mirror of
https://github.com/huggingface/lerobot.git
synced 2026-06-04 12:51:27 +00:00
annotations(steerable): remove Phase 0 canonical vocabulary discovery
Drops the optional Phase 0 vocabulary-discovery feature entirely.
With the new structured action records (Phase 1a + 1b) providing
cross-episode consistency via the deterministic template renderer,
the older vocabulary-constraint path is redundant and adds a second
constraint mechanism that wasn't well-validated in practice.
Removed:
* src/lerobot/annotations/steerable_pipeline/vocabulary.py
(Vocabulary dataclass + VocabularyDiscoveryModule + load_/
save_vocabulary helpers; canonical_vocabulary.json on-disk format)
* src/lerobot/annotations/steerable_pipeline/prompts/module_0_vocabulary.txt
(Phase 0 VLM prompt)
* tests/annotations/test_vocabulary.py
Pruned wiring across:
* config.py: VocabularyConfig dataclass + AnnotationPipelineConfig.
vocabulary field
* executor.py: vocabulary attribute on Executor + _run_vocabulary_
phase method + Phase 0 phases.append call in run()
* modules/plan_subtasks_memory.py: Vocabulary import + vocabulary
attribute + _subtask_vocabulary_block / _memory_vocabulary_block
helpers + _canonicalize_subtask / _normalize / _invalid_subtasks
/ _build_subtask_retry_message methods + vocabulary-gated retry
path in _generate_subtasks + empty-episode warning + _NORMALIZE_
STRIP_TOKENS constant
* prompts/module_1_subtasks.txt: {vocabulary_block} placeholder
* prompts/module_1_memory.txt: {vocabulary_block} placeholder
* __init__.py: Vocabulary / VocabularyDiscoveryModule / load_
vocabulary / save_vocabulary / vocabulary_path / VOCABULARY_
FILENAME re-exports
* scripts/lerobot_annotate.py: VocabularyDiscoveryModule import +
instantiation + executor argument
* examples/annotations/run_hf_job.py: --vocabulary.enabled=false
flag + docstring references + inline phase-0 comment
The original free-form rephrasings path stays (PlanConfig.
n_task_rephrasings still works when task_aug_axes.enabled=False).
Action records remain the preferred mechanism for cross-episode
subtask consistency.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -26,25 +26,11 @@ outputs are staged per-episode before a final parquet rewrite:
|
||||
|
||||
from .config import AnnotationPipelineConfig
|
||||
from .validator import StagingValidator, ValidationReport
|
||||
from .vocabulary import (
|
||||
VOCABULARY_FILENAME,
|
||||
Vocabulary,
|
||||
VocabularyDiscoveryModule,
|
||||
load_vocabulary,
|
||||
save_vocabulary,
|
||||
vocabulary_path,
|
||||
)
|
||||
from .writer import LanguageColumnsWriter
|
||||
|
||||
__all__ = [
|
||||
"VOCABULARY_FILENAME",
|
||||
"AnnotationPipelineConfig",
|
||||
"LanguageColumnsWriter",
|
||||
"StagingValidator",
|
||||
"ValidationReport",
|
||||
"Vocabulary",
|
||||
"VocabularyDiscoveryModule",
|
||||
"load_vocabulary",
|
||||
"save_vocabulary",
|
||||
"vocabulary_path",
|
||||
]
|
||||
|
||||
@@ -21,41 +21,6 @@ from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
@dataclass
|
||||
class VocabularyConfig:
|
||||
"""Phase 0 — dataset-level canonical vocabulary discovery.
|
||||
|
||||
Watches the first ``sample_episodes`` episode videos and asks the VLM
|
||||
to derive a small canonical vocabulary (subtask labels + memory
|
||||
milestones) that every episode in the dataset will reuse. The VLM
|
||||
decides the count itself from what it sees in the clips — short
|
||||
pick-and-place demos get ~6 labels, longer multi-step recipes more.
|
||||
The output lands at ``meta/canonical_vocabulary.json`` and feeds
|
||||
phase 1's subtask + memory generation as both a prompt-side
|
||||
constraint and a post-VLM validation gate.
|
||||
|
||||
Why this exists: free-form LLM rephrasing per episode produces near-
|
||||
unique subtask strings, which makes the downstream low-level policy's
|
||||
conditioning effectively noise — at inference the policy generates a
|
||||
*new* paraphrase the action expert has never seen and produces tiny
|
||||
cautious actions. Forcing every episode onto the same small set of
|
||||
canonical strings gives the action expert dense supervision per
|
||||
string and a small target distribution to learn against.
|
||||
|
||||
Set ``enabled=False`` to fall back to free-form generation (original
|
||||
behaviour). ``reuse_existing=True`` keeps a hand-edited vocabulary
|
||||
file from being clobbered on re-runs.
|
||||
"""
|
||||
|
||||
enabled: bool = True
|
||||
sample_episodes: int = 3
|
||||
max_video_frames_per_episode: int = 32
|
||||
# When True (default), an existing meta/canonical_vocabulary.json is
|
||||
# loaded as-is and no VLM call is made — lets operators hand-edit the
|
||||
# file. Set False to always rediscover from the sample episodes.
|
||||
reuse_existing: bool = True
|
||||
|
||||
|
||||
@dataclass
|
||||
class PlanConfig:
|
||||
"""``plan`` module: plan + subtasks + memory + task augmentation.
|
||||
@@ -351,7 +316,6 @@ class AnnotationPipelineConfig:
|
||||
|
||||
seed: int = 1729
|
||||
|
||||
vocabulary: VocabularyConfig = field(default_factory=VocabularyConfig)
|
||||
plan: PlanConfig = field(default_factory=PlanConfig)
|
||||
interjections: InterjectionsConfig = field(default_factory=InterjectionsConfig)
|
||||
vqa: VqaConfig = field(default_factory=VqaConfig)
|
||||
|
||||
@@ -94,7 +94,6 @@ class Executor:
|
||||
vqa: Any # GeneralVqaModule
|
||||
writer: LanguageColumnsWriter
|
||||
validator: StagingValidator
|
||||
vocabulary: Any = None # VocabularyDiscoveryModule | None
|
||||
|
||||
def run(self, root: Path) -> PipelineRunSummary:
|
||||
records = list(iter_episodes(root, only_episodes=self.config.only_episodes))
|
||||
@@ -109,10 +108,6 @@ class Executor:
|
||||
|
||||
phases: list[PhaseResult] = []
|
||||
|
||||
# Phase 0: vocabulary discovery. Mutates ``self.plan.vocabulary``
|
||||
# so subsequent per-episode plan calls see the canonical labels.
|
||||
phases.append(self._run_vocabulary_phase(records, root))
|
||||
|
||||
# Phase 1: ``plan`` module (plan + subtasks + memory)
|
||||
phases.append(self._run_module_phase("plan", records, staging_dir, self.plan))
|
||||
# Phase 2: ``interjections`` module (interjections + speech). It
|
||||
@@ -183,62 +178,6 @@ class Executor:
|
||||
flush=True,
|
||||
)
|
||||
|
||||
def _run_vocabulary_phase(
|
||||
self, records: list[EpisodeRecord], root: Path
|
||||
) -> PhaseResult:
|
||||
"""Discover (or load) the canonical vocabulary, wire it into ``self.plan``.
|
||||
|
||||
Returns a ``PhaseResult`` whose ``episodes_processed`` is the number
|
||||
of sample episodes consulted (0 when disabled or no VLM call was
|
||||
needed); ``episodes_skipped`` is always ``0`` because vocabulary is
|
||||
a once-per-dataset artifact, not a per-episode product.
|
||||
"""
|
||||
from .vocabulary import load_vocabulary, save_vocabulary # noqa: PLC0415
|
||||
|
||||
if self.vocabulary is None or not getattr(self.vocabulary, "enabled", False):
|
||||
print(
|
||||
"[annotate] phase=vocabulary skipped (module disabled or unset)",
|
||||
flush=True,
|
||||
)
|
||||
return PhaseResult(name="vocabulary", episodes_processed=0, episodes_skipped=0)
|
||||
|
||||
existing = load_vocabulary(root)
|
||||
if existing is not None and self.config.vocabulary.reuse_existing:
|
||||
print(
|
||||
f"[annotate] phase=vocabulary reusing {root / 'meta' / 'canonical_vocabulary.json'} "
|
||||
f"({len(existing.subtasks)} subtask labels, "
|
||||
f"{len(existing.memory_milestones)} memory milestones)",
|
||||
flush=True,
|
||||
)
|
||||
self.plan.vocabulary = existing
|
||||
return PhaseResult(name="vocabulary", episodes_processed=0, episodes_skipped=0)
|
||||
|
||||
sample_n = max(1, min(int(self.config.vocabulary.sample_episodes), len(records)))
|
||||
print(
|
||||
f"[annotate] phase=vocabulary discovering from {sample_n} sample episode(s)...",
|
||||
flush=True,
|
||||
)
|
||||
t0 = time.time()
|
||||
vocab = self.vocabulary.discover(records[:sample_n], existing=existing)
|
||||
if vocab is None:
|
||||
print(
|
||||
"[annotate] phase=vocabulary returned no vocabulary — "
|
||||
"plan module will fall back to free-form generation",
|
||||
flush=True,
|
||||
)
|
||||
return PhaseResult(name="vocabulary", episodes_processed=0, episodes_skipped=0)
|
||||
|
||||
save_path = save_vocabulary(root, vocab)
|
||||
print(
|
||||
f"[annotate] phase=vocabulary wrote {save_path} "
|
||||
f"({len(vocab.subtasks)} subtask labels, "
|
||||
f"{len(vocab.memory_milestones)} memory milestones) in "
|
||||
f"{time.time() - t0:.1f}s",
|
||||
flush=True,
|
||||
)
|
||||
self.plan.vocabulary = vocab
|
||||
return PhaseResult(name="vocabulary", episodes_processed=sample_n, episodes_skipped=0)
|
||||
|
||||
def _run_module_phase(
|
||||
self,
|
||||
name: str,
|
||||
|
||||
@@ -37,7 +37,6 @@ from ..prompts import load as load_prompt
|
||||
from ..reader import EpisodeRecord, reconstruct_subtask_spans, snap_to_frame
|
||||
from ..staging import EpisodeStaging
|
||||
from ..vlm_client import VlmClient
|
||||
from ..vocabulary import Vocabulary
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -60,11 +59,6 @@ class PlanSubtasksMemoryModule:
|
||||
vlm: VlmClient
|
||||
config: PlanConfig
|
||||
frame_provider: FrameProvider = field(default_factory=null_provider)
|
||||
vocabulary: Vocabulary | None = None
|
||||
"""When set, the module constrains subtask + memory generation to the
|
||||
canonical strings in ``vocabulary``. Phase 0 (vocabulary discovery)
|
||||
populates this once per dataset; ``None`` falls back to free-form
|
||||
generation (original behaviour)."""
|
||||
|
||||
@property
|
||||
def enabled(self) -> bool:
|
||||
@@ -575,28 +569,9 @@ class PlanSubtasksMemoryModule:
|
||||
min_subtask_seconds=self.config.min_subtask_seconds,
|
||||
max_steps=self.config.plan_max_steps,
|
||||
episode_duration=f"{episode_duration:.3f}",
|
||||
vocabulary_block=self._subtask_vocabulary_block(),
|
||||
)
|
||||
messages = self._video_message(record, prompt)
|
||||
spans = self._vlm_field(messages, "subtasks")
|
||||
# When a vocabulary is in force, do a single targeted retry if
|
||||
# any returned subtask is off-vocab — strict exact-match only,
|
||||
# no fuzzy snapping. The retry includes the offending strings
|
||||
# and the full canonical list so the VLM can correct itself.
|
||||
if self.vocabulary is not None and self.vocabulary.subtasks and spans:
|
||||
invalid = self._invalid_subtasks(spans)
|
||||
if invalid:
|
||||
logger.info(
|
||||
"episode %d: VLM emitted %d off-vocab subtask(s) (%s); retrying once",
|
||||
record.episode_index,
|
||||
len(invalid),
|
||||
invalid,
|
||||
)
|
||||
retry_msg = self._build_subtask_retry_message(messages, invalid)
|
||||
retried = self._vlm_field(retry_msg, "subtasks")
|
||||
if retried:
|
||||
spans = retried
|
||||
|
||||
if not spans:
|
||||
return []
|
||||
# clamp to [t0, t_last] and sort
|
||||
@@ -614,21 +589,11 @@ class PlanSubtasksMemoryModule:
|
||||
end = max(t0, min(end, t_last))
|
||||
if end < start:
|
||||
start, end = end, start
|
||||
if not text:
|
||||
continue
|
||||
text = self._canonicalize_subtask(text)
|
||||
if not text:
|
||||
continue
|
||||
cleaned.append({"text": text, "start": start, "end": end})
|
||||
cleaned.sort(key=lambda s: s["start"])
|
||||
cleaned = self._dedupe_starts_to_distinct_frames(cleaned, record)
|
||||
if self.vocabulary is not None and self.vocabulary.subtasks and not cleaned:
|
||||
logger.warning(
|
||||
"episode %d: every VLM subtask was off-vocab even after retry — "
|
||||
"episode left empty (extend meta/canonical_vocabulary.json to "
|
||||
"cover the missing phase)",
|
||||
record.episode_index,
|
||||
)
|
||||
return cleaned
|
||||
|
||||
@staticmethod
|
||||
@@ -679,132 +644,6 @@ class PlanSubtasksMemoryModule:
|
||||
out.append(new_span)
|
||||
return out
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Canonical-vocabulary helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _subtask_vocabulary_block(self) -> str:
|
||||
"""Bullet-list of canonical subtasks the VLM must pick from.
|
||||
|
||||
Returns an empty string when no vocabulary is configured —
|
||||
``module_1_subtasks.txt`` then falls back to its free-form
|
||||
rules (original behaviour).
|
||||
"""
|
||||
if self.vocabulary is None or not self.vocabulary.subtasks:
|
||||
return ""
|
||||
bullets = "\n".join(f"- {s}" for s in self.vocabulary.subtasks)
|
||||
return (
|
||||
"You MUST choose each subtask label verbatim from this canonical "
|
||||
"vocabulary — pick the closest match for each phase of the demo, "
|
||||
"and reuse the SAME string every time that phase recurs. The "
|
||||
"low-level policy is conditioned on these exact strings; any "
|
||||
"novel paraphrase you invent will make its conditioning OOD.\n"
|
||||
"Canonical subtask labels:\n"
|
||||
f"{bullets}\n\n"
|
||||
)
|
||||
|
||||
def _memory_vocabulary_block(self) -> str:
|
||||
"""Bullet-list of canonical memory milestones the VLM must pick from."""
|
||||
if self.vocabulary is None or not self.vocabulary.memory_milestones:
|
||||
return ""
|
||||
bullets = "\n".join(f"- {m}" for m in self.vocabulary.memory_milestones)
|
||||
return (
|
||||
"Compose the memory by picking ONLY from this canonical milestone "
|
||||
"list — append a milestone (or rewrite the running memory to "
|
||||
"compress past ones) using these exact phrases. Do not invent new "
|
||||
"wording: every paraphrase weakens the downstream conditioning.\n"
|
||||
"Canonical memory milestones:\n"
|
||||
f"{bullets}\n\n"
|
||||
)
|
||||
|
||||
_NORMALIZE_STRIP_TOKENS: frozenset[str] = frozenset({"the", "a", "an"})
|
||||
|
||||
def _canonicalize_subtask(self, text: str) -> str:
|
||||
"""Validate ``text`` against the canonical vocabulary; no fuzzy snap.
|
||||
|
||||
Without a vocabulary, the original text passes through. With a
|
||||
vocabulary, accept the span only if its normalised form (lower-
|
||||
cased, articles stripped, whitespace collapsed) matches a
|
||||
canonical entry exactly — the canonical wording is returned so
|
||||
the supervised string is byte-identical across episodes.
|
||||
|
||||
Off-vocab spans are dropped (empty string). Upstream
|
||||
``_generate_subtasks`` triggers a targeted retry before reaching
|
||||
the drop path; this function never snaps or warps a span into
|
||||
a different label.
|
||||
"""
|
||||
if self.vocabulary is None or not self.vocabulary.subtasks:
|
||||
return text.strip()
|
||||
normalised = self._normalize(text)
|
||||
if not normalised:
|
||||
return ""
|
||||
for candidate in self.vocabulary.subtasks:
|
||||
if self._normalize(candidate) == normalised:
|
||||
return candidate
|
||||
return ""
|
||||
|
||||
@classmethod
|
||||
def _normalize(cls, text: str) -> str:
|
||||
"""Lowercase, strip articles, collapse whitespace, drop punctuation."""
|
||||
words = [
|
||||
w.strip(".,:;\"'!?()")
|
||||
for w in text.lower().replace(",", " ").split()
|
||||
]
|
||||
return " ".join(w for w in words if w and w not in cls._NORMALIZE_STRIP_TOKENS)
|
||||
|
||||
def _invalid_subtasks(self, spans: list[dict[str, Any]]) -> list[str]:
|
||||
"""Return the unique off-vocab subtask strings the VLM produced."""
|
||||
seen: list[str] = []
|
||||
for span in spans:
|
||||
text = str((span or {}).get("text") or "").strip()
|
||||
if not text:
|
||||
continue
|
||||
if self._canonicalize_subtask(text):
|
||||
continue
|
||||
if text not in seen:
|
||||
seen.append(text)
|
||||
return seen
|
||||
|
||||
def _build_subtask_retry_message(
|
||||
self, original_messages: list[dict[str, Any]], invalid: list[str]
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Compose a one-shot correction prompt naming the off-vocab strings."""
|
||||
assert self.vocabulary is not None
|
||||
canonical = "\n".join(f"- {s}" for s in self.vocabulary.subtasks)
|
||||
invalid_list = "\n".join(f"- {s!r}" for s in invalid)
|
||||
correction = (
|
||||
"Your previous response included subtask labels that are NOT in "
|
||||
"the canonical vocabulary:\n"
|
||||
f"{invalid_list}\n\n"
|
||||
"Re-emit the same segmentation (same number of spans, same start/end "
|
||||
"timestamps where they were valid) but replace every off-vocab "
|
||||
"label with the EXACT canonical string for that phase, copied "
|
||||
"verbatim from this list:\n"
|
||||
f"{canonical}\n\n"
|
||||
"Strict rules:\n"
|
||||
"- Output strings must be byte-for-byte identical to entries above.\n"
|
||||
"- No articles, no adverbs, no extra words.\n"
|
||||
"- If a phase truly has no canonical match, omit that span entirely.\n"
|
||||
"Return the same JSON shape as before."
|
||||
)
|
||||
# Append the correction as an additional user turn; the model
|
||||
# sees the original prompt + its prior output is implied by the
|
||||
# conversation context (the VLM client is stateless, so we
|
||||
# re-send the original content plus this correction).
|
||||
retry_messages = [
|
||||
{
|
||||
"role": m.get("role", "user"),
|
||||
"content": (
|
||||
m.get("content")
|
||||
if isinstance(m.get("content"), str)
|
||||
else list(m.get("content") or [])
|
||||
),
|
||||
}
|
||||
for m in original_messages
|
||||
]
|
||||
retry_messages.append({"role": "user", "content": correction})
|
||||
return retry_messages
|
||||
|
||||
def _generate_plan(
|
||||
self,
|
||||
record: EpisodeRecord, # noqa: ARG002 (kept for signature stability)
|
||||
@@ -866,7 +705,6 @@ class PlanSubtasksMemoryModule:
|
||||
prior_memory=prior_memory or "(none)",
|
||||
completed_subtask=completed,
|
||||
remaining_subtasks=", ".join(remaining) if remaining else "(none)",
|
||||
vocabulary_block=self._memory_vocabulary_block(),
|
||||
)
|
||||
memory = self._vlm_field(self._text_message(prompt), "memory")
|
||||
return memory.strip() if isinstance(memory, str) else ""
|
||||
|
||||
@@ -1,53 +0,0 @@
|
||||
You are inspecting {n_episodes} sample episode video(s) from a teleoperated
|
||||
robot dataset. Every episode in the dataset performs the SAME task; the
|
||||
user originally asked: "{episode_task}".
|
||||
|
||||
Watch all the clips and produce a SHORT canonical vocabulary that every
|
||||
episode in this dataset will reuse. The downstream low-level policy is
|
||||
conditioned on these strings — duplicate phrasings (e.g. "grasp blue
|
||||
cube" vs "pick up the blue cube") would destroy the conditioning, so
|
||||
pick one wording per concept and reuse it everywhere.
|
||||
|
||||
Decide how many entries each list needs YOURSELF based on what you see —
|
||||
the smallest set that still covers every recurring phase in the demos.
|
||||
A simple two-object pick-and-place might need ~6 subtask labels and 2
|
||||
memory milestones; a long multi-step recipe needs more. Err on the side
|
||||
of FEWER — extra entries that don't recur across episodes weaken the
|
||||
conditioning.
|
||||
|
||||
You output two lists:
|
||||
|
||||
1. `subtasks`: imperative, telegraphic commands the robot can execute.
|
||||
- Verb-first. Drop articles, adverbs, qualifiers.
|
||||
- Consistent object nouns (if the task says "cube", every subtask says
|
||||
"cube" — never "block" / "object").
|
||||
- Atomic — one skill per subtask (gripper-open events, contact, regrasps,
|
||||
transitions all become cut points).
|
||||
- Each label must recur across the demos. If you see a motion only
|
||||
once across all sample clips, it probably isn't a canonical phase.
|
||||
- Good: "move to blue cube", "grasp blue cube", "lift blue cube",
|
||||
"place blue cube in box", "release blue cube", "retract arm".
|
||||
- Bad: "the robot arm moves towards the blue cube" (third person,
|
||||
too long), "carefully pick up the cube" (adverb, article),
|
||||
"carrying the yellow cube over the green basket" (gerund — should
|
||||
be imperative "transport yellow cube to green basket").
|
||||
|
||||
2. `memory_milestones`: first-person past-tense sentences the running
|
||||
memory composes from. Each subtask phase that produces a lasting
|
||||
change should have a milestone; transient motions (move, retract)
|
||||
should NOT.
|
||||
- First person, past tense. Start with "I".
|
||||
- One sentence. Functional outcome only — no grasp / motion detail.
|
||||
- Good: "I picked up the blue cube.", "I placed the blue cube in
|
||||
the green box.", "I wiped the counter."
|
||||
- Bad: "The robot arm grasped the blue cube." (third person),
|
||||
"I carefully grasped the blue cube with the parallel gripper."
|
||||
(irrelevant detail), "I moved towards the blue cube." (transient
|
||||
motion — should be omitted, not memorialised).
|
||||
|
||||
Output strictly valid JSON of shape:
|
||||
|
||||
{{
|
||||
"subtasks": ["<verb phrase>", ...],
|
||||
"memory_milestones": ["I <past-tense sentence>.", ...]
|
||||
}}
|
||||
@@ -13,7 +13,7 @@ Previous memory: {prior_memory}
|
||||
Just-completed subtask: "{completed_subtask}"
|
||||
Remaining subtasks (for relevance judgement only): {remaining_subtasks}
|
||||
|
||||
{vocabulary_block}Write the memory as a short FIRST-PERSON, PAST-TENSE narrative of what the
|
||||
Write the memory as a short FIRST-PERSON, PAST-TENSE narrative of what the
|
||||
robot has accomplished so far — the running story it would tell itself.
|
||||
|
||||
Authoring rules:
|
||||
|
||||
@@ -6,7 +6,7 @@ You are shown the entire demonstration as a single video. Watch the
|
||||
whole clip, then segment it into a list of consecutive atomic subtasks
|
||||
the robot performs.
|
||||
|
||||
{vocabulary_block}Authoring rules — Hi Robot atom granularity, pi0.7-style short prompts:
|
||||
Authoring rules — Hi Robot atom granularity, pi0.7-style short prompts:
|
||||
|
||||
- Each subtask = one COMPOSITE atomic skill the low-level policy can
|
||||
execute end-to-end. A "skill" bundles its own approach motion with
|
||||
|
||||
@@ -1,222 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Dataset-level canonical vocabulary discovery (Phase 0).
|
||||
|
||||
The downstream consumer of these annotations is a low-level action expert
|
||||
conditioned on the ``subtask`` string. Free-form per-episode LLM rephrasing
|
||||
gives near-unique strings per occurrence, which collapses the action
|
||||
expert's conditioning to noise and makes runtime subtask-paraphrase drift
|
||||
catastrophic. The Hi-Robot / π0.6-MEM recipe ships a small canonical
|
||||
vocabulary per environment (~10 strings) that every episode reuses; this
|
||||
module derives that vocabulary automatically from the first few episode
|
||||
videos and persists it next to the dataset.
|
||||
|
||||
Pipeline-level flow:
|
||||
|
||||
Phase 0 (here): watch N sample episodes → produce vocabulary.json
|
||||
Phase 1 (plan module): reuse vocabulary on every episode, both as
|
||||
prompt-side constraint *and* post-VLM validation
|
||||
|
||||
The vocabulary is JSON, lives at ``<root>/meta/canonical_vocabulary.json``,
|
||||
and is human-inspectable / hand-editable — if the discovered set is wrong,
|
||||
operators edit the file and re-run the pipeline without phase 0.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
from collections.abc import Sequence
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from .config import VocabularyConfig
|
||||
from .frames import FrameProvider, null_provider, to_video_block
|
||||
from .prompts import load as load_prompt
|
||||
from .reader import EpisodeRecord
|
||||
from .vlm_client import VlmClient
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
VOCABULARY_FILENAME = "canonical_vocabulary.json"
|
||||
|
||||
|
||||
@dataclass
|
||||
class Vocabulary:
|
||||
"""Canonical phrasings shared across every episode of one dataset.
|
||||
|
||||
Both lists are strict: per-episode subtask + memory generation pick
|
||||
from these strings only; the downstream policy then has a small,
|
||||
repeatable target distribution to learn instead of thousands of
|
||||
LLM paraphrases.
|
||||
"""
|
||||
|
||||
subtasks: tuple[str, ...]
|
||||
"""Imperative subtask labels — what the low-level policy is conditioned
|
||||
on. Verb-first, telegraphic, consistent object nouns. Example:
|
||||
``("move to blue cube", "grasp blue cube", "lift blue cube",
|
||||
"place blue cube in box", "retract arm")``.
|
||||
"""
|
||||
|
||||
memory_milestones: tuple[str, ...]
|
||||
"""First-person past-tense milestone sentences — building blocks for
|
||||
the running memory string. Example: ``("I picked up the blue cube.",
|
||||
"I placed the blue cube in the green box.")``. Each milestone maps
|
||||
1:1 onto a completed subtask phase; ``memory_at_step_k`` is the
|
||||
concatenation of milestones for completed phases.
|
||||
"""
|
||||
|
||||
def to_json(self) -> dict[str, list[str]]:
|
||||
return {
|
||||
"subtasks": list(self.subtasks),
|
||||
"memory_milestones": list(self.memory_milestones),
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, payload: dict[str, Any]) -> Vocabulary:
|
||||
subtasks = tuple(
|
||||
str(s).strip() for s in (payload.get("subtasks") or []) if str(s).strip()
|
||||
)
|
||||
memory_milestones = tuple(
|
||||
str(s).strip() for s in (payload.get("memory_milestones") or []) if str(s).strip()
|
||||
)
|
||||
return cls(subtasks=subtasks, memory_milestones=memory_milestones)
|
||||
|
||||
def is_empty(self) -> bool:
|
||||
return not self.subtasks and not self.memory_milestones
|
||||
|
||||
|
||||
def vocabulary_path(root: Path) -> Path:
|
||||
"""Return the canonical on-disk location for the vocabulary file."""
|
||||
return root / "meta" / VOCABULARY_FILENAME
|
||||
|
||||
|
||||
def load_vocabulary(root: Path) -> Vocabulary | None:
|
||||
"""Read ``<root>/meta/canonical_vocabulary.json`` if present.
|
||||
|
||||
Returns ``None`` when the file does not exist — callers fall back to
|
||||
free-form (unconstrained) subtask + memory generation, preserving the
|
||||
pipeline's behaviour on datasets that never ran phase 0.
|
||||
"""
|
||||
path = vocabulary_path(root)
|
||||
if not path.exists():
|
||||
return None
|
||||
try:
|
||||
payload = json.loads(path.read_text(encoding="utf-8"))
|
||||
except (OSError, json.JSONDecodeError) as exc:
|
||||
logger.warning("could not read %s: %s — proceeding without vocabulary", path, exc)
|
||||
return None
|
||||
if not isinstance(payload, dict):
|
||||
logger.warning("%s is not a JSON object — ignoring", path)
|
||||
return None
|
||||
vocab = Vocabulary.from_json(payload)
|
||||
if vocab.is_empty():
|
||||
return None
|
||||
return vocab
|
||||
|
||||
|
||||
def save_vocabulary(root: Path, vocab: Vocabulary) -> Path:
|
||||
"""Atomically persist ``vocab`` to ``<root>/meta/canonical_vocabulary.json``."""
|
||||
path = vocabulary_path(root)
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
tmp = path.with_suffix(path.suffix + ".tmp")
|
||||
tmp.write_text(
|
||||
json.dumps(vocab.to_json(), indent=2, ensure_ascii=False) + "\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
tmp.replace(path)
|
||||
return path
|
||||
|
||||
|
||||
@dataclass
|
||||
class VocabularyDiscoveryModule:
|
||||
"""Derive a dataset-level canonical vocabulary from sample episodes.
|
||||
|
||||
Phase 0 of the executor: pulls ``config.sample_episodes`` episode
|
||||
videos, packs them into one Qwen-VL multi-video prompt, and asks the
|
||||
model to enumerate the small set of canonical subtask labels +
|
||||
memory milestones that recur across them. The output is persisted
|
||||
to ``meta/canonical_vocabulary.json`` and consumed by phase 1.
|
||||
"""
|
||||
|
||||
vlm: VlmClient
|
||||
config: VocabularyConfig
|
||||
frame_provider: FrameProvider = field(default_factory=null_provider)
|
||||
|
||||
@property
|
||||
def enabled(self) -> bool:
|
||||
return self.config.enabled
|
||||
|
||||
def discover(
|
||||
self,
|
||||
records: Sequence[EpisodeRecord],
|
||||
*,
|
||||
existing: Vocabulary | None = None,
|
||||
) -> Vocabulary | None:
|
||||
"""Run vocabulary discovery against the first N sample episodes.
|
||||
|
||||
``existing`` short-circuits the VLM call when ``config.reuse_existing``
|
||||
is True and an on-disk vocabulary is already present — keeps re-runs
|
||||
cheap and lets operators hand-edit the file without it getting
|
||||
overwritten.
|
||||
"""
|
||||
if existing is not None and self.config.reuse_existing:
|
||||
logger.info(
|
||||
"vocabulary: reusing existing (%d subtasks, %d memory milestones)",
|
||||
len(existing.subtasks),
|
||||
len(existing.memory_milestones),
|
||||
)
|
||||
return existing
|
||||
|
||||
sample = list(records[: max(1, int(self.config.sample_episodes))])
|
||||
if not sample:
|
||||
return None
|
||||
|
||||
task_hint = next((r.episode_task for r in sample if r.episode_task), "")
|
||||
prompt = load_prompt("module_0_vocabulary").format(
|
||||
episode_task=task_hint or "(unspecified)",
|
||||
n_episodes=len(sample),
|
||||
)
|
||||
# Pack one video block per sample episode so the VLM sees the
|
||||
# variation across episodes (different starting poses, different
|
||||
# object placements) rather than overfitting to one trajectory.
|
||||
content: list[dict[str, Any]] = []
|
||||
for record in sample:
|
||||
video_frames = self.frame_provider.video_for_episode(
|
||||
record, int(self.config.max_video_frames_per_episode)
|
||||
)
|
||||
if video_frames:
|
||||
content.extend(to_video_block(video_frames))
|
||||
content.append({"type": "text", "text": prompt})
|
||||
messages = [{"role": "user", "content": content}]
|
||||
|
||||
result = self.vlm.generate_json([messages])[0]
|
||||
if not isinstance(result, dict):
|
||||
logger.warning("vocabulary: VLM did not return a JSON object — skipping")
|
||||
return None
|
||||
|
||||
vocab = Vocabulary.from_json(result)
|
||||
if vocab.is_empty():
|
||||
logger.warning("vocabulary: VLM returned an empty vocabulary — skipping")
|
||||
return None
|
||||
logger.info(
|
||||
"vocabulary: discovered %d subtask labels + %d memory milestones from %d episodes",
|
||||
len(vocab.subtasks),
|
||||
len(vocab.memory_milestones),
|
||||
len(sample),
|
||||
)
|
||||
return vocab
|
||||
@@ -40,7 +40,6 @@ from lerobot.annotations.steerable_pipeline.modules import (
|
||||
)
|
||||
from lerobot.annotations.steerable_pipeline.validator import StagingValidator
|
||||
from lerobot.annotations.steerable_pipeline.vlm_client import make_vlm_client
|
||||
from lerobot.annotations.steerable_pipeline.vocabulary import VocabularyDiscoveryModule
|
||||
from lerobot.annotations.steerable_pipeline.writer import LanguageColumnsWriter
|
||||
from lerobot.configs import parser
|
||||
|
||||
@@ -89,9 +88,6 @@ def annotate(cfg: AnnotationPipelineConfig) -> None:
|
||||
vlm=vlm, config=cfg.interjections, seed=cfg.seed, frame_provider=frame_provider
|
||||
)
|
||||
vqa = GeneralVqaModule(vlm=vlm, config=cfg.vqa, seed=cfg.seed, frame_provider=frame_provider)
|
||||
vocabulary = VocabularyDiscoveryModule(
|
||||
vlm=vlm, config=cfg.vocabulary, frame_provider=frame_provider
|
||||
)
|
||||
writer = LanguageColumnsWriter()
|
||||
validator = StagingValidator(
|
||||
dataset_camera_keys=tuple(getattr(frame_provider, "camera_keys", []) or []) or None,
|
||||
@@ -102,7 +98,6 @@ def annotate(cfg: AnnotationPipelineConfig) -> None:
|
||||
plan=plan,
|
||||
interjections=interjections,
|
||||
vqa=vqa,
|
||||
vocabulary=vocabulary,
|
||||
writer=writer,
|
||||
validator=validator,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user