annotations(steerable): remove Phase 0 canonical vocabulary discovery

Drops the optional Phase 0 vocabulary-discovery feature entirely.
With the new structured action records (Phase 1a + 1b) providing
cross-episode consistency via the deterministic template renderer,
the older vocabulary-constraint path is redundant and adds a second
constraint mechanism that wasn't well-validated in practice.

Removed:
  * src/lerobot/annotations/steerable_pipeline/vocabulary.py
    (Vocabulary dataclass + VocabularyDiscoveryModule + load_/
    save_vocabulary helpers; canonical_vocabulary.json on-disk format)
  * src/lerobot/annotations/steerable_pipeline/prompts/module_0_vocabulary.txt
    (Phase 0 VLM prompt)
  * tests/annotations/test_vocabulary.py

Pruned wiring across:
  * config.py: VocabularyConfig dataclass + AnnotationPipelineConfig.
    vocabulary field
  * executor.py: vocabulary attribute on Executor + _run_vocabulary_
    phase method + Phase 0 phases.append call in run()
  * modules/plan_subtasks_memory.py: Vocabulary import + vocabulary
    attribute + _subtask_vocabulary_block / _memory_vocabulary_block
    helpers + _canonicalize_subtask / _normalize / _invalid_subtasks
    / _build_subtask_retry_message methods + vocabulary-gated retry
    path in _generate_subtasks + empty-episode warning + _NORMALIZE_
    STRIP_TOKENS constant
  * prompts/module_1_subtasks.txt: {vocabulary_block} placeholder
  * prompts/module_1_memory.txt: {vocabulary_block} placeholder
  * __init__.py: Vocabulary / VocabularyDiscoveryModule / load_
    vocabulary / save_vocabulary / vocabulary_path / VOCABULARY_
    FILENAME re-exports
  * scripts/lerobot_annotate.py: VocabularyDiscoveryModule import +
    instantiation + executor argument
  * examples/annotations/run_hf_job.py: --vocabulary.enabled=false
    flag + docstring references + inline phase-0 comment

The original free-form rephrasings path stays (PlanConfig.
n_task_rephrasings still works when task_aug_axes.enabled=False).
Action records remain the preferred mechanism for cross-episode
subtask consistency.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Pepijn
2026-06-02 11:48:05 +02:00
parent 2bfaf44db2
commit 5dbf0fac5f
11 changed files with 4 additions and 981 deletions

View File

@@ -26,25 +26,11 @@ outputs are staged per-episode before a final parquet rewrite:
from .config import AnnotationPipelineConfig
from .validator import StagingValidator, ValidationReport
from .vocabulary import (
VOCABULARY_FILENAME,
Vocabulary,
VocabularyDiscoveryModule,
load_vocabulary,
save_vocabulary,
vocabulary_path,
)
from .writer import LanguageColumnsWriter
__all__ = [
"VOCABULARY_FILENAME",
"AnnotationPipelineConfig",
"LanguageColumnsWriter",
"StagingValidator",
"ValidationReport",
"Vocabulary",
"VocabularyDiscoveryModule",
"load_vocabulary",
"save_vocabulary",
"vocabulary_path",
]

View File

@@ -21,41 +21,6 @@ from pathlib import Path
from typing import Any
@dataclass
class VocabularyConfig:
"""Phase 0 — dataset-level canonical vocabulary discovery.
Watches the first ``sample_episodes`` episode videos and asks the VLM
to derive a small canonical vocabulary (subtask labels + memory
milestones) that every episode in the dataset will reuse. The VLM
decides the count itself from what it sees in the clips — short
pick-and-place demos get ~6 labels, longer multi-step recipes more.
The output lands at ``meta/canonical_vocabulary.json`` and feeds
phase 1's subtask + memory generation as both a prompt-side
constraint and a post-VLM validation gate.
Why this exists: free-form LLM rephrasing per episode produces near-
unique subtask strings, which makes the downstream low-level policy's
conditioning effectively noise — at inference the policy generates a
*new* paraphrase the action expert has never seen and produces tiny
cautious actions. Forcing every episode onto the same small set of
canonical strings gives the action expert dense supervision per
string and a small target distribution to learn against.
Set ``enabled=False`` to fall back to free-form generation (original
behaviour). ``reuse_existing=True`` keeps a hand-edited vocabulary
file from being clobbered on re-runs.
"""
enabled: bool = True
sample_episodes: int = 3
max_video_frames_per_episode: int = 32
# When True (default), an existing meta/canonical_vocabulary.json is
# loaded as-is and no VLM call is made — lets operators hand-edit the
# file. Set False to always rediscover from the sample episodes.
reuse_existing: bool = True
@dataclass
class PlanConfig:
"""``plan`` module: plan + subtasks + memory + task augmentation.
@@ -351,7 +316,6 @@ class AnnotationPipelineConfig:
seed: int = 1729
vocabulary: VocabularyConfig = field(default_factory=VocabularyConfig)
plan: PlanConfig = field(default_factory=PlanConfig)
interjections: InterjectionsConfig = field(default_factory=InterjectionsConfig)
vqa: VqaConfig = field(default_factory=VqaConfig)

View File

@@ -94,7 +94,6 @@ class Executor:
vqa: Any # GeneralVqaModule
writer: LanguageColumnsWriter
validator: StagingValidator
vocabulary: Any = None # VocabularyDiscoveryModule | None
def run(self, root: Path) -> PipelineRunSummary:
records = list(iter_episodes(root, only_episodes=self.config.only_episodes))
@@ -109,10 +108,6 @@ class Executor:
phases: list[PhaseResult] = []
# Phase 0: vocabulary discovery. Mutates ``self.plan.vocabulary``
# so subsequent per-episode plan calls see the canonical labels.
phases.append(self._run_vocabulary_phase(records, root))
# Phase 1: ``plan`` module (plan + subtasks + memory)
phases.append(self._run_module_phase("plan", records, staging_dir, self.plan))
# Phase 2: ``interjections`` module (interjections + speech). It
@@ -183,62 +178,6 @@ class Executor:
flush=True,
)
def _run_vocabulary_phase(
self, records: list[EpisodeRecord], root: Path
) -> PhaseResult:
"""Discover (or load) the canonical vocabulary, wire it into ``self.plan``.
Returns a ``PhaseResult`` whose ``episodes_processed`` is the number
of sample episodes consulted (0 when disabled or no VLM call was
needed); ``episodes_skipped`` is always ``0`` because vocabulary is
a once-per-dataset artifact, not a per-episode product.
"""
from .vocabulary import load_vocabulary, save_vocabulary # noqa: PLC0415
if self.vocabulary is None or not getattr(self.vocabulary, "enabled", False):
print(
"[annotate] phase=vocabulary skipped (module disabled or unset)",
flush=True,
)
return PhaseResult(name="vocabulary", episodes_processed=0, episodes_skipped=0)
existing = load_vocabulary(root)
if existing is not None and self.config.vocabulary.reuse_existing:
print(
f"[annotate] phase=vocabulary reusing {root / 'meta' / 'canonical_vocabulary.json'} "
f"({len(existing.subtasks)} subtask labels, "
f"{len(existing.memory_milestones)} memory milestones)",
flush=True,
)
self.plan.vocabulary = existing
return PhaseResult(name="vocabulary", episodes_processed=0, episodes_skipped=0)
sample_n = max(1, min(int(self.config.vocabulary.sample_episodes), len(records)))
print(
f"[annotate] phase=vocabulary discovering from {sample_n} sample episode(s)...",
flush=True,
)
t0 = time.time()
vocab = self.vocabulary.discover(records[:sample_n], existing=existing)
if vocab is None:
print(
"[annotate] phase=vocabulary returned no vocabulary — "
"plan module will fall back to free-form generation",
flush=True,
)
return PhaseResult(name="vocabulary", episodes_processed=0, episodes_skipped=0)
save_path = save_vocabulary(root, vocab)
print(
f"[annotate] phase=vocabulary wrote {save_path} "
f"({len(vocab.subtasks)} subtask labels, "
f"{len(vocab.memory_milestones)} memory milestones) in "
f"{time.time() - t0:.1f}s",
flush=True,
)
self.plan.vocabulary = vocab
return PhaseResult(name="vocabulary", episodes_processed=sample_n, episodes_skipped=0)
def _run_module_phase(
self,
name: str,

View File

@@ -37,7 +37,6 @@ from ..prompts import load as load_prompt
from ..reader import EpisodeRecord, reconstruct_subtask_spans, snap_to_frame
from ..staging import EpisodeStaging
from ..vlm_client import VlmClient
from ..vocabulary import Vocabulary
logger = logging.getLogger(__name__)
@@ -60,11 +59,6 @@ class PlanSubtasksMemoryModule:
vlm: VlmClient
config: PlanConfig
frame_provider: FrameProvider = field(default_factory=null_provider)
vocabulary: Vocabulary | None = None
"""When set, the module constrains subtask + memory generation to the
canonical strings in ``vocabulary``. Phase 0 (vocabulary discovery)
populates this once per dataset; ``None`` falls back to free-form
generation (original behaviour)."""
@property
def enabled(self) -> bool:
@@ -575,28 +569,9 @@ class PlanSubtasksMemoryModule:
min_subtask_seconds=self.config.min_subtask_seconds,
max_steps=self.config.plan_max_steps,
episode_duration=f"{episode_duration:.3f}",
vocabulary_block=self._subtask_vocabulary_block(),
)
messages = self._video_message(record, prompt)
spans = self._vlm_field(messages, "subtasks")
# When a vocabulary is in force, do a single targeted retry if
# any returned subtask is off-vocab — strict exact-match only,
# no fuzzy snapping. The retry includes the offending strings
# and the full canonical list so the VLM can correct itself.
if self.vocabulary is not None and self.vocabulary.subtasks and spans:
invalid = self._invalid_subtasks(spans)
if invalid:
logger.info(
"episode %d: VLM emitted %d off-vocab subtask(s) (%s); retrying once",
record.episode_index,
len(invalid),
invalid,
)
retry_msg = self._build_subtask_retry_message(messages, invalid)
retried = self._vlm_field(retry_msg, "subtasks")
if retried:
spans = retried
if not spans:
return []
# clamp to [t0, t_last] and sort
@@ -614,21 +589,11 @@ class PlanSubtasksMemoryModule:
end = max(t0, min(end, t_last))
if end < start:
start, end = end, start
if not text:
continue
text = self._canonicalize_subtask(text)
if not text:
continue
cleaned.append({"text": text, "start": start, "end": end})
cleaned.sort(key=lambda s: s["start"])
cleaned = self._dedupe_starts_to_distinct_frames(cleaned, record)
if self.vocabulary is not None and self.vocabulary.subtasks and not cleaned:
logger.warning(
"episode %d: every VLM subtask was off-vocab even after retry — "
"episode left empty (extend meta/canonical_vocabulary.json to "
"cover the missing phase)",
record.episode_index,
)
return cleaned
@staticmethod
@@ -679,132 +644,6 @@ class PlanSubtasksMemoryModule:
out.append(new_span)
return out
# ------------------------------------------------------------------
# Canonical-vocabulary helpers
# ------------------------------------------------------------------
def _subtask_vocabulary_block(self) -> str:
"""Bullet-list of canonical subtasks the VLM must pick from.
Returns an empty string when no vocabulary is configured —
``module_1_subtasks.txt`` then falls back to its free-form
rules (original behaviour).
"""
if self.vocabulary is None or not self.vocabulary.subtasks:
return ""
bullets = "\n".join(f"- {s}" for s in self.vocabulary.subtasks)
return (
"You MUST choose each subtask label verbatim from this canonical "
"vocabulary — pick the closest match for each phase of the demo, "
"and reuse the SAME string every time that phase recurs. The "
"low-level policy is conditioned on these exact strings; any "
"novel paraphrase you invent will make its conditioning OOD.\n"
"Canonical subtask labels:\n"
f"{bullets}\n\n"
)
def _memory_vocabulary_block(self) -> str:
"""Bullet-list of canonical memory milestones the VLM must pick from."""
if self.vocabulary is None or not self.vocabulary.memory_milestones:
return ""
bullets = "\n".join(f"- {m}" for m in self.vocabulary.memory_milestones)
return (
"Compose the memory by picking ONLY from this canonical milestone "
"list — append a milestone (or rewrite the running memory to "
"compress past ones) using these exact phrases. Do not invent new "
"wording: every paraphrase weakens the downstream conditioning.\n"
"Canonical memory milestones:\n"
f"{bullets}\n\n"
)
_NORMALIZE_STRIP_TOKENS: frozenset[str] = frozenset({"the", "a", "an"})
def _canonicalize_subtask(self, text: str) -> str:
"""Validate ``text`` against the canonical vocabulary; no fuzzy snap.
Without a vocabulary, the original text passes through. With a
vocabulary, accept the span only if its normalised form (lower-
cased, articles stripped, whitespace collapsed) matches a
canonical entry exactly — the canonical wording is returned so
the supervised string is byte-identical across episodes.
Off-vocab spans are dropped (empty string). Upstream
``_generate_subtasks`` triggers a targeted retry before reaching
the drop path; this function never snaps or warps a span into
a different label.
"""
if self.vocabulary is None or not self.vocabulary.subtasks:
return text.strip()
normalised = self._normalize(text)
if not normalised:
return ""
for candidate in self.vocabulary.subtasks:
if self._normalize(candidate) == normalised:
return candidate
return ""
@classmethod
def _normalize(cls, text: str) -> str:
"""Lowercase, strip articles, collapse whitespace, drop punctuation."""
words = [
w.strip(".,:;\"'!?()")
for w in text.lower().replace(",", " ").split()
]
return " ".join(w for w in words if w and w not in cls._NORMALIZE_STRIP_TOKENS)
def _invalid_subtasks(self, spans: list[dict[str, Any]]) -> list[str]:
"""Return the unique off-vocab subtask strings the VLM produced."""
seen: list[str] = []
for span in spans:
text = str((span or {}).get("text") or "").strip()
if not text:
continue
if self._canonicalize_subtask(text):
continue
if text not in seen:
seen.append(text)
return seen
def _build_subtask_retry_message(
self, original_messages: list[dict[str, Any]], invalid: list[str]
) -> list[dict[str, Any]]:
"""Compose a one-shot correction prompt naming the off-vocab strings."""
assert self.vocabulary is not None
canonical = "\n".join(f"- {s}" for s in self.vocabulary.subtasks)
invalid_list = "\n".join(f"- {s!r}" for s in invalid)
correction = (
"Your previous response included subtask labels that are NOT in "
"the canonical vocabulary:\n"
f"{invalid_list}\n\n"
"Re-emit the same segmentation (same number of spans, same start/end "
"timestamps where they were valid) but replace every off-vocab "
"label with the EXACT canonical string for that phase, copied "
"verbatim from this list:\n"
f"{canonical}\n\n"
"Strict rules:\n"
"- Output strings must be byte-for-byte identical to entries above.\n"
"- No articles, no adverbs, no extra words.\n"
"- If a phase truly has no canonical match, omit that span entirely.\n"
"Return the same JSON shape as before."
)
# Append the correction as an additional user turn; the model
# sees the original prompt + its prior output is implied by the
# conversation context (the VLM client is stateless, so we
# re-send the original content plus this correction).
retry_messages = [
{
"role": m.get("role", "user"),
"content": (
m.get("content")
if isinstance(m.get("content"), str)
else list(m.get("content") or [])
),
}
for m in original_messages
]
retry_messages.append({"role": "user", "content": correction})
return retry_messages
def _generate_plan(
self,
record: EpisodeRecord, # noqa: ARG002 (kept for signature stability)
@@ -866,7 +705,6 @@ class PlanSubtasksMemoryModule:
prior_memory=prior_memory or "(none)",
completed_subtask=completed,
remaining_subtasks=", ".join(remaining) if remaining else "(none)",
vocabulary_block=self._memory_vocabulary_block(),
)
memory = self._vlm_field(self._text_message(prompt), "memory")
return memory.strip() if isinstance(memory, str) else ""

View File

@@ -1,53 +0,0 @@
You are inspecting {n_episodes} sample episode video(s) from a teleoperated
robot dataset. Every episode in the dataset performs the SAME task; the
user originally asked: "{episode_task}".
Watch all the clips and produce a SHORT canonical vocabulary that every
episode in this dataset will reuse. The downstream low-level policy is
conditioned on these strings — duplicate phrasings (e.g. "grasp blue
cube" vs "pick up the blue cube") would destroy the conditioning, so
pick one wording per concept and reuse it everywhere.
Decide how many entries each list needs YOURSELF based on what you see —
the smallest set that still covers every recurring phase in the demos.
A simple two-object pick-and-place might need ~6 subtask labels and 2
memory milestones; a long multi-step recipe needs more. Err on the side
of FEWER — extra entries that don't recur across episodes weaken the
conditioning.
You output two lists:
1. `subtasks`: imperative, telegraphic commands the robot can execute.
- Verb-first. Drop articles, adverbs, qualifiers.
- Consistent object nouns (if the task says "cube", every subtask says
"cube" — never "block" / "object").
- Atomic — one skill per subtask (gripper-open events, contact, regrasps,
transitions all become cut points).
- Each label must recur across the demos. If you see a motion only
once across all sample clips, it probably isn't a canonical phase.
- Good: "move to blue cube", "grasp blue cube", "lift blue cube",
"place blue cube in box", "release blue cube", "retract arm".
- Bad: "the robot arm moves towards the blue cube" (third person,
too long), "carefully pick up the cube" (adverb, article),
"carrying the yellow cube over the green basket" (gerund — should
be imperative "transport yellow cube to green basket").
2. `memory_milestones`: first-person past-tense sentences the running
memory composes from. Each subtask phase that produces a lasting
change should have a milestone; transient motions (move, retract)
should NOT.
- First person, past tense. Start with "I".
- One sentence. Functional outcome only — no grasp / motion detail.
- Good: "I picked up the blue cube.", "I placed the blue cube in
the green box.", "I wiped the counter."
- Bad: "The robot arm grasped the blue cube." (third person),
"I carefully grasped the blue cube with the parallel gripper."
(irrelevant detail), "I moved towards the blue cube." (transient
motion — should be omitted, not memorialised).
Output strictly valid JSON of shape:
{{
"subtasks": ["<verb phrase>", ...],
"memory_milestones": ["I <past-tense sentence>.", ...]
}}

View File

@@ -13,7 +13,7 @@ Previous memory: {prior_memory}
Just-completed subtask: "{completed_subtask}"
Remaining subtasks (for relevance judgement only): {remaining_subtasks}
{vocabulary_block}Write the memory as a short FIRST-PERSON, PAST-TENSE narrative of what the
Write the memory as a short FIRST-PERSON, PAST-TENSE narrative of what the
robot has accomplished so far — the running story it would tell itself.
Authoring rules:

View File

@@ -6,7 +6,7 @@ You are shown the entire demonstration as a single video. Watch the
whole clip, then segment it into a list of consecutive atomic subtasks
the robot performs.
{vocabulary_block}Authoring rules — Hi Robot atom granularity, pi0.7-style short prompts:
Authoring rules — Hi Robot atom granularity, pi0.7-style short prompts:
- Each subtask = one COMPOSITE atomic skill the low-level policy can
execute end-to-end. A "skill" bundles its own approach motion with

View File

@@ -1,222 +0,0 @@
#!/usr/bin/env python
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Dataset-level canonical vocabulary discovery (Phase 0).
The downstream consumer of these annotations is a low-level action expert
conditioned on the ``subtask`` string. Free-form per-episode LLM rephrasing
gives near-unique strings per occurrence, which collapses the action
expert's conditioning to noise and makes runtime subtask-paraphrase drift
catastrophic. The Hi-Robot / π0.6-MEM recipe ships a small canonical
vocabulary per environment (~10 strings) that every episode reuses; this
module derives that vocabulary automatically from the first few episode
videos and persists it next to the dataset.
Pipeline-level flow:
Phase 0 (here): watch N sample episodes → produce vocabulary.json
Phase 1 (plan module): reuse vocabulary on every episode, both as
prompt-side constraint *and* post-VLM validation
The vocabulary is JSON, lives at ``<root>/meta/canonical_vocabulary.json``,
and is human-inspectable / hand-editable — if the discovered set is wrong,
operators edit the file and re-run the pipeline without phase 0.
"""
from __future__ import annotations
import json
import logging
from collections.abc import Sequence
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
from .config import VocabularyConfig
from .frames import FrameProvider, null_provider, to_video_block
from .prompts import load as load_prompt
from .reader import EpisodeRecord
from .vlm_client import VlmClient
logger = logging.getLogger(__name__)
VOCABULARY_FILENAME = "canonical_vocabulary.json"
@dataclass
class Vocabulary:
"""Canonical phrasings shared across every episode of one dataset.
Both lists are strict: per-episode subtask + memory generation pick
from these strings only; the downstream policy then has a small,
repeatable target distribution to learn instead of thousands of
LLM paraphrases.
"""
subtasks: tuple[str, ...]
"""Imperative subtask labels — what the low-level policy is conditioned
on. Verb-first, telegraphic, consistent object nouns. Example:
``("move to blue cube", "grasp blue cube", "lift blue cube",
"place blue cube in box", "retract arm")``.
"""
memory_milestones: tuple[str, ...]
"""First-person past-tense milestone sentences — building blocks for
the running memory string. Example: ``("I picked up the blue cube.",
"I placed the blue cube in the green box.")``. Each milestone maps
1:1 onto a completed subtask phase; ``memory_at_step_k`` is the
concatenation of milestones for completed phases.
"""
def to_json(self) -> dict[str, list[str]]:
return {
"subtasks": list(self.subtasks),
"memory_milestones": list(self.memory_milestones),
}
@classmethod
def from_json(cls, payload: dict[str, Any]) -> Vocabulary:
subtasks = tuple(
str(s).strip() for s in (payload.get("subtasks") or []) if str(s).strip()
)
memory_milestones = tuple(
str(s).strip() for s in (payload.get("memory_milestones") or []) if str(s).strip()
)
return cls(subtasks=subtasks, memory_milestones=memory_milestones)
def is_empty(self) -> bool:
return not self.subtasks and not self.memory_milestones
def vocabulary_path(root: Path) -> Path:
"""Return the canonical on-disk location for the vocabulary file."""
return root / "meta" / VOCABULARY_FILENAME
def load_vocabulary(root: Path) -> Vocabulary | None:
"""Read ``<root>/meta/canonical_vocabulary.json`` if present.
Returns ``None`` when the file does not exist — callers fall back to
free-form (unconstrained) subtask + memory generation, preserving the
pipeline's behaviour on datasets that never ran phase 0.
"""
path = vocabulary_path(root)
if not path.exists():
return None
try:
payload = json.loads(path.read_text(encoding="utf-8"))
except (OSError, json.JSONDecodeError) as exc:
logger.warning("could not read %s: %s — proceeding without vocabulary", path, exc)
return None
if not isinstance(payload, dict):
logger.warning("%s is not a JSON object — ignoring", path)
return None
vocab = Vocabulary.from_json(payload)
if vocab.is_empty():
return None
return vocab
def save_vocabulary(root: Path, vocab: Vocabulary) -> Path:
"""Atomically persist ``vocab`` to ``<root>/meta/canonical_vocabulary.json``."""
path = vocabulary_path(root)
path.parent.mkdir(parents=True, exist_ok=True)
tmp = path.with_suffix(path.suffix + ".tmp")
tmp.write_text(
json.dumps(vocab.to_json(), indent=2, ensure_ascii=False) + "\n",
encoding="utf-8",
)
tmp.replace(path)
return path
@dataclass
class VocabularyDiscoveryModule:
"""Derive a dataset-level canonical vocabulary from sample episodes.
Phase 0 of the executor: pulls ``config.sample_episodes`` episode
videos, packs them into one Qwen-VL multi-video prompt, and asks the
model to enumerate the small set of canonical subtask labels +
memory milestones that recur across them. The output is persisted
to ``meta/canonical_vocabulary.json`` and consumed by phase 1.
"""
vlm: VlmClient
config: VocabularyConfig
frame_provider: FrameProvider = field(default_factory=null_provider)
@property
def enabled(self) -> bool:
return self.config.enabled
def discover(
self,
records: Sequence[EpisodeRecord],
*,
existing: Vocabulary | None = None,
) -> Vocabulary | None:
"""Run vocabulary discovery against the first N sample episodes.
``existing`` short-circuits the VLM call when ``config.reuse_existing``
is True and an on-disk vocabulary is already present — keeps re-runs
cheap and lets operators hand-edit the file without it getting
overwritten.
"""
if existing is not None and self.config.reuse_existing:
logger.info(
"vocabulary: reusing existing (%d subtasks, %d memory milestones)",
len(existing.subtasks),
len(existing.memory_milestones),
)
return existing
sample = list(records[: max(1, int(self.config.sample_episodes))])
if not sample:
return None
task_hint = next((r.episode_task for r in sample if r.episode_task), "")
prompt = load_prompt("module_0_vocabulary").format(
episode_task=task_hint or "(unspecified)",
n_episodes=len(sample),
)
# Pack one video block per sample episode so the VLM sees the
# variation across episodes (different starting poses, different
# object placements) rather than overfitting to one trajectory.
content: list[dict[str, Any]] = []
for record in sample:
video_frames = self.frame_provider.video_for_episode(
record, int(self.config.max_video_frames_per_episode)
)
if video_frames:
content.extend(to_video_block(video_frames))
content.append({"type": "text", "text": prompt})
messages = [{"role": "user", "content": content}]
result = self.vlm.generate_json([messages])[0]
if not isinstance(result, dict):
logger.warning("vocabulary: VLM did not return a JSON object — skipping")
return None
vocab = Vocabulary.from_json(result)
if vocab.is_empty():
logger.warning("vocabulary: VLM returned an empty vocabulary — skipping")
return None
logger.info(
"vocabulary: discovered %d subtask labels + %d memory milestones from %d episodes",
len(vocab.subtasks),
len(vocab.memory_milestones),
len(sample),
)
return vocab

View File

@@ -40,7 +40,6 @@ from lerobot.annotations.steerable_pipeline.modules import (
)
from lerobot.annotations.steerable_pipeline.validator import StagingValidator
from lerobot.annotations.steerable_pipeline.vlm_client import make_vlm_client
from lerobot.annotations.steerable_pipeline.vocabulary import VocabularyDiscoveryModule
from lerobot.annotations.steerable_pipeline.writer import LanguageColumnsWriter
from lerobot.configs import parser
@@ -89,9 +88,6 @@ def annotate(cfg: AnnotationPipelineConfig) -> None:
vlm=vlm, config=cfg.interjections, seed=cfg.seed, frame_provider=frame_provider
)
vqa = GeneralVqaModule(vlm=vlm, config=cfg.vqa, seed=cfg.seed, frame_provider=frame_provider)
vocabulary = VocabularyDiscoveryModule(
vlm=vlm, config=cfg.vocabulary, frame_provider=frame_provider
)
writer = LanguageColumnsWriter()
validator = StagingValidator(
dataset_camera_keys=tuple(getattr(frame_provider, "camera_keys", []) or []) or None,
@@ -102,7 +98,6 @@ def annotate(cfg: AnnotationPipelineConfig) -> None:
plan=plan,
interjections=interjections,
vqa=vqa,
vocabulary=vocabulary,
writer=writer,
validator=validator,
)