Files
lerobot-clone/src/lerobot/scripts/lerobot_annotate.py
Pepijn 5dbf0fac5f annotations(steerable): remove Phase 0 canonical vocabulary discovery
Drops the optional Phase 0 vocabulary-discovery feature entirely.
With the new structured action records (Phase 1a + 1b) providing
cross-episode consistency via the deterministic template renderer,
the older vocabulary-constraint path is redundant and adds a second
constraint mechanism that wasn't well-validated in practice.

Removed:
  * src/lerobot/annotations/steerable_pipeline/vocabulary.py
    (Vocabulary dataclass + VocabularyDiscoveryModule + load_/
    save_vocabulary helpers; canonical_vocabulary.json on-disk format)
  * src/lerobot/annotations/steerable_pipeline/prompts/module_0_vocabulary.txt
    (Phase 0 VLM prompt)
  * tests/annotations/test_vocabulary.py

Pruned wiring across:
  * config.py: VocabularyConfig dataclass + AnnotationPipelineConfig.
    vocabulary field
  * executor.py: vocabulary attribute on Executor + _run_vocabulary_
    phase method + Phase 0 phases.append call in run()
  * modules/plan_subtasks_memory.py: Vocabulary import + vocabulary
    attribute + _subtask_vocabulary_block / _memory_vocabulary_block
    helpers + _canonicalize_subtask / _normalize / _invalid_subtasks
    / _build_subtask_retry_message methods + vocabulary-gated retry
    path in _generate_subtasks + empty-episode warning + _NORMALIZE_
    STRIP_TOKENS constant
  * prompts/module_1_subtasks.txt: {vocabulary_block} placeholder
  * prompts/module_1_memory.txt: {vocabulary_block} placeholder
  * __init__.py: Vocabulary / VocabularyDiscoveryModule / load_
    vocabulary / save_vocabulary / vocabulary_path / VOCABULARY_
    FILENAME re-exports
  * scripts/lerobot_annotate.py: VocabularyDiscoveryModule import +
    instantiation + executor argument
  * examples/annotations/run_hf_job.py: --vocabulary.enabled=false
    flag + docstring references + inline phase-0 comment

The original free-form rephrasings path stays (PlanConfig.
n_task_rephrasings still works when task_aug_axes.enabled=False).
Action records remain the preferred mechanism for cross-episode
subtask consistency.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-02 11:48:27 +02:00

201 lines
7.6 KiB
Python

#!/usr/bin/env python
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""``lerobot-annotate`` — populate ``language_persistent`` and
``language_events`` columns on a LeRobot dataset.
Annotations live directly in ``data/chunk-*/file-*.parquet``.
Example:
uv run lerobot-annotate \\
--root=/path/to/dataset \\
--vlm.model_id=Qwen/Qwen2.5-VL-7B-Instruct
For distributed runs, see ``examples/annotations/run_hf_job.py``.
"""
import logging
from pathlib import Path
from lerobot.annotations.steerable_pipeline.config import AnnotationPipelineConfig
from lerobot.annotations.steerable_pipeline.executor import Executor
from lerobot.annotations.steerable_pipeline.frames import make_frame_provider
from lerobot.annotations.steerable_pipeline.modules import (
GeneralVqaModule,
InterjectionsAndSpeechModule,
PlanSubtasksMemoryModule,
)
from lerobot.annotations.steerable_pipeline.validator import StagingValidator
from lerobot.annotations.steerable_pipeline.vlm_client import make_vlm_client
from lerobot.annotations.steerable_pipeline.writer import LanguageColumnsWriter
from lerobot.configs import parser
logger = logging.getLogger(__name__)
def _resolve_root(cfg: AnnotationPipelineConfig) -> Path:
if cfg.root is not None:
return Path(cfg.root)
if cfg.repo_id is not None:
from huggingface_hub import snapshot_download
return Path(snapshot_download(repo_id=cfg.repo_id, repo_type="dataset"))
raise ValueError("Either --root or --repo_id must be provided.")
@parser.wrap()
def annotate(cfg: AnnotationPipelineConfig) -> None:
"""Run the steerable annotation pipeline against a dataset."""
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
root = _resolve_root(cfg)
logger.info("annotate: root=%s", root)
vlm = make_vlm_client(cfg.vlm)
frame_provider = make_frame_provider(
root, camera_key=cfg.vlm.camera_key, video_backend=cfg.video_backend
)
# Surface the resolved cameras up front so a silent vqa-module no-op
# is obvious in job output rather than discovered post-hoc by counting
# parquet rows.
cam_keys = list(getattr(frame_provider, "camera_keys", []) or [])
logger.info(
"annotate: frame_provider default camera=%r, all cameras=%s",
getattr(frame_provider, "camera_key", None),
cam_keys,
)
if cfg.vqa.enabled and not cam_keys:
logger.warning(
"annotate: the vqa module is enabled but no cameras were "
"resolved — it will produce zero VQA rows. Check "
"meta/info.json for observation.images.* features, or pass "
"--vlm.camera_key=<key> to seed the cameras list."
)
plan = PlanSubtasksMemoryModule(vlm=vlm, config=cfg.plan, frame_provider=frame_provider)
interjections = InterjectionsAndSpeechModule(
vlm=vlm, config=cfg.interjections, seed=cfg.seed, frame_provider=frame_provider
)
vqa = GeneralVqaModule(vlm=vlm, config=cfg.vqa, seed=cfg.seed, frame_provider=frame_provider)
writer = LanguageColumnsWriter()
validator = StagingValidator(
dataset_camera_keys=tuple(getattr(frame_provider, "camera_keys", []) or []) or None,
)
executor = Executor(
config=cfg,
plan=plan,
interjections=interjections,
vqa=vqa,
writer=writer,
validator=validator,
)
summary = executor.run(root)
logger.info("annotate: wrote %d shard(s)", len(summary.written_paths))
for phase in summary.phases:
logger.info(
"annotate: phase=%s processed=%d skipped=%d",
phase.name,
phase.episodes_processed,
phase.episodes_skipped,
)
if summary.validation_report.warnings:
for w in summary.validation_report.warnings:
logger.warning(w)
if cfg.push_to_hub:
if cfg.repo_id is None and cfg.dest_repo_id is None:
raise ValueError(
"--push_to_hub requires --repo_id or --dest_repo_id (the dataset repo to push to)."
)
_push_to_hub(root, cfg)
def _push_to_hub(root: Path, cfg: AnnotationPipelineConfig) -> None:
"""Upload the annotated dataset directory to the Hub.
Pushes to ``cfg.dest_repo_id`` when set, otherwise back to ``cfg.repo_id``.
"""
from huggingface_hub import HfApi # noqa: PLC0415
repo_id = cfg.dest_repo_id or cfg.repo_id
commit_message = cfg.push_commit_message or "Add steerable annotations (lerobot-annotate)"
api = HfApi()
print(f"[lerobot-annotate] creating/locating dataset repo {repo_id}...", flush=True)
api.create_repo(
repo_id=repo_id,
repo_type="dataset",
private=cfg.push_private,
exist_ok=True,
)
print(f"[lerobot-annotate] uploading {root} -> {repo_id}...", flush=True)
commit_info = api.upload_folder(
folder_path=str(root),
repo_id=repo_id,
repo_type="dataset",
commit_message=commit_message,
ignore_patterns=[".annotate_staging/**", "**/.DS_Store"],
)
print(f"[lerobot-annotate] uploaded to https://huggingface.co/datasets/{repo_id}", flush=True)
# Tag the upload with the codebase version. ``LeRobotDatasetMetadata``
# resolves the dataset revision via ``get_safe_version`` which scans
# for tags like ``v3.0``; without a tag it raises
# ``RevisionNotFoundError``. Read the version straight from the
# dataset's own ``meta/info.json`` so we tag whatever the writer
# actually wrote (no accidental drift if the codebase floor moves).
from lerobot.datasets.dataset_metadata import CODEBASE_VERSION # noqa: PLC0415
info_path = root / "meta" / "info.json"
version_tag = CODEBASE_VERSION
if info_path.exists():
try:
from lerobot.utils.io_utils import load_json # noqa: PLC0415
info = load_json(info_path)
ds_version = info.get("codebase_version")
if isinstance(ds_version, str) and ds_version.startswith("v"):
version_tag = ds_version
except Exception as exc: # noqa: BLE001
print(f"[lerobot-annotate] could not read codebase_version from info.json ({exc}); falling back to {version_tag}", flush=True)
revision = getattr(commit_info, "oid", None)
tag_kwargs = {
"repo_id": repo_id,
"tag": version_tag,
"repo_type": "dataset",
"exist_ok": True,
}
if revision is not None:
tag_kwargs["revision"] = revision
try:
api.create_tag(**tag_kwargs)
print(f"[lerobot-annotate] tagged {repo_id} as {version_tag}", flush=True)
except Exception as exc: # noqa: BLE001
print(
f"[lerobot-annotate] WARNING: could not create tag {version_tag!r} on {repo_id}: {exc}. "
"Dataset is uploaded but ``LeRobotDataset`` won't be able to load it until it's tagged. "
"Run: from huggingface_hub import HfApi; "
f"HfApi().create_tag({repo_id!r}, tag={version_tag!r}, repo_type='dataset', exist_ok=True)",
flush=True,
)
def main() -> None:
annotate()
if __name__ == "__main__":
main()