mirror of
https://github.com/huggingface/lerobot.git
synced 2026-06-02 11:51:25 +00:00
Drops the optional Phase 0 vocabulary-discovery feature entirely.
With the new structured action records (Phase 1a + 1b) providing
cross-episode consistency via the deterministic template renderer,
the older vocabulary-constraint path is redundant and adds a second
constraint mechanism that wasn't well-validated in practice.
Removed:
* src/lerobot/annotations/steerable_pipeline/vocabulary.py
(Vocabulary dataclass + VocabularyDiscoveryModule + load_/
save_vocabulary helpers; canonical_vocabulary.json on-disk format)
* src/lerobot/annotations/steerable_pipeline/prompts/module_0_vocabulary.txt
(Phase 0 VLM prompt)
* tests/annotations/test_vocabulary.py
Pruned wiring across:
* config.py: VocabularyConfig dataclass + AnnotationPipelineConfig.
vocabulary field
* executor.py: vocabulary attribute on Executor + _run_vocabulary_
phase method + Phase 0 phases.append call in run()
* modules/plan_subtasks_memory.py: Vocabulary import + vocabulary
attribute + _subtask_vocabulary_block / _memory_vocabulary_block
helpers + _canonicalize_subtask / _normalize / _invalid_subtasks
/ _build_subtask_retry_message methods + vocabulary-gated retry
path in _generate_subtasks + empty-episode warning + _NORMALIZE_
STRIP_TOKENS constant
* prompts/module_1_subtasks.txt: {vocabulary_block} placeholder
* prompts/module_1_memory.txt: {vocabulary_block} placeholder
* __init__.py: Vocabulary / VocabularyDiscoveryModule / load_
vocabulary / save_vocabulary / vocabulary_path / VOCABULARY_
FILENAME re-exports
* scripts/lerobot_annotate.py: VocabularyDiscoveryModule import +
instantiation + executor argument
* examples/annotations/run_hf_job.py: --vocabulary.enabled=false
flag + docstring references + inline phase-0 comment
The original free-form rephrasings path stays (PlanConfig.
n_task_rephrasings still works when task_aug_axes.enabled=False).
Action records remain the preferred mechanism for cross-episode
subtask consistency.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
201 lines
7.6 KiB
Python
201 lines
7.6 KiB
Python
#!/usr/bin/env python
|
|
|
|
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""``lerobot-annotate`` — populate ``language_persistent`` and
|
|
``language_events`` columns on a LeRobot dataset.
|
|
|
|
Annotations live directly in ``data/chunk-*/file-*.parquet``.
|
|
|
|
Example:
|
|
|
|
uv run lerobot-annotate \\
|
|
--root=/path/to/dataset \\
|
|
--vlm.model_id=Qwen/Qwen2.5-VL-7B-Instruct
|
|
|
|
For distributed runs, see ``examples/annotations/run_hf_job.py``.
|
|
"""
|
|
|
|
import logging
|
|
from pathlib import Path
|
|
|
|
from lerobot.annotations.steerable_pipeline.config import AnnotationPipelineConfig
|
|
from lerobot.annotations.steerable_pipeline.executor import Executor
|
|
from lerobot.annotations.steerable_pipeline.frames import make_frame_provider
|
|
from lerobot.annotations.steerable_pipeline.modules import (
|
|
GeneralVqaModule,
|
|
InterjectionsAndSpeechModule,
|
|
PlanSubtasksMemoryModule,
|
|
)
|
|
from lerobot.annotations.steerable_pipeline.validator import StagingValidator
|
|
from lerobot.annotations.steerable_pipeline.vlm_client import make_vlm_client
|
|
from lerobot.annotations.steerable_pipeline.writer import LanguageColumnsWriter
|
|
from lerobot.configs import parser
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def _resolve_root(cfg: AnnotationPipelineConfig) -> Path:
|
|
if cfg.root is not None:
|
|
return Path(cfg.root)
|
|
if cfg.repo_id is not None:
|
|
from huggingface_hub import snapshot_download
|
|
|
|
return Path(snapshot_download(repo_id=cfg.repo_id, repo_type="dataset"))
|
|
raise ValueError("Either --root or --repo_id must be provided.")
|
|
|
|
|
|
@parser.wrap()
|
|
def annotate(cfg: AnnotationPipelineConfig) -> None:
|
|
"""Run the steerable annotation pipeline against a dataset."""
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
root = _resolve_root(cfg)
|
|
logger.info("annotate: root=%s", root)
|
|
|
|
vlm = make_vlm_client(cfg.vlm)
|
|
frame_provider = make_frame_provider(
|
|
root, camera_key=cfg.vlm.camera_key, video_backend=cfg.video_backend
|
|
)
|
|
# Surface the resolved cameras up front so a silent vqa-module no-op
|
|
# is obvious in job output rather than discovered post-hoc by counting
|
|
# parquet rows.
|
|
cam_keys = list(getattr(frame_provider, "camera_keys", []) or [])
|
|
logger.info(
|
|
"annotate: frame_provider default camera=%r, all cameras=%s",
|
|
getattr(frame_provider, "camera_key", None),
|
|
cam_keys,
|
|
)
|
|
if cfg.vqa.enabled and not cam_keys:
|
|
logger.warning(
|
|
"annotate: the vqa module is enabled but no cameras were "
|
|
"resolved — it will produce zero VQA rows. Check "
|
|
"meta/info.json for observation.images.* features, or pass "
|
|
"--vlm.camera_key=<key> to seed the cameras list."
|
|
)
|
|
plan = PlanSubtasksMemoryModule(vlm=vlm, config=cfg.plan, frame_provider=frame_provider)
|
|
interjections = InterjectionsAndSpeechModule(
|
|
vlm=vlm, config=cfg.interjections, seed=cfg.seed, frame_provider=frame_provider
|
|
)
|
|
vqa = GeneralVqaModule(vlm=vlm, config=cfg.vqa, seed=cfg.seed, frame_provider=frame_provider)
|
|
writer = LanguageColumnsWriter()
|
|
validator = StagingValidator(
|
|
dataset_camera_keys=tuple(getattr(frame_provider, "camera_keys", []) or []) or None,
|
|
)
|
|
|
|
executor = Executor(
|
|
config=cfg,
|
|
plan=plan,
|
|
interjections=interjections,
|
|
vqa=vqa,
|
|
writer=writer,
|
|
validator=validator,
|
|
)
|
|
summary = executor.run(root)
|
|
logger.info("annotate: wrote %d shard(s)", len(summary.written_paths))
|
|
for phase in summary.phases:
|
|
logger.info(
|
|
"annotate: phase=%s processed=%d skipped=%d",
|
|
phase.name,
|
|
phase.episodes_processed,
|
|
phase.episodes_skipped,
|
|
)
|
|
if summary.validation_report.warnings:
|
|
for w in summary.validation_report.warnings:
|
|
logger.warning(w)
|
|
|
|
if cfg.push_to_hub:
|
|
if cfg.repo_id is None and cfg.dest_repo_id is None:
|
|
raise ValueError(
|
|
"--push_to_hub requires --repo_id or --dest_repo_id (the dataset repo to push to)."
|
|
)
|
|
_push_to_hub(root, cfg)
|
|
|
|
|
|
def _push_to_hub(root: Path, cfg: AnnotationPipelineConfig) -> None:
|
|
"""Upload the annotated dataset directory to the Hub.
|
|
|
|
Pushes to ``cfg.dest_repo_id`` when set, otherwise back to ``cfg.repo_id``.
|
|
"""
|
|
from huggingface_hub import HfApi # noqa: PLC0415
|
|
|
|
repo_id = cfg.dest_repo_id or cfg.repo_id
|
|
commit_message = cfg.push_commit_message or "Add steerable annotations (lerobot-annotate)"
|
|
api = HfApi()
|
|
print(f"[lerobot-annotate] creating/locating dataset repo {repo_id}...", flush=True)
|
|
api.create_repo(
|
|
repo_id=repo_id,
|
|
repo_type="dataset",
|
|
private=cfg.push_private,
|
|
exist_ok=True,
|
|
)
|
|
print(f"[lerobot-annotate] uploading {root} -> {repo_id}...", flush=True)
|
|
commit_info = api.upload_folder(
|
|
folder_path=str(root),
|
|
repo_id=repo_id,
|
|
repo_type="dataset",
|
|
commit_message=commit_message,
|
|
ignore_patterns=[".annotate_staging/**", "**/.DS_Store"],
|
|
)
|
|
print(f"[lerobot-annotate] uploaded to https://huggingface.co/datasets/{repo_id}", flush=True)
|
|
|
|
# Tag the upload with the codebase version. ``LeRobotDatasetMetadata``
|
|
# resolves the dataset revision via ``get_safe_version`` which scans
|
|
# for tags like ``v3.0``; without a tag it raises
|
|
# ``RevisionNotFoundError``. Read the version straight from the
|
|
# dataset's own ``meta/info.json`` so we tag whatever the writer
|
|
# actually wrote (no accidental drift if the codebase floor moves).
|
|
from lerobot.datasets.dataset_metadata import CODEBASE_VERSION # noqa: PLC0415
|
|
|
|
info_path = root / "meta" / "info.json"
|
|
version_tag = CODEBASE_VERSION
|
|
if info_path.exists():
|
|
try:
|
|
from lerobot.utils.io_utils import load_json # noqa: PLC0415
|
|
|
|
info = load_json(info_path)
|
|
ds_version = info.get("codebase_version")
|
|
if isinstance(ds_version, str) and ds_version.startswith("v"):
|
|
version_tag = ds_version
|
|
except Exception as exc: # noqa: BLE001
|
|
print(f"[lerobot-annotate] could not read codebase_version from info.json ({exc}); falling back to {version_tag}", flush=True)
|
|
revision = getattr(commit_info, "oid", None)
|
|
tag_kwargs = {
|
|
"repo_id": repo_id,
|
|
"tag": version_tag,
|
|
"repo_type": "dataset",
|
|
"exist_ok": True,
|
|
}
|
|
if revision is not None:
|
|
tag_kwargs["revision"] = revision
|
|
|
|
try:
|
|
api.create_tag(**tag_kwargs)
|
|
print(f"[lerobot-annotate] tagged {repo_id} as {version_tag}", flush=True)
|
|
except Exception as exc: # noqa: BLE001
|
|
print(
|
|
f"[lerobot-annotate] WARNING: could not create tag {version_tag!r} on {repo_id}: {exc}. "
|
|
"Dataset is uploaded but ``LeRobotDataset`` won't be able to load it until it's tagged. "
|
|
"Run: from huggingface_hub import HfApi; "
|
|
f"HfApi().create_tag({repo_id!r}, tag={version_tag!r}, repo_type='dataset', exist_ok=True)",
|
|
flush=True,
|
|
)
|
|
|
|
|
|
def main() -> None:
|
|
annotate()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|