tests/annotations/run_e2e_smoke.py

#!/usr/bin/env python

# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Opt-in E2E smoke run for ``make annotation-e2e``.

Builds the shared annotation fixture (:func:`build_annotation_dataset`),
runs the full annotation pipeline against it with a stub VLM, and prints a
short report. This is intentionally not a pytest test — it exercises the
CLI plumbing — but it reuses the same on-disk dataset builder as the pytest
fixtures so there is no duplicated fixture code.
"""

from __future__ import annotations

import sys
import tempfile
from pathlib import Path

from lerobot.annotations.steerable_pipeline.config import AnnotationPipelineConfig
from lerobot.annotations.steerable_pipeline.executor import Executor
from lerobot.annotations.steerable_pipeline.modules import (
    GeneralVqaModule,
    InterjectionsAndSpeechModule,
    PlanSubtasksMemoryModule,
)
from lerobot.annotations.steerable_pipeline.validator import StagingValidator
from lerobot.annotations.steerable_pipeline.vlm_client import StubVlmClient
from lerobot.annotations.steerable_pipeline.writer import LanguageColumnsWriter
from tests.fixtures.dataset_factories import build_annotation_dataset


def _stub_responder(messages):
    text = ""
    for m in messages:
        if m.get("role") == "user":
            content = m.get("content")
            if isinstance(content, list):
                for block in content:
                    if isinstance(block, dict) and block.get("type") == "text":
                        text = block.get("text", "")
            elif isinstance(content, str):
                text = content
    if "atomic subtasks" in text:
        return {
            "subtasks": [
                {"text": "grasp the bottle", "start": 0.0, "end": 1.0},
                {"text": "pour into the cup", "start": 1.0, "end": 2.0},
                {"text": "place the bottle down", "start": 2.0, "end": 3.0},
            ]
        }
    if "concise hierarchical PLAN" in text:
        return {"plan": "1. grasp\n2. pour\n3. place"}
    if "Update the memory" in text:
        return {"memory": "poured once"}
    if "acknowledgement the robot" in text:
        return {"text": "Sure."}
    if "ONE realistic interruption" in text:
        return {"interjection": "use less water", "speech": "Using less water."}
    if "frame-grounded visual question" in text:
        return {"question": "How many cups?", "answer": {"label": "cup", "count": 1}}
    return None


def main() -> int:
    with tempfile.TemporaryDirectory() as tmp:
        root = build_annotation_dataset(
            Path(tmp) / "ds",
            episode_specs=[(0, 30, "Pour water into the cup.")],
            fps=10,
        )
        vlm = StubVlmClient(responder=_stub_responder)
        cfg = AnnotationPipelineConfig()
        executor = Executor(
            config=cfg,
            plan=PlanSubtasksMemoryModule(vlm=vlm, config=cfg.plan),
            interjections=InterjectionsAndSpeechModule(vlm=vlm, config=cfg.interjections, seed=cfg.seed),
            vqa=GeneralVqaModule(vlm=vlm, config=cfg.vqa, seed=cfg.seed),
            writer=LanguageColumnsWriter(),
            validator=StagingValidator(),
        )
        summary = executor.run(root)
        print(f"phases={[(p.name, p.episodes_processed) for p in summary.phases]}")
        print(f"validation: {summary.validation_report.summary()}")
        print(f"shards rewritten: {len(summary.written_paths)}")
    return 0


if __name__ == "__main__":
    sys.exit(main())
feat: language annotation pipeline (PR 2/3) Adds the steerable annotation pipeline (`lerobot-annotate`) that populates the `language_persistent` and `language_events` columns introduced in PR 1 directly into `data/chunk-/file-.parquet`. No flavor namespace, no sidecar tree. Modules produced: - Module 1 (plan_subtasks_memory): Pi0.7-style subtasks, plan (init + refresh on interjection), MEM-style memory at subtask boundaries. - Module 2 (interjections_and_speech): t=0 speech-only acknowledgement, mid-episode paired interjection + speech tool-call atom. - Module 3 (general_vqa): bbox/keypoint/count/attribute/spatial pairs at configurable cadence with one-retry JSON validation. Writer enforces: per-episode persistent identity, exact-frame event timestamps, column routing per `column_for_style`, dataset-level `tools` column with the `say` schema, drops legacy `subtask_index`. Validator runs against staged JSONL artifacts before the writer rewrites parquet. Adds `lerobot-annotate` console script, `annotations` extra (datatrove + optional vllm), `make annotation-e2e` opt-in smoke target, and `docs/source/annotation_pipeline.mdx`. Branched from PR 1 (`feat/language-columns`). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-04-27 16:22:51 +02:00			`#!/usr/bin/env python`

			`# Copyright 2026 The HuggingFace Inc. team. All rights reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			"""Opt-in E2E smoke run for ``make annotation-e2e``.

review: address CarolinePascal feedback - name the three modules everywhere (plan / interjections / vqa) instead of module_1/2/3 — config classes, config fields, executor params, staging keys and phase names now carry the module name - rename examples/annotation -> examples/annotations; add the Apache header to run_hf_job.py - drop the unused GeneralVqaModule._generate_one - remove "PR 1" references from comments/docstrings - frames.py: rely on the always-defined LeRobotDatasetMetadata.camera_keys - executor.py: read/write meta/info.json via load_info / write_info - reader.py: load meta/tasks.parquet via io_utils.load_tasks - make --push_to_hub a bool; push the annotated dataset back to --repo_id - move the on-disk test dataset builder into tests/fixtures (build_annotation_dataset); run_e2e_smoke reuses it - clarify in the docs that the vqa module grounds each pair on a single frame (K = per-tick anchor count) - hoist stdlib dynamic imports to module scope Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-05-18 12:03:25 +02:00			Builds the shared annotation fixture (:func:`build_annotation_dataset`),
			`runs the full annotation pipeline against it with a stub VLM, and prints a`
			`short report. This is intentionally not a pytest test — it exercises the`
			`CLI plumbing — but it reuses the same on-disk dataset builder as the pytest`
			`fixtures so there is no duplicated fixture code.`
feat: language annotation pipeline (PR 2/3) Adds the steerable annotation pipeline (`lerobot-annotate`) that populates the `language_persistent` and `language_events` columns introduced in PR 1 directly into `data/chunk-/file-.parquet`. No flavor namespace, no sidecar tree. Modules produced: - Module 1 (plan_subtasks_memory): Pi0.7-style subtasks, plan (init + refresh on interjection), MEM-style memory at subtask boundaries. - Module 2 (interjections_and_speech): t=0 speech-only acknowledgement, mid-episode paired interjection + speech tool-call atom. - Module 3 (general_vqa): bbox/keypoint/count/attribute/spatial pairs at configurable cadence with one-retry JSON validation. Writer enforces: per-episode persistent identity, exact-frame event timestamps, column routing per `column_for_style`, dataset-level `tools` column with the `say` schema, drops legacy `subtask_index`. Validator runs against staged JSONL artifacts before the writer rewrites parquet. Adds `lerobot-annotate` console script, `annotations` extra (datatrove + optional vllm), `make annotation-e2e` opt-in smoke target, and `docs/source/annotation_pipeline.mdx`. Branched from PR 1 (`feat/language-columns`). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-04-27 16:22:51 +02:00			`"""`

			`from __future__ import annotations`

			`import sys`
			`import tempfile`
			`from pathlib import Path`

			`from lerobot.annotations.steerable_pipeline.config import AnnotationPipelineConfig`
			`from lerobot.annotations.steerable_pipeline.executor import Executor`
			`from lerobot.annotations.steerable_pipeline.modules import (`
			`GeneralVqaModule,`
			`InterjectionsAndSpeechModule,`
			`PlanSubtasksMemoryModule,`
			`)`
			`from lerobot.annotations.steerable_pipeline.validator import StagingValidator`
			`from lerobot.annotations.steerable_pipeline.vlm_client import StubVlmClient`
			`from lerobot.annotations.steerable_pipeline.writer import LanguageColumnsWriter`
review: address CarolinePascal feedback - name the three modules everywhere (plan / interjections / vqa) instead of module_1/2/3 — config classes, config fields, executor params, staging keys and phase names now carry the module name - rename examples/annotation -> examples/annotations; add the Apache header to run_hf_job.py - drop the unused GeneralVqaModule._generate_one - remove "PR 1" references from comments/docstrings - frames.py: rely on the always-defined LeRobotDatasetMetadata.camera_keys - executor.py: read/write meta/info.json via load_info / write_info - reader.py: load meta/tasks.parquet via io_utils.load_tasks - make --push_to_hub a bool; push the annotated dataset back to --repo_id - move the on-disk test dataset builder into tests/fixtures (build_annotation_dataset); run_e2e_smoke reuses it - clarify in the docs that the vqa module grounds each pair on a single frame (K = per-tick anchor count) - hoist stdlib dynamic imports to module scope Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-05-18 12:03:25 +02:00			`from tests.fixtures.dataset_factories import build_annotation_dataset`
feat: language annotation pipeline (PR 2/3) Adds the steerable annotation pipeline (`lerobot-annotate`) that populates the `language_persistent` and `language_events` columns introduced in PR 1 directly into `data/chunk-/file-.parquet`. No flavor namespace, no sidecar tree. Modules produced: - Module 1 (plan_subtasks_memory): Pi0.7-style subtasks, plan (init + refresh on interjection), MEM-style memory at subtask boundaries. - Module 2 (interjections_and_speech): t=0 speech-only acknowledgement, mid-episode paired interjection + speech tool-call atom. - Module 3 (general_vqa): bbox/keypoint/count/attribute/spatial pairs at configurable cadence with one-retry JSON validation. Writer enforces: per-episode persistent identity, exact-frame event timestamps, column routing per `column_for_style`, dataset-level `tools` column with the `say` schema, drops legacy `subtask_index`. Validator runs against staged JSONL artifacts before the writer rewrites parquet. Adds `lerobot-annotate` console script, `annotations` extra (datatrove + optional vllm), `make annotation-e2e` opt-in smoke target, and `docs/source/annotation_pipeline.mdx`. Branched from PR 1 (`feat/language-columns`). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-04-27 16:22:51 +02:00

			`def _stub_responder(messages):`
			`text = ""`
			`for m in messages:`
			`if m.get("role") == "user":`
			`content = m.get("content")`
			`if isinstance(content, list):`
			`for block in content:`
			`if isinstance(block, dict) and block.get("type") == "text":`
			`text = block.get("text", "")`
			`elif isinstance(content, str):`
			`text = content`
feat(annotate): Module 1 sees the whole episode as one video block Replaces keyframe sampling with a single Qwen-VL video block covering the whole demonstration. The model pools temporally itself and chooses where to cut subtasks — no stride, no count, no keyframe count knob to tune. - frames.py: ``FrameProvider`` gains ``video_for_episode(record, max_frames)``; ``VideoFrameProvider`` samples up to ``max_frames`` uniformly across the episode duration; ``_NullProvider`` returns [] for the no-video fallback. New ``to_video_block`` helper. - Module 1: drops keyframe sampling. The subtask prompt now goes out as ``[{"type":"video", "video":[<frames>]}, {"type":"text", ...}]`` and the prompt template asks the model to "watch the whole clip, then segment it" with cut points decided from gripper/contact/regrasp events the model sees. - Module1Config: ``keyframes_per_episode`` removed; replaced with ``max_video_frames: int = 32`` (model-capacity bound, not annotation logic). - Test: ``test_module1_attaches_video_block_to_subtask_prompt`` locks in the single-video-block invariant. - Stub-VLM markers updated: tests now key on "atomic subtasks" instead of the old "Decompose the demonstration" phrase that no longer appears in the prompt. - Docs: updated to describe the whole-episode video-block behavior and the no-video fallback. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-04-27 17:08:36 +02:00			`if "atomic subtasks" in text:`
feat: language annotation pipeline (PR 2/3) Adds the steerable annotation pipeline (`lerobot-annotate`) that populates the `language_persistent` and `language_events` columns introduced in PR 1 directly into `data/chunk-/file-.parquet`. No flavor namespace, no sidecar tree. Modules produced: - Module 1 (plan_subtasks_memory): Pi0.7-style subtasks, plan (init + refresh on interjection), MEM-style memory at subtask boundaries. - Module 2 (interjections_and_speech): t=0 speech-only acknowledgement, mid-episode paired interjection + speech tool-call atom. - Module 3 (general_vqa): bbox/keypoint/count/attribute/spatial pairs at configurable cadence with one-retry JSON validation. Writer enforces: per-episode persistent identity, exact-frame event timestamps, column routing per `column_for_style`, dataset-level `tools` column with the `say` schema, drops legacy `subtask_index`. Validator runs against staged JSONL artifacts before the writer rewrites parquet. Adds `lerobot-annotate` console script, `annotations` extra (datatrove + optional vllm), `make annotation-e2e` opt-in smoke target, and `docs/source/annotation_pipeline.mdx`. Branched from PR 1 (`feat/language-columns`). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-04-27 16:22:51 +02:00			`return {`
			`"subtasks": [`
			`{"text": "grasp the bottle", "start": 0.0, "end": 1.0},`
			`{"text": "pour into the cup", "start": 1.0, "end": 2.0},`
			`{"text": "place the bottle down", "start": 2.0, "end": 3.0},`
			`]`
			`}`
			`if "concise hierarchical PLAN" in text:`
			`return {"plan": "1. grasp\n2. pour\n3. place"}`
			`if "Update the memory" in text:`
			`return {"memory": "poured once"}`
			`if "acknowledgement the robot" in text:`
			`return {"text": "Sure."}`
			`if "ONE realistic interruption" in text:`
			`return {"interjection": "use less water", "speech": "Using less water."}`
			`if "frame-grounded visual question" in text:`
			`return {"question": "How many cups?", "answer": {"label": "cup", "count": 1}}`
			`return None`


			`def main() -> int:`
			`with tempfile.TemporaryDirectory() as tmp:`
review: address CarolinePascal feedback - name the three modules everywhere (plan / interjections / vqa) instead of module_1/2/3 — config classes, config fields, executor params, staging keys and phase names now carry the module name - rename examples/annotation -> examples/annotations; add the Apache header to run_hf_job.py - drop the unused GeneralVqaModule._generate_one - remove "PR 1" references from comments/docstrings - frames.py: rely on the always-defined LeRobotDatasetMetadata.camera_keys - executor.py: read/write meta/info.json via load_info / write_info - reader.py: load meta/tasks.parquet via io_utils.load_tasks - make --push_to_hub a bool; push the annotated dataset back to --repo_id - move the on-disk test dataset builder into tests/fixtures (build_annotation_dataset); run_e2e_smoke reuses it - clarify in the docs that the vqa module grounds each pair on a single frame (K = per-tick anchor count) - hoist stdlib dynamic imports to module scope Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-05-18 12:03:25 +02:00			`root = build_annotation_dataset(`
			`Path(tmp) / "ds",`
			`episode_specs=[(0, 30, "Pour water into the cup.")],`
			`fps=10,`
			`)`
feat: language annotation pipeline (PR 2/3) Adds the steerable annotation pipeline (`lerobot-annotate`) that populates the `language_persistent` and `language_events` columns introduced in PR 1 directly into `data/chunk-/file-.parquet`. No flavor namespace, no sidecar tree. Modules produced: - Module 1 (plan_subtasks_memory): Pi0.7-style subtasks, plan (init + refresh on interjection), MEM-style memory at subtask boundaries. - Module 2 (interjections_and_speech): t=0 speech-only acknowledgement, mid-episode paired interjection + speech tool-call atom. - Module 3 (general_vqa): bbox/keypoint/count/attribute/spatial pairs at configurable cadence with one-retry JSON validation. Writer enforces: per-episode persistent identity, exact-frame event timestamps, column routing per `column_for_style`, dataset-level `tools` column with the `say` schema, drops legacy `subtask_index`. Validator runs against staged JSONL artifacts before the writer rewrites parquet. Adds `lerobot-annotate` console script, `annotations` extra (datatrove + optional vllm), `make annotation-e2e` opt-in smoke target, and `docs/source/annotation_pipeline.mdx`. Branched from PR 1 (`feat/language-columns`). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-04-27 16:22:51 +02:00			`vlm = StubVlmClient(responder=_stub_responder)`
			`cfg = AnnotationPipelineConfig()`
			`executor = Executor(`
			`config=cfg,`
review: address CarolinePascal feedback - name the three modules everywhere (plan / interjections / vqa) instead of module_1/2/3 — config classes, config fields, executor params, staging keys and phase names now carry the module name - rename examples/annotation -> examples/annotations; add the Apache header to run_hf_job.py - drop the unused GeneralVqaModule._generate_one - remove "PR 1" references from comments/docstrings - frames.py: rely on the always-defined LeRobotDatasetMetadata.camera_keys - executor.py: read/write meta/info.json via load_info / write_info - reader.py: load meta/tasks.parquet via io_utils.load_tasks - make --push_to_hub a bool; push the annotated dataset back to --repo_id - move the on-disk test dataset builder into tests/fixtures (build_annotation_dataset); run_e2e_smoke reuses it - clarify in the docs that the vqa module grounds each pair on a single frame (K = per-tick anchor count) - hoist stdlib dynamic imports to module scope Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-05-18 12:03:25 +02:00			`plan=PlanSubtasksMemoryModule(vlm=vlm, config=cfg.plan),`
			`interjections=InterjectionsAndSpeechModule(vlm=vlm, config=cfg.interjections, seed=cfg.seed),`
			`vqa=GeneralVqaModule(vlm=vlm, config=cfg.vqa, seed=cfg.seed),`
feat: language annotation pipeline (PR 2/3) Adds the steerable annotation pipeline (`lerobot-annotate`) that populates the `language_persistent` and `language_events` columns introduced in PR 1 directly into `data/chunk-/file-.parquet`. No flavor namespace, no sidecar tree. Modules produced: - Module 1 (plan_subtasks_memory): Pi0.7-style subtasks, plan (init + refresh on interjection), MEM-style memory at subtask boundaries. - Module 2 (interjections_and_speech): t=0 speech-only acknowledgement, mid-episode paired interjection + speech tool-call atom. - Module 3 (general_vqa): bbox/keypoint/count/attribute/spatial pairs at configurable cadence with one-retry JSON validation. Writer enforces: per-episode persistent identity, exact-frame event timestamps, column routing per `column_for_style`, dataset-level `tools` column with the `say` schema, drops legacy `subtask_index`. Validator runs against staged JSONL artifacts before the writer rewrites parquet. Adds `lerobot-annotate` console script, `annotations` extra (datatrove + optional vllm), `make annotation-e2e` opt-in smoke target, and `docs/source/annotation_pipeline.mdx`. Branched from PR 1 (`feat/language-columns`). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-04-27 16:22:51 +02:00			`writer=LanguageColumnsWriter(),`
			`validator=StagingValidator(),`
			`)`
			`summary = executor.run(root)`
			`print(f"phases={[(p.name, p.episodes_processed) for p in summary.phases]}")`
			`print(f"validation: {summary.validation_report.summary()}")`
			`print(f"shards rewritten: {len(summary.written_paths)}")`
			`return 0`


			`if __name__ == "__main__":`
			`sys.exit(main())`