From ba5d4c5cd824f9b23df525bf94f41638c7ea43ee Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Tue, 2 Jun 2026 15:08:25 +0200
Subject: [PATCH] annotate: kill subtask hallucination + single-camera
 grounding
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two fixes for 'subtasks describe actions not in the video' plus a way
to focus the whole pipeline on one camera.

ANTI-HALLUCINATION
  1. _episode_video_block: when use_video_url is set but clip extraction
     fails, FALL BACK to embedded frames instead of returning an empty
     block. An empty block left the VLM with zero visual grounding, so
     it invented subtasks from the task text alone — the likely root
     cause of hallucinated steps. Now logs a warning and embeds frames.
  2. module_1_subtasks.txt gains a GROUNDING preamble (overrides all
     other rules): label only motion visible in specific frames; never
     invent/anticipate/pad; max_steps is a CEILING not a target; atomic
     demos may be exactly ONE subtask; the VIDEO is ground truth, not
     the instruction text.

SINGLE-CAMERA GROUNDING
  * New VqaConfig.restrict_to_default_camera (default False). When True,
    the VQA module grounds on only the --vlm.camera_key stream instead
    of iterating every camera — matching the plan / interjection
    modules, which already use that single camera. Now the whole
    pipeline can focus on one view (e.g. observation.images.base).

run_hf_job.py updated:
  * use_video_url=false + frames_per_second=2.0 — embed frames directly
    (most reliable; no silent text-only failure mode) with dense
    grounding.
  * vqa.restrict_to_default_camera=true — VQA on the single camera too.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 examples/annotations/run_hf_job.py            | 15 ++++++++++---
 .../annotations/steerable_pipeline/config.py  |  9 ++++++++
 .../steerable_pipeline/modules/general_vqa.py | 14 ++++++++++++-
 .../modules/plan_subtasks_memory.py           | 21 ++++++++++++++-----
 .../prompts/module_1_subtasks.txt             | 15 +++++++++++++
 5 files changed, 65 insertions(+), 9 deletions(-)

diff --git a/examples/annotations/run_hf_job.py b/examples/annotations/run_hf_job.py
index 8ce22c28f..5e1b11d4b 100644
--- a/examples/annotations/run_hf_job.py
+++ b/examples/annotations/run_hf_job.py
@@ -54,9 +54,14 @@ CMD = (
     "--vlm.chat_template_kwargs='{\"enable_thinking\": false}' "
     "--vlm.camera_key=observation.images.robot0_agentview_right "
     # Phase 1 — plan module (subtasks + plan + memory).
-    "--plan.frames_per_second=1.0 "
-    "--plan.use_video_url=true "
-    "--plan.use_video_url_fps=1.0 "
+    # Embed decoded frames directly (use_video_url=false) rather than
+    # handing the server a file:// clip. The embedded path is more
+    # reliable: if clip extraction ever fails, the video_url path would
+    # silently send NO video and the VLM would hallucinate subtasks from
+    # the task text alone. 2 fps gives dense visual grounding so the VLM
+    # labels what actually happens.
+    "--plan.frames_per_second=2.0 "
+    "--plan.use_video_url=false "
     # IMPORTANT for RoboCasa: the dataset's task string ("Navigate to the
     # stove", "Pick the mug...") is authoritative and is what eval uses.
     # ``derive_task_from_video=off`` keeps that canonical task driving
@@ -80,6 +85,10 @@ CMD = (
     # Phase 2 — interjections + speech.
     "--interjections.max_interjections_per_episode=6 "
     # Phase 4 — general VQA.
+    # Ground VQA on the SAME single camera as plan/interjections
+    # (--vlm.camera_key) instead of iterating every camera. The whole
+    # pipeline then focuses on one view, e.g. observation.images.base.
+    "--vqa.restrict_to_default_camera=true "
     "--vqa.K=1 "
     "--vqa.vqa_emission_hz=1.0"
 )
diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py
index f84fdaa08..1cecfa772 100644
--- a/src/lerobot/annotations/steerable_pipeline/config.py
+++ b/src/lerobot/annotations/steerable_pipeline/config.py
@@ -219,6 +219,15 @@ class VqaConfig:
     precision for more (noisier) VQA frames."""
     question_types: tuple[str, ...] = ("bbox", "keypoint", "count", "attribute", "spatial")
 
+    # Camera restriction. By default VQA iterates EVERY camera the
+    # dataset declares (one VQA pair per camera per emission tick). Set
+    # ``restrict_to_default_camera=True`` to ground VQA on only the
+    # single ``--vlm.camera_key`` stream — the same camera the plan /
+    # interjection modules use — so the whole pipeline focuses on one
+    # view. Use this when you want every annotation grounded on, e.g.,
+    # ``observation.images.base`` and nothing else.
+    restrict_to_default_camera: bool = False
+
 
 @dataclass
 class VlmConfig:
diff --git a/src/lerobot/annotations/steerable_pipeline/modules/general_vqa.py b/src/lerobot/annotations/steerable_pipeline/modules/general_vqa.py
index adabff731..1e5ad8838 100644
--- a/src/lerobot/annotations/steerable_pipeline/modules/general_vqa.py
+++ b/src/lerobot/annotations/steerable_pipeline/modules/general_vqa.py
@@ -180,8 +180,20 @@ class GeneralVqaModule:
         Defaults to every camera the provider exposes. Datasets with no
         cameras (or test/null providers) yield an empty list, which makes
         ``run_episode`` a no-op.
+
+        When ``config.restrict_to_default_camera`` is set, VQA grounds on
+        only the provider's default camera (the single ``--vlm.camera_key``
+        stream), matching the plan / interjection modules so the whole
+        pipeline focuses on one view.
         """
-        return list(getattr(self.frame_provider, "camera_keys", []) or [])
+        all_cameras = list(getattr(self.frame_provider, "camera_keys", []) or [])
+        if getattr(self.config, "restrict_to_default_camera", False):
+            default = getattr(self.frame_provider, "camera_key", None)
+            if default and default in all_cameras:
+                return [default]
+            if default:
+                return [default]
+        return all_cameras
 
     def _build_messages(
         self,
diff --git a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
index 5e66f67be..c46b20bac 100644
--- a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
+++ b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
@@ -443,16 +443,27 @@ class PlanSubtasksMemoryModule:
         return flat
 
     def _episode_video_block(self, record: EpisodeRecord) -> list[dict[str, Any]]:
-        """Same video block ``_generate_subtasks`` builds — extracted helper."""
+        """Same video block ``_generate_subtasks`` builds — extracted helper.
+
+        Always returns a block that actually carries the video. When
+        ``use_video_url`` is set we try the server-side ``video_url``
+        path first, but if clip extraction fails we FALL BACK to
+        decoding + embedding frames rather than returning an empty
+        block — an empty block would leave the VLM with no visual
+        grounding at all and it would hallucinate subtasks purely from
+        the task text.
+        """
         if not record.frame_timestamps:
             return []
         if self.config.use_video_url and isinstance(self.frame_provider, VideoFrameProvider):
             cache_dir = Path(self.frame_provider.root) / ".annotate_staging" / ".video_clips"
             clip = self.frame_provider.episode_clip_path(record, cache_dir)
-            return (
-                to_video_url_block(f"file://{clip}", fps=self.config.use_video_url_fps)
-                if clip is not None
-                else []
+            if clip is not None:
+                return to_video_url_block(f"file://{clip}", fps=self.config.use_video_url_fps)
+            logger.warning(
+                "episode %d: video_url clip extraction failed — falling back to "
+                "embedded frames so the VLM still sees the demonstration",
+                record.episode_index,
             )
         episode_duration = record.frame_timestamps[-1] - record.frame_timestamps[0]
         target_count = max(1, int(round(episode_duration * self.config.frames_per_second)))
diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt
index a49096682..4ea7407e6 100644
--- a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt
+++ b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt
@@ -6,6 +6,21 @@ You are shown the entire demonstration as a single video. Watch the
 whole clip, then segment it into a list of consecutive atomic subtasks
 the robot performs.
 
+GROUNDING — read this first, it overrides everything below:
+- Label ONLY what the robot actually does in the video. Every subtask
+  you emit must correspond to motion you can SEE in specific frames.
+- Do NOT invent, anticipate, or pad. If the robot only does one thing
+  (e.g. it just navigates to a location and the clip ends), emit
+  EXACTLY ONE subtask. Many demonstrations are a single atomic skill.
+- ``max_steps`` below is a hard CEILING, not a target. Emitting fewer
+  subtasks than the ceiling is not just allowed, it is expected for
+  short / atomic demonstrations. One correct subtask is far better
+  than several invented ones.
+- If the video does not clearly show the action implied by the task,
+  describe what you actually see — do NOT fabricate the task's steps
+  from the instruction text. The instruction tells you the goal; the
+  VIDEO is the ground truth for what happened.
+
 Authoring rules — Hi Robot atom granularity, pi0.7-style short prompts:
 
 - Each subtask = one COMPOSITE atomic skill the low-level policy can