diff --git a/examples/annotations/run_hf_job.py b/examples/annotations/run_hf_job.py
index 8ce22c28f..5e1b11d4b 100644
--- a/examples/annotations/run_hf_job.py
+++ b/examples/annotations/run_hf_job.py
@@ -54,9 +54,14 @@ CMD = (
     "--vlm.chat_template_kwargs='{\"enable_thinking\": false}' "
     "--vlm.camera_key=observation.images.robot0_agentview_right "
     # Phase 1 — plan module (subtasks + plan + memory).
-    "--plan.frames_per_second=1.0 "
-    "--plan.use_video_url=true "
-    "--plan.use_video_url_fps=1.0 "
+    # Embed decoded frames directly (use_video_url=false) rather than
+    # handing the server a file:// clip. The embedded path is more
+    # reliable: if clip extraction ever fails, the video_url path would
+    # silently send NO video and the VLM would hallucinate subtasks from
+    # the task text alone. 2 fps gives dense visual grounding so the VLM
+    # labels what actually happens.
+    "--plan.frames_per_second=2.0 "
+    "--plan.use_video_url=false "
     # IMPORTANT for RoboCasa: the dataset's task string ("Navigate to the
     # stove", "Pick the mug...") is authoritative and is what eval uses.
     # ``derive_task_from_video=off`` keeps that canonical task driving
@@ -80,6 +85,10 @@ CMD = (
     # Phase 2 — interjections + speech.
     "--interjections.max_interjections_per_episode=6 "
     # Phase 4 — general VQA.
+    # Ground VQA on the SAME single camera as plan/interjections
+    # (--vlm.camera_key) instead of iterating every camera. The whole
+    # pipeline then focuses on one view, e.g. observation.images.base.
+    "--vqa.restrict_to_default_camera=true "
     "--vqa.K=1 "
     "--vqa.vqa_emission_hz=1.0"
 )
diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py
index f84fdaa08..1cecfa772 100644
--- a/src/lerobot/annotations/steerable_pipeline/config.py
+++ b/src/lerobot/annotations/steerable_pipeline/config.py
@@ -219,6 +219,15 @@ class VqaConfig:
     precision for more (noisier) VQA frames."""
     question_types: tuple[str, ...] = ("bbox", "keypoint", "count", "attribute", "spatial")
 
+    # Camera restriction. By default VQA iterates EVERY camera the
+    # dataset declares (one VQA pair per camera per emission tick). Set
+    # ``restrict_to_default_camera=True`` to ground VQA on only the
+    # single ``--vlm.camera_key`` stream — the same camera the plan /
+    # interjection modules use — so the whole pipeline focuses on one
+    # view. Use this when you want every annotation grounded on, e.g.,
+    # ``observation.images.base`` and nothing else.
+    restrict_to_default_camera: bool = False
+
 
 @dataclass
 class VlmConfig:
diff --git a/src/lerobot/annotations/steerable_pipeline/modules/general_vqa.py b/src/lerobot/annotations/steerable_pipeline/modules/general_vqa.py
index adabff731..1e5ad8838 100644
--- a/src/lerobot/annotations/steerable_pipeline/modules/general_vqa.py
+++ b/src/lerobot/annotations/steerable_pipeline/modules/general_vqa.py
@@ -180,8 +180,20 @@ class GeneralVqaModule:
         Defaults to every camera the provider exposes. Datasets with no
         cameras (or test/null providers) yield an empty list, which makes
         ``run_episode`` a no-op.
+
+        When ``config.restrict_to_default_camera`` is set, VQA grounds on
+        only the provider's default camera (the single ``--vlm.camera_key``
+        stream), matching the plan / interjection modules so the whole
+        pipeline focuses on one view.
         """
-        return list(getattr(self.frame_provider, "camera_keys", []) or [])
+        all_cameras = list(getattr(self.frame_provider, "camera_keys", []) or [])
+        if getattr(self.config, "restrict_to_default_camera", False):
+            default = getattr(self.frame_provider, "camera_key", None)
+            if default and default in all_cameras:
+                return [default]
+            if default:
+                return [default]
+        return all_cameras
 
     def _build_messages(
         self,
diff --git a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
index 5e66f67be..c46b20bac 100644
--- a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
+++ b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
@@ -443,16 +443,27 @@ class PlanSubtasksMemoryModule:
         return flat
 
     def _episode_video_block(self, record: EpisodeRecord) -> list[dict[str, Any]]:
-        """Same video block ``_generate_subtasks`` builds — extracted helper."""
+        """Same video block ``_generate_subtasks`` builds — extracted helper.
+
+        Always returns a block that actually carries the video. When
+        ``use_video_url`` is set we try the server-side ``video_url``
+        path first, but if clip extraction fails we FALL BACK to
+        decoding + embedding frames rather than returning an empty
+        block — an empty block would leave the VLM with no visual
+        grounding at all and it would hallucinate subtasks purely from
+        the task text.
+        """
         if not record.frame_timestamps:
             return []
         if self.config.use_video_url and isinstance(self.frame_provider, VideoFrameProvider):
             cache_dir = Path(self.frame_provider.root) / ".annotate_staging" / ".video_clips"
             clip = self.frame_provider.episode_clip_path(record, cache_dir)
-            return (
-                to_video_url_block(f"file://{clip}", fps=self.config.use_video_url_fps)
-                if clip is not None
-                else []
+            if clip is not None:
+                return to_video_url_block(f"file://{clip}", fps=self.config.use_video_url_fps)
+            logger.warning(
+                "episode %d: video_url clip extraction failed — falling back to "
+                "embedded frames so the VLM still sees the demonstration",
+                record.episode_index,
             )
         episode_duration = record.frame_timestamps[-1] - record.frame_timestamps[0]
         target_count = max(1, int(round(episode_duration * self.config.frames_per_second)))
diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt
index a49096682..4ea7407e6 100644
--- a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt
+++ b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt
@@ -6,6 +6,21 @@ You are shown the entire demonstration as a single video. Watch the
 whole clip, then segment it into a list of consecutive atomic subtasks
 the robot performs.
 
+GROUNDING — read this first, it overrides everything below:
+- Label ONLY what the robot actually does in the video. Every subtask
+  you emit must correspond to motion you can SEE in specific frames.
+- Do NOT invent, anticipate, or pad. If the robot only does one thing
+  (e.g. it just navigates to a location and the clip ends), emit
+  EXACTLY ONE subtask. Many demonstrations are a single atomic skill.
+- ``max_steps`` below is a hard CEILING, not a target. Emitting fewer
+  subtasks than the ceiling is not just allowed, it is expected for
+  short / atomic demonstrations. One correct subtask is far better
+  than several invented ones.
+- If the video does not clearly show the action implied by the task,
+  describe what you actually see — do NOT fabricate the task's steps
+  from the instruction text. The instruction tells you the goal; the
+  VIDEO is the ground truth for what happened.
+
 Authoring rules — Hi Robot atom granularity, pi0.7-style short prompts:
 
 - Each subtask = one COMPOSITE atomic skill the low-level policy can