diff --git a/examples/annotations/run_hf_job.py b/examples/annotations/run_hf_job.py index 8ce22c28f..5e1b11d4b 100644 --- a/examples/annotations/run_hf_job.py +++ b/examples/annotations/run_hf_job.py @@ -54,9 +54,14 @@ CMD = ( "--vlm.chat_template_kwargs='{\"enable_thinking\": false}' " "--vlm.camera_key=observation.images.robot0_agentview_right " # Phase 1 — plan module (subtasks + plan + memory). - "--plan.frames_per_second=1.0 " - "--plan.use_video_url=true " - "--plan.use_video_url_fps=1.0 " + # Embed decoded frames directly (use_video_url=false) rather than + # handing the server a file:// clip. The embedded path is more + # reliable: if clip extraction ever fails, the video_url path would + # silently send NO video and the VLM would hallucinate subtasks from + # the task text alone. 2 fps gives dense visual grounding so the VLM + # labels what actually happens. + "--plan.frames_per_second=2.0 " + "--plan.use_video_url=false " # IMPORTANT for RoboCasa: the dataset's task string ("Navigate to the # stove", "Pick the mug...") is authoritative and is what eval uses. # ``derive_task_from_video=off`` keeps that canonical task driving @@ -80,6 +85,10 @@ CMD = ( # Phase 2 — interjections + speech. "--interjections.max_interjections_per_episode=6 " # Phase 4 — general VQA. + # Ground VQA on the SAME single camera as plan/interjections + # (--vlm.camera_key) instead of iterating every camera. The whole + # pipeline then focuses on one view, e.g. observation.images.base. + "--vqa.restrict_to_default_camera=true " "--vqa.K=1 " "--vqa.vqa_emission_hz=1.0" ) diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py index f84fdaa08..1cecfa772 100644 --- a/src/lerobot/annotations/steerable_pipeline/config.py +++ b/src/lerobot/annotations/steerable_pipeline/config.py @@ -219,6 +219,15 @@ class VqaConfig: precision for more (noisier) VQA frames.""" question_types: tuple[str, ...] = ("bbox", "keypoint", "count", "attribute", "spatial") + # Camera restriction. By default VQA iterates EVERY camera the + # dataset declares (one VQA pair per camera per emission tick). Set + # ``restrict_to_default_camera=True`` to ground VQA on only the + # single ``--vlm.camera_key`` stream — the same camera the plan / + # interjection modules use — so the whole pipeline focuses on one + # view. Use this when you want every annotation grounded on, e.g., + # ``observation.images.base`` and nothing else. + restrict_to_default_camera: bool = False + @dataclass class VlmConfig: diff --git a/src/lerobot/annotations/steerable_pipeline/modules/general_vqa.py b/src/lerobot/annotations/steerable_pipeline/modules/general_vqa.py index adabff731..1e5ad8838 100644 --- a/src/lerobot/annotations/steerable_pipeline/modules/general_vqa.py +++ b/src/lerobot/annotations/steerable_pipeline/modules/general_vqa.py @@ -180,8 +180,20 @@ class GeneralVqaModule: Defaults to every camera the provider exposes. Datasets with no cameras (or test/null providers) yield an empty list, which makes ``run_episode`` a no-op. + + When ``config.restrict_to_default_camera`` is set, VQA grounds on + only the provider's default camera (the single ``--vlm.camera_key`` + stream), matching the plan / interjection modules so the whole + pipeline focuses on one view. """ - return list(getattr(self.frame_provider, "camera_keys", []) or []) + all_cameras = list(getattr(self.frame_provider, "camera_keys", []) or []) + if getattr(self.config, "restrict_to_default_camera", False): + default = getattr(self.frame_provider, "camera_key", None) + if default and default in all_cameras: + return [default] + if default: + return [default] + return all_cameras def _build_messages( self, diff --git a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py index 5e66f67be..c46b20bac 100644 --- a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py +++ b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py @@ -443,16 +443,27 @@ class PlanSubtasksMemoryModule: return flat def _episode_video_block(self, record: EpisodeRecord) -> list[dict[str, Any]]: - """Same video block ``_generate_subtasks`` builds — extracted helper.""" + """Same video block ``_generate_subtasks`` builds — extracted helper. + + Always returns a block that actually carries the video. When + ``use_video_url`` is set we try the server-side ``video_url`` + path first, but if clip extraction fails we FALL BACK to + decoding + embedding frames rather than returning an empty + block — an empty block would leave the VLM with no visual + grounding at all and it would hallucinate subtasks purely from + the task text. + """ if not record.frame_timestamps: return [] if self.config.use_video_url and isinstance(self.frame_provider, VideoFrameProvider): cache_dir = Path(self.frame_provider.root) / ".annotate_staging" / ".video_clips" clip = self.frame_provider.episode_clip_path(record, cache_dir) - return ( - to_video_url_block(f"file://{clip}", fps=self.config.use_video_url_fps) - if clip is not None - else [] + if clip is not None: + return to_video_url_block(f"file://{clip}", fps=self.config.use_video_url_fps) + logger.warning( + "episode %d: video_url clip extraction failed — falling back to " + "embedded frames so the VLM still sees the demonstration", + record.episode_index, ) episode_duration = record.frame_timestamps[-1] - record.frame_timestamps[0] target_count = max(1, int(round(episode_duration * self.config.frames_per_second))) diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt index a49096682..4ea7407e6 100644 --- a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt +++ b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt @@ -6,6 +6,21 @@ You are shown the entire demonstration as a single video. Watch the whole clip, then segment it into a list of consecutive atomic subtasks the robot performs. +GROUNDING — read this first, it overrides everything below: +- Label ONLY what the robot actually does in the video. Every subtask + you emit must correspond to motion you can SEE in specific frames. +- Do NOT invent, anticipate, or pad. If the robot only does one thing + (e.g. it just navigates to a location and the clip ends), emit + EXACTLY ONE subtask. Many demonstrations are a single atomic skill. +- ``max_steps`` below is a hard CEILING, not a target. Emitting fewer + subtasks than the ceiling is not just allowed, it is expected for + short / atomic demonstrations. One correct subtask is far better + than several invented ones. +- If the video does not clearly show the action implied by the task, + describe what you actually see — do NOT fabricate the task's steps + from the instruction text. The instruction tells you the goal; the + VIDEO is the ground truth for what happened. + Authoring rules — Hi Robot atom granularity, pi0.7-style short prompts: - Each subtask = one COMPOSITE atomic skill the low-level policy can