From dcd368e1f80702718e98a9bd6b230acae28c04ce Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Tue, 2 Jun 2026 15:12:46 +0200
Subject: [PATCH] annotate: multi-call subtask quality chain (describe ->
 segment -> verify)

The single-call 'watch video -> emit subtask JSON' pattern makes the
VLM commit to structured output before reasoning about what it saw, so
it pattern-matches the task text and hallucinates steps. Split it into
an opt-in multi-call chain that grounds first and prunes last.

New PlanConfig flags (both default False -> single-call unchanged):
  * subtask_describe_first: a grounding pass narrates ONLY what is
    visible in the video (no subtask JSON yet). That description is
    injected into the segmentation prompt via a new {observation_block}
    placeholder, so the model segments its own grounded observations
    instead of the instruction text. +1 VLM call/episode.
  * subtask_verify: after segmentation, an adversarial pass re-watches
    the video and drops any candidate subtask it cannot see. Can only
    PRUNE (never add/rewrite/move) and fails open (keeps un-verified
    spans if the call returns nothing). +1 VLM call/episode.

Implementation:
  * _generate_subtasks now orchestrates describe -> segment -> verify.
  * Factored span cleaning into _clean_spans (shared by segment + verify
    outputs); added _describe_episode and _verify_subtasks helpers.
  * New prompts module_1_subtask_describe.txt (returns {description})
    and module_1_subtask_verify.txt (returns pruned {subtasks}).
  * module_1_subtasks.txt gains a {observation_block} slot at the top.

run_hf_job.py enables both for the RoboCasa run (3 VLM calls/episode
for subtasks). Combined with single-camera grounding + the embedded-
frame path, this is the high-quality configuration.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 examples/annotations/run_hf_job.py            |   7 ++
 .../annotations/steerable_pipeline/config.py  |  16 +++
 .../modules/plan_subtasks_memory.py           | 105 +++++++++++++++++-
 .../prompts/module_1_subtask_describe.txt     |  27 +++++
 .../prompts/module_1_subtask_verify.txt       |  33 ++++++
 .../prompts/module_1_subtasks.txt             |   2 +-
 6 files changed, 183 insertions(+), 7 deletions(-)
 create mode 100644 src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtask_describe.txt
 create mode 100644 src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtask_verify.txt

diff --git a/examples/annotations/run_hf_job.py b/examples/annotations/run_hf_job.py
index 5e1b11d4b..421b263da 100644
--- a/examples/annotations/run_hf_job.py
+++ b/examples/annotations/run_hf_job.py
@@ -82,6 +82,13 @@ CMD = (
     # tasks. Leave off for RoboCasa atomic / navigation.
     # Keep subtask decomposition tight for atomic tasks:
     "--plan.plan_max_steps=6 "
+    # Multi-call quality chain (3 VLM calls/episode for subtasks):
+    #   1. describe-first: narrate ONLY what is visible before segmenting
+    #      — the strongest fix for subtasks invented from the task text.
+    #   2. (segment)
+    #   3. verify: re-watch and prune any subtask not actually seen.
+    "--plan.subtask_describe_first=true "
+    "--plan.subtask_verify=true "
     # Phase 2 — interjections + speech.
     "--interjections.max_interjections_per_episode=6 "
     # Phase 4 — general VQA.
diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py
index 1cecfa772..9a0dd4232 100644
--- a/src/lerobot/annotations/steerable_pipeline/config.py
+++ b/src/lerobot/annotations/steerable_pipeline/config.py
@@ -51,6 +51,22 @@ class PlanConfig:
     min_subtask_seconds: float = 1.5
     plan_max_steps: int = 8
 
+    # Multi-call subtask quality chain (opt-in, more VLM calls, higher
+    # quality). Both off by default → single-call behaviour unchanged.
+    #
+    # ``subtask_describe_first``: run a grounding pass that narrates ONLY
+    # what is visible in the video (no subtask JSON yet), then inject that
+    # description into the segmentation prompt. Forces the model to
+    # observe before committing to structured output — the strongest
+    # lever against subtasks invented from the task text. +1 VLM call/ep.
+    subtask_describe_first: bool = False
+    # ``subtask_verify``: after segmentation, re-watch the video and drop
+    # any proposed subtask that can't be verified as visible. Prunes
+    # hallucinations; can only remove subtasks, never add/rewrite them.
+    # Fail-open (keeps un-verified spans if the verify call returns
+    # nothing). +1 VLM call/ep.
+    subtask_verify: bool = False
+
     # When True (and backend supports it, e.g. ``openai``), the ``plan``
     # module sends a ``video_url`` block pointing at a per-episode mp4
     # subclip and lets the server sample frames at ``use_video_url_fps``.
diff --git a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
index c46b20bac..1ba9b142b 100644
--- a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
+++ b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py
@@ -521,20 +521,65 @@ class PlanSubtasksMemoryModule:
         staging.write("plan", new_rows)
 
     def _generate_subtasks(self, record: EpisodeRecord, *, task: str | None = None) -> list[dict[str, Any]]:
+        """Generate subtask spans, optionally via a multi-call quality chain.
+
+        Single call (default): watch video → emit subtask JSON.
+
+        Multi-call (opt-in, higher quality, more VLM calls):
+          1. ``subtask_describe_first`` — a grounding pass that narrates
+             ONLY what is visible (no JSON commitment to subtasks yet);
+             its description is injected into the segmentation prompt so
+             the model segments its own grounded observations instead of
+             pattern-matching the task text.
+          2. segmentation — emit subtask JSON (as before).
+          3. ``subtask_verify`` — an adversarial pass that re-watches the
+             video and drops any proposed subtask it cannot actually see,
+             pruning hallucinations.
+        """
         if record.row_count == 0 or not record.frame_timestamps:
             return []
         episode_duration = record.frame_timestamps[-1] - record.frame_timestamps[0]
+        effective_task = task if task is not None else record.episode_task
+
+        # ---- Pass 1 (optional): grounding description ----------------
+        observation_block = ""
+        if getattr(self.config, "subtask_describe_first", False):
+            description = self._describe_episode(record, effective_task)
+            if description:
+                observation_block = (
+                    "You watched this video and described, chronologically, "
+                    "ONLY what the robot actually does:\n"
+                    f'"""{description}"""\n\n'
+                    "Segment THAT grounded description (cross-checked against "
+                    "the video) into atomic subtasks. Do not introduce any "
+                    "action that is not in your description above.\n\n"
+                )
+
+        # ---- Pass 2: segmentation ------------------------------------
         prompt = load_prompt("module_1_subtasks").format(
-            episode_task=(task if task is not None else record.episode_task),
+            episode_task=effective_task,
             min_subtask_seconds=self.config.min_subtask_seconds,
             max_steps=self.config.plan_max_steps,
             episode_duration=f"{episode_duration:.3f}",
+            observation_block=observation_block,
         )
-        messages = self._video_message(record, prompt)
-        spans = self._vlm_field(messages, "subtasks")
+        spans = self._vlm_field(self._video_message(record, prompt), "subtasks")
+        cleaned = self._clean_spans(spans, record)
+        if not cleaned:
+            return []
+
+        # ---- Pass 3 (optional): verification / pruning ---------------
+        if getattr(self.config, "subtask_verify", False):
+            cleaned = self._verify_subtasks(record, effective_task, cleaned)
+
+        return cleaned
+
+    def _clean_spans(
+        self, spans: Any, record: EpisodeRecord
+    ) -> list[dict[str, Any]]:
+        """Clamp / sort / dedupe raw VLM subtask spans into valid rows."""
         if not spans:
             return []
-        # clamp to [t0, t_last] and sort
         t0 = record.frame_timestamps[0]
         t_last = record.frame_timestamps[-1]
         cleaned: list[dict[str, Any]] = []
@@ -553,8 +598,56 @@ class PlanSubtasksMemoryModule:
                 continue
             cleaned.append({"text": text, "start": start, "end": end})
         cleaned.sort(key=lambda s: s["start"])
-        cleaned = self._dedupe_starts_to_distinct_frames(cleaned, record)
-        return cleaned
+        return self._dedupe_starts_to_distinct_frames(cleaned, record)
+
+    def _describe_episode(self, record: EpisodeRecord, task: str) -> str:
+        """Grounding pass: free-form chronological description of the video."""
+        prompt = load_prompt("module_1_subtask_describe").format(episode_task=task)
+        text = self._vlm_field(self._video_message(record, prompt), "description")
+        return text.strip() if isinstance(text, str) and text.strip() else ""
+
+    def _verify_subtasks(
+        self,
+        record: EpisodeRecord,
+        task: str,
+        spans: list[dict[str, Any]],
+    ) -> list[dict[str, Any]]:
+        """Adversarial pass: drop proposed subtasks not visible in the video.
+
+        Keeps the original span on a verified ``text`` match (the verify
+        prompt is told not to rewrite text), so verification can only
+        PRUNE — never invent or mutate. If the verify call fails or
+        returns nothing parseable, the un-verified spans are kept (fail
+        open: better to keep a possibly-good label than silently drop
+        everything on a transient VLM hiccup).
+        """
+        import json  # noqa: PLC0415
+
+        subtasks_json = json.dumps(
+            {"subtasks": [{"text": s["text"], "start": round(s["start"], 3), "end": round(s["end"], 3)} for s in spans]},
+            indent=2,
+        )
+        prompt = load_prompt("module_1_subtask_verify").format(
+            episode_task=task, subtasks_json=subtasks_json
+        )
+        kept_raw = self._vlm_field(self._video_message(record, prompt), "subtasks")
+        kept = self._clean_spans(kept_raw, record)
+        if not kept:
+            logger.info(
+                "episode %d: verify pass returned nothing — keeping the %d "
+                "un-verified subtask(s) (fail-open)",
+                record.episode_index,
+                len(spans),
+            )
+            return spans
+        if len(kept) < len(spans):
+            logger.info(
+                "episode %d: verify pass pruned %d -> %d subtask(s)",
+                record.episode_index,
+                len(spans),
+                len(kept),
+            )
+        return kept
 
     @staticmethod
     def _dedupe_starts_to_distinct_frames(
diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtask_describe.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtask_describe.txt
new file mode 100644
index 000000000..6b709e41d
--- /dev/null
+++ b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtask_describe.txt
@@ -0,0 +1,27 @@
+You are watching a teleoperated robot demonstration from a single
+camera. The user asked the robot to: "{episode_task}"
+
+This is an OBSERVATION pass. Watch the entire clip and describe, in
+chronological order, ONLY what the robot physically does — the concrete
+motions, approaches, contacts, grasps, releases, and relocations you can
+actually SEE in the frames.
+
+Hard rules:
+- Describe only motion visible in the video. Do NOT use the task
+  instruction to guess steps that aren't shown. The instruction is the
+  goal; the video is ground truth.
+- Do NOT segment into named subtasks yet and do NOT output JSON beyond
+  the single field below. Just narrate what happens.
+- Give an approximate timestamp (in seconds) for each distinct event,
+  e.g. "0.0-1.4s: the base drives forward toward the stove".
+- Do NOT invent objects, grasps, destinations, or steps. If the robot
+  only does one thing (e.g. it just navigates and the clip ends), say
+  exactly that and nothing more.
+- Be concrete and literal. "the gripper closes on the mug" — not "the
+  robot prepares to make coffee".
+
+Output strictly valid JSON:
+
+  {{
+    "description": "<chronological, timestamped description of ONLY what is visible>"
+  }}
diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtask_verify.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtask_verify.txt
new file mode 100644
index 000000000..e52dc0aeb
--- /dev/null
+++ b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtask_verify.txt
@@ -0,0 +1,33 @@
+You previously segmented a teleoperated robot demonstration into these
+candidate subtasks (JSON):
+
+{subtasks_json}
+
+The user's task was: "{episode_task}"
+
+This is a VERIFICATION pass. Re-watch the video. For EACH candidate
+subtask, decide whether the robot can ACTUALLY be seen performing that
+action within its [start, end] time window.
+
+Rules:
+- KEEP a subtask only if its action is clearly visible in the video in
+  roughly that time window.
+- DROP any subtask whose action you cannot see, that describes
+  something not actually present in the video, that was inferred from
+  the task instruction rather than observed, or that duplicates another
+  kept subtask.
+- Do NOT add new subtasks. Do NOT rewrite the text of kept subtasks.
+  Do NOT change the start/end timestamps of kept subtasks.
+- It is correct and expected to return FEWER subtasks than you were
+  given — even just one — if that is all the video supports. Returning
+  zero is allowed if none can be verified.
+
+Output strictly valid JSON of the SAME shape, containing only the kept
+subtasks in chronological order:
+
+  {{
+    "subtasks": [
+      {{"text": "<kept verbatim>", "start": <float>, "end": <float>}},
+      ...
+    ]
+  }}
diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt
index 4ea7407e6..e1c8f822e 100644
--- a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt
+++ b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt
@@ -6,7 +6,7 @@ You are shown the entire demonstration as a single video. Watch the
 whole clip, then segment it into a list of consecutive atomic subtasks
 the robot performs.
 
-GROUNDING — read this first, it overrides everything below:
+{observation_block}GROUNDING — read this first, it overrides everything below:
 - Label ONLY what the robot actually does in the video. Every subtask
   you emit must correspond to motion you can SEE in specific frames.
 - Do NOT invent, anticipate, or pad. If the robot only does one thing