From dd97c33814d7e1dd80816ce914ca3cd019574e8d Mon Sep 17 00:00:00 2001 From: Pepijn Date: Wed, 13 May 2026 15:55:02 +0200 Subject: [PATCH] refactor(annotate): plan = summary of still-todo subtasks, drop VLM call MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The plan was being generated by a separate VLM call (one per episode + one per interjection refresh) with a prompt that asked the model to "compress the subtasks into a compact hierarchical plan". In practice the plans came out longer than necessary and sometimes drifted from the actual subtask sequence the runtime would execute. Replaced ``_generate_plan`` with a deterministic numbered list of the upcoming subtasks. At a refresh time the list shrinks to subtasks whose start ≥ refresh_t — the plan describes what's *left* to do, so it gets shorter as work progresses. Saves the per-episode + per-interjection VLM round-trip in the annotation pipeline and keeps train-time plan text bit-aligned with the subtask annotations the rest of Module 1 emits. Removed the now-unused ``prompts/module_1_plan.txt``. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../modules/plan_subtasks_memory.py | 78 +++++++++---------- .../prompts/module_1_plan.txt | 18 ----- 2 files changed, 37 insertions(+), 59 deletions(-) delete mode 100644 src/lerobot/annotations/steerable_pipeline/prompts/module_1_plan.txt diff --git a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py index c48d888fb..bf04eddfd 100644 --- a/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py +++ b/src/lerobot/annotations/steerable_pipeline/modules/plan_subtasks_memory.py @@ -372,54 +372,50 @@ class PlanSubtasksMemoryModule: def _generate_plan( self, - record: EpisodeRecord, + record: EpisodeRecord, # noqa: ARG002 (kept for signature stability) subtask_spans: Sequence[dict[str, Any]], *, refresh_t: float | None = None, - interjection: str | None = None, - task: str | None = None, + interjection: str | None = None, # noqa: ARG002 + task: str | None = None, # noqa: ARG002 ) -> str | None: + """Deterministic plan = numbered list of *still-todo* subtasks. + + Previously this called the VLM with a prompt that asked it to + compress the subtasks into a "compact hierarchical plan". That + produced longer-than-necessary plans, cost an extra VLM round-trip + per episode (plus one per interjection on refresh), and could + diverge from the actual subtask sequence the model is going to + execute. Replacing it with a plain summarisation keeps the plan + tightly aligned with the upcoming subtasks and removes the VLM + call entirely. + + Layout (matches the v2 plan style — short imperative fragments + prefixed by "N. "): + + 1. + 2. + ... + + On a refresh at ``refresh_t`` (called from ``run_plan_updates`` + on interjection events), only subtasks whose start is at or + after ``refresh_t`` are included — the plan shrinks as work + progresses, so it always describes what's left. + """ if not subtask_spans: return None - subtasks_text = "\n".join(f"- {s['text']}" for s in subtask_spans) - prompt = load_prompt("module_1_plan").format( - episode_task=(task if task is not None else record.episode_task), - subtasks_text=subtasks_text, - plan_max_steps=self.config.plan_max_steps, + remaining = [ + s for s in subtask_spans + if refresh_t is None or float(s.get("start", 0.0)) >= float(refresh_t) + ] + if not remaining: + # Past the last subtask boundary on a late refresh — nothing + # left to plan; emit None so the caller skips the row. + return None + return "\n".join( + f"{i}. {span.get('text', '').strip()}" + for i, span in enumerate(remaining, start=1) ) - if refresh_t is not None: - # ``current_subtask`` is the span the refresh time falls into, - # so the model knows where in the demonstration the planner is - # standing when it re-emits. - current_subtask = "" - for span in subtask_spans: - if float(span["start"]) <= refresh_t and ( - "end" not in span or float(span["end"]) > refresh_t - ): - current_subtask = span.get("text", "") - break - if interjection: - prompt += ( - f"\n\n(Plan refresh at t={refresh_t:.2f}s after a user " - f"interjection: {interjection!r}. Current subtask just " - f"before the interjection: {current_subtask!r}. Update " - f"the plan so it reflects the interjection — drop or " - f"reorder steps as needed; do not just restate.)\n" - ) - else: - # Refresh without an interjection text: still tell the model - # where in the episode the plan stands so the re-emission - # is grounded. Should be rare — plan refreshes are - # interjection-driven by design. - prompt += ( - f"\n\n(Plan refresh at t={refresh_t:.2f}s. Current " - f"subtask: {current_subtask!r}.)\n" - ) - messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}] - result = self.vlm.generate_json([messages])[0] - if isinstance(result, dict) and isinstance(result.get("plan"), str): - return result["plan"].strip() - return None def _generate_memory( self, diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_plan.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_plan.txt deleted file mode 100644 index 528c6f0c9..000000000 --- a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_plan.txt +++ /dev/null @@ -1,18 +0,0 @@ -You are the high-level planner for a robot demonstrating: "{episode_task}". - -Given the subtask decomposition below, write a compact hierarchical PLAN. -Use short imperative fragments, like pi0.7 context prompts. - -Subtasks for context: -{subtasks_text} - -Authoring rules: -- 3 to {plan_max_steps} steps. -- Each step is one logical chunk, not one motion. -- Steps must be in execution order. -- Brief commands, not full sentences. -- Prefer: "open air fryer"; avoid: "The robot should open the air fryer." -- Plain text, no markdown headers. - -Output strictly valid JSON: - {{ "plan": "1. ...\n2. ...\n3. ..." }}