From ffd8c92ce5b2ab6bf8f2f82ca82850aac0094ad4 Mon Sep 17 00:00:00 2001 From: Pepijn Date: Wed, 13 May 2026 15:42:29 +0200 Subject: [PATCH] fix(inference): always emit Plan:/Memory: labels in the high-level prompt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The recipe renders ``"\${task}\nPlan: \${plan}\nMemory: \${memory}"`` unconditionally — when a binding resolves to None, ``language_render._substitute`` substitutes an empty string, so the training-time user turn always contains the literal ``Plan: `` / ``Memory: `` prefixes even with empty values. The inference message builders were skipping those lines entirely when ``state['current_plan']`` / ``state['current_memory']`` was empty, producing a different prompt shape on early frames (before the plan-generation step runs) and on datasets without plan/memory annotations. Factored a shared ``_hirobot_user_head`` helper used by ``_msgs_for_subtask``, ``_msgs_for_memory``, and the legacy ``_control_context_messages`` so they all match training byte-for- byte regardless of which bindings are populated. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../policies/smolvla2/inference/steps.py | 69 +++++++++---------- 1 file changed, 31 insertions(+), 38 deletions(-) diff --git a/src/lerobot/policies/smolvla2/inference/steps.py b/src/lerobot/policies/smolvla2/inference/steps.py index 1d7a28853..3bd34074d 100644 --- a/src/lerobot/policies/smolvla2/inference/steps.py +++ b/src/lerobot/policies/smolvla2/inference/steps.py @@ -704,13 +704,12 @@ def _control_context_messages( Mirrors what ``hirobot.yaml`` renders into ``${task}\nPlan: ${plan}\nMemory: ${memory}`` for the high-level branches. """ - parts: list[str] = [] + # Always emit ``Plan: `` / ``Memory: `` labels — even with empty + # values — to mirror the training-time recipe substitution. task = state.get("task") or "" - parts.append(task) - if state.get("current_plan"): - parts.append(f"Plan: {state['current_plan']}") - if state.get("current_memory"): - parts.append(f"Memory: {state['current_memory']}") + plan = state.get("current_plan") or "" + memory = state.get("current_memory") or "" + parts = [task, f"Plan: {plan}", f"Memory: {memory}"] if include_completed and state.get("current_subtask"): parts.append(f"Completed subtask: {state['current_subtask']}") head = "\n".join(parts) @@ -729,51 +728,45 @@ def _control_context_messages( # --------------------------------------------------------------------------- -def _msgs_for_subtask(state: dict[str, Any]) -> list[dict[str, Any]]: - """``high_level_subtask`` recipe layout (v2 — predict current subtask). +def _hirobot_user_head(state: dict[str, Any]) -> str: + """Build the ``task\\nPlan: …\\nMemory: …`` user content string. - The training-time recipe was changed to supervise the model on the - *current* active subtask span at every frame, not the next-span text - only at transitions. So the inference-time prompt no longer feeds a - "Current subtask: X" user message — that would be circular (we'd be - telling the model the answer). The model now decides the subtask - purely from the task + plan + memory context plus the visual prefix. - - Transition detection moves into the runtime: when the predicted - subtask differs from ``state['current_subtask']``, fire the - ``subtask_change`` event so memory updates. Same downstream signal - as before, just produced by an always-non-empty supervision target. + Mirrors what the recipe renders at training time, where + ``language_render._substitute`` substitutes empty strings for + missing ``${plan}`` / ``${memory}`` bindings — i.e. the + ``Plan: `` / ``Memory: `` prefix labels are *always* in the + user turn, even when their values aren't set yet. Skipping them + here (the previous behaviour) produced a different prompt shape + on early frames before plan / memory are populated and on + samples where the dataset has no plan / memory annotation. """ - head_parts = [state.get("task") or ""] - if state.get("current_plan"): - head_parts.append(f"Plan: {state['current_plan']}") - if state.get("current_memory"): - head_parts.append(f"Memory: {state['current_memory']}") - return [{"role": "user", "content": "\n".join(head_parts)}] + task = state.get("task") or "" + plan = state.get("current_plan") or "" + memory = state.get("current_memory") or "" + return f"{task}\nPlan: {plan}\nMemory: {memory}" + + +def _msgs_for_subtask(state: dict[str, Any]) -> list[dict[str, Any]]: + """``high_level_subtask`` recipe layout — predict the current subtask + from (task + plan + memory). Even when plan / memory aren't set yet + the labels render as bare ``Plan: `` / ``Memory: `` to match training. + """ + return [{"role": "user", "content": _hirobot_user_head(state)}] def _msgs_for_memory(state: dict[str, Any]) -> list[dict[str, Any]]: - """Memory-update prompt — matches the boundary-frame tail of - ``action_execution`` in the v2 recipes. + """Memory-update prompt — boundary-frame tail of ``high_level_subtask``. Recipe layout on a boundary frame: user: "${task}\\nPlan: ${plan}\\nMemory: ${memory}" assistant: "${subtask}" assistant: → predicts new memory - At inference we fire this when the runtime detects a subtask - transition; the freshly-predicted subtask lives in - ``state['current_subtask']``. No "Completed subtask: X" user - filler — the second assistant turn is generated immediately - after the subtask turn. + Fired when the runtime detects a subtask transition; the + just-predicted subtask lives in ``state['current_subtask']``. """ - head_parts = [state.get("task") or ""] - if state.get("current_plan"): - head_parts.append(f"Plan: {state['current_plan']}") - if state.get("current_memory"): - head_parts.append(f"Memory: {state['current_memory']}") msgs: list[dict[str, Any]] = [ - {"role": "user", "content": "\n".join(head_parts)}, + {"role": "user", "content": _hirobot_user_head(state)}, ] if state.get("current_subtask"): msgs.append({"role": "assistant", "content": state["current_subtask"]})