From ffd8c92ce5b2ab6bf8f2f82ca82850aac0094ad4 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Wed, 13 May 2026 15:42:29 +0200
Subject: [PATCH] fix(inference): always emit Plan:/Memory: labels in the
 high-level prompt
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The recipe renders ``"\${task}\nPlan: \${plan}\nMemory: \${memory}"``
unconditionally — when a binding resolves to None,
``language_render._substitute`` substitutes an empty string, so the
training-time user turn always contains the literal ``Plan: `` /
``Memory: `` prefixes even with empty values.

The inference message builders were skipping those lines entirely
when ``state['current_plan']`` / ``state['current_memory']`` was
empty, producing a different prompt shape on early frames (before
the plan-generation step runs) and on datasets without plan/memory
annotations.

Factored a shared ``_hirobot_user_head`` helper used by
``_msgs_for_subtask``, ``_msgs_for_memory``, and the legacy
``_control_context_messages`` so they all match training byte-for-
byte regardless of which bindings are populated.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../policies/smolvla2/inference/steps.py      | 69 +++++++++----------
 1 file changed, 31 insertions(+), 38 deletions(-)

diff --git a/src/lerobot/policies/smolvla2/inference/steps.py b/src/lerobot/policies/smolvla2/inference/steps.py
index 1d7a28853..3bd34074d 100644
--- a/src/lerobot/policies/smolvla2/inference/steps.py
+++ b/src/lerobot/policies/smolvla2/inference/steps.py
@@ -704,13 +704,12 @@ def _control_context_messages(
     Mirrors what ``hirobot.yaml`` renders into ``${task}\nPlan:
     ${plan}\nMemory: ${memory}`` for the high-level branches.
     """
-    parts: list[str] = []
+    # Always emit ``Plan: `` / ``Memory: `` labels — even with empty
+    # values — to mirror the training-time recipe substitution.
     task = state.get("task") or ""
-    parts.append(task)
-    if state.get("current_plan"):
-        parts.append(f"Plan: {state['current_plan']}")
-    if state.get("current_memory"):
-        parts.append(f"Memory: {state['current_memory']}")
+    plan = state.get("current_plan") or ""
+    memory = state.get("current_memory") or ""
+    parts = [task, f"Plan: {plan}", f"Memory: {memory}"]
     if include_completed and state.get("current_subtask"):
         parts.append(f"Completed subtask: {state['current_subtask']}")
     head = "\n".join(parts)
@@ -729,51 +728,45 @@ def _control_context_messages(
 # ---------------------------------------------------------------------------
 
 
-def _msgs_for_subtask(state: dict[str, Any]) -> list[dict[str, Any]]:
-    """``high_level_subtask`` recipe layout (v2 — predict current subtask).
+def _hirobot_user_head(state: dict[str, Any]) -> str:
+    """Build the ``task\\nPlan: …\\nMemory: …`` user content string.
 
-    The training-time recipe was changed to supervise the model on the
-    *current* active subtask span at every frame, not the next-span text
-    only at transitions. So the inference-time prompt no longer feeds a
-    "Current subtask: X" user message — that would be circular (we'd be
-    telling the model the answer). The model now decides the subtask
-    purely from the task + plan + memory context plus the visual prefix.
-
-    Transition detection moves into the runtime: when the predicted
-    subtask differs from ``state['current_subtask']``, fire the
-    ``subtask_change`` event so memory updates. Same downstream signal
-    as before, just produced by an always-non-empty supervision target.
+    Mirrors what the recipe renders at training time, where
+    ``language_render._substitute`` substitutes empty strings for
+    missing ``${plan}`` / ``${memory}`` bindings — i.e. the
+    ``Plan: `` / ``Memory: `` prefix labels are *always* in the
+    user turn, even when their values aren't set yet. Skipping them
+    here (the previous behaviour) produced a different prompt shape
+    on early frames before plan / memory are populated and on
+    samples where the dataset has no plan / memory annotation.
     """
-    head_parts = [state.get("task") or ""]
-    if state.get("current_plan"):
-        head_parts.append(f"Plan: {state['current_plan']}")
-    if state.get("current_memory"):
-        head_parts.append(f"Memory: {state['current_memory']}")
-    return [{"role": "user", "content": "\n".join(head_parts)}]
+    task = state.get("task") or ""
+    plan = state.get("current_plan") or ""
+    memory = state.get("current_memory") or ""
+    return f"{task}\nPlan: {plan}\nMemory: {memory}"
+
+
+def _msgs_for_subtask(state: dict[str, Any]) -> list[dict[str, Any]]:
+    """``high_level_subtask`` recipe layout — predict the current subtask
+    from (task + plan + memory). Even when plan / memory aren't set yet
+    the labels render as bare ``Plan: `` / ``Memory: `` to match training.
+    """
+    return [{"role": "user", "content": _hirobot_user_head(state)}]
 
 
 def _msgs_for_memory(state: dict[str, Any]) -> list[dict[str, Any]]:
-    """Memory-update prompt — matches the boundary-frame tail of
-    ``action_execution`` in the v2 recipes.
+    """Memory-update prompt — boundary-frame tail of ``high_level_subtask``.
 
     Recipe layout on a boundary frame:
         user:      "${task}\\nPlan: ${plan}\\nMemory: ${memory}"
         assistant: "${subtask}"
         assistant: → predicts new memory
 
-    At inference we fire this when the runtime detects a subtask
-    transition; the freshly-predicted subtask lives in
-    ``state['current_subtask']``. No "Completed subtask: X" user
-    filler — the second assistant turn is generated immediately
-    after the subtask turn.
+    Fired when the runtime detects a subtask transition; the
+    just-predicted subtask lives in ``state['current_subtask']``.
     """
-    head_parts = [state.get("task") or ""]
-    if state.get("current_plan"):
-        head_parts.append(f"Plan: {state['current_plan']}")
-    if state.get("current_memory"):
-        head_parts.append(f"Memory: {state['current_memory']}")
     msgs: list[dict[str, Any]] = [
-        {"role": "user", "content": "\n".join(head_parts)},
+        {"role": "user", "content": _hirobot_user_head(state)},
     ]
     if state.get("current_subtask"):
         msgs.append({"role": "assistant", "content": state["current_subtask"]})