From 9f630e2a413e08a4dda75265ea76adfc46e52438 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Wed, 13 May 2026 14:59:01 +0200
Subject: [PATCH] fix(recipes,training): stop tool prompt leak + drop subtask
 copy-supervision
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CRITICAL (smolvla2) — the SmolVLM2 chat template was rendering the
``say`` tool's JSON schema as a system message on every training
sample because ``DEFAULT_TOOLS`` was the default in
``SmolVLA2ChatTokenizerStep``. That schema was only relevant to
the now-removed ``user_interjection_response`` recipe; with it
gone the schema is dead weight that polluted every action-expert
prefix AND created a train/inference mismatch (the inference
``_build_text_batch`` doesn't pass ``tools=``). Default is now
``[]``; callers needing tools can still set them via
``with_tools(meta.tools)``.

LIKELY-BUG — ``low_level_execution`` had ``target: true`` on its
assistant turn, so text-CE trained the LM head to predict the
same subtask string the user just stated (trivial "copy previous
turn" supervision that diluted LM head capacity). Dropped the
assistant turn entirely; ``high_level_subtask`` (w=0.50) already
owns subtask prediction from real context.

The chat-tokenizer's ``predict_actions`` detection used to scan
target streams only. With the new no-target low_level recipe it
would mis-fire as False. Switched both
``chat_processor_smolvla2.py`` and ``text_processor_pi052.py`` to
scan all message streams — any ``stream: low_level`` on the
sample is enough to trigger flow loss.

Inference: the low-level loop sends only ``[user(subtask)]`` now,
matching the new recipe shape.

PI052 — hardened the forward fallthrough so a degenerate batch
where every sample's recipe is text-only AND text supervision is
disabled (text_loss_weight<=0 or text_labels missing) cleanly
delegates to ``PI05Policy.forward`` instead of raising
"nothing to train".

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../configs/recipes/pi052_hirobot.yaml        |  4 +++-
 .../configs/recipes/smolvla2_hirobot.yaml     | 13 +++++-----
 src/lerobot/policies/pi052/modeling_pi052.py  | 22 +++++++++++++----
 .../policies/pi052/text_processor_pi052.py    |  7 +++++-
 .../smolvla2/chat_processor_smolvla2.py       | 24 +++++++++++++------
 .../policies/smolvla2/inference/steps.py      |  9 ++++---
 6 files changed, 54 insertions(+), 25 deletions(-)

diff --git a/src/lerobot/configs/recipes/pi052_hirobot.yaml b/src/lerobot/configs/recipes/pi052_hirobot.yaml
index 15ae92b0f..0aa19c72f 100644
--- a/src/lerobot/configs/recipes/pi052_hirobot.yaml
+++ b/src/lerobot/configs/recipes/pi052_hirobot.yaml
@@ -23,8 +23,10 @@ blend:
     weight: 0.30
     messages:
       # Action expert prefix = [images, subtask, state] only — π0.5 style.
+      # No text-CE target: ``high_level_subtask`` already supervises
+      # subtask prediction from real context. ``stream: low_level``
+      # flips ``predict_actions=True`` so the flow loss fires.
       - {role: user, content: "${subtask}", stream: low_level, if_present: subtask}
-      - {role: assistant, content: "${subtask}", stream: low_level, target: true, if_present: subtask}
 
   plan_generation:
     weight: 0.10
diff --git a/src/lerobot/configs/recipes/smolvla2_hirobot.yaml b/src/lerobot/configs/recipes/smolvla2_hirobot.yaml
index 8ff6a1e93..ffbb6b92b 100644
--- a/src/lerobot/configs/recipes/smolvla2_hirobot.yaml
+++ b/src/lerobot/configs/recipes/smolvla2_hirobot.yaml
@@ -32,13 +32,14 @@ blend:
   low_level_execution:
     weight: 0.30
     messages:
-      # Just the subtask in the prompt — π0.5 style. The action
-      # expert sees only [images, this subtask, state]. Marking the
-      # assistant target as ``stream: low_level`` triggers
-      # ``predict_actions=True`` so the flow loss fires; text CE on
-      # the subtask is a (small) redundant extra signal.
+      # π0.5-style action conditioning: the action expert sees just
+      # the subtask (plus images + state). No text-CE target here —
+      # ``high_level_subtask`` (w=0.50) already trains subtask
+      # prediction from real context; supervising it again as a
+      # copy-from-user turn would dilute the LM head. ``stream:
+      # low_level`` on either turn is enough to flip
+      # ``predict_actions=True`` so the flow loss fires.
       - {role: user, content: "${subtask}", stream: low_level, if_present: subtask}
-      - {role: assistant, content: "${subtask}", stream: low_level, target: true, if_present: subtask}
 
   plan_generation:
     weight: 0.10
diff --git a/src/lerobot/policies/pi052/modeling_pi052.py b/src/lerobot/policies/pi052/modeling_pi052.py
index 08802fa89..9553fd89a 100644
--- a/src/lerobot/policies/pi052/modeling_pi052.py
+++ b/src/lerobot/policies/pi052/modeling_pi052.py
@@ -366,14 +366,26 @@ class PI052Policy(PI05Policy):
         text_labels = batch.get("text_labels")
         predict_actions_t = batch.get("predict_actions")
 
-        # Unannotated datasets: no recipe applied → no text_labels and
-        # no FAST / predict_actions routing. Defer to PI05Policy so the
-        # plain flow-only training surface keeps working unchanged.
+        # Unannotated datasets / batches with nothing to train: fall
+        # through to PI05Policy so the plain flow-only training surface
+        # keeps working. Triggers when:
+        #   * the recipe wasn't applied (no text_labels, no
+        #     predict_actions), OR
+        #   * every sample's recipe is text-only AND text is disabled
+        #     (would otherwise hit the "nothing to train" raise below).
+        text_disabled = (
+            self.config.text_loss_weight <= 0 or text_labels is None
+        )
+        fast_disabled = not getattr(self.config, "enable_fast_action_loss", False)
+        no_flow_samples = (
+            predict_actions_t is not None
+            and not bool(predict_actions_t.any().item())
+        )
         if (
             text_labels is None
             and predict_actions_t is None
-            and not getattr(self.config, "enable_fast_action_loss", False)
-        ):
+            and fast_disabled
+        ) or (text_disabled and no_flow_samples and fast_disabled):
             return super().forward(batch, reduction=reduction)
 
         run_flow = (
diff --git a/src/lerobot/policies/pi052/text_processor_pi052.py b/src/lerobot/policies/pi052/text_processor_pi052.py
index d17808997..649e67b90 100644
--- a/src/lerobot/policies/pi052/text_processor_pi052.py
+++ b/src/lerobot/policies/pi052/text_processor_pi052.py
@@ -211,8 +211,13 @@ class PI052TextTokenizerStep(ProcessorStep):
                     continue
                 labels[token_pos] = input_ids[token_pos]
 
+        # Scan ALL message streams (not just targets) — see
+        # ``chat_processor_smolvla2.py`` for rationale: the v2
+        # ``low_level_execution`` recipe drops ``target: true`` on
+        # the assistant to avoid trivial copy-from-user text-CE; the
+        # flow loss still needs to fire, gated by ``stream: low_level``.
         predict_actions = torch.tensor(
-            bool(any(message_streams[i] == "low_level" for i in target_indices if i < len(message_streams))),
+            bool(any(s == "low_level" for s in message_streams)),
             dtype=torch.bool,
         )
 
diff --git a/src/lerobot/policies/smolvla2/chat_processor_smolvla2.py b/src/lerobot/policies/smolvla2/chat_processor_smolvla2.py
index d72ae9c3a..454a1c2d8 100644
--- a/src/lerobot/policies/smolvla2/chat_processor_smolvla2.py
+++ b/src/lerobot/policies/smolvla2/chat_processor_smolvla2.py
@@ -92,10 +92,16 @@ class SmolVLA2ChatTokenizerStep(ProcessorStep):
         # installed still pass.
         self._tokenizer: Any = None
         if self.tools is None:
-            # Default: ship the canonical ``say`` schema. Users who set
-            # ``meta.tools`` differently can override via
+            # Default: no tools rendered into the system prompt. The
+            # ``say()`` tool was only used by the now-removed
+            # ``user_interjection_response`` recipe; including its
+            # schema on every sample adds a long system message to
+            # the action expert's prefix and creates a train/inference
+            # mismatch (the inference low-level loop doesn't pass
+            # tools=, so the chat template doesn't render them).
+            # Users who actually need tools can set them via
             # ``with_tools(meta.tools)``.
-            self.tools = list(DEFAULT_TOOLS)
+            self.tools = []
 
     # ------------------------------------------------------------------
     # Public API
@@ -258,10 +264,14 @@ class SmolVLA2ChatTokenizerStep(ProcessorStep):
             for pos in range(start, end):
                 labels[pos] = int(full_ids[pos])
 
-        predict_actions = any(
-            i < len(message_streams) and message_streams[i] == "low_level"
-            for i in target_indices
-        )
+        # ``predict_actions`` is True iff this sample's recipe declares
+        # at least one ``low_level`` message — regardless of whether
+        # it's a target. The ``low_level_execution`` recipe in v2 uses
+        # ``stream: low_level`` on both user and assistant turns but
+        # only renders the *user* subtask (no text-CE target on the
+        # assistant) to avoid trivial "copy previous turn" supervision.
+        # Scanning targets alone would miss this sample's action loss.
+        predict_actions = any(s == "low_level" for s in message_streams)
         return [int(i) for i in full_ids], labels, predict_actions
 
     def _apply_prompt_dropout(
diff --git a/src/lerobot/policies/smolvla2/inference/steps.py b/src/lerobot/policies/smolvla2/inference/steps.py
index 7bc2d1e16..e638ca636 100644
--- a/src/lerobot/policies/smolvla2/inference/steps.py
+++ b/src/lerobot/policies/smolvla2/inference/steps.py
@@ -115,12 +115,11 @@ class LowLevelForward(InferenceStep):
         # subtask (+ images + state). No task / plan / memory in the
         # low-level prompt — those are only used by the high-level
         # loop to *generate* the subtask. Matches the training-time
-        # ``low_level_execution`` recipe shape.
+        # ``low_level_execution`` recipe shape (single user turn,
+        # no assistant target since text-CE is owned by the
+        # high-level recipe).
         subtask = state.get("current_subtask") or state.get("task") or ""
-        ctx = [
-            {"role": "user", "content": subtask},
-            {"role": "assistant", "content": subtask},
-        ]
+        ctx = [{"role": "user", "content": subtask}]
         text_batch = _build_text_batch(self.policy, ctx)
         from lerobot.utils.constants import (  # noqa: PLC0415
             OBS_LANGUAGE_ATTENTION_MASK,