From 9f630e2a413e08a4dda75265ea76adfc46e52438 Mon Sep 17 00:00:00 2001 From: Pepijn Date: Wed, 13 May 2026 14:59:01 +0200 Subject: [PATCH] fix(recipes,training): stop tool prompt leak + drop subtask copy-supervision MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CRITICAL (smolvla2) — the SmolVLM2 chat template was rendering the ``say`` tool's JSON schema as a system message on every training sample because ``DEFAULT_TOOLS`` was the default in ``SmolVLA2ChatTokenizerStep``. That schema was only relevant to the now-removed ``user_interjection_response`` recipe; with it gone the schema is dead weight that polluted every action-expert prefix AND created a train/inference mismatch (the inference ``_build_text_batch`` doesn't pass ``tools=``). Default is now ``[]``; callers needing tools can still set them via ``with_tools(meta.tools)``. LIKELY-BUG — ``low_level_execution`` had ``target: true`` on its assistant turn, so text-CE trained the LM head to predict the same subtask string the user just stated (trivial "copy previous turn" supervision that diluted LM head capacity). Dropped the assistant turn entirely; ``high_level_subtask`` (w=0.50) already owns subtask prediction from real context. The chat-tokenizer's ``predict_actions`` detection used to scan target streams only. With the new no-target low_level recipe it would mis-fire as False. Switched both ``chat_processor_smolvla2.py`` and ``text_processor_pi052.py`` to scan all message streams — any ``stream: low_level`` on the sample is enough to trigger flow loss. Inference: the low-level loop sends only ``[user(subtask)]`` now, matching the new recipe shape. PI052 — hardened the forward fallthrough so a degenerate batch where every sample's recipe is text-only AND text supervision is disabled (text_loss_weight<=0 or text_labels missing) cleanly delegates to ``PI05Policy.forward`` instead of raising "nothing to train". Co-Authored-By: Claude Opus 4.7 (1M context) --- .../configs/recipes/pi052_hirobot.yaml | 4 +++- .../configs/recipes/smolvla2_hirobot.yaml | 13 +++++----- src/lerobot/policies/pi052/modeling_pi052.py | 22 +++++++++++++---- .../policies/pi052/text_processor_pi052.py | 7 +++++- .../smolvla2/chat_processor_smolvla2.py | 24 +++++++++++++------ .../policies/smolvla2/inference/steps.py | 9 ++++--- 6 files changed, 54 insertions(+), 25 deletions(-) diff --git a/src/lerobot/configs/recipes/pi052_hirobot.yaml b/src/lerobot/configs/recipes/pi052_hirobot.yaml index 15ae92b0f..0aa19c72f 100644 --- a/src/lerobot/configs/recipes/pi052_hirobot.yaml +++ b/src/lerobot/configs/recipes/pi052_hirobot.yaml @@ -23,8 +23,10 @@ blend: weight: 0.30 messages: # Action expert prefix = [images, subtask, state] only — π0.5 style. + # No text-CE target: ``high_level_subtask`` already supervises + # subtask prediction from real context. ``stream: low_level`` + # flips ``predict_actions=True`` so the flow loss fires. - {role: user, content: "${subtask}", stream: low_level, if_present: subtask} - - {role: assistant, content: "${subtask}", stream: low_level, target: true, if_present: subtask} plan_generation: weight: 0.10 diff --git a/src/lerobot/configs/recipes/smolvla2_hirobot.yaml b/src/lerobot/configs/recipes/smolvla2_hirobot.yaml index 8ff6a1e93..ffbb6b92b 100644 --- a/src/lerobot/configs/recipes/smolvla2_hirobot.yaml +++ b/src/lerobot/configs/recipes/smolvla2_hirobot.yaml @@ -32,13 +32,14 @@ blend: low_level_execution: weight: 0.30 messages: - # Just the subtask in the prompt — π0.5 style. The action - # expert sees only [images, this subtask, state]. Marking the - # assistant target as ``stream: low_level`` triggers - # ``predict_actions=True`` so the flow loss fires; text CE on - # the subtask is a (small) redundant extra signal. + # π0.5-style action conditioning: the action expert sees just + # the subtask (plus images + state). No text-CE target here — + # ``high_level_subtask`` (w=0.50) already trains subtask + # prediction from real context; supervising it again as a + # copy-from-user turn would dilute the LM head. ``stream: + # low_level`` on either turn is enough to flip + # ``predict_actions=True`` so the flow loss fires. - {role: user, content: "${subtask}", stream: low_level, if_present: subtask} - - {role: assistant, content: "${subtask}", stream: low_level, target: true, if_present: subtask} plan_generation: weight: 0.10 diff --git a/src/lerobot/policies/pi052/modeling_pi052.py b/src/lerobot/policies/pi052/modeling_pi052.py index 08802fa89..9553fd89a 100644 --- a/src/lerobot/policies/pi052/modeling_pi052.py +++ b/src/lerobot/policies/pi052/modeling_pi052.py @@ -366,14 +366,26 @@ class PI052Policy(PI05Policy): text_labels = batch.get("text_labels") predict_actions_t = batch.get("predict_actions") - # Unannotated datasets: no recipe applied → no text_labels and - # no FAST / predict_actions routing. Defer to PI05Policy so the - # plain flow-only training surface keeps working unchanged. + # Unannotated datasets / batches with nothing to train: fall + # through to PI05Policy so the plain flow-only training surface + # keeps working. Triggers when: + # * the recipe wasn't applied (no text_labels, no + # predict_actions), OR + # * every sample's recipe is text-only AND text is disabled + # (would otherwise hit the "nothing to train" raise below). + text_disabled = ( + self.config.text_loss_weight <= 0 or text_labels is None + ) + fast_disabled = not getattr(self.config, "enable_fast_action_loss", False) + no_flow_samples = ( + predict_actions_t is not None + and not bool(predict_actions_t.any().item()) + ) if ( text_labels is None and predict_actions_t is None - and not getattr(self.config, "enable_fast_action_loss", False) - ): + and fast_disabled + ) or (text_disabled and no_flow_samples and fast_disabled): return super().forward(batch, reduction=reduction) run_flow = ( diff --git a/src/lerobot/policies/pi052/text_processor_pi052.py b/src/lerobot/policies/pi052/text_processor_pi052.py index d17808997..649e67b90 100644 --- a/src/lerobot/policies/pi052/text_processor_pi052.py +++ b/src/lerobot/policies/pi052/text_processor_pi052.py @@ -211,8 +211,13 @@ class PI052TextTokenizerStep(ProcessorStep): continue labels[token_pos] = input_ids[token_pos] + # Scan ALL message streams (not just targets) — see + # ``chat_processor_smolvla2.py`` for rationale: the v2 + # ``low_level_execution`` recipe drops ``target: true`` on + # the assistant to avoid trivial copy-from-user text-CE; the + # flow loss still needs to fire, gated by ``stream: low_level``. predict_actions = torch.tensor( - bool(any(message_streams[i] == "low_level" for i in target_indices if i < len(message_streams))), + bool(any(s == "low_level" for s in message_streams)), dtype=torch.bool, ) diff --git a/src/lerobot/policies/smolvla2/chat_processor_smolvla2.py b/src/lerobot/policies/smolvla2/chat_processor_smolvla2.py index d72ae9c3a..454a1c2d8 100644 --- a/src/lerobot/policies/smolvla2/chat_processor_smolvla2.py +++ b/src/lerobot/policies/smolvla2/chat_processor_smolvla2.py @@ -92,10 +92,16 @@ class SmolVLA2ChatTokenizerStep(ProcessorStep): # installed still pass. self._tokenizer: Any = None if self.tools is None: - # Default: ship the canonical ``say`` schema. Users who set - # ``meta.tools`` differently can override via + # Default: no tools rendered into the system prompt. The + # ``say()`` tool was only used by the now-removed + # ``user_interjection_response`` recipe; including its + # schema on every sample adds a long system message to + # the action expert's prefix and creates a train/inference + # mismatch (the inference low-level loop doesn't pass + # tools=, so the chat template doesn't render them). + # Users who actually need tools can set them via # ``with_tools(meta.tools)``. - self.tools = list(DEFAULT_TOOLS) + self.tools = [] # ------------------------------------------------------------------ # Public API @@ -258,10 +264,14 @@ class SmolVLA2ChatTokenizerStep(ProcessorStep): for pos in range(start, end): labels[pos] = int(full_ids[pos]) - predict_actions = any( - i < len(message_streams) and message_streams[i] == "low_level" - for i in target_indices - ) + # ``predict_actions`` is True iff this sample's recipe declares + # at least one ``low_level`` message — regardless of whether + # it's a target. The ``low_level_execution`` recipe in v2 uses + # ``stream: low_level`` on both user and assistant turns but + # only renders the *user* subtask (no text-CE target on the + # assistant) to avoid trivial "copy previous turn" supervision. + # Scanning targets alone would miss this sample's action loss. + predict_actions = any(s == "low_level" for s in message_streams) return [int(i) for i in full_ids], labels, predict_actions def _apply_prompt_dropout( diff --git a/src/lerobot/policies/smolvla2/inference/steps.py b/src/lerobot/policies/smolvla2/inference/steps.py index 7bc2d1e16..e638ca636 100644 --- a/src/lerobot/policies/smolvla2/inference/steps.py +++ b/src/lerobot/policies/smolvla2/inference/steps.py @@ -115,12 +115,11 @@ class LowLevelForward(InferenceStep): # subtask (+ images + state). No task / plan / memory in the # low-level prompt — those are only used by the high-level # loop to *generate* the subtask. Matches the training-time - # ``low_level_execution`` recipe shape. + # ``low_level_execution`` recipe shape (single user turn, + # no assistant target since text-CE is owned by the + # high-level recipe). subtask = state.get("current_subtask") or state.get("task") or "" - ctx = [ - {"role": "user", "content": subtask}, - {"role": "assistant", "content": subtask}, - ] + ctx = [{"role": "user", "content": subtask}] text_batch = _build_text_batch(self.policy, ctx) from lerobot.utils.constants import ( # noqa: PLC0415 OBS_LANGUAGE_ATTENTION_MASK,