mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-31 19:01:28 +00:00
fix(recipes,training): stop tool prompt leak + drop subtask copy-supervision
CRITICAL (smolvla2) — the SmolVLM2 chat template was rendering the ``say`` tool's JSON schema as a system message on every training sample because ``DEFAULT_TOOLS`` was the default in ``SmolVLA2ChatTokenizerStep``. That schema was only relevant to the now-removed ``user_interjection_response`` recipe; with it gone the schema is dead weight that polluted every action-expert prefix AND created a train/inference mismatch (the inference ``_build_text_batch`` doesn't pass ``tools=``). Default is now ``[]``; callers needing tools can still set them via ``with_tools(meta.tools)``. LIKELY-BUG — ``low_level_execution`` had ``target: true`` on its assistant turn, so text-CE trained the LM head to predict the same subtask string the user just stated (trivial "copy previous turn" supervision that diluted LM head capacity). Dropped the assistant turn entirely; ``high_level_subtask`` (w=0.50) already owns subtask prediction from real context. The chat-tokenizer's ``predict_actions`` detection used to scan target streams only. With the new no-target low_level recipe it would mis-fire as False. Switched both ``chat_processor_smolvla2.py`` and ``text_processor_pi052.py`` to scan all message streams — any ``stream: low_level`` on the sample is enough to trigger flow loss. Inference: the low-level loop sends only ``[user(subtask)]`` now, matching the new recipe shape. PI052 — hardened the forward fallthrough so a degenerate batch where every sample's recipe is text-only AND text supervision is disabled (text_loss_weight<=0 or text_labels missing) cleanly delegates to ``PI05Policy.forward`` instead of raising "nothing to train". Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -23,8 +23,10 @@ blend:
|
||||
weight: 0.30
|
||||
messages:
|
||||
# Action expert prefix = [images, subtask, state] only — π0.5 style.
|
||||
# No text-CE target: ``high_level_subtask`` already supervises
|
||||
# subtask prediction from real context. ``stream: low_level``
|
||||
# flips ``predict_actions=True`` so the flow loss fires.
|
||||
- {role: user, content: "${subtask}", stream: low_level, if_present: subtask}
|
||||
- {role: assistant, content: "${subtask}", stream: low_level, target: true, if_present: subtask}
|
||||
|
||||
plan_generation:
|
||||
weight: 0.10
|
||||
|
||||
@@ -32,13 +32,14 @@ blend:
|
||||
low_level_execution:
|
||||
weight: 0.30
|
||||
messages:
|
||||
# Just the subtask in the prompt — π0.5 style. The action
|
||||
# expert sees only [images, this subtask, state]. Marking the
|
||||
# assistant target as ``stream: low_level`` triggers
|
||||
# ``predict_actions=True`` so the flow loss fires; text CE on
|
||||
# the subtask is a (small) redundant extra signal.
|
||||
# π0.5-style action conditioning: the action expert sees just
|
||||
# the subtask (plus images + state). No text-CE target here —
|
||||
# ``high_level_subtask`` (w=0.50) already trains subtask
|
||||
# prediction from real context; supervising it again as a
|
||||
# copy-from-user turn would dilute the LM head. ``stream:
|
||||
# low_level`` on either turn is enough to flip
|
||||
# ``predict_actions=True`` so the flow loss fires.
|
||||
- {role: user, content: "${subtask}", stream: low_level, if_present: subtask}
|
||||
- {role: assistant, content: "${subtask}", stream: low_level, target: true, if_present: subtask}
|
||||
|
||||
plan_generation:
|
||||
weight: 0.10
|
||||
|
||||
@@ -366,14 +366,26 @@ class PI052Policy(PI05Policy):
|
||||
text_labels = batch.get("text_labels")
|
||||
predict_actions_t = batch.get("predict_actions")
|
||||
|
||||
# Unannotated datasets: no recipe applied → no text_labels and
|
||||
# no FAST / predict_actions routing. Defer to PI05Policy so the
|
||||
# plain flow-only training surface keeps working unchanged.
|
||||
# Unannotated datasets / batches with nothing to train: fall
|
||||
# through to PI05Policy so the plain flow-only training surface
|
||||
# keeps working. Triggers when:
|
||||
# * the recipe wasn't applied (no text_labels, no
|
||||
# predict_actions), OR
|
||||
# * every sample's recipe is text-only AND text is disabled
|
||||
# (would otherwise hit the "nothing to train" raise below).
|
||||
text_disabled = (
|
||||
self.config.text_loss_weight <= 0 or text_labels is None
|
||||
)
|
||||
fast_disabled = not getattr(self.config, "enable_fast_action_loss", False)
|
||||
no_flow_samples = (
|
||||
predict_actions_t is not None
|
||||
and not bool(predict_actions_t.any().item())
|
||||
)
|
||||
if (
|
||||
text_labels is None
|
||||
and predict_actions_t is None
|
||||
and not getattr(self.config, "enable_fast_action_loss", False)
|
||||
):
|
||||
and fast_disabled
|
||||
) or (text_disabled and no_flow_samples and fast_disabled):
|
||||
return super().forward(batch, reduction=reduction)
|
||||
|
||||
run_flow = (
|
||||
|
||||
@@ -211,8 +211,13 @@ class PI052TextTokenizerStep(ProcessorStep):
|
||||
continue
|
||||
labels[token_pos] = input_ids[token_pos]
|
||||
|
||||
# Scan ALL message streams (not just targets) — see
|
||||
# ``chat_processor_smolvla2.py`` for rationale: the v2
|
||||
# ``low_level_execution`` recipe drops ``target: true`` on
|
||||
# the assistant to avoid trivial copy-from-user text-CE; the
|
||||
# flow loss still needs to fire, gated by ``stream: low_level``.
|
||||
predict_actions = torch.tensor(
|
||||
bool(any(message_streams[i] == "low_level" for i in target_indices if i < len(message_streams))),
|
||||
bool(any(s == "low_level" for s in message_streams)),
|
||||
dtype=torch.bool,
|
||||
)
|
||||
|
||||
|
||||
@@ -92,10 +92,16 @@ class SmolVLA2ChatTokenizerStep(ProcessorStep):
|
||||
# installed still pass.
|
||||
self._tokenizer: Any = None
|
||||
if self.tools is None:
|
||||
# Default: ship the canonical ``say`` schema. Users who set
|
||||
# ``meta.tools`` differently can override via
|
||||
# Default: no tools rendered into the system prompt. The
|
||||
# ``say()`` tool was only used by the now-removed
|
||||
# ``user_interjection_response`` recipe; including its
|
||||
# schema on every sample adds a long system message to
|
||||
# the action expert's prefix and creates a train/inference
|
||||
# mismatch (the inference low-level loop doesn't pass
|
||||
# tools=, so the chat template doesn't render them).
|
||||
# Users who actually need tools can set them via
|
||||
# ``with_tools(meta.tools)``.
|
||||
self.tools = list(DEFAULT_TOOLS)
|
||||
self.tools = []
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Public API
|
||||
@@ -258,10 +264,14 @@ class SmolVLA2ChatTokenizerStep(ProcessorStep):
|
||||
for pos in range(start, end):
|
||||
labels[pos] = int(full_ids[pos])
|
||||
|
||||
predict_actions = any(
|
||||
i < len(message_streams) and message_streams[i] == "low_level"
|
||||
for i in target_indices
|
||||
)
|
||||
# ``predict_actions`` is True iff this sample's recipe declares
|
||||
# at least one ``low_level`` message — regardless of whether
|
||||
# it's a target. The ``low_level_execution`` recipe in v2 uses
|
||||
# ``stream: low_level`` on both user and assistant turns but
|
||||
# only renders the *user* subtask (no text-CE target on the
|
||||
# assistant) to avoid trivial "copy previous turn" supervision.
|
||||
# Scanning targets alone would miss this sample's action loss.
|
||||
predict_actions = any(s == "low_level" for s in message_streams)
|
||||
return [int(i) for i in full_ids], labels, predict_actions
|
||||
|
||||
def _apply_prompt_dropout(
|
||||
|
||||
@@ -115,12 +115,11 @@ class LowLevelForward(InferenceStep):
|
||||
# subtask (+ images + state). No task / plan / memory in the
|
||||
# low-level prompt — those are only used by the high-level
|
||||
# loop to *generate* the subtask. Matches the training-time
|
||||
# ``low_level_execution`` recipe shape.
|
||||
# ``low_level_execution`` recipe shape (single user turn,
|
||||
# no assistant target since text-CE is owned by the
|
||||
# high-level recipe).
|
||||
subtask = state.get("current_subtask") or state.get("task") or ""
|
||||
ctx = [
|
||||
{"role": "user", "content": subtask},
|
||||
{"role": "assistant", "content": subtask},
|
||||
]
|
||||
ctx = [{"role": "user", "content": subtask}]
|
||||
text_batch = _build_text_batch(self.policy, ctx)
|
||||
from lerobot.utils.constants import ( # noqa: PLC0415
|
||||
OBS_LANGUAGE_ATTENTION_MASK,
|
||||
|
||||
Reference in New Issue
Block a user