diff --git a/src/lerobot/policies/smolvla2/inference/steps.py b/src/lerobot/policies/smolvla2/inference/steps.py index 43371f6f4..8ffe66cd0 100644 --- a/src/lerobot/policies/smolvla2/inference/steps.py +++ b/src/lerobot/policies/smolvla2/inference/steps.py @@ -368,6 +368,12 @@ class HighLevelSubtaskFwd(InferenceStep): msg = _generate_with_policy( self.policy, ctx, observation=observation, state=state, label="subtask gen" ) + # Diagnostics: surface what the model is *actually* producing + # at chunk boundaries, even when the output gets rejected or + # repeats. Memorisation collapse looks like "same accepted + # subtask N times in a row" or "gibberish_count rising while + # current_subtask is stuck". The state panel renders these. + state["last_subtask_raw"] = msg or "" if msg and _looks_like_gibberish(msg): # Bump a counter so the operator can see the model is # struggling without spamming the log every tick. A first @@ -385,6 +391,17 @@ class HighLevelSubtaskFwd(InferenceStep): if changed: # Subtask change is a downstream trigger. state.setdefault("events_this_tick", []).append("subtask_change") + state["subtask_repeat_count"] = 0 + else: + # Same accepted string regenerated — memorisation tell. + # Once this counter climbs past a few, you're seeing + # the model unable to move past the current subtask + # despite the chunk having drained (visual scene may + # have changed but the LM is replaying training + # tokens). + state["subtask_repeat_count"] = ( + state.get("subtask_repeat_count", 0) + 1 + ) # Silently skip empty completions — common when the model # warms up or generates only EOS; logging it every tick at # ctrl_hz is just noise. @@ -417,8 +434,14 @@ class MemoryUpdateFwd(InferenceStep): new_memory = _generate_with_policy( self.policy, ctx, observation=observation, state=state, label="memory gen" ) + state["last_memory_raw"] = new_memory or "" if new_memory and _looks_like_gibberish(new_memory): - push_log(state, f" [info] memory gen rejected (gibberish): {new_memory[:60]!r}") + count = state.get("memory_gibberish_count", 0) + 1 + state["memory_gibberish_count"] = count + push_log( + state, + f" [info] memory gen rejected (gibberish ×{count}): {new_memory[:60]!r}", + ) return None if new_memory: set_if_changed(state, "current_memory", new_memory, label="memory") @@ -456,7 +479,12 @@ class UserInterjectionFwd(InferenceStep): # re-trigger by typing again. return None if _looks_like_gibberish(out): - push_log(state, f" [info] plan/say gen rejected (gibberish): {out[:60]!r}") + count = state.get("plan_gibberish_count", 0) + 1 + state["plan_gibberish_count"] = count + push_log( + state, + f" [info] plan/say gen rejected (gibberish ×{count}): {out[:60]!r}", + ) return None # Heuristic split: model is trained to emit one assistant turn # carrying both plan text AND a `say` tool call. Look for a diff --git a/src/lerobot/robots/utils.py b/src/lerobot/robots/utils.py index 92da597f1..248dc1b1a 100644 --- a/src/lerobot/robots/utils.py +++ b/src/lerobot/robots/utils.py @@ -21,6 +21,8 @@ from lerobot.utils.import_utils import make_device_from_device_class from .config import RobotConfig from .robot import Robot +logger = logging.getLogger(__name__) + def make_robot_from_config(config: RobotConfig) -> Robot: # TODO(Steven): Consider just using the make_device_from_device_class for all types @@ -110,7 +112,7 @@ def ensure_safe_goal_position( } if warnings_dict: - logging.warning( + logger.warning( "Relative goal position magnitude had to be clamped to be safe.\n" f"{pformat(warnings_dict, indent=4)}" ) diff --git a/src/lerobot/scripts/lerobot_smolvla2_runtime.py b/src/lerobot/scripts/lerobot_smolvla2_runtime.py index 09548f2d1..8055a5aed 100644 --- a/src/lerobot/scripts/lerobot_smolvla2_runtime.py +++ b/src/lerobot/scripts/lerobot_smolvla2_runtime.py @@ -883,6 +883,37 @@ def _make_state_panel_renderer( f"dispatched: {dispatched} " f"pending tool calls: {pending}[/]" ) + + # Overfit / memorisation diagnostics. The high-level steps + # surface the raw generation each time they fire (even when + # rejected as gibberish or unchanged), plus repeat/rejection + # counters. Rule of thumb: + # + # * subtask repeat ≥ ~5 and queue_len cycles fully → model + # can't move past current subtask (memorised one phase + # of the task — classic overfit signature) + # * subtask gibberish climbing → LM head collapsed to + # chat-template fragments / one-token salads + # * last raw differs from accepted → at least the LM is + # varying, the gibberish filter is doing its job + raw_subtask = st.get("last_subtask_raw") + sub_rep = int(st.get("subtask_repeat_count") or 0) + sub_gib = int(st.get("subtask_gibberish_count") or 0) + if raw_subtask is not None or sub_rep or sub_gib: + raw_display = (raw_subtask or "(empty)")[:80] + color = "yellow" if (sub_rep >= 3 or sub_gib >= 3) else "dim" + console.print( + f" [{color}]subtask diag repeat:{sub_rep} " + f"gibberish:{sub_gib} last_raw: {raw_display!r}[/]" + ) + + # Same diagnostics for memory and plan when available. + mem_gib = int(st.get("memory_gibberish_count") or 0) + plan_gib = int(st.get("plan_gibberish_count") or 0) + if mem_gib or plan_gib: + console.print( + f" [dim]gen rejects memory:{mem_gib} plan:{plan_gib}[/]" + ) console.rule(style="cyan") if robot_lines: for line in robot_lines: @@ -939,6 +970,16 @@ def _silence_noisy_loggers() -> None: ): logging.getLogger(name).setLevel(logging.WARNING) + # The robot's relative-goal-position clamp warning fires *every* + # dispatch tick on a memorised model — the LM is trying to jump + # the wrist far past where max_relative_target allows, so the + # warning floods the panel at ~30 Hz. Promote it from WARNING to + # DEBUG: the dispatch counter on the panel already tells the + # operator the loop is running, and the panel itself shows + # whether motion is happening. If anyone needs the per-action + # clamp detail, ``-v`` puts it back via DEBUG. + logging.getLogger("lerobot.robots.utils").setLevel(logging.ERROR) + def main(argv: list[str] | None = None) -> int: args = _parse_args(argv)