From fcdae0ce8eb98f728ca755aa66487619e6a8982f Mon Sep 17 00:00:00 2001 From: Pepijn Date: Tue, 12 May 2026 18:19:18 +0200 Subject: [PATCH] chore(smolvla2-runtime): tensor-level obs print for both inference paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Helper that prints (once per provider lifetime) every ``observation.*`` tensor the policy is about to see, with its shape, dtype, device, and per-channel min/max/mean/std. Wired into both the dry-run dataset path and the live-robot path. Now we can bisect train/inference mismatch *at the tensor level* — if the same checkpoint produces coherent text on one path's tensors and ``\n`` on the other's, and the printed tensor stats differ materially, the bug is in the observation prep, not in the model or the training distribution. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../scripts/lerobot_smolvla2_runtime.py | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/src/lerobot/scripts/lerobot_smolvla2_runtime.py b/src/lerobot/scripts/lerobot_smolvla2_runtime.py index f0c390f8f..84d7a9d00 100644 --- a/src/lerobot/scripts/lerobot_smolvla2_runtime.py +++ b/src/lerobot/scripts/lerobot_smolvla2_runtime.py @@ -263,6 +263,47 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: return p.parse_args(argv) +def _log_obs_tensors_once(label: str, obs: Any, flag: dict) -> None: + """Print shape / dtype / per-channel stats of every observation tensor + going into the policy, exactly once per provider lifetime. + + Used to bisect train/inference mismatches: if the dry-run path + and the robot path produce identifiably different tensors here + (e.g. one is batched twice, one has a different range, one is on + a different device), the LM head's collapse on the live robot is + a tensor-shape bug, not a distribution-shift problem. If the + tensors *do* match byte-for-byte and the head still collapses, + only then is the scene-content OOD hypothesis the right one. + """ + if flag.get("done") or not isinstance(obs, dict): + return + flag["done"] = True + import torch as _torch # noqa: PLC0415 + + for k, v in obs.items(): + if not isinstance(k, str) or not k.startswith("observation."): + continue + if isinstance(v, _torch.Tensor): + try: + stats = ( + f"min={float(v.min()):.4f} max={float(v.max()):.4f} " + f"mean={float(v.mean()):.4f} std={float(v.float().std()):.4f}" + ) + except Exception: # noqa: BLE001 + stats = "(stats unavailable)" + logger.warning( + "obs[%s] %-30s shape=%s dtype=%s device=%s %s", + label, + k, + tuple(v.shape), + v.dtype, + v.device, + stats, + ) + else: + logger.warning("obs[%s] %-30s type=%s value=%r", label, k, type(v).__name__, v) + + def _load_policy_and_preprocessor( policy_path: str, dataset_repo_id: str | None, @@ -368,6 +409,7 @@ def _build_observation_provider( ) state = {"cursor": max(0, min(start_frame, len(ds) - 1))} + _logged = {"done": False} def _provider() -> dict | None: idx = state["cursor"] @@ -383,6 +425,8 @@ def _build_observation_provider( if preprocessor is not None: sample = preprocessor(sample) + _log_obs_tensors_once("dry-run", sample, _logged) + # Keep only observation keys; the runtime's text path will # merge these with its own lang_tokens / lang_masks. observation = { @@ -649,6 +693,7 @@ def _build_robot_observation_provider( # head's distribution at position 0 collapses to its dominant # mode (a memorised ``\n``-only run in this checkpoint). _resize_logged = {"done": False} + _obs_logged = {"done": False} target_image_shapes: dict[str, tuple[int, int]] = {} if ds_features: for fkey, fmeta in ds_features.items(): @@ -770,6 +815,8 @@ def _build_robot_observation_provider( return None obs_tensors = processed if isinstance(processed, dict) else {} + _log_obs_tensors_once("robot", obs_tensors, _obs_logged) + observation = { k: v for k, v in obs_tensors.items()