From c8763e0ad542feeec64acfb2143dade00e9f2608 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Wed, 13 May 2026 11:58:40 +0200
Subject: [PATCH] fix(pi052): four real bugs in the modeling code + flip
 defaults
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Defaults
--------
* enable_fast_action_loss: False -> True   (match paper §III.B-C Eq.1)
* auto_fit_fast_tokenizer: True -> False   (opt-in; needs base.fit())

Bug fixes
---------

1. Wrong attribute path on PaliGemma. The KI port copied
   pi05_full's ``paligemma.language_model.layers[...]`` literally,
   but the production pi05 wrapper exposes the text model at
   ``paligemma.model.language_model``. With KI enabled, every layer
   would have raised AttributeError on first forward. Fixed all
   references in _compute_layer_ki + _paligemma_forward_ki.

2. ``fill_kv_cache=True`` passed to PaliGemmaWithExpertModel.forward.
   That kwarg is a SmolVLA-only concept; pi05's signature has no
   such argument, so every forward call from pi052 (text loss, FAST
   loss, select_message) would have crashed with TypeError. Dropped
   from all four call sites — pi05's forward already handles the
   cache via past_key_values, and re-forwarding the cumulative
   sequence each step in select_message is fine for our short
   subtask completions.

3. Text-loss shape mismatch. _compute_text_loss applied lm_head to
   the *full* vlm_out (image tokens + language tokens), then tried
   to cross-entropy that against text_labels which only covers the
   language portion — the .view(-1) calls would produce two
   tensors of different lengths and CE would fail. Now slices
   vlm_out to the last text_labels.shape[1] positions before
   running lm_head, matching the [images, language] order
   embed_prefix produces.

4. Dead-code conditional in _paligemma_forward_ki's single-expert
   fallback. The ``if hasattr(...) else self._pi052_orig_forward``
   ternary always took the wrong branch because the attribute is
   always set (we save it in PI052Policy.__init__). Simplified to
   just call self._pi052_orig_forward directly.

After this commit, pi052 should be runnable end-to-end for the
first time with all three loss heads + KI active. Still worth a
100-step smoke test before kicking off a long run.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../policies/pi052/configuration_pi052.py     | 28 ++++----
 src/lerobot/policies/pi052/modeling_pi052.py  | 66 ++++++++-----------
 2 files changed, 46 insertions(+), 48 deletions(-)

diff --git a/src/lerobot/policies/pi052/configuration_pi052.py b/src/lerobot/policies/pi052/configuration_pi052.py
index 67dfd446a..df9325c01 100644
--- a/src/lerobot/policies/pi052/configuration_pi052.py
+++ b/src/lerobot/policies/pi052/configuration_pi052.py
@@ -107,12 +107,12 @@ class PI052Config(PI05Config):
     # ActionTokenizerProcessorStep is wired into the preprocessor
     # pipeline when this flag is set; the loss is computed in
     # PI052Policy.forward.
-    enable_fast_action_loss: bool = False
+    enable_fast_action_loss: bool = True
     """If True, tokenise actions with the FAST tokenizer and add a
-    cross-entropy loss on the LM head. Off by default because most
-    fine-tuning runs only need the flow head + text supervision; the
-    FAST CE term is most useful when training from a base PaliGemma
-    rather than an existing π0.5 checkpoint."""
+    cross-entropy loss on the LM head. On by default to match the
+    π0.5 paper's three-loss objective (text CE + FAST CE + flow MSE,
+    §III.B-C Eq. 1). Set to False if you only want the
+    post-training-style flow + text recipe."""
 
     action_tokenizer_name: str = "physical-intelligence/fast"
     """HF identifier for the FAST action tokenizer."""
@@ -127,15 +127,21 @@ class PI052Config(PI05Config):
     fast_action_loss_weight: float = 1.0
     """Weight on the FAST-action-token CE loss. Paper §III.C uses 1.0."""
 
-    auto_fit_fast_tokenizer: bool = True
-    """If True (default), the processor factory checks
-    ``fast_tokenizer_cache_dir`` for a previously-fitted tokenizer keyed
-    on ``(dataset_repo_id, base_tokenizer_name, fit_samples)``. On cache
-    miss, it loads ``action_tokenizer_name`` as a base, samples
+    auto_fit_fast_tokenizer: bool = False
+    """If True, the processor factory checks ``fast_tokenizer_cache_dir``
+    for a previously-fitted tokenizer keyed on ``(dataset_repo_id,
+    base_tokenizer_name, fit_samples)``. On cache miss, it loads
+    ``action_tokenizer_name`` as a base, samples
     ``fast_tokenizer_fit_samples`` action chunks from the dataset, runs
     ``.fit()``, saves the result, and uses *that* fitted path as the
     actual tokenizer. Pertsch et al. 2025 (FAST paper [64], π0.5 §III.C)
-    explicitly recommend per-dataset fitting for best compression."""
+    explicitly recommend per-dataset fitting for best compression.
+
+    Off by default because the fit requires a separate pre-training
+    pass over the dataset (~1-2 min on a medium dataset) and depends
+    on the FAST tokenizer snapshot having a ``.fit()`` method. Opt in
+    when you want paper-faithful compression; leave off to fall back
+    on the universal ``physical-intelligence/fast`` codebook."""
 
     fast_tokenizer_cache_dir: str = "~/.cache/lerobot/fast_tokenizers"
     """Where fitted FAST tokenizers are stored. ``~`` expands."""
diff --git a/src/lerobot/policies/pi052/modeling_pi052.py b/src/lerobot/policies/pi052/modeling_pi052.py
index b5e4e6054..52349c614 100644
--- a/src/lerobot/policies/pi052/modeling_pi052.py
+++ b/src/lerobot/policies/pi052/modeling_pi052.py
@@ -76,7 +76,7 @@ def _compute_layer_ki(
 ):
     from transformers.models.gemma import modeling_gemma  # noqa: PLC0415
 
-    models = [paligemma.language_model, gemma_expert.model]
+    models = [paligemma.model.language_model, gemma_expert.model]
     query_states, key_states, value_states, gates = [], [], [], []
 
     vlm_len = inputs_embeds[0].shape[1]
@@ -111,7 +111,7 @@ def _compute_layer_ki(
     )
 
     batch_size = query_states.shape[0]
-    scaling = paligemma.language_model.layers[layer_idx].self_attn.scaling
+    scaling = paligemma.model.language_model.layers[layer_idx].self_attn.scaling
 
     # Split queries / K / V at the VLM-vs-action boundary.
     Q_vlm = query_states[:, :, :vlm_len, :]
@@ -133,16 +133,16 @@ def _compute_layer_ki(
     mask_for_action = attention_mask[:, :, vlm_len:, :]
 
     att_vlm, _ = modeling_gemma.eager_attention_forward(
-        paligemma.language_model.layers[layer_idx].self_attn,
+        paligemma.model.language_model.layers[layer_idx].self_attn,
         Q_vlm, K_for_vlm, V_for_vlm, mask_for_vlm, scaling,
     )
     att_action, _ = modeling_gemma.eager_attention_forward(
-        paligemma.language_model.layers[layer_idx].self_attn,
+        paligemma.model.language_model.layers[layer_idx].self_attn,
         Q_action, K_for_action, V_for_action, mask_for_action, scaling,
     )
     att = torch.cat([att_vlm, att_action], dim=1)
 
-    head_dim = paligemma.language_model.layers[layer_idx].self_attn.head_dim
+    head_dim = paligemma.model.language_model.layers[layer_idx].self_attn.head_dim
     att = att.reshape(batch_size, -1, 1 * 8 * head_dim)
 
     outputs_embeds = []
@@ -173,33 +173,24 @@ def _paligemma_forward_ki(
     inputs_embeds=None,
     use_cache=None,
     adarms_cond=None,
-    fill_kv_cache=None,
 ):
     """Replacement ``PaliGemmaWithExpertModel.forward`` that routes the
     dual-expert layer pass through :func:`_compute_layer_ki`.
 
     Bound onto the model instance when ``config.knowledge_insulation``
     is True (see ``PI052Policy.__init__``). Single-expert branches
-    (VLM-only or action-only) reuse the parent's implementation
-    because there's no KI signal to add — KI only matters when
-    actions and VLM tokens are forwarded together.
+    (VLM-only or action-only) defer back to the original forward —
+    KI only matters when actions and VLM tokens are forwarded together.
     """
     from ..pi05.modeling_pi05 import layernorm_forward  # noqa: PLC0415
 
     if adarms_cond is None:
         adarms_cond = [None, None]
 
-    # Single-expert paths: defer to the bound class method via super().
+    # Single-expert paths: defer to the original forward saved in
+    # PI052Policy.__init__.
     if inputs_embeds[0] is None or inputs_embeds[1] is None:
-        return type(self).__bases__[0].forward(
-            self,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            adarms_cond=adarms_cond,
-        ) if hasattr(self, "_pi052_orig_forward") else self._pi052_orig_forward(
+        return self._pi052_orig_forward(
             attention_mask=attention_mask,
             position_ids=position_ids,
             past_key_values=past_key_values,
@@ -429,7 +420,6 @@ class PI052Policy(PI05Policy):
             past_key_values=None,
             inputs_embeds=[full_embs, None],
             use_cache=False,
-            fill_kv_cache=True,
         )
         if vlm_out is None:
             raise RuntimeError("PI052 FAST loss: VLM forward returned no hidden states.")
@@ -456,12 +446,15 @@ class PI052Policy(PI05Policy):
     def _compute_text_loss(self, batch: dict[str, Tensor], text_labels: Tensor) -> Tensor:
         """Cross-entropy on PaliGemma's LM head over the supervised span.
 
-        Re-uses the same prefix-embedding path the flow head does:
-        embed images + state + language tokens, run a forward pass,
-        slice out the per-token logits at the supervised positions,
-        compute CE.
+        Embeds images + language, runs the VLM-only forward (the
+        action expert is skipped via ``inputs_embeds=[..., None]``),
+        slices the hidden states to the *language* portion so they
+        align with ``text_labels`` (which covers only the language
+        tokens, not the image patch tokens), then computes shifted
+        next-token CE with ``-100`` ignoring padding/non-target
+        positions.
         """
-        from torch.nn import functional as F  # noqa: PLC0415
+        from ..pi05.modeling_pi05 import make_att_2d_masks  # noqa: PLC0415
 
         images, img_masks = self.model._preprocess_images(batch)
         tokens = batch[OBS_LANGUAGE_TOKENS]
@@ -470,12 +463,6 @@ class PI052Policy(PI05Policy):
         prefix_embs, prefix_pad_masks, prefix_att_masks = self.model.embed_prefix(
             images, img_masks, tokens, masks
         )
-        # PaliGemma's text path: forward the prefix through the
-        # backbone *without* the action expert. We piggy-back on the
-        # existing PaliGemmaWithExpertModel.forward — it accepts a
-        # list of expert inputs and returns parallel outputs.
-        from ..pi05.modeling_pi05 import make_att_2d_masks  # noqa: PLC0415
-
         att_2d = make_att_2d_masks(prefix_pad_masks, prefix_att_masks)
         position_ids = torch.cumsum(prefix_pad_masks, dim=1) - 1
 
@@ -485,18 +472,24 @@ class PI052Policy(PI05Policy):
             past_key_values=None,
             inputs_embeds=[prefix_embs, None],
             use_cache=False,
-            fill_kv_cache=True,
         )
         if vlm_out is None:
             raise RuntimeError("PI052 text loss: VLM forward returned no hidden states.")
 
-        # Logits over the vocab via the PaliGemma lm_head.
+        # Slice the hidden states to the language portion. embed_prefix
+        # concatenates [images, language] in that order, so the trailing
+        # ``text_labels.shape[1]`` positions are the language tokens.
+        # Without this slice, applying lm_head to the full vlm_out and
+        # shifting against text_labels[..., 1:] produces a shape
+        # mismatch in cross_entropy.
+        lang_len = text_labels.shape[1]
+        text_hidden = vlm_out[:, -lang_len:, :]
+
         lm_head = self.model.paligemma_with_expert.paligemma.lm_head
-        logits = lm_head(vlm_out.to(lm_head.weight.dtype))
+        logits = lm_head(text_hidden.to(lm_head.weight.dtype))
 
         # Shift for next-token prediction: predict token[i+1] from
-        # hidden[i]. Both ``logits`` and ``text_labels`` are over the
-        # same sequence length, so shift logits[:-1] vs labels[1:].
+        # hidden[i] within the language span.
         shift_logits = logits[..., :-1, :].contiguous()
         shift_labels = text_labels[..., 1:].contiguous()
         loss = F.cross_entropy(
@@ -579,7 +572,6 @@ class PI052Policy(PI05Policy):
                 past_key_values=None,
                 inputs_embeds=[current_embs, None],
                 use_cache=False,
-                fill_kv_cache=True,
             )
             if vlm_out is None:
                 break