From 3f3a159cff34ebe03f8fa852f68e678c6c207f65 Mon Sep 17 00:00:00 2001
From: Jade Choghari <chogharijade@gmail.com>
Date: Tue, 24 Feb 2026 21:16:37 +0300
Subject: [PATCH] fix wall x for transformer v5 (#3008)

* tv5 fix

* various wall x fixes

* Delete tests/policies/pi0_pi05/print_pi05_output_logits.py

Signed-off-by: Jade Choghari <chogharijade@gmail.com>

* sync modeling_florence2.py with chore/bump_transformers_v5

* more

* more fixes

* more

* remove comment

* more

---------

Signed-off-by: Jade Choghari <chogharijade@gmail.com>
---
 .../policies/wall_x/modeling_wall_x.py        | 12 +++++++++-
 .../qwen_model/configuration_qwen2_5_vl.py    |  2 ++
 .../wall_x/qwen_model/qwen2_5_vl_moe.py       | 23 ++++++++++++++++++-
 src/lerobot/policies/wall_x/utils.py          |  4 ++--
 4 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/src/lerobot/policies/wall_x/modeling_wall_x.py b/src/lerobot/policies/wall_x/modeling_wall_x.py
index 36f896998..84ee05743 100644
--- a/src/lerobot/policies/wall_x/modeling_wall_x.py
+++ b/src/lerobot/policies/wall_x/modeling_wall_x.py
@@ -261,10 +261,15 @@ class Qwen2_5_VLMoEForAction(Qwen2_5_VLForConditionalGeneration):
     and optional LoRA fine-tuning support.
     """
 
-    _tied_weights_keys = ["lm_head.weight"]
+    _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
     config_class = Qwen2_5_VLConfig
     _no_split_modules = ["Qwen2_5_VLDecoderLayer_with_MoE", "Qwen2_5_VLVisionBlock"]
 
+    def init_weights(self):
+        if getattr(self.model, "language_model", None) is not None:
+            return
+        super().init_weights()
+
     @classmethod
     def from_pretrained(
         cls,
@@ -312,6 +317,11 @@ class Qwen2_5_VLMoEForAction(Qwen2_5_VLForConditionalGeneration):
             processor.action_processor = action_tokenizer
         else:
             action_tokenizer = None
+
+        # add pad_token_id to config
+        config.pad_token_id = processor.tokenizer.pad_token_id
+        config.text_config.pad_token_id = processor.tokenizer.pad_token_id
+
         # Initialize model with configuration and processor
         model = cls(config, processor=processor, action_tokenizer=action_tokenizer, **kwargs)
 
diff --git a/src/lerobot/policies/wall_x/qwen_model/configuration_qwen2_5_vl.py b/src/lerobot/policies/wall_x/qwen_model/configuration_qwen2_5_vl.py
index 731ef3b3e..19874b6ff 100644
--- a/src/lerobot/policies/wall_x/qwen_model/configuration_qwen2_5_vl.py
+++ b/src/lerobot/policies/wall_x/qwen_model/configuration_qwen2_5_vl.py
@@ -21,6 +21,7 @@ class Qwen2_5_VLVisionConfig(PretrainedConfig):
         window_size=112,
         out_hidden_size=3584,
         fullatt_block_indexes=[7, 15, 23, 31],
+        initializer_range=0.02,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -38,6 +39,7 @@ class Qwen2_5_VLVisionConfig(PretrainedConfig):
         self.window_size = window_size
         self.fullatt_block_indexes = fullatt_block_indexes
         self.out_hidden_size = out_hidden_size
+        self.initializer_range = initializer_range
 
 
 class Qwen2_5_VLConfig(PretrainedConfig):
diff --git a/src/lerobot/policies/wall_x/qwen_model/qwen2_5_vl_moe.py b/src/lerobot/policies/wall_x/qwen_model/qwen2_5_vl_moe.py
index a1309ea9a..6e5adc39f 100644
--- a/src/lerobot/policies/wall_x/qwen_model/qwen2_5_vl_moe.py
+++ b/src/lerobot/policies/wall_x/qwen_model/qwen2_5_vl_moe.py
@@ -602,19 +602,40 @@ class Qwen2_5_VisionTransformerPretrainedModel(Qwen2_5_VLPreTrainedModel):
         return hidden_states
 
 
+def _compute_default_rope_parameters_qwen2_5_vl(config, device=None):
+    """
+    compute default rope parameters for Qwen2_5_VL
+    """
+    base = config.text_config.rope_parameters["rope_theta"]
+    dim = config.hidden_size // config.num_attention_heads
+    inv_freq = 1.0 / (
+        base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim)
+    )
+    return inv_freq, 1.0
+
+
 class Qwen2_5_VLRotaryEmbedding(nn.Module):
     def __init__(self, config: Qwen2_5_VLConfig, device=None):
         super().__init__()
         # BC: "rope_type" was originally "type"
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
             self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        elif hasattr(config, "rope_parameters") and config.rope_parameters is not None:
+            self.rope_type = config.rope_parameters.get("rope_type", "default")
         else:
             self.rope_type = "default"
         self.max_seq_len_cached = config.max_position_embeddings
         self.original_max_seq_len = config.max_position_embeddings
 
         self.config = config
-        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        if self.rope_type == "default":
+            self.rope_init_fn = _compute_default_rope_parameters_qwen2_5_vl
+            self.rope_kwargs = {}
+        else:
+            rope_type_key = "linear" if self.rope_type == "linear" else self.rope_type
+            self.rope_init_fn = ROPE_INIT_FUNCTIONS[rope_type_key]
+            self.rope_kwargs = {}
 
         inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
diff --git a/src/lerobot/policies/wall_x/utils.py b/src/lerobot/policies/wall_x/utils.py
index 2ea40b377..e08ef69d5 100644
--- a/src/lerobot/policies/wall_x/utils.py
+++ b/src/lerobot/policies/wall_x/utils.py
@@ -144,7 +144,7 @@ def preprocesser_call(
     """
     # Process image inputs
     if images is not None and len(images) > 0:
-        image_inputs = processor.image_processor(images=images, videos=None, return_tensors=return_tensors)
+        image_inputs = processor.image_processor(images=images, return_tensors=return_tensors)
         image_grid_thw = image_inputs["image_grid_thw"]
     else:
         image_inputs = {}
@@ -152,7 +152,7 @@ def preprocesser_call(
 
     # Process video inputs
     if videos is not None:
-        videos_inputs = processor.image_processor(images=None, videos=videos, return_tensors=return_tensors)
+        videos_inputs = processor.image_processor(videos=videos, return_tensors=return_tensors)
         video_grid_thw = videos_inputs["video_grid_thw"]
     else:
         videos_inputs = {}