From 3f3a159cff34ebe03f8fa852f68e678c6c207f65 Mon Sep 17 00:00:00 2001 From: Jade Choghari Date: Tue, 24 Feb 2026 21:16:37 +0300 Subject: [PATCH] fix wall x for transformer v5 (#3008) * tv5 fix * various wall x fixes * Delete tests/policies/pi0_pi05/print_pi05_output_logits.py Signed-off-by: Jade Choghari * sync modeling_florence2.py with chore/bump_transformers_v5 * more * more fixes * more * remove comment * more --------- Signed-off-by: Jade Choghari --- .../policies/wall_x/modeling_wall_x.py | 12 +++++++++- .../qwen_model/configuration_qwen2_5_vl.py | 2 ++ .../wall_x/qwen_model/qwen2_5_vl_moe.py | 23 ++++++++++++++++++- src/lerobot/policies/wall_x/utils.py | 4 ++-- 4 files changed, 37 insertions(+), 4 deletions(-) diff --git a/src/lerobot/policies/wall_x/modeling_wall_x.py b/src/lerobot/policies/wall_x/modeling_wall_x.py index 36f896998..84ee05743 100644 --- a/src/lerobot/policies/wall_x/modeling_wall_x.py +++ b/src/lerobot/policies/wall_x/modeling_wall_x.py @@ -261,10 +261,15 @@ class Qwen2_5_VLMoEForAction(Qwen2_5_VLForConditionalGeneration): and optional LoRA fine-tuning support. """ - _tied_weights_keys = ["lm_head.weight"] + _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"} config_class = Qwen2_5_VLConfig _no_split_modules = ["Qwen2_5_VLDecoderLayer_with_MoE", "Qwen2_5_VLVisionBlock"] + def init_weights(self): + if getattr(self.model, "language_model", None) is not None: + return + super().init_weights() + @classmethod def from_pretrained( cls, @@ -312,6 +317,11 @@ class Qwen2_5_VLMoEForAction(Qwen2_5_VLForConditionalGeneration): processor.action_processor = action_tokenizer else: action_tokenizer = None + + # add pad_token_id to config + config.pad_token_id = processor.tokenizer.pad_token_id + config.text_config.pad_token_id = processor.tokenizer.pad_token_id + # Initialize model with configuration and processor model = cls(config, processor=processor, action_tokenizer=action_tokenizer, **kwargs) diff --git a/src/lerobot/policies/wall_x/qwen_model/configuration_qwen2_5_vl.py b/src/lerobot/policies/wall_x/qwen_model/configuration_qwen2_5_vl.py index 731ef3b3e..19874b6ff 100644 --- a/src/lerobot/policies/wall_x/qwen_model/configuration_qwen2_5_vl.py +++ b/src/lerobot/policies/wall_x/qwen_model/configuration_qwen2_5_vl.py @@ -21,6 +21,7 @@ class Qwen2_5_VLVisionConfig(PretrainedConfig): window_size=112, out_hidden_size=3584, fullatt_block_indexes=[7, 15, 23, 31], + initializer_range=0.02, **kwargs, ): super().__init__(**kwargs) @@ -38,6 +39,7 @@ class Qwen2_5_VLVisionConfig(PretrainedConfig): self.window_size = window_size self.fullatt_block_indexes = fullatt_block_indexes self.out_hidden_size = out_hidden_size + self.initializer_range = initializer_range class Qwen2_5_VLConfig(PretrainedConfig): diff --git a/src/lerobot/policies/wall_x/qwen_model/qwen2_5_vl_moe.py b/src/lerobot/policies/wall_x/qwen_model/qwen2_5_vl_moe.py index a1309ea9a..6e5adc39f 100644 --- a/src/lerobot/policies/wall_x/qwen_model/qwen2_5_vl_moe.py +++ b/src/lerobot/policies/wall_x/qwen_model/qwen2_5_vl_moe.py @@ -602,19 +602,40 @@ class Qwen2_5_VisionTransformerPretrainedModel(Qwen2_5_VLPreTrainedModel): return hidden_states +def _compute_default_rope_parameters_qwen2_5_vl(config, device=None): + """ + compute default rope parameters for Qwen2_5_VL + """ + base = config.text_config.rope_parameters["rope_theta"] + dim = config.hidden_size // config.num_attention_heads + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, 1.0 + + class Qwen2_5_VLRotaryEmbedding(nn.Module): def __init__(self, config: Qwen2_5_VLConfig, device=None): super().__init__() # BC: "rope_type" was originally "type" if hasattr(config, "rope_scaling") and config.rope_scaling is not None: self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + elif hasattr(config, "rope_parameters") and config.rope_parameters is not None: + self.rope_type = config.rope_parameters.get("rope_type", "default") else: self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + if self.rope_type == "default": + self.rope_init_fn = _compute_default_rope_parameters_qwen2_5_vl + self.rope_kwargs = {} + else: + rope_type_key = "linear" if self.rope_type == "linear" else self.rope_type + self.rope_init_fn = ROPE_INIT_FUNCTIONS[rope_type_key] + self.rope_kwargs = {} inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) self.register_buffer("inv_freq", inv_freq, persistent=False) diff --git a/src/lerobot/policies/wall_x/utils.py b/src/lerobot/policies/wall_x/utils.py index 2ea40b377..e08ef69d5 100644 --- a/src/lerobot/policies/wall_x/utils.py +++ b/src/lerobot/policies/wall_x/utils.py @@ -144,7 +144,7 @@ def preprocesser_call( """ # Process image inputs if images is not None and len(images) > 0: - image_inputs = processor.image_processor(images=images, videos=None, return_tensors=return_tensors) + image_inputs = processor.image_processor(images=images, return_tensors=return_tensors) image_grid_thw = image_inputs["image_grid_thw"] else: image_inputs = {} @@ -152,7 +152,7 @@ def preprocesser_call( # Process video inputs if videos is not None: - videos_inputs = processor.image_processor(images=None, videos=videos, return_tensors=return_tensors) + videos_inputs = processor.image_processor(videos=videos, return_tensors=return_tensors) video_grid_thw = videos_inputs["video_grid_thw"] else: videos_inputs = {}