From 0bda18726800eb9b5d05860b3106a59b7ecd71ce Mon Sep 17 00:00:00 2001
From: Jade Choghari <chogharijade@gmail.com>
Date: Thu, 26 Feb 2026 09:55:25 +0000
Subject: [PATCH] xvla log fix

---
 .../policies/xvla/configuration_florence2.py  |  3 ---
 .../policies/xvla/modeling_florence2.py       | 19 -------------------
 2 files changed, 22 deletions(-)

diff --git a/src/lerobot/policies/xvla/configuration_florence2.py b/src/lerobot/policies/xvla/configuration_florence2.py
index 77f1b3a1d..4e3240487 100644
--- a/src/lerobot/policies/xvla/configuration_florence2.py
+++ b/src/lerobot/policies/xvla/configuration_florence2.py
@@ -13,12 +13,9 @@
 import warnings
 
 from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
 
 """ Florence-2 configuration"""
 
-logger = logging.get_logger(__name__)
-
 
 class Florence2VisionConfig(PretrainedConfig):
     r"""
diff --git a/src/lerobot/policies/xvla/modeling_florence2.py b/src/lerobot/policies/xvla/modeling_florence2.py
index e33efe5c3..df8e45a14 100644
--- a/src/lerobot/policies/xvla/modeling_florence2.py
+++ b/src/lerobot/policies/xvla/modeling_florence2.py
@@ -46,7 +46,6 @@ from transformers.utils import (
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
-    logging,
     replace_return_docstrings,
 )
 
@@ -57,8 +56,6 @@ if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
 
-logger = logging.get_logger(__name__)
-
 _CONFIG_FOR_DOC = "Florence2Config"
 
 
@@ -992,12 +989,6 @@ class Florence2FlashAttention2(Florence2Attention):
             else:
                 target_dtype = self.q_proj.weight.dtype
 
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-
             query_states = query_states.to(target_dtype)
             key_states = key_states.to(target_dtype)
             value_states = value_states.to(target_dtype)
@@ -1135,11 +1126,6 @@ class Florence2SdpaAttention(Florence2Attention):
     ) -> tuple[torch.Tensor, torch.Tensor | None, tuple[torch.Tensor] | None]:
         """Input shape: Batch x Time x Channel"""
         if output_attentions or layer_head_mask is not None:
-            # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "Florence2Model is using Florence2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `layer_head_mask` not None. Falling back to the manual attention"
-                ' implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
             return super().forward(
                 hidden_states,
                 key_value_states=key_value_states,
@@ -1860,9 +1846,6 @@ class Florence2Decoder(Florence2LanguagePreTrainedModel):
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
 
         if self.gradient_checkpointing and self.training and use_cache:
-            logger.warning_once(
-                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-            )
             use_cache = False
 
         # decoder layers
@@ -2160,8 +2143,6 @@ class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if labels is not None:
-            if use_cache:
-                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
             use_cache = False
             if decoder_input_ids is None and decoder_inputs_embeds is None:
                 decoder_input_ids = shift_tokens_right(