load molmoact2 without remote code

This commit is contained in:
hq-fang
2026-05-20 22:56:31 +00:00
parent e1afb96474
commit 7fe49f9e54
12 changed files with 8778 additions and 527 deletions

View File

@@ -69,6 +69,7 @@ def _resolve_checkpoint_location(
repo_type="model",
revision=revision,
force_download=force_download,
ignore_patterns=["*.py", "*.pyc", "__pycache__/*"],
token=_hf_token(),
)
@@ -185,7 +186,6 @@ class MolmoAct2Config(PreTrainedConfig):
checkpoint_path: str = "allenai/MolmoAct2"
checkpoint_revision: str | None = None
checkpoint_force_download: bool = False
trust_remote_code: bool = True
n_obs_steps: int = 1
chunk_size: int = 30

View File

@@ -0,0 +1,17 @@
#!/usr/bin/env python
# Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ruff: noqa

View File

@@ -0,0 +1,234 @@
#!/usr/bin/env python
# Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ruff: noqa
import logging
import os
from pathlib import Path
from typing import ClassVar
import numpy as np
from scipy.fft import dct
from scipy.fft import idct
from tokenizers import ByteLevelBPETokenizer
from tokenizers.trainers import BpeTrainer
from huggingface_hub import snapshot_download
from transformers import PreTrainedTokenizerFast
from transformers.processing_utils import ProcessorMixin
def _hf_token() -> str | None:
return os.environ.get("HF_TOKEN") or os.environ.get("HF_ACCESS_TOKEN")
def _resolve_tokenizer_location(
tokenizer_path: str,
*,
revision: str | None = None,
force_download: bool = False,
) -> str:
local_path = Path(str(tokenizer_path)).expanduser()
if local_path.exists():
return str(local_path)
return snapshot_download(
repo_id=str(tokenizer_path),
repo_type="model",
revision=revision,
force_download=force_download,
ignore_patterns=["*.py", "*.pyc", "__pycache__/*"],
token=_hf_token(),
)
class UniversalActionProcessor(ProcessorMixin):
attributes: ClassVar[list[str]] = ["tokenizer"]
tokenizer_class: str = "AutoTokenizer"
def __init__(
self,
tokenizer: PreTrainedTokenizerFast,
scale: float = 10,
vocab_size: int = 1024,
min_token: int = 0,
*,
action_dim: int | None = None,
time_horizon: int | None = None,
):
self.scale = scale
self.vocab_size = vocab_size
self.min_token = min_token
# Action horizon and dimension needed during decoding. These can be specified
# in three ways (in order of priority):
# 1. passed in as kwargs to decode()
# 2. in the constructor
# 3. cached from the last time decode() was called
self.time_horizon = time_horizon
self.action_dim = action_dim
self.called_time_horizon = time_horizon
self.called_action_dim = action_dim
super().__init__(tokenizer)
self.bpe_tokenizer = self.tokenizer
def __call__(self, action_chunk: np.array) -> np.array:
assert action_chunk.ndim <= 3, "Only 3 dimensions supported: [batch, timesteps, action_dim]"
if action_chunk.ndim == 2:
action_chunk = action_chunk[None, ...]
# Cache the time horizon and action dimension for decoding
self.called_time_horizon = action_chunk.shape[-2]
self.called_action_dim = action_chunk.shape[-1]
dct_coeff = dct(action_chunk, axis=1, norm="ortho")
dct_coeff = np.around(dct_coeff * self.scale)
tokens = []
for elem in dct_coeff:
token_str = "".join(map(chr, np.maximum(elem.flatten() - self.min_token, 0).astype(int)))
tokens.append(self.bpe_tokenizer(token_str)["input_ids"])
return tokens
def decode(
self,
tokens: list[list[int]],
*,
time_horizon: int | None = None,
action_dim: int | None = None,
) -> np.array:
self.time_horizon = time_horizon or self.time_horizon or self.called_time_horizon
self.action_dim = action_dim or self.action_dim or self.called_action_dim
# Cache the time horizon and action dimension for the next call
self.called_time_horizon = self.time_horizon
self.called_action_dim = self.action_dim
assert (
self.time_horizon is not None and self.action_dim is not None
), "Tokenizer not initialized, call encode() once or pass in time_horizon and action_dim."
decoded_actions = []
for token in tokens:
try:
decoded_tokens = self.bpe_tokenizer.decode(token)
decoded_dct_coeff = np.array(list(map(ord, decoded_tokens))) + self.min_token
decoded_dct_coeff = decoded_dct_coeff.reshape(-1, self.action_dim)
assert (
decoded_dct_coeff.shape
== (
self.time_horizon,
self.action_dim,
)
), f"Decoded DCT coefficients have shape {decoded_dct_coeff.shape}, expected ({self.time_horizon}, {self.action_dim})"
except Exception as e:
print(f"Error decoding tokens: {e}")
print(f"Tokens: {token}")
decoded_dct_coeff = np.zeros((self.time_horizon, self.action_dim))
decoded_actions.append(idct(decoded_dct_coeff / self.scale, axis=0, norm="ortho"))
return np.stack(decoded_actions)
@classmethod
def fit(
cls,
action_data: list[np.array],
scale: float = 10,
vocab_size: int = 1024,
*,
time_horizon: int | None = None,
action_dim: int | None = None,
) -> "UniversalActionProcessor":
# Run DCT over all inputs
dct_tokens = [dct(a, axis=0, norm="ortho").flatten() for a in action_data]
# Quantize and find min token
max_token = int(np.around(np.concatenate(dct_tokens) * scale).max())
min_token = int(np.around(np.concatenate(dct_tokens) * scale).min())
min_vocab_size = max_token - min_token
assert (
min_vocab_size <= vocab_size
), f"Vocab size {vocab_size} is too small for the range of tokens {min_vocab_size}"
if min_vocab_size + 100 > vocab_size:
logging.warning(
f"Initial alphabet size {min_vocab_size} is almost as large as the vocab"
f"size {vocab_size}, consider increasing vocab size"
)
# Make token iterator for BPE training
def _token_iter():
for tokens in dct_tokens:
rounded_tokens = np.around(tokens * scale) - min_token
rounded_tokens = rounded_tokens.astype(int)
string = "".join(map(chr, rounded_tokens))
yield string
# Train BPE tokenizer
bpe = ByteLevelBPETokenizer()
# Set up the entire range of possible tokens as the initial alphabet
alphabet = [chr(i) for i in range(max_token - min_token + 1)]
trainer = BpeTrainer(
vocab_size=vocab_size,
min_frequency=2,
show_progress=True,
special_tokens=[],
initial_alphabet=alphabet,
max_token_length=10000,
)
# Train the inner tokenizer (don't use ByteLevelBPETokenizer.train_from_iterator()
# because it doesn't support custom alphabets)
bpe._tokenizer.train_from_iterator(_token_iter(), trainer=trainer)
return cls(
PreTrainedTokenizerFast(tokenizer_object=bpe, clean_up_tokenization_spaces=False),
scale=scale,
vocab_size=vocab_size,
min_token=min_token,
time_horizon=time_horizon,
action_dim=action_dim,
)
@classmethod
def from_pretrained_local(
cls,
pretrained_model_name_or_path: str,
*,
revision: str | None = None,
force_download: bool = False,
) -> "UniversalActionProcessor":
location = Path(
_resolve_tokenizer_location(
pretrained_model_name_or_path,
revision=revision,
force_download=force_download,
)
)
processor_config = {}
processor_config_path = location / "processor_config.json"
if processor_config_path.exists():
import json
processor_config = json.loads(processor_config_path.read_text())
tokenizer = PreTrainedTokenizerFast.from_pretrained(str(location))
return cls(
tokenizer,
scale=processor_config.get("scale", 10),
vocab_size=processor_config.get("vocab_size", 1024),
min_token=processor_config.get("min_token", 0),
action_dim=processor_config.get("action_dim"),
time_horizon=processor_config.get("time_horizon"),
)

View File

@@ -0,0 +1,561 @@
#!/usr/bin/env python
# Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ruff: noqa
"""
MolmoAct2 configuration
"""
from typing import Optional, Any
from transformers import PretrainedConfig
from transformers.modeling_rope_utils import rope_config_validation
from transformers.utils import logging
logger = logging.get_logger(__name__)
class MolmoAct2VitConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`MolmoAct2VisionTransformer`].
It is used to instantiate a `MolmoAct2VisionTransformer` according to the specified arguments,
defining the model architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Example:
```python
>>> from transformers import MolmoAct2VitConfig, MolmoAct2VisionTransformer
>>> # Initializing a MolmoAct2VitConfig
>>> configuration = MolmoAct2VitConfig()
>>> # Initializing a MolmoAct2VisionTransformer (with random weights)
>>> model = MolmoAct2VisionTransformer(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "molmoact2"
base_config_key = "vit_config"
def __init__(
self,
hidden_size: int = 1152,
intermediate_size: int = 4304,
num_hidden_layers: int = 27,
num_attention_heads: int = 16,
num_key_value_heads: int = 16,
head_dim: int = 72,
hidden_act: str = "gelu_pytorch_tanh",
layer_norm_eps: float = 1e-6,
image_default_input_size: tuple[int, int] = (378, 378),
image_patch_size: int = 14,
image_num_pos: int = 577,
attention_dropout: float = 0.0,
residual_dropout: float = 0.0,
initializer_range: float = 0.02,
float32_attention: bool = True,
attn_implementation: str = "eager",
**kwargs,
):
self.attn_implementation = attn_implementation
super().__init__(
attn_implementation=attn_implementation,
**kwargs
)
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.head_dim = head_dim
self.hidden_act = hidden_act
self.layer_norm_eps = layer_norm_eps
self.image_default_input_size = image_default_input_size
self.image_patch_size = image_patch_size
self.image_num_pos = image_num_pos
self.attention_dropout = attention_dropout
self.residual_dropout = residual_dropout
self.initializer_range = initializer_range
self.float32_attention = float32_attention
@property
def image_num_patch(self):
h, w = self.image_default_input_size
return h // self.image_patch_size, w // self.image_patch_size
class MolmoAct2AdapterConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of MolmoAct2Adapter. With MolmoAct2VitConfig,
It is used to instantiate an MolmoAct2VisionBackbone according to the specified arguments,
defining the model architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Example:
```python
>>> from transformers import MolmoAct2VitConfig, MolmoAct2AdapterConfig, MolmoAct2VisionBackbone
>>> # Initializing a MolmoAct2VitConfig and a MolmoAct2AdapterConfig
>>> vit_config = MolmoAct2VitConfig()
>>> adapter_config = MolmoPoolingConfig()
>>> # Initializing a MolmoAct2VisionBackbone (with random weights)
>>> model = MolmoAct2VisionBackbone(vit_config, adapter_config)
>>> # Accessing the model configuration
>>> vit_configuration = model.vit_config
>>> adapter_configuration = model.adapter_config
```"""
model_type = "molmoact2"
base_config_key = "adapter_config"
def __init__(
self,
vit_layers: tuple = (-3, -9),
pooling_attention_mask: bool = False,
hidden_size: int = 1152,
num_attention_heads: int = 16,
num_key_value_heads: int = 16,
head_dim: int = 72,
float32_attention: bool = True,
attention_dropout: float = 0.0,
residual_dropout: float = 0.0,
hidden_act: str = "silu",
intermediate_size: int = 18944,
text_hidden_size: int = 3584,
image_feature_dropout: float = 0.0,
initializer_range: float = 0.02,
attn_implementation: str = "eager",
**kwargs,
):
self.attn_implementation = attn_implementation
super().__init__(
attn_implementation=attn_implementation,
**kwargs
)
self.vit_layers = vit_layers
self.pooling_attention_mask = pooling_attention_mask
self.hidden_size = hidden_size
self.num_attention_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.head_dim = head_dim
self.float32_attention = float32_attention
self.attention_dropout = attention_dropout
self.residual_dropout = residual_dropout
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.text_hidden_size = text_hidden_size
self.image_feature_dropout = image_feature_dropout
self.initializer_range = initializer_range
class MolmoAct2TextConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`MolmoAct2TextModel`]. It is used to instantiate a
`MolmoAct2TextModel` according to the specified arguments, defining the model architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Example:
```python
>>> from transformers import MolmoAct2TextConfig, MolmoAct2TextModel
>>> # Initializing a MolmoAct2TextConfig
>>> configuration = MolmoAct2TextConfig()
>>> # Initializing a MolmoAct2TextModel (with random weights)
>>> model = MolmoAct2TextModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "molmoact2_text"
base_config_key = "text_config"
keys_to_ignore_at_inference = ["past_key_values"]
base_model_tp_plan = {
"blocks.*.self_attn.att_proj": "colwise",
"blocks.*.self_attn.attn_out": "rowwise",
"blocks.*.mlp.ff_proj": "colwise",
"blocks.*.mlp.ff_out": "rowwise",
}
base_model_pp_plan = {
"wte": (["input_ids"], ["inputs_embeds"]),
"blocks": (["hidden_states", "attention_mask"], ["hidden_states"]),
"ln_f": (["hidden_states"], ["hidden_states"]),
}
def __init__(
self,
hidden_size: int = 3584,
num_attention_heads: int = 28,
num_key_value_heads: Optional[int] = 4,
head_dim: int = 128,
vocab_size: int = 152064,
additional_vocab_size: int = 128,
qkv_bias: bool = True,
num_hidden_layers: int = 48,
intermediate_size: int = 18944,
hidden_act: str = "silu",
embedding_dropout: float=0.0,
attention_dropout: float=0.0,
residual_dropout: float = 0.0,
max_position_embeddings: int = 4096,
rope_theta: float = 1000000.0,
rope_scaling: dict[str, Any] = None,
rope_scaling_layers: Optional[list[int]] = None,
use_qk_norm: bool = False,
qk_norm_type: str = "olmo",
layer_norm_eps: int = 1e-6,
norm_after: bool = False,
initializer_range: float = 0.02,
use_cache=True,
tie_word_embeddings=False,
attn_implementation: str = "eager",
**kwargs,
):
self.attn_implementation = attn_implementation
super().__init__(
tie_word_embeddings=tie_word_embeddings,
attn_implementation=attn_implementation,
**kwargs
)
self.hidden_size = hidden_size
self.num_attention_heads = num_attention_heads
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.head_dim = head_dim
self.vocab_size = vocab_size
self.additional_vocab_size = additional_vocab_size
self.qkv_bias = qkv_bias
self.num_hidden_layers = num_hidden_layers
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.embedding_dropout = embedding_dropout
self.attention_dropout = attention_dropout
self.residual_dropout = residual_dropout
self.max_position_embeddings = max_position_embeddings
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
self.rope_scaling_layers = rope_scaling_layers
self.use_qk_norm = use_qk_norm
self.qk_norm_type = qk_norm_type
self.layer_norm_eps = layer_norm_eps
self.norm_after = norm_after
self.initializer_range = initializer_range
self.use_cache = use_cache
# Validate the correctness of rotary position embeddings parameters
rope_config_validation(self)
class MolmoAct2ActionExpertConfig(PretrainedConfig):
r"""Configuration for the MolmoAct2 modern action expert."""
model_type = "molmoact2_action_expert"
base_config_key = "action_expert_config"
def __init__(
self,
max_action_horizon: int = 32,
max_action_dim: int = 32,
hidden_size: int = 1024,
num_layers: int = 32,
num_heads: int = 16,
mlp_ratio: float = 8.0 / 3.0,
ffn_multiple_of: int = 256,
timestep_embed_dim: int = 256,
dropout: float = 0.0,
attn_dropout: float = 0.0,
context_layer_norm: bool = True,
qk_norm: bool = True,
qk_norm_eps: float = 1e-6,
rope: bool = True,
causal_attn: bool = False,
**kwargs,
):
super().__init__(**kwargs)
self.max_action_horizon = max_action_horizon
self.max_action_dim = max_action_dim
self.hidden_size = hidden_size
self.num_layers = num_layers
self.num_heads = num_heads
self.mlp_ratio = mlp_ratio
self.ffn_multiple_of = ffn_multiple_of
self.timestep_embed_dim = timestep_embed_dim
self.dropout = dropout
self.attn_dropout = attn_dropout
self.context_layer_norm = context_layer_norm
self.qk_norm = qk_norm
self.qk_norm_eps = qk_norm_eps
self.rope = rope
self.causal_attn = causal_attn
def to_dict(self):
output = super().to_dict()
# These are derived from the parent MolmoAct2Config for HF exports. Keeping
# them out of the public nested config avoids duplicated sources of truth.
output.pop("max_action_horizon", None)
output.pop("max_action_dim", None)
return output
class MolmoAct2Config(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`MolmoAct2ForConditionalGeneration`].
It is used to instantiate an MolmoAct2 model according to the specified arguments, defining the model architecture.
Example:
```python
>>> from transformers import MolmoAct2Config, MolmoAct2VitConfig, MolmoAct2AdapterConfig, MolmoAct2TextConfig
>>> # Initializing a MolmoAct2VitConfig
>>> vit_config = MolmoAct2VitConfig()
>>> # Initializing a MolmoAct2AdapterConfig
>>> adapter_config = MolmoAct2AdapterConfig()
>>> # Initializing a MolmoAct2TextConfig
>>> text_config = MolmoAct2TextConfig()
>>> # Initializing a MolmoAct2Config
>>> configuration = MolmoAct2Config(
>>> vit_config=vit_config,
>>> adapter_config=adapter_config,
>>> text_config=text_config,
>>> image_start_token_id=151936,
>>> image_end_token_id=151937,
>>> image_patch_id=151938,
>>> image_col_id=151939,
>>> low_res_image_start_token_id=151940,
>>> image_low_res_id=151942,
>>> frame_start_token_id=151943,
>>> frame_end_token_id=151944,
>>> )
>>> # Initializing a model
>>> model = MolmoAct2ForConditionalGeneration(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "molmoact2"
sub_configs = {
"text_config": MolmoAct2TextConfig,
"vit_config": MolmoAct2VitConfig,
"adapter_config": MolmoAct2AdapterConfig,
"action_expert_config": MolmoAct2ActionExpertConfig,
}
def __init__(
self,
vit_config: MolmoAct2VitConfig = None,
adapter_config: MolmoAct2AdapterConfig = None,
text_config: MolmoAct2TextConfig = None,
action_expert_config: MolmoAct2ActionExpertConfig = None,
image_start_token_id: int = None,
low_res_image_start_token_id: int = None,
image_end_token_id: int = None,
image_low_res_id: int = None,
image_patch_id: int = None,
image_col_id: int = None,
frame_start_token_id: int = None,
frame_end_token_id: int = None,
use_frame_special_tokens: bool = True,
initializer_range: float = 0.02,
add_action_expert: bool = True,
max_action_dim: int = 32,
max_action_horizon: int = 30,
n_obs_steps: int = 30,
action_mode: str = "both",
state_format: str = "discrete",
flow_matching_num_steps: int = 10,
flow_matching_cutoff: float = 1.0,
flow_matching_time_offset: float = 0.001,
flow_matching_time_scale: float = 0.999,
flow_matching_beta_alpha: float = 1.0,
flow_matching_beta_beta: float = 1.5,
mask_action_dim_padding: bool = True,
enable_depth_reasoning: bool = False,
depth_mode: int = 2,
num_depth_codes: int = 100,
action_expert_depth_gate: bool = False,
action_expert_depth_gate_per_layer: bool = False,
action_expert_depth_gate_init_bias: float = -4.0,
action_output_token_id: int = None,
action_start_token_id: int = None,
action_end_token_id: int = None,
action_token_start_id: int = None,
num_action_tokens: int = 0,
depth_output_token_id: int = None,
depth_start_token_id: int = None,
depth_end_token_id: int = None,
depth_token_start_id: int = None,
num_depth_tokens: int = 0,
state_start_token_id: int = None,
state_end_token_id: int = None,
state_token_start_id: int = None,
num_state_tokens: int = 0,
add_setup_tokens: bool = True,
add_control_tokens: bool = True,
norm_stats_filename: str = "norm_stats.json",
**kwargs,
):
super().__init__(**kwargs)
if vit_config is None:
self.vit_config = MolmoAct2VitConfig()
elif isinstance(vit_config, dict):
self.vit_config = MolmoAct2VitConfig(**vit_config)
else:
self.vit_config = vit_config
if adapter_config is None:
self.adapter_config = MolmoAct2AdapterConfig()
elif isinstance(adapter_config, dict):
self.adapter_config = MolmoAct2AdapterConfig(**adapter_config)
else:
self.adapter_config = adapter_config
if text_config is None:
self.text_config = MolmoAct2TextConfig()
elif isinstance(text_config, dict):
self.text_config = MolmoAct2TextConfig(**text_config)
else:
self.text_config = text_config
self.add_action_expert = bool(add_action_expert)
if not self.add_action_expert:
self.action_expert_config = None
elif action_expert_config is None:
self.action_expert_config = MolmoAct2ActionExpertConfig(
max_action_horizon=max_action_horizon,
max_action_dim=max_action_dim,
num_layers=self.text_config.num_hidden_layers,
)
elif isinstance(action_expert_config, dict):
self.action_expert_config = MolmoAct2ActionExpertConfig(**action_expert_config)
else:
self.action_expert_config = action_expert_config
if self.add_action_expert:
self.action_expert_config.max_action_dim = int(max_action_dim)
self.action_expert_config.max_action_horizon = int(max_action_horizon)
self._validate_release_action_config(
state_format=state_format,
)
self.image_start_token_id = image_start_token_id
self.low_res_image_start_token_id = low_res_image_start_token_id
self.image_end_token_id = image_end_token_id
self.image_low_res_id = image_low_res_id
self.image_high_res_id = image_patch_id
self.image_patch_id = image_patch_id
self.image_col_id = image_col_id
self.frame_start_token_id = frame_start_token_id
self.frame_end_token_id = frame_end_token_id
self.use_frame_special_tokens = use_frame_special_tokens
self.initializer_range = initializer_range
self.max_action_dim = max_action_dim
self.max_action_horizon = max_action_horizon
self.n_obs_steps = n_obs_steps
self.action_mode = action_mode
self.state_format = state_format
self.flow_matching_num_steps = flow_matching_num_steps
self.flow_matching_cutoff = flow_matching_cutoff
self.flow_matching_time_offset = flow_matching_time_offset
self.flow_matching_time_scale = flow_matching_time_scale
self.flow_matching_beta_alpha = flow_matching_beta_alpha
self.flow_matching_beta_beta = flow_matching_beta_beta
self.mask_action_dim_padding = mask_action_dim_padding
self.enable_depth_reasoning = enable_depth_reasoning
self.depth_mode = depth_mode
self.num_depth_codes = num_depth_codes
self.action_expert_depth_gate = action_expert_depth_gate
self.action_expert_depth_gate_per_layer = action_expert_depth_gate_per_layer
self.action_expert_depth_gate_init_bias = action_expert_depth_gate_init_bias
self.action_output_token_id = action_output_token_id
self.action_start_token_id = action_start_token_id
self.action_end_token_id = action_end_token_id
self.action_token_start_id = action_token_start_id
self.num_action_tokens = num_action_tokens
self.depth_output_token_id = depth_output_token_id
self.depth_start_token_id = depth_start_token_id
self.depth_end_token_id = depth_end_token_id
self.depth_token_start_id = depth_token_start_id
self.num_depth_tokens = num_depth_tokens
self.state_start_token_id = state_start_token_id
self.state_end_token_id = state_end_token_id
self.state_token_start_id = state_token_start_id
self.num_state_tokens = num_state_tokens
self.add_setup_tokens = add_setup_tokens
self.add_control_tokens = add_control_tokens
self.norm_stats_filename = norm_stats_filename
@staticmethod
def _validate_release_action_config(
*,
state_format: str,
) -> None:
if state_format != "discrete":
raise ValueError("MolmoAct2 HF export supports only state_format='discrete'.")
@property
def image_num_patch(self):
assert self.vit_config is not None
return self.vit_config.image_num_patch
@property
def num_attention_heads(self):
return self.text_config.num_attention_heads
@property
def num_key_value_heads(self):
return self.text_config.num_key_value_heads
@property
def head_dim(self):
return self.text_config.head_dim
@property
def num_hidden_layers(self):
return self.text_config.num_hidden_layers
@property
def hidden_size(self):
return self.text_config.hidden_size
@property
def vocab_size(self):
return self.text_config.vocab_size
@property
def max_position_embeddings(self):
return self.text_config.max_position_embeddings
MolmoAct2VitConfig.register_for_auto_class()
MolmoAct2AdapterConfig.register_for_auto_class()
MolmoAct2TextConfig.register_for_auto_class()
MolmoAct2ActionExpertConfig.register_for_auto_class()
MolmoAct2Config.register_for_auto_class()

View File

@@ -0,0 +1,564 @@
#!/usr/bin/env python
# Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ruff: noqa
"""Image processor class for MolmoAct2"""
from typing import Optional, Union
import numpy as np
import einops
import torch
import torchvision.transforms
from transformers.image_utils import (
IMAGENET_STANDARD_MEAN,
IMAGENET_STANDARD_STD,
ImageInput,
PILImageResampling,
make_flat_list_of_images,
valid_images,
to_numpy_array,
)
from transformers.image_transforms import convert_to_rgb
from transformers.processing_utils import ImagesKwargs
from transformers.image_processing_utils import BaseImageProcessor, get_size_dict
from transformers.utils import logging
from transformers.feature_extraction_utils import BatchFeature
from transformers.utils import TensorType, logging
logger = logging.get_logger(__name__)
def normalize_image(
image: np.ndarray,
image_mean: list[float],
image_std: list[float],
) -> np.ndarray:
if np.allclose(image_mean, [0.5, 0.5, 0.5]) and np.allclose(image_std, [0.5, 0.5, 0.5]):
return image * np.asarray(2.0, dtype=np.float32) - np.asarray(1.0, dtype=np.float32)
image -= np.array(image_mean, dtype=np.float32)[None, None, :]
image /= np.array(image_std, dtype=np.float32)[None, None, :]
return image
def resize_image(
image: np.ndarray,
desired_output_size: list[int],
resample: PILImageResampling,
) -> np.ndarray:
image = torch.permute(torch.from_numpy(image), [2, 0, 1])
dtype = image.dtype
if torch.is_floating_point(image):
in_min = 0.0
in_max = 1.0
resized = torchvision.transforms.Resize(
desired_output_size,
resample,
antialias=False,
)(image)
resized = torch.clip(resized, 0.0, 1.0).to(dtype)
else:
assert image.dtype == torch.uint8, "SigLIP expects float images or uint8 images, but got {}".format(image.dtype)
in_min = 0.0
in_max = 255.0
resized = torchvision.transforms.Resize(
desired_output_size,
resample,
antialias=False,
)(image)
resized = torch.clip(resized, 0, 255).to(dtype)
resized = resized.to(torch.float32)
resized = (resized - in_min) / (in_max - in_min)
resized = torch.permute(resized, [1, 2, 0]).numpy()
return resized
def select_tiling(h, w, patch_size, max_num_crops):
"""Divide in image of size [w, h] in up to max_num_patches of size patch_size"""
original_size = np.stack([h, w]) # [1, 2]
original_res = h * w
tilings = []
for i in range(1, max_num_crops + 1):
for j in range(1, max_num_crops + 1):
if i*j <= max_num_crops:
tilings.append((i, j))
# sort so argmin and argmax favour smaller tilings in the event of a tie
tilings.sort(key=lambda x: (x[0]*x[1], x[0]))
candidate_tilings = np.array(tilings, dtype=np.int32) # [n_resolutions, 2]
candidate_resolutions = candidate_tilings * patch_size # [n_resolutions, 2]
# How much we would need to scale the image to fit exactly in each tiling
original_size = np.stack([h, w], dtype=np.float32) # [1, 2]
# The original size can be zero in rare cases if the image is smaller than the margin
# In those cases letting the scale become infinite means the tiling is based on the
# other side, or falls back to the smallest tiling
with np.errstate(divide='ignore'):
required_scale_d = candidate_resolutions.astype(np.float32) / original_size,
required_scale = np.min(required_scale_d, axis=-1, keepdims=True) # [n_resolutions, 1]
if np.all(required_scale < 1):
# We are forced to downscale, so try to minimize the amount of downscaling
ix = np.argmax(required_scale)
else:
# Pick the resolution that required the least upscaling so that it most closely fits the image
required_scale = np.where(required_scale < 1.0, 10e9, required_scale)
ix = np.argmin(required_scale)
return candidate_tilings[ix]
def build_resized_image(
image: np.ndarray,
base_image_input_size: list[int],
resample: PILImageResampling,
image_mean: list[float],
image_std: list[float],
image_patch_size: int,
) -> tuple[np.ndarray, np.ndarray]:
resized = resize_image(
image, base_image_input_size, resample,
)
resized = normalize_image(resized, image_mean, image_std)
if len(resized.shape) == 3:
resized = np.expand_dims(resized, 0)
crop_patch_w = base_image_input_size[1] // image_patch_size
crop_patch_h = base_image_input_size[0] // image_patch_size
resize_idx = np.arange(crop_patch_w*crop_patch_h).reshape([crop_patch_h, crop_patch_w])
return resized, resize_idx
def build_overlapping_crops(
image: np.ndarray,
max_crops: int,
overlap_margins: list[int],
base_image_input_size: list[int],
resample: PILImageResampling,
image_mean: list[float],
image_std: list[float],
image_patch_size: int,
) -> tuple[np.ndarray, np.ndarray]:
"""Decompose an image into a set of overlapping crops
:return crop_arr: [n_crops, h, w, 3] The crops
:return patch_idx: [overlap_patch_h, overlap_patch_w] For each patch in the resized image
the crops were extracted from, what patch in `crop_arr` it corresponds to
"""
original_image_h, original_image_w = image.shape[:2]
crop_size = base_image_input_size[0]
assert base_image_input_size[0] == base_image_input_size[1]
left_margin, right_margin = overlap_margins
total_margin_pixels = image_patch_size * (right_margin + left_margin) # pixels removed per dim
crop_patches = base_image_input_size[0] // image_patch_size # patches per crop dim
crop_window_patches = crop_patches - (right_margin + left_margin) # usable patches
crop_window_size = crop_window_patches * image_patch_size
crop_patch_w = base_image_input_size[1] // image_patch_size
crop_patch_h = base_image_input_size[0] // image_patch_size
original_image_h, original_image_w = image.shape[:2]
crop_size = base_image_input_size[0]
# Decide how to tile the image, to account for the overlap margins we compute the tiling
# as if we had an image without the margins and were using a crop size without the margins
tiling = select_tiling(
original_image_h - total_margin_pixels,
original_image_w - total_margin_pixels,
crop_window_size,
max_crops,
)
src = resize_image(
image,
[tiling[0]*crop_window_size+total_margin_pixels, tiling[1]*crop_window_size+total_margin_pixels],
resample,
)
src = normalize_image(src, image_mean, image_std)
# Now we have to split the image into crops, and track what patches came from
# where in `patch_idx_arr`
n_crops = tiling[0] * tiling[1]
crop_arr = np.zeros([n_crops, crop_size, crop_size, 3], dtype=src.dtype)
patch_idx_arr = np.zeros([n_crops, crop_patch_h, crop_patch_w], dtype=np.int32)
on_crop = 0
for i in range(tiling[0]):
# Slide over `src` by `crop_window_size` steps, but extract crops of size `crops_size`
# which results in overlapping crop windows
y0 = i*crop_window_size
for j in range(tiling[1]):
x0 = j*crop_window_size
crop_arr[on_crop] = src[y0:y0+crop_size, x0:x0+crop_size]
patch_idx = np.arange(crop_patch_w*crop_patch_h).reshape(crop_patch_h, crop_patch_w)
patch_idx += on_crop * crop_patch_h * crop_patch_w
# Mask out idx that are in the overlap region
if i != 0:
patch_idx[:left_margin, :] = -1
if j != 0:
patch_idx[:, :left_margin] = -1
if i != tiling[0]-1:
patch_idx[-right_margin:, :] = -1
if j != tiling[1]-1:
patch_idx[:, -right_margin:] = -1
patch_idx_arr[on_crop] = patch_idx
on_crop += 1
# `patch_idx_arr` is ordered crop-by-crop, here we transpose `patch_idx_arr`
# so it is ordered left-to-right order
patch_idx_arr = np.reshape(
patch_idx_arr,
[tiling[0], tiling[1], crop_patch_h, crop_patch_w]
)
patch_idx_arr = np.transpose(patch_idx_arr, [0, 2, 1, 3])
patch_idx_arr = np.reshape(patch_idx_arr, [-1])
# Now get the parts not in the overlap region, so it should map each patch in `src`
# to the correct patch it should come from in `crop_arr`
patch_idx_arr = patch_idx_arr[patch_idx_arr >= 0].reshape(
src.shape[0]//image_patch_size,
src.shape[1]//image_patch_size,
)
return crop_arr, patch_idx_arr
def batch_pixels_to_patches(array: np.ndarray, patch_size: int) -> np.ndarray:
"""Reshape images of [n_images, h, w, 3] -> [n_images, n_patches, pixels_per_patch]"""
if len(array.shape) == 3:
n_crops, h, w = array.shape
h_patches = h//patch_size
w_patches = w//patch_size
array = np.reshape(array, [n_crops, h_patches, patch_size, w_patches, patch_size])
array = np.transpose(array, [0, 1, 3, 2, 4])
array = np.reshape(array, [n_crops, h_patches*w_patches, patch_size*patch_size])
return array
else:
n_crops, h, w, c = array.shape
h_patches = h//patch_size
w_patches = w//patch_size
array = np.reshape(array, [n_crops, h_patches, patch_size, w_patches, patch_size, c])
array = np.transpose(array, [0, 1, 3, 2, 4, 5])
array = np.reshape(array, [n_crops, h_patches*w_patches, patch_size*patch_size*c])
return array
def arange_for_pooling(
idx_arr: np.ndarray,
pool_h: int,
pool_w: int,
) -> np.ndarray:
h_pad = pool_h * ((idx_arr.shape[0] + pool_h - 1) // pool_h) - idx_arr.shape[0]
w_pad = pool_w * ((idx_arr.shape[1] + pool_w - 1) // pool_w) - idx_arr.shape[1]
idx_arr = np.pad(idx_arr, [[h_pad//2, (h_pad+1)//2], [w_pad//2, (w_pad+1)//2]],
mode='constant',constant_values=-1)
return einops.rearrange(
idx_arr, "(h dh) (w dw) -> h w (dh dw)", dh=pool_h, dw=pool_w)
def image_to_patches_and_grids(
image: np.ndarray,
max_crops: int,
overlap_margins: list[int],
base_image_input_size: list[int],
resample: PILImageResampling,
image_mean: list[float],
image_std: list[float],
image_patch_size: int,
image_pooling_w: int,
image_pooling_h: int,
crop_mode: str = "overlap-and-resize-c2",
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
:return image_grids, the shape of each (low-res, high-res) image after pooling
:return crops, the image crops to processes with the ViT
:return pooled_patch_idx, for each patch_id tokens in `image_tokens`, the indices of the
patches in `crops` to pool for that token, masked with -1
"""
if isinstance(base_image_input_size, int):
base_image_input_size = (base_image_input_size, base_image_input_size)
base_image_input_d = image_patch_size
pooling_w = image_pooling_w
pooling_h = image_pooling_h
crop_patch_w = base_image_input_size[1] // base_image_input_d
crop_patch_h = base_image_input_size[0] // base_image_input_d
if crop_mode == "resize":
resized, resize_idx = build_resized_image(
image,
base_image_input_size,
resample,
image_mean,
image_std,
image_patch_size,
)
resize_idx = arange_for_pooling(resize_idx, pooling_h, pooling_w)
resized_h, resized_w = resize_idx.shape[:2]
resize_idx = resize_idx.reshape([-1, pooling_h * pooling_w])
image_grid = [np.array([resized_h, resized_w, 0, 0])]
return (
np.stack(image_grid, 0),
batch_pixels_to_patches(resized, image_patch_size),
resize_idx,
)
if crop_mode not in {"overlap-and-resize-c2", "overlap-and-resize"}:
raise ValueError(f"Unsupported MolmoAct2 image crop_mode {crop_mode!r}.")
crop_arr, patch_idx_arr = build_overlapping_crops(
image,
max_crops,
overlap_margins,
base_image_input_size,
resample,
image_mean,
image_std,
image_patch_size,
)
pooling_idx = arange_for_pooling(patch_idx_arr, pooling_h, pooling_w)
h, w = pooling_idx.shape[:2]
pooling_idx = pooling_idx.reshape([-1, pooling_h*pooling_w])
# Finally do the same for the global image
resized, resize_idx = build_resized_image(
image,
base_image_input_size,
resample,
image_mean,
image_std,
image_patch_size,
)
crop_arr = np.concatenate([resized, crop_arr], 0)
resize_idx = arange_for_pooling(resize_idx, pooling_h, pooling_w)
resized_h, resized_w = resize_idx.shape[:2]
resize_idx = resize_idx.reshape([-1, pooling_h*pooling_w])
# Global image goes first, so the order of patches in previous crops gets increased
pooling_idx = np.where(
pooling_idx >= 0,
pooling_idx + crop_patch_h*crop_patch_w,
-1
)
pooling_idx = np.concatenate([resize_idx, pooling_idx])
image_grid = [np.array([resized_h, resized_w, h, w])]
return (
np.stack(image_grid, 0),
batch_pixels_to_patches(crop_arr, image_patch_size),
pooling_idx
)
class MolmoAct2ImagesKwargs(ImagesKwargs, total=False):
max_crops: Optional[int]
overlap_margins: Optional[list[int]]
crop_mode: Optional[str]
patch_size: Optional[int]
pooling_size: Optional[list[int]]
class MolmoAct2ImageProcessor(BaseImageProcessor):
r"""
Constructs a MolmoAct2 image processor that preprocesses images for the model.
Args:
size (`dict[str, int]` *optional*, defaults to `{"height": 378, "width": 378}`):
Size of the image after resizing.
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
Resampling filter to use when resizing the image.
image_mean (`float` or `list[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
image_std (`float` or `list[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
do_convert_rgb (`bool`, *optional*, defaults to `True`):
Whether to convert the image to RGB.
max_crops (`int`, *optional*, defaults to `8`):
Maximum number of crops to use per image.
overlap_margins (`list[int]`, *optional*, defaults to `[4, 4]`):
Overlap margins to use.
patch_size (`int`, *optional*, defaults to 14):
The spatial patch size of the vision encoder.
pooling_size (`list[int]`, *optional*, defaults to `[2, 2]`):
The pooling size of the vision adapter.
"""
model_input_names = ["pixel_values", "image_token_pooling", "image_grids", "image_num_crops"]
def __init__(
self,
size: Optional[dict[str, int]] = None,
resample: PILImageResampling = PILImageResampling.BILINEAR,
image_mean: Optional[Union[float, list[float]]] = None,
image_std: Optional[Union[float, list[float]]] = None,
do_convert_rgb: bool = True,
max_crops: int = 8,
overlap_margins: list[int] = [4, 4],
crop_mode: str = "overlap-and-resize-c2",
patch_size: int = 14,
pooling_size: list[int] = [2, 2],
**kwargs,
) -> None:
super().__init__(**kwargs)
size = size if size is not None else {"height": 378, "width": 378}
size = get_size_dict(size, default_to_square=True)
self.size = size
self.resample = resample
self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
self.do_convert_rgb = do_convert_rgb
self.max_crops = max_crops
self.overlap_margins = overlap_margins
self.crop_mode = crop_mode
self.patch_size = patch_size
self.pooling_size = pooling_size
def preprocess(
self,
images: ImageInput,
size: Optional[dict[str, int]] = None,
resample: Optional[PILImageResampling] = None,
image_mean: Optional[Union[float, list[float]]] = None,
image_std: Optional[Union[float, list[float]]] = None,
do_convert_rgb: Optional[bool] = None,
max_crops: Optional[int] = None,
overlap_margins: Optional[list[int]] = None,
crop_mode: Optional[str] = None,
patch_size: Optional[int] = None,
pooling_size: Optional[list[int]] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs,
) -> BatchFeature:
"""
Args:
images (`ImageInput`):
Image to preprocess.
size (`dict[str, int]`, *optional*, defaults to `self.size`):
Size of the image after resizing.
resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
Resampling filter to use when resizing the image. This can be one of the enum `PILImageResampling`. Only
has an effect if `do_resize` is set to `True`.
image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
`True`.
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
Whether to convert the image to RGB.
max_crops (`int`, *optional*, defaults to `self.max_crops`):
Maximum number of crops to use per image.
overlap_margins (`list[int]`, *optional*, defaults to `self.overlap_margins`):
Overlap margins to use.
patch_size (`int`, *optional*, defaults to `self.patch_size`):
The spatial patch size of the vision encoder.
pooling_size (`list[int]`, *optional*, defaults to `self.pooling_size`):
The pooling size of the vision adapter.
return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
Returns:
A `BatchFeature` containing the following keys:
- `pixel_values`: The preprocessed images.
- `image_token_pooling`: The indices of the patches in `crops` to pool for each token in `image_tokens`.
- `image_grids`: The image grids.
- `image_num_crops`: The number of crops for each image.
"""
if size is not None:
if "height" not in size or "width" not in size:
raise ValueError("size must contain 'height' and 'width' keys.")
else:
size = {**self.size}
base_image_input_size = [size["height"], size["width"]]
resample = resample or self.resample
image_mean = image_mean or self.image_mean
image_std = image_std or self.image_std
do_convert_rgb = do_convert_rgb or self.do_convert_rgb
max_crops = max_crops or self.max_crops
overlap_margins = overlap_margins or self.overlap_margins
crop_mode = crop_mode or self.crop_mode
patch_size = patch_size or self.patch_size
pooling_size = pooling_size or self.pooling_size
image_pooling_h, image_pooling_w = pooling_size
if images is not None:
images = self.fetch_images(images)
images = make_flat_list_of_images(images)
if images is not None and not valid_images(images):
raise ValueError(
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
"torch.Tensor, tf.Tensor or jax.ndarray."
)
if do_convert_rgb:
images = [convert_to_rgb(image) for image in images]
# All transformations expect numpy arrays.
images = [to_numpy_array(image) for image in images]
data = {}
if images is not None:
batch_grids = []
batch_crops = []
batch_pooled_patches_idx = []
batch_num_crops = []
for image in images:
image_grid, crops, pooled_idx = image_to_patches_and_grids(
image,
max_crops,
overlap_margins,
base_image_input_size,
resample,
image_mean,
image_std,
patch_size,
image_pooling_w,
image_pooling_h,
crop_mode,
)
batch_grids.append(image_grid)
batch_crops.append(crops)
batch_pooled_patches_idx.append(pooled_idx)
batch_num_crops.append(crops.shape[0])
pixel_values = np.concatenate(batch_crops, 0)
image_token_pooling = np.concatenate(batch_pooled_patches_idx, 0)
image_grids = np.concatenate(batch_grids, 0)
image_num_crops = np.array(batch_num_crops)
data.update(
pixel_values=pixel_values,
image_token_pooling=image_token_pooling,
image_grids=image_grids,
image_num_crops=image_num_crops,
)
return BatchFeature(data, tensor_type=return_tensors)
MolmoAct2ImageProcessor.register_for_auto_class()

View File

@@ -0,0 +1,786 @@
#!/usr/bin/env python
# Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ruff: noqa
"""Inference utilities for MolmoAct2"""
from dataclasses import dataclass
from typing import Any, Iterable, Optional, Sequence, Tuple
import torch
from torch.nn import functional as F
from transformers.cache_utils import Cache
from transformers.configuration_utils import PretrainedConfig
@dataclass
class _ActionFlowInputs:
trajectory: torch.Tensor
context: Any
modulations: Sequence[Any]
action_dim_is_pad: Optional[torch.Tensor]
@dataclass
class _ActionFlowCudaGraph:
key: Tuple[Any, ...]
graph: torch.cuda.CUDAGraph
static_inputs: _ActionFlowInputs
output: torch.Tensor
@dataclass
class _DepthDecodeCudaGraphLayerStage:
residual: torch.Tensor
query: torch.Tensor
key: torch.Tensor
value: torch.Tensor
@dataclass
class _DepthDecodeCudaGraphPostStage:
graph: torch.cuda.CUDAGraph
attn_context: torch.Tensor
@dataclass
class _DepthDecodeCudaGraph:
cache_key: Tuple[Any, ...]
pre_graph: torch.cuda.CUDAGraph
token_ids: torch.Tensor
cos: torch.Tensor
sin: torch.Tensor
positions: torch.Tensor
stages: Sequence[_DepthDecodeCudaGraphLayerStage]
post_graphs: Sequence[_DepthDecodeCudaGraphPostStage]
output: torch.Tensor
@dataclass
class _DepthDecodeCudaGraphSpec:
eligible: bool
cache_key_prefix: Tuple[Any, ...]
num_hidden_layers: int
head_dim: int
num_attention_heads: int
def _cache_seq_len_int(past_key_values: Optional[Cache]) -> int:
if past_key_values is None:
return 0
seq_len = past_key_values.get_seq_length()
if torch.is_tensor(seq_len):
return int(seq_len.item())
return int(seq_len)
def _cache_max_len_int(past_key_values: Optional[Cache]) -> int:
if past_key_values is None:
return -1
max_len = past_key_values.get_max_cache_shape()
if torch.is_tensor(max_len):
return int(max_len.item())
return int(max_len)
def _iter_cache_key_values(
past_key_values: Cache,
) -> Iterable[Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]]:
layers = getattr(past_key_values, "layers", None)
if layers is not None:
for layer in layers:
yield getattr(layer, "keys", None), getattr(layer, "values", None)
return
for layer in past_key_values:
yield layer[0], layer[1]
class _DepthDecodeStaticLayerCache:
is_compileable = False
is_sliding = False
def __init__(self, max_cache_len: int) -> None:
self.max_cache_len = int(max_cache_len)
self.cumulative_length = 0
self.keys: Optional[torch.Tensor] = None
self.values: Optional[torch.Tensor] = None
def _allocate(self, key_states: torch.Tensor, value_states: torch.Tensor) -> None:
bsz, n_heads = key_states.shape[:2]
self.keys = torch.empty(
(bsz, n_heads, self.max_cache_len, key_states.shape[-1]),
dtype=key_states.dtype,
device=key_states.device,
)
self.values = torch.empty(
(bsz, n_heads, self.max_cache_len, value_states.shape[-1]),
dtype=value_states.dtype,
device=value_states.device,
)
def update(
self,
key_states: torch.Tensor,
value_states: torch.Tensor,
*args,
**kwargs,
) -> Tuple[torch.Tensor, torch.Tensor]:
if self.keys is None:
self._allocate(key_states, value_states)
start = self.cumulative_length
end = start + key_states.shape[-2]
if end > self.max_cache_len:
raise RuntimeError(
f"KV cache length {end} exceeds max_cache_len={self.max_cache_len}."
)
self.keys[:, :, start:end, :].copy_(key_states)
self.values[:, :, start:end, :].copy_(value_states)
self.cumulative_length = end
return self.keys[:, :, :end, :], self.values[:, :, :end, :]
def get_seq_length(self) -> int:
return self.cumulative_length
def get_max_cache_shape(self) -> int:
return -1
def reset(self) -> None:
self.cumulative_length = 0
class _DepthDecodeStaticCache(Cache):
def __init__(self, config: PretrainedConfig, max_cache_len: int) -> None:
text_config = config.get_text_config(decoder=True)
super().__init__(
layers=[
_DepthDecodeStaticLayerCache(max_cache_len=max_cache_len)
for _ in range(text_config.num_hidden_layers)
]
)
def get_seq_length(self, layer_idx: int = 0) -> int:
return self.layers[layer_idx].get_seq_length()
def get_max_cache_shape(self, layer_idx: int = 0) -> int:
return self.layers[layer_idx].get_max_cache_shape()
def reset(self) -> None:
for layer in self.layers:
layer.reset()
class ActionCudaGraphManager:
def __init__(self, model: Any) -> None:
self.model = model
self.enabled = True
self.action_flow_graph: Optional[_ActionFlowCudaGraph] = None
def set_enabled(self, enabled: bool) -> None:
self.enabled = bool(enabled)
def can_use_action_flow(self, inputs: _ActionFlowInputs) -> bool:
action_model = self.model
if not self.enabled:
return False
if action_model.training or action_model._require_action_expert().training:
return False
if inputs.trajectory.device.type != "cuda":
return False
def all_on_cuda():
yield inputs.trajectory
for k, v in inputs.context.kv_contexts:
yield k
yield v
for t in (
inputs.context.cross_mask,
inputs.context.self_mask,
inputs.context.valid_action,
inputs.action_dim_is_pad,
):
if t is not None:
yield t
if inputs.context.rope_cache is not None:
yield from inputs.context.rope_cache
for step in inputs.modulations:
yield step.conditioning
for block_modulation in step.block_modulations:
yield from block_modulation
yield from step.final_modulation
return all(t.device.type == "cuda" for t in all_on_cuda())
def run_action_flow(
self,
inputs: _ActionFlowInputs,
steps: int,
run_loop,
) -> torch.Tensor:
key = _cuda_graph_key(inputs, steps)
cache = self.action_flow_graph
if cache is None or cache.key != key:
static_inputs = _clone_static_inputs(inputs)
graph, output = _capture_cuda_graph(
lambda: run_loop(static_inputs, steps),
inputs.trajectory.device,
after_warmup=lambda: static_inputs.trajectory.copy_(inputs.trajectory),
)
cache = _ActionFlowCudaGraph(
key=key,
graph=graph,
static_inputs=static_inputs,
output=output,
)
self.action_flow_graph = cache
else:
_copy_inputs_(cache.static_inputs, inputs)
cache.graph.replay()
return cache.output.clone()
class DepthDecodeCudaGraphManager:
def __init__(self, model: Any) -> None:
self.model = model
self.backbone = model.model
self.enabled = True
self.graph: Optional[_DepthDecodeCudaGraph] = None
self.graph_spec: Optional[_DepthDecodeCudaGraphSpec] = None
def set_enabled(self, enabled: bool) -> None:
self.enabled = bool(enabled)
def make_static_cache(self, max_cache_len: int) -> _DepthDecodeStaticCache:
return _DepthDecodeStaticCache(
config=self.model.config.text_config,
max_cache_len=max_cache_len,
)
def _depth_decode_spec(self) -> _DepthDecodeCudaGraphSpec:
static = self.graph_spec
if static is None:
cfg = self.backbone.transformer.config
rotary_emb = getattr(self.backbone.transformer, "rotary_emb", None)
static = _DepthDecodeCudaGraphSpec(
eligible=(
not cfg.norm_after
and cfg.rope_scaling_layers is None
and getattr(rotary_emb, "rope_type", None) == "default"
and cfg._attn_implementation == "sdpa"
),
cache_key_prefix=(
cfg.hidden_size,
cfg.num_attention_heads,
cfg.num_key_value_heads,
cfg.head_dim,
cfg.num_hidden_layers,
cfg.use_qk_norm,
cfg.qk_norm_type,
cfg._attn_implementation,
),
num_hidden_layers=cfg.num_hidden_layers,
head_dim=cfg.head_dim,
num_attention_heads=cfg.num_attention_heads,
)
self.graph_spec = static
return static
def can_use(
self,
next_input_ids: torch.Tensor,
*,
past_key_values: Cache,
attention_bias: torch.Tensor,
) -> bool:
if (
not self.enabled
or self.model.training
or self.backbone.transformer.training
):
return False
if next_input_ids.device.type != "cuda":
return False
if (
next_input_ids.ndim != 2
or next_input_ids.shape[0] != 1
or next_input_ids.shape[1] != 1
):
return False
if not isinstance(past_key_values, _DepthDecodeStaticCache):
return False
if (
not torch.is_tensor(attention_bias)
or attention_bias.device != next_input_ids.device
):
return False
return self._depth_decode_spec().eligible
def _depth_decode_key(
self,
next_input_ids: torch.Tensor,
attention_bias: torch.Tensor,
) -> Tuple[Any, ...]:
device = next_input_ids.device
return (
self._depth_decode_spec().cache_key_prefix,
device.type,
device.index,
self.model.lm_head.weight.dtype,
attention_bias.shape[-1],
)
def _select_depth_decode_rope(
self, cos: torch.Tensor, sin: torch.Tensor, *, past_length: int
) -> None:
emb = self.backbone.transformer.rotary_emb
cos.copy_(emb._pos_cos_cache[0, :, past_length : past_length + 1, :])
sin.copy_(emb._pos_sin_cache[0, :, past_length : past_length + 1, :])
def _depth_decode_pre_layer(
self,
layer_idx: int,
hidden_states: torch.Tensor,
cos: torch.Tensor,
sin: torch.Tensor,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
block = self.backbone.transformer.blocks[layer_idx]
attention = block.self_attn
residual = hidden_states
hidden_states = block.attn_norm(hidden_states)
input_shape = hidden_states.shape[:-1]
hidden_shape = (*input_shape, -1, attention.head_dim)
qkv = attention.att_proj(hidden_states)
query_states, key_states, value_states = qkv.split(attention.fused_dims, dim=-1)
value_states = value_states.view(hidden_shape)
apply_qk_norm = attention.q_norm is not None and attention.k_norm is not None
norm_after_view = apply_qk_norm and attention.qk_norm_type == "qwen3"
if apply_qk_norm and not norm_after_view:
query_states = attention.q_norm(query_states)
key_states = attention.k_norm(key_states)
query_states = query_states.view(hidden_shape)
key_states = key_states.view(hidden_shape)
if norm_after_view:
query_states = attention.q_norm(query_states)
key_states = attention.k_norm(key_states)
query_states = query_states.transpose(1, 2)
key_states = key_states.transpose(1, 2)
value_states = value_states.transpose(1, 2)
query_states, key_states = _apply_rotary_pos_emb(
query_states, key_states, cos, sin
)
return residual, query_states, key_states, value_states
def _depth_decode_pre0(
self,
token_ids: torch.Tensor,
cos: torch.Tensor,
sin: torch.Tensor,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
inputs_embeds = self.model._embed_base_tokens(token_ids)
return self._depth_decode_pre_layer(0, inputs_embeds, cos, sin)
def _depth_decode_post_layer(
self,
layer_idx: int,
residual: torch.Tensor,
attn_context: torch.Tensor,
) -> torch.Tensor:
block = self.backbone.transformer.blocks[layer_idx]
attention = block.self_attn
input_shape = residual.shape[:-1]
attn_output = attn_context.reshape(*input_shape, -1).contiguous()
attn_output = attention.attn_out(attn_output)
hidden_states = residual + block.dropout(attn_output)
residual = hidden_states
hidden_states = block.ff_norm(hidden_states)
hidden_states = block.mlp(hidden_states)
hidden_states = residual + block.dropout(hidden_states)
return hidden_states
def _depth_decode_post_and_pre_next(
self,
layer_idx: int,
residual: torch.Tensor,
attn_context: torch.Tensor,
cos: torch.Tensor,
sin: torch.Tensor,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
hidden_states = self._depth_decode_post_layer(layer_idx, residual, attn_context)
return self._depth_decode_pre_layer(layer_idx + 1, hidden_states, cos, sin)
def _depth_decode_last_post(
self,
layer_idx: int,
residual: torch.Tensor,
attn_context: torch.Tensor,
) -> torch.Tensor:
hidden_states = self._depth_decode_post_layer(layer_idx, residual, attn_context)
return self.backbone.transformer.ln_f(hidden_states)
def _build_depth_decode_graph(
self,
next_input_ids: torch.Tensor,
*,
past_length: int,
attention_bias: torch.Tensor,
) -> _DepthDecodeCudaGraph:
text_config = self.backbone.transformer.config
device = next_input_ids.device
dtype = self.model.lm_head.weight.dtype
static = self._depth_decode_spec()
num_layers = static.num_hidden_layers
head_dim = static.head_dim
max_cache_len = int(attention_bias.shape[-1])
max_rope_len = max(int(text_config.max_position_embeddings or 0), max_cache_len)
self.backbone.transformer.prepare_rope_cache(
device=device, max_seq_len=max_rope_len
)
token_ids = torch.empty((1, 1), device=device, dtype=torch.long)
cos = torch.empty((1, 1, head_dim), device=device, dtype=dtype)
sin = torch.empty_like(cos)
positions = torch.arange(max_cache_len, device=device, dtype=torch.long)
context_shape = (1, 1, static.num_attention_heads, head_dim)
token_ids.copy_(next_input_ids)
self._select_depth_decode_rope(cos, sin, past_length=past_length)
pre_graph, pre_output = _capture_cuda_graph(
lambda: self._depth_decode_pre0(token_ids, cos, sin),
device,
)
stages = [_DepthDecodeCudaGraphLayerStage(*pre_output)]
post_graphs = []
for layer_idx in range(num_layers - 1):
stage = stages[-1]
attn_context = torch.empty(context_shape, device=device, dtype=dtype)
graph, output = _capture_cuda_graph(
lambda layer_idx=layer_idx, stage=stage, attn_context=attn_context: (
self._depth_decode_post_and_pre_next(
layer_idx,
stage.residual,
attn_context,
cos,
sin,
)
),
device,
)
post_graphs.append(
_DepthDecodeCudaGraphPostStage(graph=graph, attn_context=attn_context)
)
stages.append(_DepthDecodeCudaGraphLayerStage(*output))
last_stage = stages[-1]
last_attn_context = torch.empty(context_shape, device=device, dtype=dtype)
last_graph, last_output = _capture_cuda_graph(
lambda: self._depth_decode_last_post(
num_layers - 1,
last_stage.residual,
last_attn_context,
),
device,
)
post_graphs.append(
_DepthDecodeCudaGraphPostStage(
graph=last_graph, attn_context=last_attn_context
)
)
return _DepthDecodeCudaGraph(
cache_key=self._depth_decode_key(next_input_ids, attention_bias),
pre_graph=pre_graph,
token_ids=token_ids,
cos=cos,
sin=sin,
positions=positions,
stages=tuple(stages),
post_graphs=tuple(post_graphs),
output=last_output,
)
def _get_depth_decode_graph(
self,
next_input_ids: torch.Tensor,
*,
past_length: int,
attention_bias: torch.Tensor,
) -> _DepthDecodeCudaGraph:
key = self._depth_decode_key(next_input_ids, attention_bias)
decode_graph = self.graph
if decode_graph is None or decode_graph.cache_key != key:
decode_graph = self._build_depth_decode_graph(
next_input_ids,
past_length=past_length,
attention_bias=attention_bias,
)
self.graph = decode_graph
else:
decode_graph.token_ids.copy_(next_input_ids)
self._select_depth_decode_rope(
decode_graph.cos, decode_graph.sin, past_length=past_length
)
return decode_graph
def _run_depth_decode_attention_core(
self,
layer_idx: int,
stage: _DepthDecodeCudaGraphLayerStage,
*,
past_key_values: Cache,
attention_bias: torch.Tensor,
cache_position: torch.Tensor,
cos: torch.Tensor,
sin: torch.Tensor,
) -> torch.Tensor:
attention = self.backbone.transformer.blocks[layer_idx].self_attn
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
key_states, value_states = past_key_values.update(
stage.key,
stage.value,
layer_idx,
cache_kwargs,
)
key_states = _repeat_kv(key_states, attention.num_key_value_groups)
value_states = _repeat_kv(value_states, attention.num_key_value_groups)
attn_output = F.scaled_dot_product_attention(
stage.query,
key_states,
value_states,
attn_mask=attention_bias,
dropout_p=0.0,
is_causal=False,
)
return attn_output.transpose(1, 2)
def run(
self,
next_input_ids: torch.Tensor,
*,
past_key_values: Cache,
attention_bias: torch.Tensor,
past_length: int,
) -> Tuple[torch.Tensor, Cache]:
end = past_length + 1
decode_graph = self._get_depth_decode_graph(
next_input_ids,
past_length=past_length,
attention_bias=attention_bias,
)
cache_position = decode_graph.positions[past_length:end]
attention_bias_q = attention_bias[:, :, past_length:end, :end]
decode_graph.pre_graph.replay()
for layer_idx, post_graph in enumerate(decode_graph.post_graphs):
attn_context = self._run_depth_decode_attention_core(
layer_idx,
decode_graph.stages[layer_idx],
past_key_values=past_key_values,
attention_bias=attention_bias_q,
cache_position=cache_position,
cos=decode_graph.cos,
sin=decode_graph.sin,
)
post_graph.attn_context.copy_(attn_context)
post_graph.graph.replay()
return decode_graph.output, past_key_values
def _cuda_graph_tensor_signature(
tensor: Optional[torch.Tensor],
) -> Optional[Tuple[Any, ...]]:
if tensor is None:
return None
return (
tuple(tensor.shape),
tuple(tensor.stride()),
str(tensor.dtype),
str(tensor.device),
)
def _cuda_graph_context_signature(context: Any) -> Tuple[Any, ...]:
sig = _cuda_graph_tensor_signature
return (
tuple((sig(k), sig(v)) for k, v in context.kv_contexts),
sig(context.cross_mask),
sig(context.self_mask),
sig(context.valid_action),
None
if context.rope_cache is None
else tuple(sig(t) for t in context.rope_cache),
)
def _cuda_graph_modulation_signature(modulations: Sequence[Any]) -> Tuple[Any, ...]:
sig = _cuda_graph_tensor_signature
return tuple(
(
sig(step.conditioning),
tuple(
tuple(sig(t) for t in block_modulation)
for block_modulation in step.block_modulations
),
tuple(sig(t) for t in step.final_modulation),
)
for step in modulations
)
def _cuda_graph_key(inputs: _ActionFlowInputs, steps: int) -> Tuple[Any, ...]:
sig = _cuda_graph_tensor_signature
return (
sig(inputs.trajectory),
_cuda_graph_context_signature(inputs.context),
_cuda_graph_modulation_signature(inputs.modulations),
sig(inputs.action_dim_is_pad),
int(steps),
)
def _clone_static_tensor(tensor: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
if tensor is None:
return None
static = torch.empty_strided(
tuple(tensor.shape),
tuple(tensor.stride()),
device=tensor.device,
dtype=tensor.dtype,
)
static.copy_(tensor)
return static
def _clone_static_context(context: Any) -> Any:
rope_cache = None
if context.rope_cache is not None:
rope_cache = tuple(_clone_static_tensor(t) for t in context.rope_cache)
return context.__class__(
kv_contexts=tuple(
(_clone_static_tensor(k), _clone_static_tensor(v))
for k, v in context.kv_contexts
),
cross_mask=_clone_static_tensor(context.cross_mask),
self_mask=_clone_static_tensor(context.self_mask),
valid_action=_clone_static_tensor(context.valid_action),
rope_cache=rope_cache,
)
def _clone_static_modulations(modulations: Sequence[Any]) -> Sequence[Any]:
return tuple(
step.__class__(
conditioning=_clone_static_tensor(step.conditioning),
block_modulations=tuple(
tuple(_clone_static_tensor(t) for t in block_modulation)
for block_modulation in step.block_modulations
),
final_modulation=tuple(
_clone_static_tensor(t) for t in step.final_modulation
),
)
for step in modulations
)
def _clone_static_inputs(inputs: _ActionFlowInputs) -> _ActionFlowInputs:
return _ActionFlowInputs(
trajectory=_clone_static_tensor(inputs.trajectory),
context=_clone_static_context(inputs.context),
modulations=_clone_static_modulations(inputs.modulations),
action_dim_is_pad=_clone_static_tensor(inputs.action_dim_is_pad),
)
def _copy_context_(dst: Any, src: Any) -> None:
for (dst_k, dst_v), (src_k, src_v) in zip(dst.kv_contexts, src.kv_contexts):
dst_k.copy_(src_k)
dst_v.copy_(src_v)
if src.cross_mask is not None:
dst.cross_mask.copy_(src.cross_mask)
if src.self_mask is not None:
dst.self_mask.copy_(src.self_mask)
if src.valid_action is not None:
dst.valid_action.copy_(src.valid_action)
if src.rope_cache is not None:
for dst_tensor, src_tensor in zip(dst.rope_cache, src.rope_cache):
dst_tensor.copy_(src_tensor)
def _copy_inputs_(dst: _ActionFlowInputs, src: _ActionFlowInputs) -> None:
dst.trajectory.copy_(src.trajectory)
_copy_context_(dst.context, src.context)
if src.action_dim_is_pad is not None:
dst.action_dim_is_pad.copy_(src.action_dim_is_pad)
def _rotate_half(x: torch.Tensor) -> torch.Tensor:
x1 = x[..., : x.shape[-1] // 2]
x2 = x[..., x.shape[-1] // 2 :]
return torch.cat((-x2, x1), dim=-1)
def _apply_rotary_pos_emb(
q: torch.Tensor,
k: torch.Tensor,
cos: torch.Tensor,
sin: torch.Tensor,
unsqueeze_dim: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor]:
cos = cos.unsqueeze(unsqueeze_dim)
sin = sin.unsqueeze(unsqueeze_dim)
q_embed = (q * cos) + (_rotate_half(q) * sin)
k_embed = (k * cos) + (_rotate_half(k) * sin)
return q_embed, k_embed
def _repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
batch, num_key_value_heads, slen, head_dim = hidden_states.shape
if n_rep == 1:
return hidden_states
hidden_states = hidden_states[:, :, None, :, :].expand(
batch, num_key_value_heads, n_rep, slen, head_dim
)
return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
def _capture_cuda_graph(
fn,
device: torch.device,
*,
after_warmup=None,
) -> Tuple[torch.cuda.CUDAGraph, Any]:
warmup_stream = torch.cuda.Stream(device=device)
warmup_stream.wait_stream(torch.cuda.current_stream(device))
with torch.cuda.stream(warmup_stream):
fn()
torch.cuda.current_stream(device).wait_stream(warmup_stream)
if after_warmup is not None:
after_warmup()
graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(graph):
output = fn()
return graph, output

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,436 @@
#!/usr/bin/env python
# Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ruff: noqa
"""
Processor class for MolmoAct2.
"""
from typing import Optional, Union
import dataclasses
import numpy as np
from transformers.image_utils import ImageInput
from transformers.video_utils import VideoInput
from transformers.processing_utils import (
Unpack,
ProcessingKwargs,
ProcessorMixin,
)
from transformers.feature_extraction_utils import BatchFeature
from transformers.tokenization_utils_base import TextInput, PreTokenizedInput
from transformers.utils import logging
from transformers import AutoTokenizer
from .image_processing_molmoact2 import MolmoAct2ImagesKwargs, MolmoAct2ImageProcessor
from .video_processing_molmoact2 import MolmoAct2VideoProcessorKwargs, MolmoAct2VideoProcessor
logger = logging.get_logger(__name__)
# Special tokens, these should be present in any tokenizer we use since the preprocessor uses them
IMAGE_PATCH_TOKEN = f"<im_patch>" # Where to insert high-res tokens
IMAGE_LOW_RES_TOKEN = f"<im_low>" # Where to insert low-res tokens
IM_START_TOKEN = f"<im_start>"
LOW_RES_IMAGE_START_TOKEN = f"<low_res_im_start>"
FRAME_START_TOKEN = f"<frame_start>"
IM_END_TOKEN = f"<im_end>"
FRAME_END_TOKEN= f"<frame_end>"
IM_COL_TOKEN = f"<im_col>"
IMAGE_PROMPT = "<|image|>"
VIDEO_PROMPT = "<|video|>"
IMAGE_TOKENS = [
IMAGE_PATCH_TOKEN,
IM_COL_TOKEN,
IM_START_TOKEN,
LOW_RES_IMAGE_START_TOKEN,
FRAME_START_TOKEN,
IM_END_TOKEN,
FRAME_END_TOKEN,
IMAGE_LOW_RES_TOKEN,
]
class MolmoAct2ProcessorKwargs(ProcessingKwargs, total=False):
"""MolmoAct2 processor kwargs"""
images_kwargs: MolmoAct2ImagesKwargs
videos_kwargs: MolmoAct2VideoProcessorKwargs
_defaults = {
"text_kwargs": {
"padding": False,
"return_mm_token_type_ids": True,
},
"videos_kwargs": {"return_metadata": True},
}
class MolmoAct2Processor(ProcessorMixin):
attributes = ["image_processor", "video_processor", "tokenizer"]
optional_attributes = [
"chat_template",
"time_mode",
"image_use_col_tokens",
"use_single_crop_col_tokens",
"use_single_crop_start_token",
"video_use_col_tokens",
"use_frame_special_tokens",
]
image_processor_class = "AutoImageProcessor"
video_processor_class = "AutoVideoProcessor"
tokenizer_class = "AutoTokenizer"
def __init__(
self,
image_processor: MolmoAct2ImageProcessor = None,
video_processor: MolmoAct2VideoProcessor = None,
tokenizer: AutoTokenizer = None,
chat_template: Optional[str] = None,
image_use_col_tokens: Optional[bool] = True,
use_single_crop_col_tokens: Optional[bool] = None,
use_single_crop_start_token: Optional[bool] = True,
video_use_col_tokens: Optional[bool] = False,
use_frame_special_tokens: Optional[bool] = True,
**kwargs
) -> None:
super().__init__(
image_processor,
video_processor,
tokenizer,
chat_template=chat_template,
)
self.image_use_col_tokens = image_use_col_tokens
self.use_single_crop_col_tokens = use_single_crop_col_tokens
self.use_single_crop_start_token = use_single_crop_start_token
self.video_use_col_tokens = video_use_col_tokens
self.use_frame_special_tokens = use_frame_special_tokens
self.image_placeholder_token = IMAGE_PROMPT
self.video_placeholder_token = VIDEO_PROMPT
self.image_token_ids = [
tokenizer.convert_tokens_to_ids(token)
for token in IMAGE_TOKENS
]
def get_image_tokens(self, image_grid: np.ndarray):
resized_h, resized_w, height, width = image_grid
if int(height) == 0 or int(width) == 0:
per_row = np.full(resized_w, IMAGE_PATCH_TOKEN)
use_single_crop_col_tokens = (
self.image_use_col_tokens
if self.use_single_crop_col_tokens is None
else self.use_single_crop_col_tokens
)
if use_single_crop_col_tokens:
per_row = np.concatenate([per_row, [IM_COL_TOKEN]], 0)
joint = [
[IM_START_TOKEN],
np.tile(per_row, [resized_h]),
[IM_END_TOKEN],
]
return np.concatenate(joint)
per_row = np.full(width, IMAGE_PATCH_TOKEN)
if self.image_use_col_tokens:
per_row = np.concatenate([per_row, [IM_COL_TOKEN]], 0)
joint = [
[IM_START_TOKEN],
np.tile(per_row, [height]),
[IM_END_TOKEN],
]
per_row = np.full(resized_w, IMAGE_PATCH_TOKEN)
use_single_crop_col_tokens = (
self.image_use_col_tokens
if self.use_single_crop_col_tokens is None
else self.use_single_crop_col_tokens
)
image_start_token = (
LOW_RES_IMAGE_START_TOKEN
if self.use_single_crop_start_token
else IM_START_TOKEN
)
if use_single_crop_col_tokens:
per_row = np.concatenate([per_row, [IM_COL_TOKEN]], 0)
joint = [
[image_start_token],
np.tile(per_row, [resized_h]),
[IM_END_TOKEN],
] + joint
return np.concatenate(joint)
def get_video_string(
self,
video_grid: np.ndarray,
timestamps: np.ndarray,
):
if self.use_frame_special_tokens:
start_token_id = FRAME_START_TOKEN
end_token_id = FRAME_END_TOKEN
else:
start_token_id = IM_START_TOKEN
end_token_id = IM_END_TOKEN
num_frames, h, w = video_grid
video_string: str = ""
for frame_idx, frame_time in enumerate(timestamps):
# `per-frame-compact` time mode
prev_space = " " if frame_idx > 0 else ""
frame_prefix = prev_space + f"{frame_time:.1f} " # explicit whitespace before/after image tokens
video_string += frame_prefix
per_row = np.full(w, IMAGE_PATCH_TOKEN)
if self.video_use_col_tokens:
per_row = np.concatenate([per_row, [IM_COL_TOKEN]], 0)
extra_tokens = np.tile(per_row, [h])
video_tokens = [
[start_token_id],
extra_tokens,
[end_token_id],
]
video_string += "".join(np.concatenate(video_tokens, 0))
return video_string
def insert_bos(
self,
input_ids: np.ndarray,
attention_mask: np.ndarray,
bos_token_id: int,
pad_token_id: int,
):
"""
Args:
input_ids: [B, S] array with left padding
attention_mask: [B, S] array (0 for pad, 1 for valid)
bos_token_id: int
pad_token_id: int
Returns:
input_ids_out: [B, S] or [B, S+1] array with bos inserted if needed
attention_mask_out: same shape as input_ids_out
"""
need_to_expand = len(input_ids.shape) == 1
if need_to_expand:
input_ids = input_ids[None, :]
attention_mask = attention_mask[None, :]
B, S = input_ids.shape
# Handle zero-length sequence
if S == 0:
new_input_ids = np.full((B, 1), bos_token_id, dtype=input_ids.dtype)
new_attention_mask = np.ones((B, 1), dtype=attention_mask.dtype)
if need_to_expand:
new_input_ids = new_input_ids[0]
new_attention_mask = new_attention_mask[0]
return new_input_ids, new_attention_mask
first_valid_index = (attention_mask == 1).argmax(axis=-1) # [B]
bos_already_present = np.all(input_ids[np.arange(B), first_valid_index] == bos_token_id)
if bos_already_present:
if need_to_expand:
input_ids = input_ids[0]
attention_mask = attention_mask[0]
return input_ids, attention_mask
else:
new_input_ids = np.full((B, S+1), pad_token_id, dtype=input_ids.dtype)
new_attention_mask = np.zeros((B, S+1), dtype=attention_mask.dtype)
src_idx = np.tile(np.arange(S), (B, 1)) # [B, S]
valid_mask = src_idx >= first_valid_index[:, None] # [B, S]
tgt_idx = src_idx + 1 # shit right
batch_idx = np.tile(np.arange(B)[:, None], (1, S)) # [B, S]
# flatten valid_positions
flat_vals = input_ids[valid_mask]
flat_batch = batch_idx[valid_mask]
flat_tgt = tgt_idx[valid_mask]
new_input_ids[flat_batch, flat_tgt] = flat_vals
new_attention_mask[flat_batch, flat_tgt] = 1
insert_pos = first_valid_index
new_input_ids[np.arange(B), insert_pos] = bos_token_id
new_attention_mask[np.arange(B), insert_pos] = 1
if need_to_expand:
new_input_ids = new_input_ids[0]
new_attention_mask = new_attention_mask[0]
return new_input_ids, new_attention_mask
def __call__(
self,
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
images: ImageInput = None,
videos: VideoInput = None,
**kwargs: Unpack[MolmoAct2ProcessorKwargs],
) -> BatchFeature:
"""
Args:
text (`str`, `list[str]`, `list[list[str]]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. Both channels-first and channels-last formats are supported.
videos (`dict[str, Any]` or `list[dict[str, Any]]`):
The video or batch of videos to be prepared. Each video can be a dictionary with the following keys:
- `"frames"`: `np.ndarray` of shape (T, H, W, 3)
- `"timestamps"`: `np.ndarray` of shape (T,)
- `"sampled_fps"`: `float` (optional)
- `"sampling_augmentation"`: `str` (optional)
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
`BatchFeature`: A [`BatchFeature`] with the following fields:
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not `None`).
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
- **image_token_pooling** -- Indices of the patches in `image_grids` to pool for each token in `image_tokens`.
Returned when `images` is not `None`.
- **image_grids** -- Grids of images. Returned when `images` is not `None`.
- **image_num_crops** -- Number of crops for each image. Returned when `images` is not `None`.
- **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
- **video_token_pooling** -- Indices of the patches in `video_grids` to pool for each token in `video_tokens`.
Returned when `videos` is not `None`.
- **video_grids** -- Grids of videos. Returned when `videos` is not `None`.
"""
output_kwargs = self._merge_kwargs(
MolmoAct2ProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
)
if images is not None:
image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
image_grids = image_inputs["image_grids"]
else:
image_inputs = {}
image_grids = None
if videos is not None:
videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"])
video_grids = videos_inputs["video_grids"]
# If user has not requested video metadata, pop it
if "return_metadata" not in kwargs:
video_metadata = videos_inputs.pop("video_metadata")
else:
video_metadata = videos_inputs["video_metadata"]
else:
videos_inputs = {}
video_grids = None
if not isinstance(text, list):
text = [text]
text = text.copy() # below lines change text in-place
if image_grids is not None:
index = 0
for i in range(len(text)):
num_images = text[i].count(self.image_placeholder_token)
image_grids_i = image_grids[index:index+num_images]
for image_grid in image_grids_i:
image_tokens = self.get_image_tokens(image_grid)
image_string = "".join(image_tokens)
text[i] = text[i].replace(self.image_placeholder_token, image_string, 1)
index += num_images
if video_grids is not None:
index = 0
for i in range(len(text)):
num_videos = text[i].count(self.video_placeholder_token)
assert num_videos in {0, 1}, "At most one video is supported for now"
video_grids_i = video_grids[index:index+num_videos]
metadata_i = video_metadata[index:index+num_videos]
for video_grid, metadata in zip(video_grids_i, metadata_i):
video_string = self.get_video_string(
video_grid,
metadata.timestamps,
)
text[i] = text[i].replace(self.video_placeholder_token, video_string, 1)
index += num_videos
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
input_ids = text_inputs["input_ids"]
attention_mask = text_inputs["attention_mask"]
input_ids = np.array(input_ids)
attention_mask = np.array(attention_mask)
bos = self.tokenizer.bos_token_id or self.tokenizer.eos_token_id
input_ids, attention_mask = self.insert_bos(
input_ids, attention_mask, bos, self.tokenizer.pad_token_id
)
if return_mm_token_type_ids:
image_tokens = np.array(self.image_token_ids).astype(input_ids.dtype)
token_type_ids = np.any(input_ids[:, :, None] == image_tokens[None, None, :], axis=-1)
text_inputs["token_type_ids"] = token_type_ids.tolist()
text_inputs["input_ids"] = input_ids.tolist()
text_inputs["attention_mask"] = attention_mask.tolist()
return BatchFeature(
data={**text_inputs, **image_inputs, **videos_inputs},
tensor_type=return_tensors,
)
def post_process_image_text_to_text(
self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs
):
"""
Post-process the output of the model to decode the text.
Args:
generated_outputs (`torch.Tensor` or `np.ndarray`):
The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
or `(sequence_length,)`.
skip_special_tokens (`bool`, *optional*, defaults to `True`):
Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
**kwargs:
Additional arguments to be passed to the tokenizer's `batch_decode method`.
Returns:
`list[str]`: The decoded text.
"""
return self.tokenizer.batch_decode(
generated_outputs,
skip_special_tokens=skip_special_tokens,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)
MolmoAct2Processor.register_for_auto_class()

View File

@@ -0,0 +1,987 @@
#!/usr/bin/env python
# Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ruff: noqa
"""Video processor class for MolmoAct2"""
from functools import partial
import os
import warnings
from contextlib import redirect_stdout
from io import BytesIO
from urllib.parse import urlparse
from typing import Optional, Union, Callable
import numpy as np
import requests
import einops
import torch
import torchvision.transforms
from transformers.image_utils import (
IMAGENET_STANDARD_MEAN,
IMAGENET_STANDARD_STD,
ImageInput,
PILImageResampling,
SizeDict,
validate_kwargs,
)
from transformers.video_utils import (
VideoInput,
is_valid_video,
make_batched_videos,
make_batched_metadata,
VideoMetadata,
)
from transformers.processing_utils import Unpack, VideosKwargs
from transformers.video_processing_utils import BaseVideoProcessor
from transformers.utils import logging
from transformers.feature_extraction_utils import BatchFeature
from transformers.utils import (
is_av_available,
is_decord_available,
is_torchcodec_available,
is_yt_dlp_available,
TensorType,
logging,
to_numpy,
)
logger = logging.get_logger(__name__)
MAX_VIDEO_FPS = 8
def normalize_image(
image: np.ndarray,
image_mean: list[float],
image_std: list[float],
) -> np.ndarray:
if np.allclose(image_mean, [0.5, 0.5, 0.5]) and np.allclose(image_std, [0.5, 0.5, 0.5]):
return image * np.asarray(2.0, dtype=np.float32) - np.asarray(1.0, dtype=np.float32)
image -= np.array(image_mean, dtype=np.float32)[None, None, :]
image /= np.array(image_std, dtype=np.float32)[None, None, :]
return image
def resize_image(
image: np.ndarray,
desired_output_size: list[int],
resample: PILImageResampling,
) -> np.ndarray:
if len(image.shape) == 3:
is_video = False
image = torch.permute(torch.from_numpy(image), [2, 0, 1])
else:
is_video = True
image = torch.permute(torch.from_numpy(image), [0, 3, 1, 2])
dtype = image.dtype
if torch.is_floating_point(image):
in_min = 0.0
in_max = 1.0
resized = torchvision.transforms.Resize(
desired_output_size,
resample,
antialias=False,
)(image)
resized = torch.clip(resized, 0.0, 1.0).to(dtype)
else:
assert image.dtype == torch.uint8, "SigLIP expects float images or uint8 images, but got {}".format(image.dtype)
in_min = 0.0
in_max = 255.0
resized = torchvision.transforms.Resize(
desired_output_size,
resample,
antialias=False,
)(image)
resized = torch.clip(resized, 0, 255).to(dtype)
resized = resized.to(torch.float32)
resized = (resized - in_min) / (in_max - in_min)
if is_video:
resized = torch.permute(resized, [0, 2, 3, 1]).numpy()
else:
resized = torch.permute(resized, [1, 2, 0]).numpy()
return resized
def build_resized_image(
image: np.ndarray,
base_image_input_size: list[int],
resample: PILImageResampling,
image_mean: list[float],
image_std: list[float],
image_patch_size: int,
) -> tuple[np.ndarray, np.ndarray]:
resized = resize_image(
image, base_image_input_size, resample,
)
resized = normalize_image(resized, image_mean, image_std)
if len(resized.shape) == 3:
resized = np.expand_dims(resized, 0)
crop_patch_w = base_image_input_size[1] // image_patch_size
crop_patch_h = base_image_input_size[0] // image_patch_size
resize_idx = np.arange(crop_patch_w*crop_patch_h).reshape([crop_patch_h, crop_patch_w])
return resized, resize_idx
def batch_pixels_to_patches(array: np.ndarray, patch_size: int) -> np.ndarray:
"""Reshape images of [n_images, h, w, 3] -> [n_images, n_patches, pixels_per_patch]"""
if len(array.shape) == 3:
n_crops, h, w = array.shape
h_patches = h//patch_size
w_patches = w//patch_size
array = np.reshape(array, [n_crops, h_patches, patch_size, w_patches, patch_size])
array = np.transpose(array, [0, 1, 3, 2, 4])
array = np.reshape(array, [n_crops, h_patches*w_patches, patch_size*patch_size])
return array
else:
n_crops, h, w, c = array.shape
h_patches = h//patch_size
w_patches = w//patch_size
array = np.reshape(array, [n_crops, h_patches, patch_size, w_patches, patch_size, c])
array = np.transpose(array, [0, 1, 3, 2, 4, 5])
array = np.reshape(array, [n_crops, h_patches*w_patches, patch_size*patch_size*c])
return array
def arange_for_pooling(
idx_arr: np.ndarray,
pool_h: int,
pool_w: int,
) -> np.ndarray:
h_pad = pool_h * ((idx_arr.shape[0] + pool_h - 1) // pool_h) - idx_arr.shape[0]
w_pad = pool_w * ((idx_arr.shape[1] + pool_w - 1) // pool_w) - idx_arr.shape[1]
idx_arr = np.pad(idx_arr, [[h_pad//2, (h_pad+1)//2], [w_pad//2, (w_pad+1)//2]],
mode='constant',constant_values=-1)
return einops.rearrange(
idx_arr, "(h dh) (w dw) -> h w (dh dw)", dh=pool_h, dw=pool_w)
def image_to_patches_and_grids(
image: ImageInput,
base_image_input_size: list[int],
resample: PILImageResampling,
image_mean: list[float],
image_std: list[float],
image_patch_size: int,
image_pooling_w: int,
image_pooling_h: int,
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
:return image_grids, the shape of each image after pooling
:return crops, the image crops to processes with the ViT
:return pooled_patch_idx, for each patch_id tokens in `image_tokens`, the indices of the
patches in `crops` to pool for that token, masked with -1
"""
if isinstance(base_image_input_size, int):
base_image_input_size = (base_image_input_size, base_image_input_size)
pooling_w = image_pooling_w
pooling_h = image_pooling_h
resized, resize_idx = build_resized_image(
image,
base_image_input_size,
resample,
image_mean,
image_std,
image_patch_size,
)
pooling_idx = arange_for_pooling(resize_idx, pooling_h, pooling_w)
h, w = pooling_idx.shape[:2]
pooling_idx = pooling_idx.reshape([-1, pooling_h*pooling_w])
image_grid = [h, w]
return (
image_grid,
batch_pixels_to_patches(resized, image_patch_size),
pooling_idx,
)
def get_candidate_target_fps(
video_fps: Union[int, float],
sampling_fps: Union[int, float],
max_fps: Union[int, float] = MAX_VIDEO_FPS,
) -> list[float]:
"""
Return the subset of `video_fps` factors that remain multiples of `sampling_fps`.
Examples:
>>> get_candidate_target_fps(video_fps=6, sampling_fps=2)
[2, 6]
>>> get_candidate_target_fps(video_fps=5, sampling_fps=1)
[1, 5]
>>> get_candidate_target_fps(video_fps=2, sampling_fps=2)
[2]
>>> get_candidate_target_fps(video_fps=5, sampling_fps=2)
Traceback (most recent call last):
...
ValueError: sampling_fps=2 must divide video_fps=5 to produce consistent frame steps.
"""
video_fps = int(video_fps)
sampling_fps = int(sampling_fps)
max_fps = int(max_fps)
if sampling_fps is None:
raise ValueError("sampling_fps must be provided")
if video_fps <= 0 or sampling_fps <= 0:
raise ValueError(f"video_fps and sampling_fps must be positive (got {video_fps}, {sampling_fps})")
if video_fps % sampling_fps != 0:
raise ValueError(f"sampling_fps={sampling_fps} must divide video_fps={video_fps}.")
candidates = []
for candidate in range(sampling_fps, video_fps + 1, sampling_fps):
if candidate > max_fps:
break
if video_fps % candidate == 0:
candidates.append(float(candidate))
return candidates
def read_video_decord(
video_path,
sample_timestamps_fn: Callable,
**kwargs,
) -> np.ndarray:
"""
Decode a video using the Decord backend.
Args:
video_path (`str`):
Path to the video file.
sample_timestamps_fn (`Callable`):
A callable function that will return timestamps at which the video should be sampled.
Returns:
tuple[`np.array`, `VideoMetadata`]: A tuple containing:
- Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
- `VideoMetadata` object.
"""
# Lazy import from decord
import importlib
decord = importlib.import_module("decord")
vr = decord.VideoReader(uri=video_path, ctx=decord.cpu(0)) # decord has problems with gpu
video_fps = vr.get_avg_fps()
total_num_frames = len(vr)
time_stamps = vr.get_frame_timestamp(list(range(len(vr))))
duration = time_stamps[-1][1] - time_stamps[0][0]
metadata = VideoMetadata(
total_num_frames=int(total_num_frames),
fps=float(video_fps),
duration=float(duration),
video_backend="decord",
)
target_timestamps = sample_timestamps_fn(metadata=metadata, **kwargs)
target_timestamps = np.array(target_timestamps)
offset = time_stamps[0, 0]
ix = np.searchsorted(time_stamps[:, 1], target_timestamps + offset, side='right')
ix = np.minimum(ix, len(time_stamps) - 1)
video = vr.get_batch(ix).asnumpy()
metadata.update(
{
"frames_indices": target_timestamps * video_fps,
"height": video.shape[1],
"width": video.shape[2],
}
)
return video, metadata
def read_video_torchcodec(
video_path,
sample_timestamps_fn: Callable,
**kwargs,
) -> np.ndarray:
"""
Decode a video using torchcodec decoder.
Args:
video_path (`str`):
Path to the video file.
sample_timestamps_fn (`Callable`):
A callable function that will return timestamps at which the video should be sampled.
Returns:
tuple[`np.array`, `VideoMetadata`]: A tuple containing:
- Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
- `VideoMetadata` object.
"""
# Lazy import torchcodec
import importlib
torchcodec = importlib.import_module("torchcodec")
decoder = torchcodec.decoders.VideoDecoder(
video_path,
# Interestingly `exact` mode takes less than approximate when we load the whole video
seek_mode="exact",
# Allow FFmpeg decide on the number of threads for efficiency
num_ffmpeg_threads=0,
)
# If the first frame starts at > 0, we effectively clip the video starting at that time
# since (most) video players would also skip to that time
time_offset = decoder.metadata.begin_stream_seconds_from_content
# Note this duration does assume we started playing at `time_offset`
duration = decoder.metadata.duration_seconds
metadata = VideoMetadata(
total_num_frames=decoder.metadata.num_frames,
fps=decoder.metadata.average_fps,
duration=duration,
video_backend="torchcodec",
height=decoder.metadata.height,
width=decoder.metadata.width,
)
target_timestamps = sample_timestamps_fn(metadata=metadata, **kwargs)
# Floating point/rounding issues might cause `target_timestamps` to be very slightly
# out-of-bounds, to handle this we sanity check then clip them
assert all(x >= 0 for x in target_timestamps)
assert all(x < duration+1e-6 for x in target_timestamps)
# 1e-6 padding since torchcodec can throw out-of-bounds errors even if you ask for the
# exact boundary value, we should still get the first/last frame anyway
max_timestamp = decoder.metadata.end_stream_seconds_from_content - 1e-6
min_timestamp = decoder.metadata.begin_stream_seconds_from_content + 1e-6
# Note we avoid using numpy ops here to reduce floating precision issues
timestamps = [x + time_offset for x in target_timestamps]
timestamps = [max(min_timestamp, min(max_timestamp, x)) for x in timestamps]
video = decoder.get_frames_played_at(timestamps).data.numpy().transpose(0, 2, 3, 1) # Convert to THWC format
target_timestamps = np.array(target_timestamps)
metadata.frames_indices = target_timestamps * metadata.fps
return video, metadata
def read_video_pyav(
video_path,
sample_timestamps_fn: Callable,
**kwargs,
) -> np.ndarray:
"""
Decode a video using the PyAV backend.
Args:
video_path (`str`):
Path to the video file.
sample_timestamps_fn (`Callable`):
A callable function that will return timestamps at which the video should be sampled.
Returns:
tuple[`np.array`, `VideoMetadata`]: A tuple containing:
- Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
- `VideoMetadata` object.
"""
# Lazy import torchcodec
import importlib
av = importlib.import_module("av")
with av.open(video_path) as container:
video_stream = container.streams.video[0]
fps = video_stream.average_rate or video_stream.guessed_rate
it = container.decode(video=0)
frames = list(it)
stream = container.streams.video[0]
start = frames[0].pts * stream.time_base
container_end = stream.duration
if container_end is not None:
container_end *= stream.time_base
if container_end is None or container_end < frames[-1].pts:
# Some problem with stream duration, so use the frame PTS directly
# and guess the duration of the last frame
end = frames[-1].pts * stream.time_base + 1/fps
else:
end = container_end
duration = float(end - start)
metadata = VideoMetadata(
total_num_frames=len(frames),
fps=float(fps),
duration=float(duration),
video_backend="pyav",
height=video_stream.height,
width=video_stream.width,
)
target_timestamps = sample_timestamps_fn(metadata=metadata, **kwargs)
offset = float(start)
target_timestamps = np.array(target_timestamps)
end_time_stamps = np.array([float(frame.pts * stream.time_base) for frame in frames[1:]] + [duration])
indices = np.searchsorted(end_time_stamps, target_timestamps + offset, side='right')
indices = np.minimum(indices, len(end_time_stamps) - 1)
video = np.stack(
[frames[i].to_ndarray(format="rgb24", channel_last=True) for i in indices],
axis=0,
)
metadata.frames_indices = target_timestamps * fps
return video, metadata
VIDEO_DECODERS = {
"decord": read_video_decord,
"torchcodec": read_video_torchcodec,
"pyav": read_video_pyav,
}
def load_video(
video: VideoInput,
backend: str = "decord",
sample_timestamps_fn: Optional[Callable] = None,
**kwargs,
):
"""
Loads `video` to a numpy array.
Args:
video (`VideoInput`):
The video to convert to the numpy array format. Can be a link to video or local path.
backend (`str`, *optional*, defaults to `"decord"`):
The backend to use when loading the video. Can be any of ["decord", "pyav", ""torchcodec"]. Defaults to "decord".
sample_timestamps_fn (`Callable`):
A callable function that will return timestamps at which the video should be sampled.
"""
# Early exit if provided an array or `PIL` frames
if not isinstance(video, str):
metadata = [None] * len(video)
return video, metadata
if urlparse(video).netloc in ["www.youtube.com", "youtube.com"]:
if not is_yt_dlp_available():
raise ImportError("To load a video from YouTube url you have to install `yt_dlp` first.")
# Lazy import from yt_dlp
import importlib
yt_dlp = importlib.import_module("yt_dlp")
buffer = BytesIO()
with redirect_stdout(buffer), yt_dlp.YoutubeDL() as f:
f.download([video])
bytes_obj = buffer.getvalue()
file_obj = BytesIO(bytes_obj)
elif video.startswith("http://") or video.startswith("https://"):
file_obj = BytesIO(requests.get(video).content)
elif os.path.isfile(video):
file_obj = video
else:
raise TypeError("Incorrect format used for video. Should be an url linking to an video or a local path.")
# can also load with decord, but not cv2/torchvision
# both will fail in case of url links
video_is_url = video.startswith("http://") or video.startswith("https://")
if video_is_url and backend == "opencv":
raise ValueError("If you are trying to load a video from URL, you cannot use 'opencv' as backend")
if (
(not is_decord_available() and backend == "decord")
or (not is_torchcodec_available() and backend == "torchcodec")
or (not is_av_available() and backend == "pyav")
):
raise ImportError(
f"You chose backend={backend} for loading the video but the required library is not found in your environment "
f"Make sure to install {backend} before loading the video."
)
video_decoder = VIDEO_DECODERS[backend]
video, metadata = video_decoder(file_obj, sample_timestamps_fn, **kwargs)
return video, metadata
def get_target_fps(
video_fps: float,
max_frames: int,
total_frames: int,
frame_sample_mode: str,
candidate_target_fps: tuple[float],
) -> float:
"""
Get the target fps that best spans the video and has the most frames sampled
"""
num_frames_sampled = 0
selected_target_fps = None
for target_fps in candidate_target_fps:
step_size = max(int(video_fps / target_fps), 1)
num_frames_sampled_at_fps = int(total_frames / step_size)
if num_frames_sampled == 0:
if "uniform" in frame_sample_mode:
if num_frames_sampled_at_fps > max_frames:
break
selected_target_fps = target_fps
num_frames_sampled = num_frames_sampled_at_fps
else:
# the candidate sampling fps increases so frame count can't decrease
assert num_frames_sampled <= num_frames_sampled_at_fps
if num_frames_sampled_at_fps > max_frames:
# choose the sampling fps that spans the video
continue
elif num_frames_sampled_at_fps > num_frames_sampled:
# both are less than max_frames, choose the one with higher density of frames sampled
selected_target_fps = target_fps
num_frames_sampled = num_frames_sampled_at_fps
return selected_target_fps
def get_frame_times_and_chosen_fps(
selected_target_fps,
total_frames,
max_frames,
video_fps
):
if selected_target_fps is None:
frame_indices = np.linspace(0, total_frames, max_frames, endpoint=False, dtype=int)
else:
step_size = max(int(video_fps / selected_target_fps), 1)
frame_indices = np.arange(0, total_frames, step_size)
if len(frame_indices) > max_frames:
frame_indices = frame_indices[:max_frames]
return selected_target_fps, frame_indices
class MolmoAct2VideoProcessorKwargs(VideosKwargs, total=False):
patch_size: Optional[int]
pooling_size: Optional[list[int]]
frame_sample_mode: Optional[str]
max_fps: Optional[int]
sampling_fps: Optional[int]
class MolmoAct2VideoProcessor(BaseVideoProcessor):
resample = PILImageResampling.BILINEAR
size = {"height": 378, "width": 378}
image_mean = IMAGENET_STANDARD_MEAN
image_std = IMAGENET_STANDARD_STD
do_resize = True
do_rescale = True
do_normalize = True
do_convert_rgb = True
patch_size = 14
pooling_size = [3, 3]
do_sample_frames = True
frame_sample_mode = "uniform_last_frame"
max_fps = 2
sampling_fps = 2
valid_kwargs = MolmoAct2VideoProcessorKwargs
model_input_names = ["pixel_values_videos", "video_token_pooling", "video_grids"]
def __init__(self, **kwargs: Unpack[MolmoAct2VideoProcessorKwargs]):
super().__init__(**kwargs)
if self.size is not None and (
self.size.get("height", None) is None or self.size.get("width", None) is None
):
raise ValueError("size must contain 'height' and 'width' keys.")
def _further_process_kwargs(
self,
size: Optional[SizeDict] = None,
**kwargs,
) -> dict:
"""
Update kwargs that need further processing before being validated
Can be overridden by subclasses to customize the processing of kwargs.
"""
if size is not None and ("height" not in size or "width" not in size):
raise ValueError("size must contain 'height' and 'width' keys.")
return super()._further_process_kwargs(size=size, **kwargs)
def sample_times(
self,
metadata: VideoMetadata,
frame_sample_mode: str,
num_frames: int,
max_fps: Optional[int] = None,
sampling_fps: Optional[int] = None,
**kwargs,
) -> np.ndarray:
"""
Time-based sampling if an array video is passed
Args:
metadata (`VideoMetadata`):
Metadata of the video containing information about total duration, fps and total number of frames.
frame_sample_mode (`str`, *optional*):
Mode to sample frames. Defaults to `self.frame_sample_mode`.
num_frames (`int`, *optional*):
Maximum number of frames to sample. Defaults to `self.num_frames`.
man_fps (`int`, *optional*):
Maximum frames per second to sample.
sampling_fps (`int`, *optional*):
Sampling frames per second. Defaults to `self.sampling_fps`.
Used when `frame_sample_mode` is `"fps"`.
"""
frame_sample_mode = frame_sample_mode or self.frame_sample_mode
num_frames = num_frames or self.num_frames
sampling_fps = sampling_fps or self.sampling_fps
duration = metadata.duration or metadata.total_num_frames / metadata.fps
if frame_sample_mode == "fps":
candidate_target_fps = get_candidate_target_fps(metadata.fps, sampling_fps)
# Try larger and larger FPSs until we hit one that can't span the video
target_fps = candidate_target_fps[0]
for candidate_fps in candidate_target_fps[1:]:
if num_frames / candidate_fps < duration:
break
target_fps = candidate_fps
times = np.arange(0, num_frames) / target_fps
times = times[times < duration]
return times
elif frame_sample_mode == "uniform_last_frame":
if max_fps is not None:
max_duration = (num_frames-1) / max_fps # -1 to include the last frame
if max_duration < duration:
times = np.linspace(
0, duration, num=num_frames, endpoint=True, dtype=np.float64
)
else:
times = np.arange(0.0, stop=duration, step=1/max_fps)
times = np.concatenate([times, [duration]], axis=0)
assert len(times) <= num_frames
else:
times = np.linspace(
0, duration, num=num_frames, endpoint=True, dtype=np.float64
)
return times
else:
raise NotImplementedError(frame_sample_mode)
def sample_frames(
self,
metadata: VideoMetadata,
frame_sample_mode: Optional[str] = None,
num_frames: Optional[int] = None,
max_fps: Optional[int] = None,
sampling_fps: Optional[int] = None,
**kwargs,
) -> np.ndarray:
"""
Frame-based sampling if an array video is passed
Args:
metadata (`VideoMetadata`):
Metadata of the video containing information about total duration, fps and total number of frames.
frame_sample_mode (`str`, *optional*):
Mode to sample frames. Defaults to `self.frame_sample_mode`.
num_frames (`int`, *optional*):
Maximum number of frames to sample. Defaults to `self.num_frames`.
max_fps (`int`, *optional*):
Maximum frames per second to sample.
sampling_fps (`int`, *optional*):
Sampling frames per second. Defaults to `self.sampling_fps`.
Used when `frame_sample_mode` is `"fps"`.
"""
frame_sample_mode = frame_sample_mode or self.frame_sample_mode
num_frames = num_frames or self.num_frames
sampling_fps = sampling_fps or self.sampling_fps
total_num_frames = metadata.total_num_frames
if frame_sample_mode == "uniform_last_frame" and max_fps is not None:
duration = total_num_frames / metadata.fps
if total_num_frames <= 2:
return np.arange(total_num_frames).astype(int)
if duration > (num_frames - 1) / max_fps: # -1 to include the last frame
# uniform fallback
indices = np.linspace(
0,
total_num_frames - 1,
num=min(num_frames, total_num_frames),
endpoint=True,
).astype(int)
return indices
else:
float_indices = np.arange(
0.0, stop=total_num_frames - 1, step=float(metadata.fps / max_fps),
)
if np.round(float_indices[-1]) != total_num_frames - 1:
float_indices = np.concatenate([float_indices, [total_num_frames - 1]], axis=0)
indices = np.round(float_indices).astype(int)
assert indices[-1] < total_num_frames
assert len(float_indices) <= num_frames
return indices
elif frame_sample_mode == "uniform_last_frame":
indices = np.linspace(
0, total_num_frames - 1, num=min(num_frames, total_num_frames), endpoint=True,
).astype(int)
return indices
elif frame_sample_mode == "fps":
candidate_target_fps = get_candidate_target_fps(metadata.fps, sampling_fps)
selected_target_fps = get_target_fps(
metadata.fps,
num_frames,
total_num_frames,
frame_sample_mode,
candidate_target_fps,
)
_, indices = get_frame_times_and_chosen_fps(
selected_target_fps,
total_num_frames,
num_frames,
metadata.fps,
)
return indices
else:
raise NotImplementedError(frame_sample_mode)
def fetch_videos(
self,
video_url_or_urls: Union[str, list[str], list[list[str]]],
sample_timestamps_fn=None
):
"""
Convert a single or a list of urls into the corresponding `np.array` objects.
If a single url is passed, the return value will be a single object. If a list is passed a list of objects is
returned.
"""
if (
(not is_decord_available())
and (not is_torchcodec_available())
and (not is_av_available())
):
raise ImportError(
"MolmoAct2VideoProcessor requires `decord`, `torchcodec`, or `av` to be installed."
)
if is_decord_available():
backend = "decord"
elif is_torchcodec_available():
warnings.warn(
"`decord` is not installed and cannot be used to decode the video by default. "
"Falling back to `torchcodec`."
)
backend = "torchcodec"
else:
warnings.warn(
"`decord` is not installed and cannot be used to decode the video by default. "
"Falling back to `PyAV`."
)
backend = "pyav"
if isinstance(video_url_or_urls, list):
return list(zip(*[self.fetch_videos(x, sample_timestamps_fn=sample_timestamps_fn) for x in video_url_or_urls]))
else:
return load_video(video_url_or_urls, backend=backend, sample_timestamps_fn=sample_timestamps_fn)
def _decode_and_sample_videos(
self,
videos: VideoInput,
video_metadata: Union[VideoMetadata, dict],
do_sample_frames: Optional[bool] = None,
sample_indices_fn: Optional[Callable] = None,
sample_timestamps_fn: Optional[Callable] = None,
):
"""
Decode input videos and sample frames if needed.
"""
videos = make_batched_videos(videos)
video_metadata = make_batched_metadata(videos, video_metadata=video_metadata)
# Framed-based sampling if an array video is passed
# Otherwise, time-based sampling with decoding
if is_valid_video(videos[0]) and do_sample_frames:
assert video_metadata[0].fps is not None, "FPS must be provided for video input"
sampled_videos = []
sampled_metadata = []
for video, metadata in zip(videos, video_metadata):
indices = sample_indices_fn(metadata=metadata)
metadata.frames_indices = indices
sampled_videos.append(video[indices])
sampled_metadata.append(metadata)
videos = sampled_videos
video_metadata = sampled_metadata
elif not is_valid_video(videos[0]):
if sample_indices_fn is None:
logger.warning(
"do_sample_frames is False, but video array is not provided: "
"Will decode the video and sample frames using MolmoAct2's default sampling mode"
)
if isinstance(videos[0], list):
raise ValueError(
"A list of images is not supported for video input!"
)
else:
videos, video_metadata = self.fetch_videos(videos, sample_timestamps_fn=sample_timestamps_fn)
return videos, video_metadata
def _prepare_input_videos(
self,
videos: VideoInput,
**kwargs,
) -> list[np.ndarray]:
processed_videos = [to_numpy(video) for video in videos]
return processed_videos
def preprocess(
self,
videos: VideoInput,
**kwargs: Unpack[MolmoAct2VideoProcessorKwargs],
) -> BatchFeature:
validate_kwargs(
captured_kwargs=kwargs.keys(),
valid_processor_keys=list(self.valid_kwargs.__annotations__.keys()) + ["return_tensors"],
)
# Set default kwargs from self. This ensures that if a kwarg is not provided
# by the user, it gets its default value from the instance, or is set to None.
for kwarg_name in self.valid_kwargs.__annotations__:
kwargs.setdefault(kwarg_name, getattr(self, kwarg_name, None))
do_sample_frames = kwargs.pop("do_sample_frames")
video_metadata = kwargs.pop("video_metadata")
sample_indices_fn = partial(self.sample_frames, **kwargs) if do_sample_frames else None
sample_timestamps_fn = partial(self.sample_times, **kwargs)
videos, video_metadata = self._decode_and_sample_videos(
videos,
video_metadata=video_metadata,
do_sample_frames=do_sample_frames,
sample_indices_fn=sample_indices_fn,
sample_timestamps_fn=sample_timestamps_fn,
)
videos = self._prepare_input_videos(videos=videos)
kwargs = self._further_process_kwargs(**kwargs)
return_metadata = kwargs.pop("return_metadata")
preprocessed_videos = self._preprocess(videos=videos, **kwargs)
if return_metadata:
preprocessed_videos["video_metadata"] = video_metadata
return preprocessed_videos
def _preprocess(
self,
videos: list[np.ndarray],
size: Optional[SizeDict] = None,
resample: Optional[PILImageResampling] = None,
image_mean: Optional[Union[float, list[float]]] = None,
image_std: Optional[Union[float, list[float]]] = None,
do_convert_rgb: Optional[bool] = None,
patch_size: Optional[int] = None,
pooling_size: Optional[list[int]] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs,
) -> BatchFeature:
"""
Preprocess a video for the model.
Args:
videos (`VideoInput`):
Video to preprocess.
size (`SizeDict`, *optional*, defaults to `self.size`):
Size of the image after resizing.
resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
Resampling filter to use when resizing the image. This can be one of the enum `PILImageResampling`. Only
has an effect if `do_resize` is set to `True`.
image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
`True`.
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
Whether to convert the image to RGB.
patch_size (`int`, *optional*, defaults to `self.patch_size`):
The spatial patch size of the vision encoder.
pooling_size (`list[int]`, *optional*, defaults to `self.pooling_size`):
The pooling size of the vision adapter.
return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
Returns:
A `BatchFeature` containing the following keys:
- `pixel_values_videos`: The preprocessed videos.
- `video_token_pooling`: The indices of the patches in `crops` to pool for each token in `video_tokens`.
- `video_grids`: The video grids.
"""
if size.height is None or size.width is None:
raise ValueError("size must contain 'height' and 'width' keys.")
base_image_input_size = [size.height, size.width]
resample = resample or self.resample
image_mean = image_mean or self.image_mean
image_std = image_std or self.image_std
do_convert_rgb = do_convert_rgb or self.do_convert_rgb
patch_size = patch_size or self.patch_size
pooling_size = pooling_size or self.pooling_size
image_pooling_h, image_pooling_w = pooling_size
batch_grids = []
batch_crops = []
batch_pooled_patches_idx = []
for video in videos:
all_crops = []
pooled_patches_idx = []
for frame in video:
image_grid, crops, pooled_idx = image_to_patches_and_grids(
frame,
base_image_input_size,
resample,
image_mean,
image_std,
patch_size,
image_pooling_w,
image_pooling_h,
)
offset = sum(np.prod(x.shape[:2]) for x in all_crops)
pooled_idx_with_offset = np.where(pooled_idx >= 0, pooled_idx + offset, pooled_idx)
pooled_patches_idx.append(pooled_idx_with_offset)
all_crops.append(crops)
video_grid = np.array([len(video), image_grid[0], image_grid[1]])
all_crops = np.concatenate(all_crops, 0)
pooled_patches_idx = np.concatenate(pooled_patches_idx, 0)
batch_grids.append(video_grid)
batch_crops.append(all_crops)
batch_pooled_patches_idx.append(pooled_patches_idx)
video_grids = np.stack(batch_grids, 0)
pixel_values_videos = np.concatenate(batch_crops, 0)
video_token_pooling = np.concatenate(batch_pooled_patches_idx, 0)
data =dict(
pixel_values_videos=pixel_values_videos,
video_token_pooling=video_token_pooling,
video_grids=video_grids,
)
return BatchFeature(data, tensor_type=return_tensors)
MolmoAct2VideoProcessor.register_for_auto_class()

View File

@@ -16,6 +16,8 @@
from __future__ import annotations
import json
import os
import types
from collections import deque
from contextlib import nullcontext
@@ -24,9 +26,10 @@ from typing import TYPE_CHECKING, Any
import numpy as np
import torch
import torch.nn.functional as F # noqa: N812
import torch.utils.checkpoint
from safetensors.torch import load_file as load_safetensors_file
from torch import Tensor
from torch.distributions import Beta
from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME
from lerobot.policies.pretrained import PreTrainedPolicy
from lerobot.utils.constants import ACTION
@@ -36,10 +39,13 @@ from ..rtc.modeling_rtc import RTCProcessor
from .configuration_molmoact2 import MolmoAct2Config, _hf_token, _resolve_checkpoint_location
if TYPE_CHECKING or _transformers_available:
from transformers import AutoModelForImageTextToText, AutoProcessor
from .hf_model.action_tokenizer import UniversalActionProcessor
from .hf_model.configuration_molmoact2 import MolmoAct2Config as HFMolmoAct2Config
from .hf_model.modeling_molmoact2 import MolmoAct2ForConditionalGeneration
else:
AutoModelForImageTextToText = None
AutoProcessor = None
UniversalActionProcessor = None
HFMolmoAct2Config = None
MolmoAct2ForConditionalGeneration = None
_MODEL_INPUT_KEYS = {
"input_ids",
@@ -58,6 +64,39 @@ _MODEL_INPUT_KEYS = {
}
def _strict_load_safetensors_weights(model: torch.nn.Module, checkpoint_location: str) -> None:
index_path = os.path.join(checkpoint_location, SAFE_WEIGHTS_INDEX_NAME)
single_file_path = os.path.join(checkpoint_location, SAFE_WEIGHTS_NAME)
if os.path.isfile(index_path):
with open(index_path, encoding="utf-8") as f:
index = json.load(f)
weight_map = index["weight_map"]
loaded_keys = set(weight_map)
model_keys = set(model.state_dict())
missing_keys = sorted(model_keys - loaded_keys)
unexpected_keys = sorted(loaded_keys - model_keys)
if missing_keys or unexpected_keys:
message = ["MolmoAct2 safetensors do not match the local model implementation."]
if missing_keys:
message.append(f"Missing keys: {missing_keys[:8]}")
if unexpected_keys:
message.append(f"Unexpected keys: {unexpected_keys[:8]}")
raise RuntimeError(" ".join(message))
for shard_file in sorted(set(weight_map.values())):
state_dict = load_safetensors_file(os.path.join(checkpoint_location, shard_file), device="cpu")
model.load_state_dict(state_dict, strict=False)
del state_dict
return
if os.path.isfile(single_file_path):
state_dict = load_safetensors_file(single_file_path, device="cpu")
model.load_state_dict(state_dict, strict=True)
return
raise FileNotFoundError(
f"MolmoAct2 checkpoint at {checkpoint_location} must contain {SAFE_WEIGHTS_NAME} "
f"or {SAFE_WEIGHTS_INDEX_NAME}."
)
def _torch_dtype(dtype: str) -> torch.dtype:
if dtype == "float32":
return torch.float32
@@ -91,480 +130,6 @@ def _sample_beta_timesteps(
return time_offset + scale * samples
def _patch_batched_image_attention_bias(backbone: Any) -> None:
original = getattr(backbone, "_build_native_attention_bias", None)
if original is None:
return
original_func = getattr(original, "__func__", original)
original_globals = getattr(original_func, "__globals__", {})
cache_seq_len = original_globals.get("_cache_seq_len_int")
cache_max_len = original_globals.get("_cache_max_len_int")
if cache_seq_len is None or cache_max_len is None:
return
def _build_native_attention_bias(
self,
*,
inputs_embeds: Tensor,
attention_mask: Tensor | None,
token_type_ids: Tensor | None,
past_key_values: Any,
) -> Tensor:
if attention_mask is not None and attention_mask.ndim == 4:
return attention_mask.to(device=inputs_embeds.device)
batch_size, seq_len = inputs_embeds.shape[:2]
past_length = int(cache_seq_len(past_key_values))
current_length = past_length + int(seq_len)
max_cache_len = int(cache_max_len(past_key_values))
attention_mask_len = max_cache_len if max_cache_len > 0 else current_length
device = inputs_embeds.device
if attention_mask is None:
positions = torch.arange(attention_mask_len, device=device)
valid_mask = positions.unsqueeze(0) < current_length
valid_mask = valid_mask.expand(batch_size, -1)
elif attention_mask.ndim == 2:
valid_mask = torch.zeros((batch_size, attention_mask_len), device=device, dtype=torch.bool)
source_mask = attention_mask.to(device=device, dtype=torch.bool)
copy_len = min(int(source_mask.shape[-1]), attention_mask_len)
if copy_len > 0:
valid_mask[:, :copy_len] = source_mask[:, :copy_len]
if attention_mask_len > current_length:
valid_mask[:, current_length:] = False
else:
raise ValueError(f"Unsupported attention_mask shape for MolmoAct2: {tuple(attention_mask.shape)}")
valid_mask = valid_mask[:, None, None, :]
causal_mask = torch.tril(
torch.ones(attention_mask_len, attention_mask_len, device=device, dtype=torch.bool)
)[None, None, past_length:current_length, :attention_mask_len]
if token_type_ids is not None and past_length == 0:
causal_mask = causal_mask.expand(batch_size, -1, -1, -1).clone()
image_mask = token_type_ids.to(device=device, dtype=torch.bool)
can_attend_back = image_mask[:, :, None] & image_mask[:, None, :]
image_len = min(int(token_type_ids.shape[1]), attention_mask_len)
causal_mask[:, :, :, :image_len] = (
causal_mask[:, :, :, :image_len] | can_attend_back[:, None, :, :image_len]
)
allowed = valid_mask & causal_mask
return torch.where(
allowed,
torch.zeros((), device=device, dtype=inputs_embeds.dtype),
torch.full((), torch.finfo(inputs_embeds.dtype).min, device=device, dtype=inputs_embeds.dtype),
)
backbone._build_native_attention_bias = types.MethodType(_build_native_attention_bias, backbone)
def _patch_leaf_safe_input_embedding_update(backbone: Any) -> None:
if getattr(backbone, "_lerobot_leaf_safe_input_embedding_update_patched", False):
return
if not callable(getattr(backbone, "build_input_embeddings", None)):
return
def _build_input_embeddings(
self,
input_ids: Tensor,
images: Tensor | None = None,
token_pooling: Tensor | None = None,
) -> tuple[Tensor, Tensor | None]:
input_ids = input_ids * (input_ids != -1).to(input_ids.dtype)
x = self.transformer.wte(input_ids)
image_features = None
if images is not None:
image_features = self.vision_backbone(images, token_pooling).to(x.device)
is_image_patch = input_ids.reshape(-1) == self.config.image_patch_id
if is_image_patch.sum() != len(image_features):
raise RuntimeError(
f"Expected {int(is_image_patch.sum())} image patch embeddings, got {len(image_features)}."
)
flat_x = x.reshape(-1, x.shape[-1]).clone()
flat_x[is_image_patch] = flat_x[is_image_patch] + image_features
x = flat_x.reshape_as(x)
x = self.transformer.emb_drop(x)
return x, image_features
backbone.build_input_embeddings = types.MethodType(_build_input_embeddings, backbone)
backbone._lerobot_leaf_safe_input_embedding_update_patched = True
def _patch_memory_efficient_vision_backbone(backbone: Any, *, gradient_checkpointing: bool) -> None:
vision_backbone = getattr(backbone, "vision_backbone", None)
if vision_backbone is None or getattr(
vision_backbone, "_lerobot_memory_efficient_vision_backbone_patched", False
):
return
image_vit = getattr(vision_backbone, "image_vit", None)
transformer = getattr(image_vit, "transformer", None)
resblocks = getattr(transformer, "resblocks", None)
if image_vit is None or transformer is None or resblocks is None:
return
if not hasattr(vision_backbone, "vit_layers"):
return
def _encode_image(self, images: Tensor) -> Tensor:
batch_size, num_crops, num_patches, patch_dim = images.shape
images = images.view(batch_size * num_crops, num_patches, patch_dim)
x = self.image_vit.patch_embedding(images)
x = self.image_vit.add_pos_emb(x, self.image_vit.config.image_num_patch)
needed_layers = {int(layer) for layer in self.vit_layers}
selected_features: dict[int, Tensor] = {}
use_checkpoint = bool(
self._lerobot_vision_gradient_checkpointing and self.training and torch.is_grad_enabled()
)
for layer_idx, block in enumerate(self.image_vit.transformer.resblocks):
if use_checkpoint:
x = torch.utils.checkpoint.checkpoint(block, x, use_reentrant=False)
else:
x = block(x)
if layer_idx in needed_layers:
selected_features[layer_idx] = x
if len(selected_features) != len(needed_layers):
missing = sorted(needed_layers - set(selected_features))
raise RuntimeError(f"MolmoAct2 vision backbone did not produce requested layers: {missing}.")
image_features = torch.cat([selected_features[int(layer)] for layer in self.vit_layers], dim=-1)
if self.num_prefix_tokens > 0:
image_features = image_features[:, 1:]
image_features = image_features.view(batch_size, num_crops, num_patches, -1)
return image_features
vision_backbone.encode_image = types.MethodType(_encode_image, vision_backbone)
vision_backbone._lerobot_vision_gradient_checkpointing = bool(gradient_checkpointing)
vision_backbone._lerobot_memory_efficient_vision_backbone_patched = True
def _patch_training_kv_collection(backbone: Any) -> None:
"""Expose per-layer VLM KV tensors without enabling HF autoregressive cache."""
if getattr(backbone, "_lerobot_training_kv_collection_patched", False):
return
transformer = getattr(backbone, "transformer", None)
blocks = getattr(transformer, "blocks", None)
if transformer is None or blocks is None:
raise RuntimeError("MolmoAct2 checkpoint does not expose a patchable text transformer.")
original_transformer_forward = transformer.forward
from transformers.masking_utils import create_causal_mask
from transformers.modeling_outputs import BaseModelOutputWithPast
def _patch_attention(attention: torch.nn.Module) -> None:
if getattr(attention, "_lerobot_training_kv_collection_patched", False):
return
original_attention_forward = attention.forward
original_attention_func = getattr(original_attention_forward, "__func__", original_attention_forward)
attention_globals = getattr(original_attention_func, "__globals__", {})
apply_rotary_pos_emb = attention_globals.get("apply_rotary_pos_emb")
repeat_kv = attention_globals.get("repeat_kv")
eager_attention_forward = attention_globals.get("eager_attention_forward")
all_attention_functions = attention_globals.get("ALL_ATTENTION_FUNCTIONS")
if (
apply_rotary_pos_emb is None
or repeat_kv is None
or eager_attention_forward is None
or all_attention_functions is None
):
raise RuntimeError("MolmoAct2 attention internals changed; cannot patch KV collection.")
def _attention_forward(
self,
hidden_states: Tensor,
position_embeddings: tuple[Tensor, Tensor],
attention_mask: Tensor | None,
past_key_values: Any | None = None,
cache_position: Tensor | None = None,
**kwargs,
):
collect_layer_kv_states = bool(kwargs.pop("collect_layer_kv_states", False))
if not collect_layer_kv_states:
return original_attention_forward(
hidden_states=hidden_states,
position_embeddings=position_embeddings,
attention_mask=attention_mask,
past_key_values=past_key_values,
cache_position=cache_position,
**kwargs,
)
input_shape = hidden_states.shape[:-1]
hidden_shape = (*input_shape, -1, self.head_dim)
qkv = self.att_proj(hidden_states)
query_states, key_states, value_states = qkv.split(self.fused_dims, dim=-1)
value_states = value_states.view(hidden_shape)
if self.q_norm is not None and self.k_norm is not None and self.qk_norm_type != "qwen3":
query_states = self.q_norm(query_states)
key_states = self.k_norm(key_states)
query_states = query_states.view(hidden_shape)
key_states = key_states.view(hidden_shape)
if self.q_norm is not None and self.k_norm is not None and self.qk_norm_type == "qwen3":
query_states = self.q_norm(query_states)
key_states = self.k_norm(key_states)
query_states = query_states.transpose(1, 2)
key_states = key_states.transpose(1, 2)
value_states = value_states.transpose(1, 2)
cos, sin = position_embeddings
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
if past_key_values is not None:
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
key_states, value_states = past_key_values.update(
key_states, value_states, self.layer_idx, cache_kwargs
)
collected_key_states = key_states
collected_value_states = value_states
dropout_p = 0.0 if not self.training else self.attention_dropout
if self.config._attn_implementation == "sdpa" and (
attention_mask is None or torch.is_tensor(attention_mask)
):
key_states = repeat_kv(key_states, self.num_key_value_groups)
value_states = repeat_kv(value_states, self.num_key_value_groups)
attn_output = F.scaled_dot_product_attention(
query_states,
key_states,
value_states,
attn_mask=attention_mask,
dropout_p=dropout_p,
is_causal=attention_mask is None,
)
attn_output = attn_output.transpose(1, 2).contiguous()
attn_weights = None
else:
attention_interface = eager_attention_forward
if self.config._attn_implementation != "eager":
attention_interface = all_attention_functions[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,
query_states,
key_states,
value_states,
attention_mask,
dropout=dropout_p,
scaling=self.scaling,
**kwargs,
)
attn_output = attn_output.reshape(*input_shape, -1).contiguous()
attn_output = self.attn_out(attn_output)
return attn_output, attn_weights, collected_key_states, collected_value_states
attention.forward = types.MethodType(_attention_forward, attention)
attention._lerobot_training_kv_collection_patched = True
def _patch_decoder_layer(layer: torch.nn.Module) -> None:
if getattr(layer, "_lerobot_training_kv_collection_patched", False):
return
_patch_attention(layer.self_attn)
original_layer_forward = layer.forward
is_post_norm = "PostNorm" in layer.__class__.__name__
def _decoder_layer_forward(
self,
hidden_states: Tensor,
position_embeddings: tuple[Tensor, Tensor],
attention_mask: Tensor | None = None,
position_ids: Tensor | None = None,
past_key_values: Any | None = None,
output_attentions: bool = False,
use_cache: bool = False,
cache_position: Tensor | None = None,
**kwargs,
):
collect_layer_kv_states = bool(kwargs.pop("collect_layer_kv_states", False))
if not collect_layer_kv_states:
return original_layer_forward(
hidden_states,
position_embeddings=position_embeddings,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
output_attentions=output_attentions,
use_cache=use_cache,
cache_position=cache_position,
**kwargs,
)
residual = hidden_states
attn_input = hidden_states if is_post_norm else self.attn_norm(hidden_states)
attn_output, self_attn_weights, key_states, value_states = self.self_attn(
hidden_states=attn_input,
position_embeddings=position_embeddings,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
output_attentions=output_attentions,
use_cache=use_cache,
cache_position=cache_position,
collect_layer_kv_states=True,
**kwargs,
)
if is_post_norm:
attn_output = self.attn_norm(attn_output)
hidden_states = residual + self.dropout(attn_output)
residual = hidden_states
if is_post_norm:
hidden_states = self.mlp(hidden_states)
hidden_states = self.ff_norm(hidden_states)
else:
hidden_states = self.ff_norm(hidden_states)
hidden_states = self.mlp(hidden_states)
hidden_states = residual + self.dropout(hidden_states)
outputs = (hidden_states,)
if output_attentions:
outputs += (self_attn_weights,)
return outputs + (key_states, value_states)
layer.forward = types.MethodType(_decoder_layer_forward, layer)
layer._lerobot_training_kv_collection_patched = True
for block in blocks:
_patch_decoder_layer(block)
def _transformer_forward(
self,
input_ids: Tensor | None = None,
attention_mask: Tensor | None = None,
position_ids: Tensor | None = None,
past_key_values: Any | None = None,
inputs_embeds: Tensor | None = None,
use_cache: bool | None = None,
output_attentions: bool | None = None,
output_hidden_states: bool | None = None,
cache_position: Tensor | None = None,
**kwargs,
):
collect_layer_kv_states = bool(kwargs.pop("collect_layer_kv_states", False))
if not collect_layer_kv_states:
return original_transformer_forward(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
cache_position=cache_position,
**kwargs,
)
if past_key_values is not None:
raise ValueError("collect_layer_kv_states only supports full-sequence training forwards.")
output_attentions = (
output_attentions if output_attentions is not None else self.config.output_attentions
)
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
use_cache = False
if (input_ids is None) ^ (inputs_embeds is not None):
raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
if inputs_embeds is None:
input_ids = input_ids * (input_ids != -1).to(input_ids.dtype)
inputs_embeds = self.wte(input_ids)
if cache_position is None:
cache_position = torch.arange(
0,
inputs_embeds.shape[1],
device=inputs_embeds.device,
)
if position_ids is None:
position_ids = cache_position.unsqueeze(0)
if torch.is_tensor(attention_mask) and attention_mask.ndim == 4:
causal_mask_mapping = attention_mask
elif not isinstance(causal_mask_mapping := attention_mask, dict):
mask_kwargs = {
"config": self.config,
"input_embeds": inputs_embeds,
"attention_mask": attention_mask,
"cache_position": cache_position,
"past_key_values": None,
"position_ids": position_ids,
}
causal_mask_mapping = create_causal_mask(**mask_kwargs)
hidden_states = inputs_embeds
if self.config.rope_scaling_layers is not None:
position_embeddings_mapping = {
"default": self.rotary_embs["default"](hidden_states, position_ids),
"scaling": self.rotary_embs["scaling"](hidden_states, position_ids),
}
else:
position_embeddings = self.rotary_emb(hidden_states, position_ids)
all_hidden_states = () if output_hidden_states else None
all_self_attns = () if output_attentions else None
collected_kv_states = []
for layer_idx, decoder_block in enumerate(self.blocks[: self.config.num_hidden_layers]):
if output_hidden_states:
all_hidden_states += (hidden_states,)
if self.config.rope_scaling_layers is not None:
position_embeddings_i = (
position_embeddings_mapping["scaling"]
if layer_idx in self.config.rope_scaling_layers
else position_embeddings_mapping["default"]
)
else:
position_embeddings_i = position_embeddings
layer_outputs = decoder_block(
hidden_states,
position_embeddings=position_embeddings_i,
attention_mask=causal_mask_mapping,
position_ids=position_ids,
past_key_values=None,
output_attentions=output_attentions,
use_cache=use_cache,
cache_position=cache_position,
collect_layer_kv_states=True,
**kwargs,
)
hidden_states = layer_outputs[0]
output_idx = 1
if output_attentions:
all_self_attns += (layer_outputs[output_idx],)
output_idx += 1
collected_kv_states.append((layer_outputs[output_idx], layer_outputs[output_idx + 1]))
hidden_states = self.ln_f(hidden_states)
if output_hidden_states:
all_hidden_states += (hidden_states,)
return BaseModelOutputWithPast(
last_hidden_state=hidden_states,
past_key_values=tuple(collected_kv_states),
hidden_states=all_hidden_states,
attentions=all_self_attns,
)
transformer.forward = types.MethodType(_transformer_forward, transformer)
backbone._lerobot_training_kv_collection_patched = True
class MolmoAct2Policy(PreTrainedPolicy):
config_class = MolmoAct2Config
@@ -604,13 +169,22 @@ class MolmoAct2Policy(PreTrainedPolicy):
force_download=bool(self.config.checkpoint_force_download),
)
model_dtype = _torch_dtype(self.config.model_dtype)
self.model = AutoModelForImageTextToText.from_pretrained(
if HFMolmoAct2Config is None or MolmoAct2ForConditionalGeneration is None:
raise RuntimeError("transformers is required to load MolmoAct2 checkpoints.")
hf_config = HFMolmoAct2Config.from_pretrained(
checkpoint_location,
trust_remote_code=self.config.trust_remote_code,
token=_hf_token(),
)
self.model = MolmoAct2ForConditionalGeneration.from_pretrained(
checkpoint_location,
config=hf_config,
dtype=model_dtype,
low_cpu_mem_usage=True,
token=_hf_token(),
)
# Keep Hub loading limited to local code plus safetensors, and verify the
# local implementation exactly matches the checkpoint key space.
_strict_load_safetensors_weights(self.model, checkpoint_location)
hf_max_action_dim = int(getattr(self.model.config, "max_action_dim", -1))
if hf_max_action_dim != int(self.config.expected_max_action_dim):
raise ValueError(
@@ -642,13 +216,6 @@ class MolmoAct2Policy(PreTrainedPolicy):
self._freeze_input_embeddings()
if self.config.train_action_expert_only:
self._freeze_non_action_expert_parameters()
_patch_batched_image_attention_bias(self._backbone())
_patch_leaf_safe_input_embedding_update(self._backbone())
_patch_memory_efficient_vision_backbone(
self._backbone(),
gradient_checkpointing=bool(self.config.gradient_checkpointing),
)
_patch_training_kv_collection(self._backbone())
if self.config.gradient_checkpointing:
self._enable_gradient_checkpointing()
self.train(self.training)
@@ -703,6 +270,9 @@ class MolmoAct2Policy(PreTrainedPolicy):
transformer = getattr(self._backbone(), "transformer", None)
if transformer is not None:
transformer.gradient_checkpointing = True
vision_backbone = getattr(self._backbone(), "vision_backbone", None)
if vision_backbone is not None:
vision_backbone.gradient_checkpointing = True
def _freeze_non_action_expert_parameters(self) -> None:
trainable_params = 0
@@ -914,10 +484,10 @@ class MolmoAct2Policy(PreTrainedPolicy):
if self.action_tokenizer is None:
require_package("transformers", extra="molmoact2")
self.action_tokenizer = AutoProcessor.from_pretrained(
if UniversalActionProcessor is None:
raise RuntimeError("transformers is required to load MolmoAct2 action tokenizer.")
self.action_tokenizer = UniversalActionProcessor.from_pretrained_local(
self.config.discrete_action_tokenizer,
trust_remote_code=self.config.trust_remote_code,
token=_hf_token(),
)
return self.action_tokenizer

View File

@@ -57,9 +57,18 @@ from lerobot.utils.import_utils import _transformers_available, require_package
from .configuration_molmoact2 import MolmoAct2Config, infer_molmoact2_max_sequence_length
if TYPE_CHECKING or _transformers_available:
from transformers import AutoProcessor
from transformers import Qwen2Tokenizer
from .hf_model.action_tokenizer import UniversalActionProcessor
from .hf_model.image_processing_molmoact2 import MolmoAct2ImageProcessor
from .hf_model.processing_molmoact2 import MolmoAct2Processor
from .hf_model.video_processing_molmoact2 import MolmoAct2VideoProcessor
else:
AutoProcessor = None
Qwen2Tokenizer = None
UniversalActionProcessor = None
MolmoAct2ImageProcessor = None
MolmoAct2Processor = None
MolmoAct2VideoProcessor = None
ACTION_OUTPUT_TOKEN = "<action_output>" # nosec B105
ACTION_START_TOKEN = "<action_start>" # nosec B105
@@ -106,6 +115,7 @@ def _resolve_checkpoint_location(
repo_type="model",
revision=revision,
force_download=force_download,
ignore_patterns=["*.py", "*.pyc", "__pycache__/*"],
token=_hf_token(),
)
@@ -169,6 +179,59 @@ def _load_hf_norm_stats_for_tag(
return {ACTION: numeric_stats(action_stats), OBS_STATE: numeric_stats(state_stats)}, metadata
def _strip_processor_config(config: dict[str, Any], *metadata_keys: str) -> dict[str, Any]:
return {
key: value
for key, value in config.items()
if key not in {"auto_map", "processor_class", *metadata_keys}
}
def _load_local_molmoact2_processor(checkpoint_location: str) -> Any:
if (
Qwen2Tokenizer is None
or MolmoAct2ImageProcessor is None
or MolmoAct2Processor is None
or MolmoAct2VideoProcessor is None
):
raise RuntimeError("transformers is required to load MolmoAct2 processor.")
checkpoint_path = Path(checkpoint_location)
processor_config_path = checkpoint_path / "processor_config.json"
if not processor_config_path.exists():
raise FileNotFoundError(f"MolmoAct2 checkpoint is missing {processor_config_path}.")
processor_config = json.loads(processor_config_path.read_text())
image_config = _strip_processor_config(
dict(processor_config.get("image_processor") or {}),
"image_processor_type",
)
video_config = _strip_processor_config(
dict(processor_config.get("video_processor") or {}),
"video_processor_type",
)
image_processor = MolmoAct2ImageProcessor(**image_config)
video_processor = MolmoAct2VideoProcessor(**video_config)
tokenizer = Qwen2Tokenizer.from_pretrained(
checkpoint_location,
token=_hf_token(),
)
chat_template_path = checkpoint_path / "chat_template.jinja"
chat_template = chat_template_path.read_text() if chat_template_path.exists() else None
return MolmoAct2Processor(
image_processor=image_processor,
video_processor=video_processor,
tokenizer=tokenizer,
chat_template=chat_template,
image_use_col_tokens=processor_config.get("image_use_col_tokens", True),
use_single_crop_col_tokens=processor_config.get("use_single_crop_col_tokens"),
use_single_crop_start_token=processor_config.get("use_single_crop_start_token", True),
video_use_col_tokens=processor_config.get("video_use_col_tokens", False),
use_frame_special_tokens=processor_config.get("use_frame_special_tokens", True),
)
def _to_numpy(value: Any) -> np.ndarray:
if isinstance(value, np.ndarray):
return value
@@ -497,7 +560,6 @@ class MolmoAct2PackInputsProcessorStep(ProcessorStep):
checkpoint_path: str
checkpoint_revision: str | None = None
checkpoint_force_download: bool = False
trust_remote_code: bool = True
action_mode: str = "both"
discrete_action_tokenizer: str = "allenai/MolmoAct2-FAST-Tokenizer"
image_keys: list[str] = field(default_factory=list)
@@ -521,18 +583,13 @@ class MolmoAct2PackInputsProcessorStep(ProcessorStep):
revision=self.checkpoint_revision,
force_download=bool(self.checkpoint_force_download),
)
self.processor = AutoProcessor.from_pretrained(
checkpoint_location,
trust_remote_code=self.trust_remote_code,
use_fast=False,
token=_hf_token(),
)
self.processor = _load_local_molmoact2_processor(checkpoint_location)
self.action_processor = None
if self.action_mode in {"discrete", "both"}:
self.action_processor = AutoProcessor.from_pretrained(
if UniversalActionProcessor is None:
raise RuntimeError("transformers is required to load MolmoAct2 action tokenizer.")
self.action_processor = UniversalActionProcessor.from_pretrained_local(
self.discrete_action_tokenizer,
trust_remote_code=self.trust_remote_code,
token=_hf_token(),
)
self._action_start_id = _single_token_id(self.processor.tokenizer, ACTION_START_TOKEN)
self._action_end_id = _single_token_id(self.processor.tokenizer, ACTION_END_TOKEN)
@@ -544,7 +601,6 @@ class MolmoAct2PackInputsProcessorStep(ProcessorStep):
"checkpoint_path": self.checkpoint_path,
"checkpoint_revision": self.checkpoint_revision,
"checkpoint_force_download": self.checkpoint_force_download,
"trust_remote_code": self.trust_remote_code,
"action_mode": self.action_mode,
"discrete_action_tokenizer": self.discrete_action_tokenizer,
"image_keys": list(self.image_keys),
@@ -878,7 +934,6 @@ def make_molmoact2_pre_post_processors(
checkpoint_path=config.checkpoint_path,
checkpoint_revision=config.checkpoint_revision,
checkpoint_force_download=config.checkpoint_force_download,
trust_remote_code=config.trust_remote_code,
action_mode=config.action_mode,
discrete_action_tokenizer=config.discrete_action_tokenizer,
image_keys=image_keys,

View File

@@ -28,6 +28,7 @@ import torch.nn.functional as F # noqa: N812
from lerobot.configs import FeatureType, NormalizationMode, PolicyFeature
from lerobot.policies import get_policy_class, make_policy_config
from lerobot.policies.molmoact2 import (
configuration_molmoact2 as molmoact2_config,
modeling_molmoact2 as molmoact2_modeling,
processor_molmoact2 as molmoact2_processor,
)
@@ -67,6 +68,21 @@ def test_molmoact2_policy_registration():
assert get_policy_class("molmoact2") is MolmoAct2Policy
def test_molmoact2_checkpoint_download_ignores_remote_python(monkeypatch):
download_kwargs = {}
def fake_snapshot_download(**kwargs):
download_kwargs.update(kwargs)
return "/tmp/downloaded-molmoact2"
monkeypatch.setattr(molmoact2_config, "snapshot_download", fake_snapshot_download)
checkpoint_location = molmoact2_config._resolve_checkpoint_location("allenai/MolmoAct2")
assert checkpoint_location == "/tmp/downloaded-molmoact2"
assert download_kwargs["ignore_patterns"] == ["*.py", "*.pyc", "__pycache__/*"]
def test_molmoact2_scheduler_decay_steps_auto_match_training_steps():
param = torch.nn.Parameter(torch.ones(()))
optimizer = torch.optim.AdamW([param], lr=0.001)
@@ -550,18 +566,31 @@ def test_load_hf_model_accepts_max_action_horizon_schema(monkeypatch):
resolved_kwargs.update(kwargs)
return checkpoint_path
config_kwargs = {}
model_kwargs = {}
class DummyHFConfig:
@classmethod
def from_pretrained(cls, *args, **kwargs):
del args
config_kwargs.update(kwargs)
return SimpleNamespace()
class DummyMolmoAct2ForConditionalGeneration:
@classmethod
def from_pretrained(cls, *args, **kwargs):
del args
model_kwargs.update(kwargs)
return loaded_model
monkeypatch.setattr(molmoact2_modeling, "_resolve_checkpoint_location", fake_resolve_checkpoint_location)
monkeypatch.setattr(molmoact2_modeling, "_patch_batched_image_attention_bias", lambda backbone: None)
monkeypatch.setattr(molmoact2_modeling, "_patch_leaf_safe_input_embedding_update", lambda backbone: None)
monkeypatch.setattr(molmoact2_modeling, "_patch_training_kv_collection", lambda backbone: None)
from transformers import AutoModelForImageTextToText
monkeypatch.setattr(molmoact2_modeling, "HFMolmoAct2Config", DummyHFConfig)
monkeypatch.setattr(
AutoModelForImageTextToText,
"from_pretrained",
lambda *args, **kwargs: loaded_model,
molmoact2_modeling,
"MolmoAct2ForConditionalGeneration",
DummyMolmoAct2ForConditionalGeneration,
)
monkeypatch.setattr(molmoact2_modeling, "_strict_load_safetensors_weights", lambda *args: None)
policy = object.__new__(MolmoAct2Policy)
torch.nn.Module.__init__(policy)
policy.config = MolmoAct2Config(
@@ -580,6 +609,8 @@ def test_load_hf_model_accepts_max_action_horizon_schema(monkeypatch):
assert policy.model.config.max_action_horizon == 10
assert policy._generation_action_horizon() == 10
assert resolved_kwargs == {"revision": "main", "force_download": True}
assert "trust_remote_code" not in config_kwargs
assert "trust_remote_code" not in model_kwargs
def test_load_hf_model_chunk_size_overrides_larger_than_checkpoint_horizon(monkeypatch):
@@ -605,17 +636,26 @@ def test_load_hf_model_chunk_size_overrides_larger_than_checkpoint_horizon(monke
"_resolve_checkpoint_location",
lambda checkpoint_path, **kwargs: checkpoint_path,
)
monkeypatch.setattr(molmoact2_modeling, "_patch_batched_image_attention_bias", lambda backbone: None)
monkeypatch.setattr(molmoact2_modeling, "_patch_leaf_safe_input_embedding_update", lambda backbone: None)
monkeypatch.setattr(molmoact2_modeling, "_patch_training_kv_collection", lambda backbone: None)
from transformers import AutoModelForImageTextToText
class DummyHFConfig:
@classmethod
def from_pretrained(cls, *args, **kwargs):
del args, kwargs
return SimpleNamespace()
class DummyMolmoAct2ForConditionalGeneration:
@classmethod
def from_pretrained(cls, *args, **kwargs):
del args, kwargs
return loaded_model
monkeypatch.setattr(molmoact2_modeling, "HFMolmoAct2Config", DummyHFConfig)
monkeypatch.setattr(
AutoModelForImageTextToText,
"from_pretrained",
lambda *args, **kwargs: loaded_model,
molmoact2_modeling,
"MolmoAct2ForConditionalGeneration",
DummyMolmoAct2ForConditionalGeneration,
)
monkeypatch.setattr(molmoact2_modeling, "_strict_load_safetensors_weights", lambda *args: None)
policy = object.__new__(MolmoAct2Policy)
torch.nn.Module.__init__(policy)
policy.config = MolmoAct2Config(
@@ -649,13 +689,25 @@ def test_load_hf_model_rejects_legacy_action_horizon_schema(monkeypatch):
lambda checkpoint_path, **kwargs: checkpoint_path,
)
from transformers import AutoModelForImageTextToText
class DummyHFConfig:
@classmethod
def from_pretrained(cls, *args, **kwargs):
del args, kwargs
return SimpleNamespace()
class DummyMolmoAct2ForConditionalGeneration:
@classmethod
def from_pretrained(cls, *args, **kwargs):
del args, kwargs
return DummyLoadedModel()
monkeypatch.setattr(molmoact2_modeling, "HFMolmoAct2Config", DummyHFConfig)
monkeypatch.setattr(
AutoModelForImageTextToText,
"from_pretrained",
lambda *args, **kwargs: DummyLoadedModel(),
molmoact2_modeling,
"MolmoAct2ForConditionalGeneration",
DummyMolmoAct2ForConditionalGeneration,
)
monkeypatch.setattr(molmoact2_modeling, "_strict_load_safetensors_weights", lambda *args: None)
policy = object.__new__(MolmoAct2Policy)
torch.nn.Module.__init__(policy)
policy.config = MolmoAct2Config(
@@ -1133,7 +1185,6 @@ def test_discrete_predict_action_chunk_uses_hf_cached_generation_path():
output_features={ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(2,))},
discrete_generation_max_steps=None,
discrete_action_tokenizer="unused",
trust_remote_code=True,
chunk_size=2,
n_action_steps=1,
rtc_config=None,
@@ -1221,7 +1272,6 @@ def test_discrete_predict_action_chunk_uses_graph_backed_ar_decode_when_enabled(
output_features={ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(2,))},
discrete_generation_max_steps=None,
discrete_action_tokenizer="unused",
trust_remote_code=True,
chunk_size=2,
n_action_steps=1,
rtc_config=None,