Merge remote-tracking branch 'origin/main' into feat/language-annotation-pipeline

# Conflicts: # uv.lock
2026-06-04 04:41:24 +00:00 · 2026-06-02 17:36:07 +02:00
parent 518e191337 b8ad81bf39
commit 3662c41b85
46 changed files with 17465 additions and 18 deletions
--- a/src/lerobot/configs/parser.py
+++ b/src/lerobot/configs/parser.py
@@ -255,8 +255,7 @@ def extract_path_fields_from_config(config_path: str, path_fields: list[str]) ->
            remaining = config_data[field]
            if remaining:
                _config_yaml_overrides[field] = _flatten_to_cli_args(remaining)
-            else:
-                del config_data[field]
+            del config_data[field]
            modified = True

    if not modified:
@@ -311,7 +310,13 @@ def wrap(config_path: Path | None = None) -> Callable[[F], F]:
                    cli_args = filter_arg("config_path", cli_args)
                    cfg = argtype.from_pretrained(config_path_cli, cli_args=cli_args)
                else:
-                    cfg = draccus.parse(config_class=argtype, config_path=config_path, args=cli_args)
+                    if config_path_cli:
+                        cli_args = filter_arg("config_path", cli_args)
+                    cfg = draccus.parse(
+                        config_class=argtype,
+                        config_path=config_path_cli or config_path,
+                        args=cli_args,
+                    )
            response = fn(cfg, *args, **kwargs)
            return response

--- a/src/lerobot/policies/init.py
+++ b/src/lerobot/policies/init.py
@@ -20,6 +20,7 @@ from .eo1.configuration_eo1 import EO1Config as EO1Config
 from .factory import get_policy_class, make_policy, make_policy_config, make_pre_post_processors
 from .gaussian_actor.configuration_gaussian_actor import GaussianActorConfig as GaussianActorConfig
 from .groot.configuration_groot import GrootConfig as GrootConfig
+from .molmoact2.configuration_molmoact2 import MolmoAct2Config as MolmoAct2Config
 from .multi_task_dit.configuration_multi_task_dit import MultiTaskDiTConfig as MultiTaskDiTConfig
 from .pi0.configuration_pi0 import PI0Config as PI0Config
 from .pi0_fast.configuration_pi0_fast import PI0FastConfig as PI0FastConfig
@@ -43,6 +44,7 @@ __all__ = [
    "EO1Config",
    "GaussianActorConfig",
    "GrootConfig",
+    "MolmoAct2Config",
    "MultiTaskDiTConfig",
    "PI0Config",
    "PI0FastConfig",
--- a/src/lerobot/policies/factory.py
+++ b/src/lerobot/policies/factory.py
@@ -49,6 +49,7 @@ from .diffusion.configuration_diffusion import DiffusionConfig
 from .eo1.configuration_eo1 import EO1Config
 from .gaussian_actor.configuration_gaussian_actor import GaussianActorConfig
 from .groot.configuration_groot import GrootConfig
+from .molmoact2.configuration_molmoact2 import MolmoAct2Config
 from .multi_task_dit.configuration_multi_task_dit import MultiTaskDiTConfig
 from .pi0.configuration_pi0 import PI0Config
 from .pi05.configuration_pi05 import PI05Config
@@ -88,7 +89,8 @@ def get_policy_class(name: str) -> type[PreTrainedPolicy]:

    Args:
        name: The name of the policy. Supported names are "tdmpc", "diffusion", "act",
-            "multi_task_dit", "vqbet", "pi0", "pi05", "gaussian_actor", "smolvla", "wall_x".
+            "multi_task_dit", "vqbet", "pi0", "pi05", "gaussian_actor", "smolvla", "wall_x",
+            "molmoact2".
    Returns:
        The policy class corresponding to the given name.

@@ -151,6 +153,10 @@ def get_policy_class(name: str) -> type[PreTrainedPolicy]:
        from .eo1.modeling_eo1 import EO1Policy

        return EO1Policy
+    elif name == "molmoact2":
+        from .molmoact2.modeling_molmoact2 import MolmoAct2Policy
+
+        return MolmoAct2Policy
    else:
        try:
            return _get_policy_cls_from_policy_name(name=name)
@@ -168,7 +174,7 @@ def make_policy_config(policy_type: str, **kwargs) -> PreTrainedConfig:
    Args:
        policy_type: The type of the policy. Supported types include "tdmpc",
                     "multi_task_dit", "diffusion", "act", "vqbet", "pi0", "pi05", "gaussian_actor",
-                     "smolvla", "wall_x".
+                     "smolvla", "wall_x", "molmoact2".
        **kwargs: Keyword arguments to be passed to the configuration class constructor.

    Returns:
@@ -203,6 +209,8 @@ def make_policy_config(policy_type: str, **kwargs) -> PreTrainedConfig:
        return WallXConfig(**kwargs)
    elif policy_type == "eo1":
        return EO1Config(**kwargs)
+    elif policy_type == "molmoact2":
+        return MolmoAct2Config(**kwargs)
    else:
        try:
            config_cls = PreTrainedConfig.get_choice_class(policy_type)
@@ -231,6 +239,7 @@ class ProcessorConfigKwargs(TypedDict, total=False):
    preprocessor_overrides: dict[str, Any] | None
    postprocessor_overrides: dict[str, Any] | None
    dataset_stats: dict[str, dict[str, torch.Tensor]] | None
+    dataset_meta: Any | None


 def make_pre_post_processors(
@@ -414,6 +423,15 @@ def make_pre_post_processors(
            dataset_stats=kwargs.get("dataset_stats"),
        )

+    elif isinstance(policy_cfg, MolmoAct2Config):
+        from .molmoact2.processor_molmoact2 import make_molmoact2_pre_post_processors
+
+        processors = make_molmoact2_pre_post_processors(
+            config=policy_cfg,
+            dataset_stats=kwargs.get("dataset_stats"),
+            dataset_meta=kwargs.get("dataset_meta"),
+        )
+
    else:
        try:
            processors = _make_processors_from_policy_config(
@@ -499,6 +517,10 @@ def make_policy(
        action_names = ds_meta.features.get(ACTION, {}).get("names")
        if action_names is not None:
            cfg.action_feature_names = list(action_names)
+    if ds_meta is not None:
+        set_dataset_feature_metadata = getattr(cfg, "set_dataset_feature_metadata", None)
+        if callable(set_dataset_feature_metadata):
+            set_dataset_feature_metadata(ds_meta.features)

    kwargs["config"] = cfg

--- a/src/lerobot/policies/groot/eagle2_hg_model/modeling_eagle2_5_vl.py
+++ b/src/lerobot/policies/groot/eagle2_hg_model/modeling_eagle2_5_vl.py
@@ -60,6 +60,7 @@ class Eagle25VLPreTrainedModel(PreTrainedModel):
        "SiglipEncoderLayer",
    ]
    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = True
    _supports_flash_attn_2 = True
    _supports_cache_class = True
    _supports_static_cache = True
--- a/src/lerobot/policies/groot/eagle2_hg_model/processing_eagle2_5_vl.py
+++ b/src/lerobot/policies/groot/eagle2_hg_model/processing_eagle2_5_vl.py
@@ -124,7 +124,6 @@ class Eagle25VLProcessor(ProcessorMixin):
        "videos_kwargs",
        "text_kwargs",
    ]
-    image_processor_class = "AutoImageProcessor"
    tokenizer_class = "AutoTokenizer"

    def __init__(
--- a/src/lerobot/policies/groot/processor_groot.py
+++ b/src/lerobot/policies/groot/processor_groot.py
@@ -206,7 +206,11 @@ def _build_eagle_processor(tokenizer_assets_repo: str = DEFAULT_TOKENIZER_ASSETS
            "Vendor files are copied during model creation. Create the policy/model first, "
            "or call ensure_eagle_cache_ready() before building processors."
        )
-    proc = AutoProcessor.from_pretrained(str(cache_dir), trust_remote_code=True, use_fast=True)
+    proc = AutoProcessor.from_pretrained(
+        str(cache_dir),
+        trust_remote_code=True,
+        fix_mistral_regex=False,
+    )
    proc.tokenizer.padding_side = "left"
    return proc

--- a/src/lerobot/policies/molmoact2/README.md
+++ b/src/lerobot/policies/molmoact2/README.md
@@ -0,0 +1 @@
+../../../../docs/source/policy_molmoact2_README.md
--- a/src/lerobot/policies/molmoact2/init.py
+++ b/src/lerobot/policies/molmoact2/init.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .configuration_molmoact2 import MolmoAct2Config
+from .modeling_molmoact2 import MolmoAct2Policy
+from .processor_molmoact2 import make_molmoact2_pre_post_processors
+
+__all__ = ["MolmoAct2Config", "MolmoAct2Policy", "make_molmoact2_pre_post_processors"]
--- a/src/lerobot/policies/molmoact2/configuration_molmoact2.py
+++ b/src/lerobot/policies/molmoact2/configuration_molmoact2.py
@@ -0,0 +1,519 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import json
+import math
+import os
+from contextlib import suppress
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+from huggingface_hub import snapshot_download
+
+from lerobot.configs import FeatureType, NormalizationMode, PolicyFeature, PreTrainedConfig
+from lerobot.optim import (
+    AdamWConfig,
+    CosineDecayWithWarmupSchedulerConfig,
+    LRSchedulerConfig,
+    OptimizerConfig,
+)
+from lerobot.utils.constants import ACTION, OBS_STATE
+
+from ..rtc.configuration_rtc import RTCConfig
+
+MOLMOACT2_DEFAULT_NUM_IMAGES = 2
+MOLMOACT2_IMAGE_TOKENS_PER_IMAGE = 196
+MOLMOACT2_FIXED_PROMPT_TOKEN_BUDGET = 80
+MOLMOACT2_TASK_TOKEN_BUDGET = 32
+MOLMOACT2_SEQUENCE_LENGTH_MARGIN = 32
+MOLMOACT2_SEQUENCE_LENGTH_MULTIPLE = 64
+MOLMOACT2_DISCRETE_ACTION_WRAPPER_TOKENS = 4
+MOLMOACT2_MIN_DISCRETE_ACTION_TOKENS_PER_STEP = 6
+MOLMOACT2_DISCRETE_ACTION_TOKENS_PER_DIM = 0.95
+
+
+def _hf_token() -> str | None:
+    return os.environ.get("HF_TOKEN") or os.environ.get("HF_ACCESS_TOKEN")
+
+
+def _resolve_checkpoint_location(
+    checkpoint_path: str,
+    *,
+    revision: str | None = None,
+    force_download: bool = False,
+) -> str:
+    checkpoint_path = str(checkpoint_path or "").strip()
+    if not checkpoint_path:
+        raise ValueError("MolmoAct2 policy requires `checkpoint_path`.")
+    local_path = Path(checkpoint_path).expanduser()
+    if local_path.exists():
+        return str(local_path)
+    return snapshot_download(
+        repo_id=checkpoint_path,
+        repo_type="model",
+        revision=revision,
+        force_download=force_download,
+        ignore_patterns=["*.py", "*.pyc", "__pycache__/*"],
+        token=_hf_token(),
+    )
+
+
+def _load_hf_norm_metadata_for_tag(
+    checkpoint_path: str,
+    *,
+    revision: str | None,
+    force_download: bool,
+    norm_tag: str | None,
+) -> dict[str, Any]:
+    norm_tag = str(norm_tag or "").strip()
+    if not norm_tag:
+        return {}
+    checkpoint_location = Path(
+        _resolve_checkpoint_location(
+            checkpoint_path,
+            revision=revision,
+            force_download=force_download,
+        )
+    )
+    norm_stats_filename = "norm_stats.json"
+    config_path = checkpoint_location / "config.json"
+    if config_path.exists():
+        with suppress(OSError, json.JSONDecodeError):
+            norm_stats_filename = str(
+                json.loads(config_path.read_text()).get("norm_stats_filename") or norm_stats_filename
+            )
+    stats_path = checkpoint_location / norm_stats_filename
+    if not stats_path.exists():
+        raise FileNotFoundError(
+            f"MolmoAct2 HF checkpoint is missing {norm_stats_filename!r}; cannot resolve norm_tag={norm_tag!r}."
+        )
+    payload = json.loads(stats_path.read_text())
+    metadata_by_tag = payload.get("metadata_by_tag")
+    if not isinstance(metadata_by_tag, dict):
+        raise ValueError(f"MolmoAct2 norm stats file {stats_path} has no metadata_by_tag mapping.")
+    metadata = metadata_by_tag.get(norm_tag)
+    if not isinstance(metadata, dict):
+        available = sorted(str(tag) for tag in metadata_by_tag)
+        raise ValueError(f"Unknown MolmoAct2 norm_tag={norm_tag!r}. Available tags: {available}.")
+    return metadata
+
+
+@LRSchedulerConfig.register_subclass("molmoact2_cosine_decay_with_warmup")
+@dataclass
+class MolmoAct2CosineDecayWithWarmupSchedulerConfig(CosineDecayWithWarmupSchedulerConfig):
+    """MolmoAct2-local cosine scheduler with optional decay-step auto-match.
+
+    LeRobot's generic cosine scheduler keeps an explicit integer decay length.
+    For MolmoAct2, leaving num_decay_steps unset means "decay across this run's
+    training steps"; build() is the first point where num_training_steps is known.
+    """
+
+    num_decay_steps: int | None
+
+    def build(self, optimizer, num_training_steps: int):
+        return CosineDecayWithWarmupSchedulerConfig(
+            peak_lr=self.peak_lr,
+            decay_lr=self.decay_lr,
+            num_warmup_steps=self.num_warmup_steps,
+            num_decay_steps=num_training_steps if self.num_decay_steps is None else self.num_decay_steps,
+        ).build(optimizer, num_training_steps=num_training_steps)
+
+
+def _round_up(value: int, multiple: int) -> int:
+    return int(math.ceil(value / multiple) * multiple)
+
+
+def infer_molmoact2_max_sequence_length(
+    *,
+    num_images: int,
+    state_dim: int,
+    action_dim: int,
+    action_horizon: int,
+    include_discrete_action: bool,
+) -> int:
+    """Infer the padded text/image sequence cap from MolmoAct2's fixed token layout."""
+    if num_images < 1:
+        num_images = MOLMOACT2_DEFAULT_NUM_IMAGES
+    if state_dim < 0:
+        state_dim = 0
+    if action_dim < 1:
+        action_dim = 1
+    if action_horizon < 1:
+        action_horizon = 1
+
+    image_tokens = num_images * MOLMOACT2_IMAGE_TOKENS_PER_IMAGE
+    prompt_tokens = (
+        MOLMOACT2_FIXED_PROMPT_TOKEN_BUDGET
+        + MOLMOACT2_TASK_TOKEN_BUDGET
+        + state_dim
+        + MOLMOACT2_SEQUENCE_LENGTH_MARGIN
+    )
+    action_tokens = 0
+    if include_discrete_action:
+        action_tokens_per_step = max(
+            MOLMOACT2_MIN_DISCRETE_ACTION_TOKENS_PER_STEP,
+            math.ceil(action_dim * MOLMOACT2_DISCRETE_ACTION_TOKENS_PER_DIM),
+        )
+        action_tokens = MOLMOACT2_DISCRETE_ACTION_WRAPPER_TOKENS + action_horizon * action_tokens_per_step
+
+    return _round_up(
+        image_tokens + prompt_tokens + action_tokens,
+        MOLMOACT2_SEQUENCE_LENGTH_MULTIPLE,
+    )
+
+
+@PreTrainedConfig.register_subclass("molmoact2")
+@dataclass
+class MolmoAct2Config(PreTrainedConfig):
+    """MolmoAct2 policy backed by the converted HF checkpoint implementation."""
+
+    checkpoint_path: str = "allenai/MolmoAct2"
+    checkpoint_revision: str | None = None
+    checkpoint_force_download: bool = False
+
+    n_obs_steps: int = 1
+    chunk_size: int = 30
+    n_action_steps: int = 30
+
+    action_mode: str = "both"
+    inference_action_mode: str | None = None
+    discrete_action_tokenizer: str = "allenai/MolmoAct2-FAST-Tokenizer"
+    discrete_generation_max_steps: int | None = None
+    norm_tag: str | None = None
+
+    setup_type: str = ""
+    control_mode: str = ""
+    image_keys: list[str] = field(default_factory=list)
+    normalize_language: bool = True
+    add_setup_tokens: bool = True
+    add_control_tokens: bool = True
+    normalize_gripper: bool = False
+    num_state_tokens: int = 256
+    # Leave unset for the default MolmoAct2 sequence budget inferred from the fixed
+    # image/prompt/state/action token layout. Override only for unusual long prompts.
+    max_sequence_length: int | None = None
+
+    # Fixed by released MolmoAct2 checkpoints. We validate this at model load.
+    expected_max_action_dim: int = 32
+
+    # Flow-matching training knobs copied from the original MolmoAct2 training path.
+    num_flow_timesteps: int = 8
+    flow_matching_cutoff: float = 1.0
+    flow_matching_time_offset: float = 0.001
+    flow_matching_time_scale: float = 0.999
+    flow_matching_beta_alpha: float = 1.0
+    flow_matching_beta_beta: float = 1.5
+    num_inference_steps: int | None = None
+    mask_action_dim_padding: bool = True
+    enable_inference_cuda_graph: bool = True
+    # MolmoAct2-local eval option. When enabled, stochastic continuous action
+    # generation uses a rollout-local generator derived from eval_seed.
+    per_episode_seed: bool = False
+    eval_seed: int | None = None
+    rtc_config: RTCConfig | None = None
+
+    # Default is full finetuning with gradients from the action expert flowing into the VLM.
+    enable_lora_vlm: bool = False
+    lora_rank: int = 64
+    lora_alpha: int = 16
+    lora_dropout: float = 0.05
+    lora_bias: str = "none"
+    enable_lora_action_expert: bool = False
+    enable_knowledge_insulation: bool = False
+    freeze_embedding: bool = True
+    train_action_expert_only: bool = False
+    gradient_checkpointing: bool = False
+
+    model_dtype: str = "bfloat16"
+    softmax_auxiliary_loss: bool = True
+    softmax_auxiliary_loss_scale: float = 1e-4
+    discrete_loss_token_weighting: str = "root_subsegments_root_tokens"
+
+    optimizer_lr: float = 1e-5
+    optimizer_vit_lr: float = 5e-6
+    optimizer_connector_lr: float = 5e-6
+    optimizer_action_expert_lr: float = 5e-5
+    optimizer_betas: tuple[float, float] = (0.9, 0.95)
+    optimizer_eps: float = 1e-6
+    optimizer_weight_decay: float = 0.0
+    optimizer_grad_clip_norm: float = 1.0
+
+    scheduler_warmup_steps: int = 200
+    scheduler_decay_steps: int | None = None
+    scheduler_decay_lr: float = 1e-6
+
+    normalization_mapping: dict[str, NormalizationMode] = field(
+        default_factory=lambda: {
+            "VISUAL": NormalizationMode.IDENTITY,
+            "STATE": NormalizationMode.QUANTILES,
+            "ACTION": NormalizationMode.QUANTILES,
+        }
+    )
+
+    input_features: dict[str, PolicyFeature] = field(default_factory=dict)
+    output_features: dict[str, PolicyFeature] = field(default_factory=dict)
+    dataset_feature_names: dict[str, Any] = field(default_factory=dict)
+
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        if self.action_mode not in {"continuous", "discrete", "both"}:
+            raise ValueError(
+                f"Unsupported action_mode={self.action_mode!r}. "
+                "Expected one of {'continuous', 'discrete', 'both'}."
+            )
+        if self.inference_action_mode not in {None, "continuous", "discrete"}:
+            raise ValueError(
+                f"Unsupported inference_action_mode={self.inference_action_mode!r}. "
+                "Expected one of {None, 'continuous', 'discrete'}."
+            )
+        if self.inference_action_mode == "continuous" and self.action_mode == "discrete":
+            raise ValueError("MolmoAct2 action_mode='discrete' cannot run continuous inference.")
+        if self.inference_action_mode == "discrete" and self.action_mode == "continuous":
+            raise ValueError("MolmoAct2 action_mode='continuous' cannot run discrete inference.")
+        if self.train_action_expert_only and self.action_mode != "continuous":
+            raise ValueError("MolmoAct2 train_action_expert_only requires action_mode='continuous'.")
+        if self.train_action_expert_only and self.enable_lora_vlm:
+            raise ValueError("MolmoAct2 train_action_expert_only is incompatible with enable_lora_vlm.")
+        if self.enable_lora_action_expert and not self.enable_lora_vlm:
+            raise ValueError("MolmoAct2 enable_lora_action_expert requires enable_lora_vlm.")
+        if self.chunk_size < 1:
+            raise ValueError(f"chunk_size must be >= 1, got {self.chunk_size}.")
+        if self.n_action_steps < 1:
+            raise ValueError(f"n_action_steps must be >= 1, got {self.n_action_steps}.")
+        if self.n_action_steps > self.chunk_size:
+            raise ValueError(
+                f"n_action_steps ({self.n_action_steps}) cannot exceed chunk_size ({self.chunk_size})."
+            )
+        if self.expected_max_action_dim != 32:
+            raise ValueError("MolmoAct2 released checkpoints use expected_max_action_dim=32.")
+        if self.model_dtype not in {"float32", "bfloat16", "float16"}:
+            raise ValueError(
+                f"Unsupported model_dtype={self.model_dtype!r}. Expected 'float32', 'bfloat16', or 'float16'."
+            )
+        if self.lora_rank < 1:
+            raise ValueError(f"lora_rank must be >= 1, got {self.lora_rank}.")
+        if self.lora_alpha < 1:
+            raise ValueError(f"lora_alpha must be >= 1, got {self.lora_alpha}.")
+        if not 0 <= self.lora_dropout <= 1:
+            raise ValueError(f"lora_dropout must be in [0, 1], got {self.lora_dropout}.")
+        if self.lora_bias not in {"none", "all", "lora_only"}:
+            raise ValueError(
+                f"Unsupported lora_bias={self.lora_bias!r}. Expected one of 'none', 'all', or 'lora_only'."
+            )
+        if self.discrete_loss_token_weighting not in {
+            "none",
+            "token",
+            "root_tokens",
+            "root_subsegments",
+            "root_subsegments_root_tokens",
+        }:
+            raise ValueError(
+                f"Unsupported discrete_loss_token_weighting={self.discrete_loss_token_weighting!r}."
+            )
+        if self.discrete_generation_max_steps is not None and self.discrete_generation_max_steps < 1:
+            raise ValueError(
+                f"discrete_generation_max_steps must be >= 1 or None, got {self.discrete_generation_max_steps}."
+            )
+        if self.max_sequence_length is not None and self.max_sequence_length < 1:
+            raise ValueError(f"max_sequence_length must be >= 1 or None, got {self.max_sequence_length}.")
+
+    def inferred_max_sequence_length(
+        self,
+        *,
+        num_images: int | None = None,
+        state_dim: int | None = None,
+        action_dim: int | None = None,
+        action_horizon: int | None = None,
+        include_discrete_action: bool | None = None,
+    ) -> int:
+        if self.max_sequence_length is not None:
+            return int(self.max_sequence_length)
+
+        if num_images is None:
+            num_images = len(self.image_keys) or len(self.image_features) or MOLMOACT2_DEFAULT_NUM_IMAGES
+        if state_dim is None:
+            state_feature = self.robot_state_feature
+            state_dim = int(state_feature.shape[0]) if state_feature is not None else 0
+        if action_dim is None:
+            action_feature = self.action_feature
+            action_dim = (
+                int(action_feature.shape[0]) if action_feature is not None else self.expected_max_action_dim
+            )
+        if action_horizon is None:
+            action_horizon = self.chunk_size
+        if include_discrete_action is None:
+            include_discrete_action = self.action_mode in {"discrete", "both"}
+
+        return infer_molmoact2_max_sequence_length(
+            num_images=int(num_images),
+            state_dim=int(state_dim),
+            action_dim=int(action_dim),
+            action_horizon=int(action_horizon),
+            include_discrete_action=bool(include_discrete_action),
+        )
+
+    @property
+    def observation_delta_indices(self) -> None:
+        return None
+
+    @property
+    def action_delta_indices(self) -> list[int]:
+        return list(range(self.chunk_size))
+
+    @property
+    def reward_delta_indices(self) -> None:
+        return None
+
+    def get_optimizer_preset(self) -> OptimizerConfig:
+        return AdamWConfig(
+            lr=self.optimizer_lr,
+            betas=self.optimizer_betas,
+            eps=self.optimizer_eps,
+            weight_decay=self.optimizer_weight_decay,
+            grad_clip_norm=self.optimizer_grad_clip_norm,
+        )
+
+    def get_scheduler_preset(self) -> LRSchedulerConfig | None:
+        return MolmoAct2CosineDecayWithWarmupSchedulerConfig(
+            peak_lr=self.optimizer_lr,
+            decay_lr=self.scheduler_decay_lr,
+            num_warmup_steps=self.scheduler_warmup_steps,
+            num_decay_steps=self.scheduler_decay_steps,
+        )
+
+    def set_dataset_feature_metadata(self, features: dict[str, Any]) -> None:
+        self.dataset_feature_names = {}
+        for key in (ACTION, OBS_STATE):
+            feature = features.get(key) if isinstance(features, dict) else None
+            if isinstance(feature, dict) and feature.get("names") is not None:
+                self.dataset_feature_names[key] = feature["names"]
+
+    def validate_features(self) -> None:
+        """Validate and set up MolmoAct2 input and output features."""
+        image_features = [key for key, feat in self.input_features.items() if feat.type == FeatureType.VISUAL]
+        if not image_features:
+            raise ValueError(
+                "MolmoAct2 policy requires at least one visual input feature. "
+                "No features of type FeatureType.VISUAL found in input_features."
+            )
+
+        if OBS_STATE not in self.input_features:
+            state_feature = PolicyFeature(
+                type=FeatureType.STATE,
+                shape=(0,),
+            )
+            self.input_features[OBS_STATE] = state_feature
+
+        if ACTION not in self.output_features:
+            action_feature = PolicyFeature(
+                type=FeatureType.ACTION,
+                shape=(self.expected_max_action_dim,),
+            )
+            self.output_features[ACTION] = action_feature
+
+    def apply_norm_tag_metadata(self) -> None:
+        if not str(self.norm_tag or "").strip():
+            return
+        metadata = _load_hf_norm_metadata_for_tag(
+            self.checkpoint_path,
+            revision=self.checkpoint_revision,
+            force_download=bool(self.checkpoint_force_download),
+            norm_tag=self.norm_tag,
+        )
+        if metadata.get("action_horizon") is not None:
+            self.chunk_size = int(metadata["action_horizon"])
+        if metadata.get("n_action_steps") is not None:
+            self.n_action_steps = int(metadata["n_action_steps"])
+        if not self.setup_type and metadata.get("setup_type") is not None:
+            self.setup_type = str(metadata["setup_type"])
+        if not self.control_mode and metadata.get("control_mode") is not None:
+            self.control_mode = str(metadata["control_mode"])
+
+    def saved_policy_action_mode(self) -> str | None:
+        pretrained_path = getattr(self, "pretrained_path", None)
+        if pretrained_path is None:
+            return None
+        config_path = Path(pretrained_path) / "config.json"
+        if not config_path.exists():
+            return None
+        try:
+            mode = json.loads(config_path.read_text()).get("action_mode")
+        except (OSError, json.JSONDecodeError):
+            return None
+        if mode in {"continuous", "discrete", "both"}:
+            return str(mode)
+        return None
+
+    def training_action_mode(self, saved_policy_action_mode: str | None = None) -> str:
+        return saved_policy_action_mode or self.action_mode
+
+    def validate_inference_action_mode(self, saved_policy_action_mode: str | None = None) -> None:
+        requested_mode = self.inference_action_mode
+        if requested_mode is None:
+            return
+        training_mode = self.training_action_mode(saved_policy_action_mode)
+        if requested_mode == "continuous" and training_mode == "discrete":
+            raise ValueError(
+                "MolmoAct2 checkpoint was trained with action_mode='discrete' and cannot run "
+                "continuous inference."
+            )
+        if requested_mode == "discrete" and training_mode == "continuous":
+            raise ValueError(
+                "MolmoAct2 checkpoint was trained with action_mode='continuous' and cannot run "
+                "discrete inference. Train with action_mode='both' or action_mode='discrete' first."
+            )
+
+    def validate_checkpoint_action_mode(
+        self,
+        checkpoint_action_mode: str,
+        *,
+        has_action_expert: bool,
+    ) -> None:
+        if self.action_mode == "both" and checkpoint_action_mode != "both":
+            raise ValueError(
+                f"action_mode='both' requires checkpoint action_mode='both', got {checkpoint_action_mode!r}."
+            )
+        if self.action_mode == "discrete" and checkpoint_action_mode not in {"discrete", "both"}:
+            raise ValueError(
+                f"action_mode='discrete' requires checkpoint action_mode in {{'discrete', 'both'}}, "
+                f"got {checkpoint_action_mode!r}."
+            )
+        if self.action_mode in {"continuous", "both"} and not has_action_expert:
+            raise ValueError("Continuous MolmoAct2 training requires an action expert checkpoint.")
+
+    def resolve_inference_action_mode(
+        self,
+        requested_mode: str | None,
+        saved_policy_action_mode: str | None = None,
+    ) -> str:
+        training_mode = self.training_action_mode(saved_policy_action_mode)
+        if requested_mode is None:
+            requested_mode = self.inference_action_mode
+        if requested_mode is None:
+            raise ValueError(
+                "MolmoAct2 inference requires `inference_action_mode` to be set explicitly "
+                "to either 'continuous' or 'discrete'."
+            )
+        if requested_mode not in {"continuous", "discrete"}:
+            raise ValueError("MolmoAct2 inference_action_mode must be either 'continuous' or 'discrete'.")
+        if requested_mode == "continuous" and training_mode == "discrete":
+            raise ValueError("MolmoAct2 action_mode='discrete' checkpoint cannot run continuous inference.")
+        if requested_mode == "discrete" and training_mode == "continuous":
+            raise ValueError("MolmoAct2 action_mode='continuous' checkpoint cannot run discrete inference.")
+        return requested_mode
--- a/src/lerobot/policies/molmoact2/hf_model/init.py
+++ b/src/lerobot/policies/molmoact2/hf_model/init.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# ruff: noqa
--- a/src/lerobot/policies/molmoact2/hf_model/action_tokenizer.py
+++ b/src/lerobot/policies/molmoact2/hf_model/action_tokenizer.py
@@ -0,0 +1,237 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# ruff: noqa
+
+import logging
+import os
+from pathlib import Path
+from typing import ClassVar
+
+import numpy as np
+from tokenizers import ByteLevelBPETokenizer
+from tokenizers.trainers import BpeTrainer
+from huggingface_hub import snapshot_download
+from transformers import PreTrainedTokenizerFast
+from transformers.processing_utils import ProcessorMixin
+
+
+def _hf_token() -> str | None:
+    return os.environ.get("HF_TOKEN") or os.environ.get("HF_ACCESS_TOKEN")
+
+
+def _resolve_tokenizer_location(
+    tokenizer_path: str,
+    *,
+    revision: str | None = None,
+    force_download: bool = False,
+) -> str:
+    local_path = Path(str(tokenizer_path)).expanduser()
+    if local_path.exists():
+        return str(local_path)
+    return snapshot_download(
+        repo_id=str(tokenizer_path),
+        repo_type="model",
+        revision=revision,
+        force_download=force_download,
+        ignore_patterns=["*.py", "*.pyc", "__pycache__/*"],
+        token=_hf_token(),
+    )
+
+
+class UniversalActionProcessor(ProcessorMixin):
+    attributes: ClassVar[list[str]] = ["tokenizer"]
+    tokenizer_class: str = "AutoTokenizer"
+
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizerFast,
+        scale: float = 10,
+        vocab_size: int = 1024,
+        min_token: int = 0,
+        *,
+        action_dim: int | None = None,
+        time_horizon: int | None = None,
+    ):
+        self.scale = scale
+        self.vocab_size = vocab_size
+        self.min_token = min_token
+
+        # Action horizon and dimension needed during decoding. These can be specified
+        # in three ways (in order of priority):
+        # 1. passed in as kwargs to decode()
+        # 2. in the constructor
+        # 3. cached from the last time decode() was called
+        self.time_horizon = time_horizon
+        self.action_dim = action_dim
+        self.called_time_horizon = time_horizon
+        self.called_action_dim = action_dim
+
+        super().__init__(tokenizer)
+        self.bpe_tokenizer = self.tokenizer
+
+    def __call__(self, action_chunk: np.array) -> np.array:
+        from scipy.fft import dct
+
+        assert action_chunk.ndim <= 3, "Only 3 dimensions supported: [batch, timesteps, action_dim]"
+        if action_chunk.ndim == 2:
+            action_chunk = action_chunk[None, ...]
+
+        # Cache the time horizon and action dimension for decoding
+        self.called_time_horizon = action_chunk.shape[-2]
+        self.called_action_dim = action_chunk.shape[-1]
+
+        dct_coeff = dct(action_chunk, axis=1, norm="ortho")
+        dct_coeff = np.around(dct_coeff * self.scale)
+        tokens = []
+        for elem in dct_coeff:
+            token_str = "".join(map(chr, np.maximum(elem.flatten() - self.min_token, 0).astype(int)))
+            tokens.append(self.bpe_tokenizer(token_str)["input_ids"])
+        return tokens
+
+    def decode(
+        self,
+        tokens: list[list[int]],
+        *,
+        time_horizon: int | None = None,
+        action_dim: int | None = None,
+    ) -> np.array:
+        from scipy.fft import idct
+
+        self.time_horizon = time_horizon or self.time_horizon or self.called_time_horizon
+        self.action_dim = action_dim or self.action_dim or self.called_action_dim
+
+        # Cache the time horizon and action dimension for the next call
+        self.called_time_horizon = self.time_horizon
+        self.called_action_dim = self.action_dim
+
+        assert self.time_horizon is not None and self.action_dim is not None, (
+            "Tokenizer not initialized, call encode() once or pass in time_horizon and action_dim."
+        )
+
+        decoded_actions = []
+        for token in tokens:
+            try:
+                decoded_tokens = self.bpe_tokenizer.decode(token)
+                decoded_dct_coeff = np.array(list(map(ord, decoded_tokens))) + self.min_token
+                decoded_dct_coeff = decoded_dct_coeff.reshape(-1, self.action_dim)
+                assert decoded_dct_coeff.shape == (
+                    self.time_horizon,
+                    self.action_dim,
+                ), (
+                    f"Decoded DCT coefficients have shape {decoded_dct_coeff.shape}, expected ({self.time_horizon}, {self.action_dim})"
+                )
+            except Exception as e:
+                print(f"Error decoding tokens: {e}")
+                print(f"Tokens: {token}")
+                decoded_dct_coeff = np.zeros((self.time_horizon, self.action_dim))
+            decoded_actions.append(idct(decoded_dct_coeff / self.scale, axis=0, norm="ortho"))
+        return np.stack(decoded_actions)
+
+    @classmethod
+    def fit(
+        cls,
+        action_data: list[np.array],
+        scale: float = 10,
+        vocab_size: int = 1024,
+        *,
+        time_horizon: int | None = None,
+        action_dim: int | None = None,
+    ) -> "UniversalActionProcessor":
+        from scipy.fft import dct
+
+        # Run DCT over all inputs
+        dct_tokens = [dct(a, axis=0, norm="ortho").flatten() for a in action_data]
+
+        # Quantize and find min token
+        max_token = int(np.around(np.concatenate(dct_tokens) * scale).max())
+        min_token = int(np.around(np.concatenate(dct_tokens) * scale).min())
+        min_vocab_size = max_token - min_token
+
+        assert min_vocab_size <= vocab_size, (
+            f"Vocab size {vocab_size} is too small for the range of tokens {min_vocab_size}"
+        )
+        if min_vocab_size + 100 > vocab_size:
+            logging.warning(
+                f"Initial alphabet size {min_vocab_size} is almost as large as the vocab"
+                f"size {vocab_size}, consider increasing vocab size"
+            )
+
+        # Make token iterator for BPE training
+        def _token_iter():
+            for tokens in dct_tokens:
+                rounded_tokens = np.around(tokens * scale) - min_token
+                rounded_tokens = rounded_tokens.astype(int)
+                string = "".join(map(chr, rounded_tokens))
+                yield string
+
+        # Train BPE tokenizer
+        bpe = ByteLevelBPETokenizer()
+
+        # Set up the entire range of possible tokens as the initial alphabet
+        alphabet = [chr(i) for i in range(max_token - min_token + 1)]
+        trainer = BpeTrainer(
+            vocab_size=vocab_size,
+            min_frequency=2,
+            show_progress=True,
+            special_tokens=[],
+            initial_alphabet=alphabet,
+            max_token_length=10000,
+        )
+
+        # Train the inner tokenizer (don't use ByteLevelBPETokenizer.train_from_iterator()
+        # because it doesn't support custom alphabets)
+        bpe._tokenizer.train_from_iterator(_token_iter(), trainer=trainer)
+
+        return cls(
+            PreTrainedTokenizerFast(tokenizer_object=bpe, clean_up_tokenization_spaces=False),
+            scale=scale,
+            vocab_size=vocab_size,
+            min_token=min_token,
+            time_horizon=time_horizon,
+            action_dim=action_dim,
+        )
+
+    @classmethod
+    def from_pretrained_local(
+        cls,
+        pretrained_model_name_or_path: str,
+        *,
+        revision: str | None = None,
+        force_download: bool = False,
+    ) -> "UniversalActionProcessor":
+        location = Path(
+            _resolve_tokenizer_location(
+                pretrained_model_name_or_path,
+                revision=revision,
+                force_download=force_download,
+            )
+        )
+        processor_config = {}
+        processor_config_path = location / "processor_config.json"
+        if processor_config_path.exists():
+            import json
+
+            processor_config = json.loads(processor_config_path.read_text())
+        tokenizer = PreTrainedTokenizerFast.from_pretrained(str(location))
+        return cls(
+            tokenizer,
+            scale=processor_config.get("scale", 10),
+            vocab_size=processor_config.get("vocab_size", 1024),
+            min_token=processor_config.get("min_token", 0),
+            action_dim=processor_config.get("action_dim"),
+            time_horizon=processor_config.get("time_horizon"),
+        )
--- a/src/lerobot/policies/molmoact2/hf_model/configuration_molmoact2.py
+++ b/src/lerobot/policies/molmoact2/hf_model/configuration_molmoact2.py
@@ -0,0 +1,553 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# ruff: noqa
+
+"""
+MolmoAct2 configuration
+"""
+
+from typing import Optional, Any
+
+from transformers import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class MolmoAct2VitConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MolmoAct2VisionTransformer`].
+    It is used to instantiate a `MolmoAct2VisionTransformer` according to the specified arguments,
+    defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Example:
+    ```python
+    >>> from transformers import MolmoAct2VitConfig, MolmoAct2VisionTransformer
+
+    >>> # Initializing a MolmoAct2VitConfig
+    >>> configuration = MolmoAct2VitConfig()
+
+    >>> # Initializing a MolmoAct2VisionTransformer (with random weights)
+    >>> model = MolmoAct2VisionTransformer(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "molmoact2"
+    base_config_key = "vit_config"
+
+    def __init__(
+        self,
+        hidden_size: int = 1152,
+        intermediate_size: int = 4304,
+        num_hidden_layers: int = 27,
+        num_attention_heads: int = 16,
+        num_key_value_heads: int = 16,
+        head_dim: int = 72,
+        hidden_act: str = "gelu_pytorch_tanh",
+        layer_norm_eps: float = 1e-6,
+        image_default_input_size: tuple[int, int] = (378, 378),
+        image_patch_size: int = 14,
+        image_num_pos: int = 577,
+        attention_dropout: float = 0.0,
+        residual_dropout: float = 0.0,
+        initializer_range: float = 0.02,
+        float32_attention: bool = True,
+        attn_implementation: str = "eager",
+        **kwargs,
+    ):
+        self.attn_implementation = attn_implementation
+        super().__init__(attn_implementation=attn_implementation, **kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.hidden_act = hidden_act
+        self.layer_norm_eps = layer_norm_eps
+        self.image_default_input_size = image_default_input_size
+        self.image_patch_size = image_patch_size
+        self.image_num_pos = image_num_pos
+        self.attention_dropout = attention_dropout
+        self.residual_dropout = residual_dropout
+        self.initializer_range = initializer_range
+        self.float32_attention = float32_attention
+
+    @property
+    def image_num_patch(self):
+        h, w = self.image_default_input_size
+        return h // self.image_patch_size, w // self.image_patch_size
+
+
+class MolmoAct2AdapterConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of MolmoAct2Adapter. With MolmoAct2VitConfig,
+    It is used to instantiate an MolmoAct2VisionBackbone according to the specified arguments,
+    defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Example:
+
+    ```python
+    >>> from transformers import MolmoAct2VitConfig, MolmoAct2AdapterConfig, MolmoAct2VisionBackbone
+
+    >>> # Initializing a MolmoAct2VitConfig and a MolmoAct2AdapterConfig
+    >>> vit_config = MolmoAct2VitConfig()
+    >>> adapter_config = MolmoPoolingConfig()
+
+    >>> # Initializing a MolmoAct2VisionBackbone (with random weights)
+    >>> model = MolmoAct2VisionBackbone(vit_config, adapter_config)
+
+    >>> # Accessing the model configuration
+    >>> vit_configuration = model.vit_config
+    >>> adapter_configuration = model.adapter_config
+    ```"""
+
+    model_type = "molmoact2"
+    base_config_key = "adapter_config"
+
+    def __init__(
+        self,
+        vit_layers: tuple = (-3, -9),
+        pooling_attention_mask: bool = False,
+        hidden_size: int = 1152,
+        num_attention_heads: int = 16,
+        num_key_value_heads: int = 16,
+        head_dim: int = 72,
+        float32_attention: bool = True,
+        attention_dropout: float = 0.0,
+        residual_dropout: float = 0.0,
+        hidden_act: str = "silu",
+        intermediate_size: int = 18944,
+        text_hidden_size: int = 3584,
+        image_feature_dropout: float = 0.0,
+        initializer_range: float = 0.02,
+        attn_implementation: str = "eager",
+        **kwargs,
+    ):
+        self.attn_implementation = attn_implementation
+        super().__init__(attn_implementation=attn_implementation, **kwargs)
+        self.vit_layers = vit_layers
+        self.pooling_attention_mask = pooling_attention_mask
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.float32_attention = float32_attention
+        self.attention_dropout = attention_dropout
+        self.residual_dropout = residual_dropout
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.text_hidden_size = text_hidden_size
+        self.image_feature_dropout = image_feature_dropout
+        self.initializer_range = initializer_range
+
+
+class MolmoAct2TextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MolmoAct2TextModel`]. It is used to instantiate a
+    `MolmoAct2TextModel` according to the specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Example:
+    ```python
+    >>> from transformers import MolmoAct2TextConfig, MolmoAct2TextModel
+
+    >>> # Initializing a MolmoAct2TextConfig
+    >>> configuration = MolmoAct2TextConfig()
+
+    >>> # Initializing a MolmoAct2TextModel (with random weights)
+    >>> model = MolmoAct2TextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "molmoact2_text"
+    base_config_key = "text_config"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "blocks.*.self_attn.att_proj": "colwise",
+        "blocks.*.self_attn.attn_out": "rowwise",
+        "blocks.*.mlp.ff_proj": "colwise",
+        "blocks.*.mlp.ff_out": "rowwise",
+    }
+    base_model_pp_plan = {
+        "wte": (["input_ids"], ["inputs_embeds"]),
+        "blocks": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "ln_f": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        hidden_size: int = 3584,
+        num_attention_heads: int = 28,
+        num_key_value_heads: int | None = 4,
+        head_dim: int = 128,
+        vocab_size: int = 152064,
+        additional_vocab_size: int = 128,
+        qkv_bias: bool = True,
+        num_hidden_layers: int = 48,
+        intermediate_size: int = 18944,
+        hidden_act: str = "silu",
+        embedding_dropout: float = 0.0,
+        attention_dropout: float = 0.0,
+        residual_dropout: float = 0.0,
+        max_position_embeddings: int = 4096,
+        rope_theta: float = 1000000.0,
+        rope_scaling: dict[str, Any] = None,
+        rope_scaling_layers: list[int] | None = None,
+        use_qk_norm: bool = False,
+        qk_norm_type: str = "olmo",
+        layer_norm_eps: int = 1e-6,
+        norm_after: bool = False,
+        initializer_range: float = 0.02,
+        use_cache=True,
+        tie_word_embeddings=False,
+        attn_implementation: str = "eager",
+        **kwargs,
+    ):
+        self.attn_implementation = attn_implementation
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings, attn_implementation=attn_implementation, **kwargs
+        )
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.vocab_size = vocab_size
+        self.additional_vocab_size = additional_vocab_size
+        self.qkv_bias = qkv_bias
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.embedding_dropout = embedding_dropout
+        self.attention_dropout = attention_dropout
+        self.residual_dropout = residual_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.rope_scaling_layers = rope_scaling_layers
+        self.use_qk_norm = use_qk_norm
+        self.qk_norm_type = qk_norm_type
+        self.layer_norm_eps = layer_norm_eps
+        self.norm_after = norm_after
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+
+        # Validate the correctness of rotary position embeddings parameters
+        rope_config_validation(self)
+
+
+class MolmoAct2ActionExpertConfig(PretrainedConfig):
+    r"""Configuration for the MolmoAct2 modern action expert."""
+
+    model_type = "molmoact2_action_expert"
+    base_config_key = "action_expert_config"
+
+    def __init__(
+        self,
+        max_action_horizon: int = 32,
+        max_action_dim: int = 32,
+        hidden_size: int = 1024,
+        num_layers: int = 32,
+        num_heads: int = 16,
+        mlp_ratio: float = 8.0 / 3.0,
+        ffn_multiple_of: int = 256,
+        timestep_embed_dim: int = 256,
+        dropout: float = 0.0,
+        attn_dropout: float = 0.0,
+        context_layer_norm: bool = True,
+        qk_norm: bool = True,
+        qk_norm_eps: float = 1e-6,
+        rope: bool = True,
+        causal_attn: bool = False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.max_action_horizon = max_action_horizon
+        self.max_action_dim = max_action_dim
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.mlp_ratio = mlp_ratio
+        self.ffn_multiple_of = ffn_multiple_of
+        self.timestep_embed_dim = timestep_embed_dim
+        self.dropout = dropout
+        self.attn_dropout = attn_dropout
+        self.context_layer_norm = context_layer_norm
+        self.qk_norm = qk_norm
+        self.qk_norm_eps = qk_norm_eps
+        self.rope = rope
+        self.causal_attn = causal_attn
+
+    def to_dict(self):
+        output = super().to_dict()
+        # These are derived from the parent MolmoAct2Config for HF exports. Keeping
+        # them out of the public nested config avoids duplicated sources of truth.
+        output.pop("max_action_horizon", None)
+        output.pop("max_action_dim", None)
+        return output
+
+
+class MolmoAct2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MolmoAct2ForConditionalGeneration`].
+    It is used to instantiate an MolmoAct2 model according to the specified arguments, defining the model architecture.
+
+    Example:
+
+    ```python
+    >>> from transformers import MolmoAct2Config, MolmoAct2VitConfig, MolmoAct2AdapterConfig, MolmoAct2TextConfig
+
+    >>> # Initializing a MolmoAct2VitConfig
+    >>> vit_config = MolmoAct2VitConfig()
+
+    >>> # Initializing a MolmoAct2AdapterConfig
+    >>> adapter_config = MolmoAct2AdapterConfig()
+
+    >>> # Initializing a MolmoAct2TextConfig
+    >>> text_config = MolmoAct2TextConfig()
+
+    >>> # Initializing a MolmoAct2Config
+    >>> configuration = MolmoAct2Config(
+    >>>     vit_config=vit_config,
+    >>>     adapter_config=adapter_config,
+    >>>     text_config=text_config,
+    >>>     image_start_token_id=151936,
+    >>>     image_end_token_id=151937,
+    >>>     image_patch_id=151938,
+    >>>     image_col_id=151939,
+    >>>     low_res_image_start_token_id=151940,
+    >>>     image_low_res_id=151942,
+    >>>     frame_start_token_id=151943,
+    >>>     frame_end_token_id=151944,
+    >>> )
+
+    >>> # Initializing a model
+    >>> model = MolmoAct2ForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "molmoact2"
+    sub_configs = {
+        "text_config": MolmoAct2TextConfig,
+        "vit_config": MolmoAct2VitConfig,
+        "adapter_config": MolmoAct2AdapterConfig,
+        "action_expert_config": MolmoAct2ActionExpertConfig,
+    }
+
+    def __init__(
+        self,
+        vit_config: MolmoAct2VitConfig = None,
+        adapter_config: MolmoAct2AdapterConfig = None,
+        text_config: MolmoAct2TextConfig = None,
+        action_expert_config: MolmoAct2ActionExpertConfig = None,
+        image_start_token_id: int = None,
+        low_res_image_start_token_id: int = None,
+        image_end_token_id: int = None,
+        image_low_res_id: int = None,
+        image_patch_id: int = None,
+        image_col_id: int = None,
+        frame_start_token_id: int = None,
+        frame_end_token_id: int = None,
+        use_frame_special_tokens: bool = True,
+        initializer_range: float = 0.02,
+        add_action_expert: bool = True,
+        max_action_dim: int = 32,
+        max_action_horizon: int = 30,
+        n_obs_steps: int = 30,
+        action_mode: str = "both",
+        state_format: str = "discrete",
+        flow_matching_num_steps: int = 10,
+        flow_matching_cutoff: float = 1.0,
+        flow_matching_time_offset: float = 0.001,
+        flow_matching_time_scale: float = 0.999,
+        flow_matching_beta_alpha: float = 1.0,
+        flow_matching_beta_beta: float = 1.5,
+        mask_action_dim_padding: bool = True,
+        enable_depth_reasoning: bool = False,
+        depth_mode: int = 2,
+        num_depth_codes: int = 100,
+        action_expert_depth_gate: bool = False,
+        action_expert_depth_gate_per_layer: bool = False,
+        action_expert_depth_gate_init_bias: float = -4.0,
+        action_output_token_id: int = None,
+        action_start_token_id: int = None,
+        action_end_token_id: int = None,
+        action_token_start_id: int = None,
+        num_action_tokens: int = 0,
+        depth_output_token_id: int = None,
+        depth_start_token_id: int = None,
+        depth_end_token_id: int = None,
+        depth_token_start_id: int = None,
+        num_depth_tokens: int = 0,
+        state_start_token_id: int = None,
+        state_end_token_id: int = None,
+        state_token_start_id: int = None,
+        num_state_tokens: int = 0,
+        add_setup_tokens: bool = True,
+        add_control_tokens: bool = True,
+        norm_stats_filename: str = "norm_stats.json",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if vit_config is None:
+            self.vit_config = MolmoAct2VitConfig()
+        elif isinstance(vit_config, dict):
+            self.vit_config = MolmoAct2VitConfig(**vit_config)
+        else:
+            self.vit_config = vit_config
+        if adapter_config is None:
+            self.adapter_config = MolmoAct2AdapterConfig()
+        elif isinstance(adapter_config, dict):
+            self.adapter_config = MolmoAct2AdapterConfig(**adapter_config)
+        else:
+            self.adapter_config = adapter_config
+        if text_config is None:
+            self.text_config = MolmoAct2TextConfig()
+        elif isinstance(text_config, dict):
+            self.text_config = MolmoAct2TextConfig(**text_config)
+        else:
+            self.text_config = text_config
+        self.add_action_expert = bool(add_action_expert)
+        if not self.add_action_expert:
+            self.action_expert_config = None
+        elif action_expert_config is None:
+            self.action_expert_config = MolmoAct2ActionExpertConfig(
+                max_action_horizon=max_action_horizon,
+                max_action_dim=max_action_dim,
+                num_layers=self.text_config.num_hidden_layers,
+            )
+        elif isinstance(action_expert_config, dict):
+            self.action_expert_config = MolmoAct2ActionExpertConfig(**action_expert_config)
+        else:
+            self.action_expert_config = action_expert_config
+        if self.add_action_expert:
+            self.action_expert_config.max_action_dim = int(max_action_dim)
+            self.action_expert_config.max_action_horizon = int(max_action_horizon)
+            self._validate_release_action_config(
+                state_format=state_format,
+            )
+        self.image_start_token_id = image_start_token_id
+        self.low_res_image_start_token_id = low_res_image_start_token_id
+        self.image_end_token_id = image_end_token_id
+        self.image_low_res_id = image_low_res_id
+        self.image_high_res_id = image_patch_id
+        self.image_patch_id = image_patch_id
+        self.image_col_id = image_col_id
+        self.frame_start_token_id = frame_start_token_id
+        self.frame_end_token_id = frame_end_token_id
+        self.use_frame_special_tokens = use_frame_special_tokens
+        self.initializer_range = initializer_range
+        self.max_action_dim = max_action_dim
+        self.max_action_horizon = max_action_horizon
+        self.n_obs_steps = n_obs_steps
+        self.action_mode = action_mode
+        self.state_format = state_format
+        self.flow_matching_num_steps = flow_matching_num_steps
+        self.flow_matching_cutoff = flow_matching_cutoff
+        self.flow_matching_time_offset = flow_matching_time_offset
+        self.flow_matching_time_scale = flow_matching_time_scale
+        self.flow_matching_beta_alpha = flow_matching_beta_alpha
+        self.flow_matching_beta_beta = flow_matching_beta_beta
+        self.mask_action_dim_padding = mask_action_dim_padding
+        self.enable_depth_reasoning = enable_depth_reasoning
+        self.depth_mode = depth_mode
+        self.num_depth_codes = num_depth_codes
+        self.action_expert_depth_gate = action_expert_depth_gate
+        self.action_expert_depth_gate_per_layer = action_expert_depth_gate_per_layer
+        self.action_expert_depth_gate_init_bias = action_expert_depth_gate_init_bias
+        self.action_output_token_id = action_output_token_id
+        self.action_start_token_id = action_start_token_id
+        self.action_end_token_id = action_end_token_id
+        self.action_token_start_id = action_token_start_id
+        self.num_action_tokens = num_action_tokens
+        self.depth_output_token_id = depth_output_token_id
+        self.depth_start_token_id = depth_start_token_id
+        self.depth_end_token_id = depth_end_token_id
+        self.depth_token_start_id = depth_token_start_id
+        self.num_depth_tokens = num_depth_tokens
+        self.state_start_token_id = state_start_token_id
+        self.state_end_token_id = state_end_token_id
+        self.state_token_start_id = state_token_start_id
+        self.num_state_tokens = num_state_tokens
+        self.add_setup_tokens = add_setup_tokens
+        self.add_control_tokens = add_control_tokens
+        self.norm_stats_filename = norm_stats_filename
+
+    @staticmethod
+    def _validate_release_action_config(
+        *,
+        state_format: str,
+    ) -> None:
+        if state_format != "discrete":
+            raise ValueError("MolmoAct2 HF export supports only state_format='discrete'.")
+
+    @property
+    def image_num_patch(self):
+        assert self.vit_config is not None
+        return self.vit_config.image_num_patch
+
+    @property
+    def num_attention_heads(self):
+        return self.text_config.num_attention_heads
+
+    @property
+    def num_key_value_heads(self):
+        return self.text_config.num_key_value_heads
+
+    @property
+    def head_dim(self):
+        return self.text_config.head_dim
+
+    @property
+    def num_hidden_layers(self):
+        return self.text_config.num_hidden_layers
+
+    @property
+    def hidden_size(self):
+        return self.text_config.hidden_size
+
+    @property
+    def vocab_size(self):
+        return self.text_config.vocab_size
+
+    @property
+    def max_position_embeddings(self):
+        return self.text_config.max_position_embeddings
+
+
+MolmoAct2VitConfig.register_for_auto_class()
+MolmoAct2AdapterConfig.register_for_auto_class()
+MolmoAct2TextConfig.register_for_auto_class()
+MolmoAct2ActionExpertConfig.register_for_auto_class()
+MolmoAct2Config.register_for_auto_class()
--- a/src/lerobot/policies/molmoact2/hf_model/image_processing_molmoact2.py
+++ b/src/lerobot/policies/molmoact2/hf_model/image_processing_molmoact2.py
@@ -0,0 +1,564 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# ruff: noqa
+
+"""Image processor class for MolmoAct2"""
+
+from typing import Optional, Union
+import numpy as np
+import einops
+import torch
+import torchvision.transforms
+
+from transformers.image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ImageInput,
+    PILImageResampling,
+    make_flat_list_of_images,
+    valid_images,
+    to_numpy_array,
+)
+from transformers.image_transforms import convert_to_rgb
+from transformers.processing_utils import ImagesKwargs
+from transformers.image_processing_utils import BaseImageProcessor, get_size_dict
+from transformers.utils import logging
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.utils import TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+def normalize_image(
+    image: np.ndarray,
+    image_mean: list[float],
+    image_std: list[float],
+) -> np.ndarray:
+    if np.allclose(image_mean, [0.5, 0.5, 0.5]) and np.allclose(image_std, [0.5, 0.5, 0.5]):
+        return image * np.asarray(2.0, dtype=np.float32) - np.asarray(1.0, dtype=np.float32)
+    image -= np.array(image_mean, dtype=np.float32)[None, None, :]
+    image /= np.array(image_std, dtype=np.float32)[None, None, :]
+    return image
+
+
+def resize_image(
+    image: np.ndarray,
+    desired_output_size: list[int],
+    resample: PILImageResampling,
+) -> np.ndarray:
+    image = torch.permute(torch.from_numpy(image), [2, 0, 1])
+    dtype = image.dtype
+    if torch.is_floating_point(image):
+        in_min = 0.0
+        in_max = 1.0
+        resized = torchvision.transforms.Resize(
+            desired_output_size,
+            resample,
+            antialias=False,
+        )(image)
+        resized = torch.clip(resized, 0.0, 1.0).to(dtype)
+    else:
+        assert image.dtype == torch.uint8, "SigLIP expects float images or uint8 images, but got {}".format(
+            image.dtype
+        )
+        in_min = 0.0
+        in_max = 255.0
+        resized = torchvision.transforms.Resize(
+            desired_output_size,
+            resample,
+            antialias=False,
+        )(image)
+        resized = torch.clip(resized, 0, 255).to(dtype)
+
+    resized = resized.to(torch.float32)
+    resized = (resized - in_min) / (in_max - in_min)
+
+    resized = torch.permute(resized, [1, 2, 0]).numpy()
+
+    return resized
+
+
+def select_tiling(h, w, patch_size, max_num_crops):
+    """Divide in image of size [w, h] in up to max_num_patches of size patch_size"""
+    original_size = np.stack([h, w])  # [1, 2]
+    original_res = h * w
+    tilings = []
+    for i in range(1, max_num_crops + 1):
+        for j in range(1, max_num_crops + 1):
+            if i * j <= max_num_crops:
+                tilings.append((i, j))
+    # sort so argmin and argmax favour smaller tilings in the event of a tie
+    tilings.sort(key=lambda x: (x[0] * x[1], x[0]))
+    candidate_tilings = np.array(tilings, dtype=np.int32)  # [n_resolutions, 2]
+    candidate_resolutions = candidate_tilings * patch_size  # [n_resolutions, 2]
+
+    # How much we would need to scale the image to fit exactly in each tiling
+    original_size = np.stack([h, w], dtype=np.float32)  # [1, 2]
+
+    # The original size can be zero in rare cases if the image is smaller than the margin
+    # In those cases letting the scale become infinite means the tiling is based on the
+    # other side, or falls back to the smallest tiling
+    with np.errstate(divide="ignore"):
+        required_scale_d = (candidate_resolutions.astype(np.float32) / original_size,)
+    required_scale = np.min(required_scale_d, axis=-1, keepdims=True)  # [n_resolutions, 1]
+    if np.all(required_scale < 1):
+        # We are forced to downscale, so try to minimize the amount of downscaling
+        ix = np.argmax(required_scale)
+    else:
+        # Pick the resolution that required the least upscaling so that it most closely fits the image
+        required_scale = np.where(required_scale < 1.0, 10e9, required_scale)
+        ix = np.argmin(required_scale)
+    return candidate_tilings[ix]
+
+
+def build_resized_image(
+    image: np.ndarray,
+    base_image_input_size: list[int],
+    resample: PILImageResampling,
+    image_mean: list[float],
+    image_std: list[float],
+    image_patch_size: int,
+) -> tuple[np.ndarray, np.ndarray]:
+    resized = resize_image(
+        image,
+        base_image_input_size,
+        resample,
+    )
+    resized = normalize_image(resized, image_mean, image_std)
+    if len(resized.shape) == 3:
+        resized = np.expand_dims(resized, 0)
+    crop_patch_w = base_image_input_size[1] // image_patch_size
+    crop_patch_h = base_image_input_size[0] // image_patch_size
+    resize_idx = np.arange(crop_patch_w * crop_patch_h).reshape([crop_patch_h, crop_patch_w])
+    return resized, resize_idx
+
+
+def build_overlapping_crops(
+    image: np.ndarray,
+    max_crops: int,
+    overlap_margins: list[int],
+    base_image_input_size: list[int],
+    resample: PILImageResampling,
+    image_mean: list[float],
+    image_std: list[float],
+    image_patch_size: int,
+) -> tuple[np.ndarray, np.ndarray]:
+    """Decompose an image into a set of overlapping crops
+
+    :return crop_arr: [n_crops, h, w, 3] The crops
+    :return patch_idx: [overlap_patch_h, overlap_patch_w] For each patch in the resized image
+                        the crops were extracted from, what patch in `crop_arr` it corresponds to
+    """
+    original_image_h, original_image_w = image.shape[:2]
+    crop_size = base_image_input_size[0]
+    assert base_image_input_size[0] == base_image_input_size[1]
+
+    left_margin, right_margin = overlap_margins
+    total_margin_pixels = image_patch_size * (right_margin + left_margin)  # pixels removed per dim
+    crop_patches = base_image_input_size[0] // image_patch_size  # patches per crop dim
+    crop_window_patches = crop_patches - (right_margin + left_margin)  # usable patches
+    crop_window_size = crop_window_patches * image_patch_size
+    crop_patch_w = base_image_input_size[1] // image_patch_size
+    crop_patch_h = base_image_input_size[0] // image_patch_size
+    original_image_h, original_image_w = image.shape[:2]
+    crop_size = base_image_input_size[0]
+
+    # Decide how to tile the image, to account for the overlap margins we compute the tiling
+    # as if we had an image without the margins and were using a crop size without the margins
+    tiling = select_tiling(
+        original_image_h - total_margin_pixels,
+        original_image_w - total_margin_pixels,
+        crop_window_size,
+        max_crops,
+    )
+
+    src = resize_image(
+        image,
+        [
+            tiling[0] * crop_window_size + total_margin_pixels,
+            tiling[1] * crop_window_size + total_margin_pixels,
+        ],
+        resample,
+    )
+    src = normalize_image(src, image_mean, image_std)
+
+    # Now we have to split the image into crops, and track what patches came from
+    # where in `patch_idx_arr`
+    n_crops = tiling[0] * tiling[1]
+    crop_arr = np.zeros([n_crops, crop_size, crop_size, 3], dtype=src.dtype)
+    patch_idx_arr = np.zeros([n_crops, crop_patch_h, crop_patch_w], dtype=np.int32)
+    on_crop = 0
+    for i in range(tiling[0]):
+        # Slide over `src` by `crop_window_size` steps, but extract crops of size `crops_size`
+        # which results in overlapping crop windows
+        y0 = i * crop_window_size
+        for j in range(tiling[1]):
+            x0 = j * crop_window_size
+            crop_arr[on_crop] = src[y0 : y0 + crop_size, x0 : x0 + crop_size]
+            patch_idx = np.arange(crop_patch_w * crop_patch_h).reshape(crop_patch_h, crop_patch_w)
+            patch_idx += on_crop * crop_patch_h * crop_patch_w
+
+            # Mask out idx that are in the overlap region
+            if i != 0:
+                patch_idx[:left_margin, :] = -1
+            if j != 0:
+                patch_idx[:, :left_margin] = -1
+            if i != tiling[0] - 1:
+                patch_idx[-right_margin:, :] = -1
+            if j != tiling[1] - 1:
+                patch_idx[:, -right_margin:] = -1
+            patch_idx_arr[on_crop] = patch_idx
+            on_crop += 1
+
+    # `patch_idx_arr` is ordered crop-by-crop, here we transpose `patch_idx_arr`
+    # so it is ordered left-to-right order
+    patch_idx_arr = np.reshape(patch_idx_arr, [tiling[0], tiling[1], crop_patch_h, crop_patch_w])
+    patch_idx_arr = np.transpose(patch_idx_arr, [0, 2, 1, 3])
+    patch_idx_arr = np.reshape(patch_idx_arr, [-1])
+
+    # Now get the parts not in the overlap region, so it should map each patch in `src`
+    # to the correct patch it should come from in `crop_arr`
+    patch_idx_arr = patch_idx_arr[patch_idx_arr >= 0].reshape(
+        src.shape[0] // image_patch_size,
+        src.shape[1] // image_patch_size,
+    )
+    return crop_arr, patch_idx_arr
+
+
+def batch_pixels_to_patches(array: np.ndarray, patch_size: int) -> np.ndarray:
+    """Reshape images of [n_images, h, w, 3] -> [n_images, n_patches, pixels_per_patch]"""
+    if len(array.shape) == 3:
+        n_crops, h, w = array.shape
+        h_patches = h // patch_size
+        w_patches = w // patch_size
+        array = np.reshape(array, [n_crops, h_patches, patch_size, w_patches, patch_size])
+        array = np.transpose(array, [0, 1, 3, 2, 4])
+        array = np.reshape(array, [n_crops, h_patches * w_patches, patch_size * patch_size])
+        return array
+    else:
+        n_crops, h, w, c = array.shape
+        h_patches = h // patch_size
+        w_patches = w // patch_size
+        array = np.reshape(array, [n_crops, h_patches, patch_size, w_patches, patch_size, c])
+        array = np.transpose(array, [0, 1, 3, 2, 4, 5])
+        array = np.reshape(array, [n_crops, h_patches * w_patches, patch_size * patch_size * c])
+        return array
+
+
+def arange_for_pooling(
+    idx_arr: np.ndarray,
+    pool_h: int,
+    pool_w: int,
+) -> np.ndarray:
+    h_pad = pool_h * ((idx_arr.shape[0] + pool_h - 1) // pool_h) - idx_arr.shape[0]
+    w_pad = pool_w * ((idx_arr.shape[1] + pool_w - 1) // pool_w) - idx_arr.shape[1]
+    idx_arr = np.pad(
+        idx_arr,
+        [[h_pad // 2, (h_pad + 1) // 2], [w_pad // 2, (w_pad + 1) // 2]],
+        mode="constant",
+        constant_values=-1,
+    )
+    return einops.rearrange(idx_arr, "(h dh) (w dw) -> h w (dh dw)", dh=pool_h, dw=pool_w)
+
+
+def image_to_patches_and_grids(
+    image: np.ndarray,
+    max_crops: int,
+    overlap_margins: list[int],
+    base_image_input_size: list[int],
+    resample: PILImageResampling,
+    image_mean: list[float],
+    image_std: list[float],
+    image_patch_size: int,
+    image_pooling_w: int,
+    image_pooling_h: int,
+    crop_mode: str = "overlap-and-resize-c2",
+) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """
+    :return image_grids, the shape of each (low-res, high-res) image after pooling
+    :return crops, the image crops to processes with the ViT
+    :return pooled_patch_idx, for each patch_id tokens in `image_tokens`, the indices of the
+                                patches in `crops` to pool for that token, masked with -1
+    """
+    if isinstance(base_image_input_size, int):
+        base_image_input_size = (base_image_input_size, base_image_input_size)
+
+    base_image_input_d = image_patch_size
+    pooling_w = image_pooling_w
+    pooling_h = image_pooling_h
+    crop_patch_w = base_image_input_size[1] // base_image_input_d
+    crop_patch_h = base_image_input_size[0] // base_image_input_d
+
+    if crop_mode == "resize":
+        resized, resize_idx = build_resized_image(
+            image,
+            base_image_input_size,
+            resample,
+            image_mean,
+            image_std,
+            image_patch_size,
+        )
+        resize_idx = arange_for_pooling(resize_idx, pooling_h, pooling_w)
+        resized_h, resized_w = resize_idx.shape[:2]
+        resize_idx = resize_idx.reshape([-1, pooling_h * pooling_w])
+        image_grid = [np.array([resized_h, resized_w, 0, 0])]
+        return (
+            np.stack(image_grid, 0),
+            batch_pixels_to_patches(resized, image_patch_size),
+            resize_idx,
+        )
+
+    if crop_mode not in {"overlap-and-resize-c2", "overlap-and-resize"}:
+        raise ValueError(f"Unsupported MolmoAct2 image crop_mode {crop_mode!r}.")
+
+    crop_arr, patch_idx_arr = build_overlapping_crops(
+        image,
+        max_crops,
+        overlap_margins,
+        base_image_input_size,
+        resample,
+        image_mean,
+        image_std,
+        image_patch_size,
+    )
+    pooling_idx = arange_for_pooling(patch_idx_arr, pooling_h, pooling_w)
+    h, w = pooling_idx.shape[:2]
+    pooling_idx = pooling_idx.reshape([-1, pooling_h * pooling_w])
+
+    # Finally do the same for the global image
+    resized, resize_idx = build_resized_image(
+        image,
+        base_image_input_size,
+        resample,
+        image_mean,
+        image_std,
+        image_patch_size,
+    )
+    crop_arr = np.concatenate([resized, crop_arr], 0)
+
+    resize_idx = arange_for_pooling(resize_idx, pooling_h, pooling_w)
+    resized_h, resized_w = resize_idx.shape[:2]
+    resize_idx = resize_idx.reshape([-1, pooling_h * pooling_w])
+
+    # Global image goes first, so the order of patches in previous crops gets increased
+    pooling_idx = np.where(pooling_idx >= 0, pooling_idx + crop_patch_h * crop_patch_w, -1)
+    pooling_idx = np.concatenate([resize_idx, pooling_idx])
+    image_grid = [np.array([resized_h, resized_w, h, w])]
+
+    return (np.stack(image_grid, 0), batch_pixels_to_patches(crop_arr, image_patch_size), pooling_idx)
+
+
+class MolmoAct2ImagesKwargs(ImagesKwargs, total=False):
+    max_crops: int | None
+    overlap_margins: list[int] | None
+    crop_mode: str | None
+    patch_size: int | None
+    pooling_size: list[int] | None
+
+
+class MolmoAct2ImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a MolmoAct2 image processor that preprocesses images for the model.
+
+    Args:
+        size (`dict[str, int]` *optional*, defaults to `{"height": 378, "width": 378}`):
+            Size of the image after resizing.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Resampling filter to use when resizing the image.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+            Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        image_std (`float` or `list[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+        max_crops (`int`, *optional*, defaults to `8`):
+            Maximum number of crops to use per image.
+        overlap_margins (`list[int]`, *optional*, defaults to `[4, 4]`):
+            Overlap margins to use.
+        patch_size (`int`, *optional*, defaults to 14):
+            The spatial patch size of the vision encoder.
+        pooling_size (`list[int]`, *optional*, defaults to `[2, 2]`):
+            The pooling size of the vision adapter.
+    """
+
+    model_input_names = ["pixel_values", "image_token_pooling", "image_grids", "image_num_crops"]
+
+    def __init__(
+        self,
+        size: dict[str, int] | None = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        image_mean: float | list[float] | None = None,
+        image_std: float | list[float] | None = None,
+        do_convert_rgb: bool = True,
+        max_crops: int = 8,
+        overlap_margins: list[int] = [4, 4],
+        crop_mode: str = "overlap-and-resize-c2",
+        patch_size: int = 14,
+        pooling_size: list[int] = [2, 2],
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 378, "width": 378}
+        size = get_size_dict(size, default_to_square=True)
+        self.size = size
+
+        self.resample = resample
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+        self.do_convert_rgb = do_convert_rgb
+
+        self.max_crops = max_crops
+        self.overlap_margins = overlap_margins
+        self.crop_mode = crop_mode
+        self.patch_size = patch_size
+        self.pooling_size = pooling_size
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        size: dict[str, int] | None = None,
+        resample: PILImageResampling | None = None,
+        image_mean: float | list[float] | None = None,
+        image_std: float | list[float] | None = None,
+        do_convert_rgb: bool | None = None,
+        max_crops: int | None = None,
+        overlap_margins: list[int] | None = None,
+        crop_mode: str | None = None,
+        patch_size: int | None = None,
+        pooling_size: list[int] | None = None,
+        return_tensors: str | TensorType | None = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Args:
+            images (`ImageInput`):
+                Image to preprocess.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use when resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            max_crops (`int`, *optional*, defaults to `self.max_crops`):
+                Maximum number of crops to use per image.
+            overlap_margins (`list[int]`, *optional*, defaults to `self.overlap_margins`):
+                Overlap margins to use.
+            patch_size (`int`, *optional*, defaults to `self.patch_size`):
+                The spatial patch size of the vision encoder.
+            pooling_size (`list[int]`, *optional*, defaults to `self.pooling_size`):
+                The pooling size of the vision adapter.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+
+        Returns:
+            A `BatchFeature` containing the following keys:
+                - `pixel_values`: The preprocessed images.
+                - `image_token_pooling`: The indices of the patches in `crops` to pool for each token in `image_tokens`.
+                - `image_grids`: The image grids.
+                - `image_num_crops`: The number of crops for each image.
+        """
+        if size is not None:
+            if "height" not in size or "width" not in size:
+                raise ValueError("size must contain 'height' and 'width' keys.")
+        else:
+            size = {**self.size}
+
+        base_image_input_size = [size["height"], size["width"]]
+
+        resample = resample or self.resample
+        image_mean = image_mean or self.image_mean
+        image_std = image_std or self.image_std
+        do_convert_rgb = do_convert_rgb or self.do_convert_rgb
+
+        max_crops = max_crops or self.max_crops
+        overlap_margins = overlap_margins or self.overlap_margins
+        crop_mode = crop_mode or self.crop_mode
+        patch_size = patch_size or self.patch_size
+        pooling_size = pooling_size or self.pooling_size
+
+        image_pooling_h, image_pooling_w = pooling_size
+
+        if images is not None:
+            images = self.fetch_images(images)
+            images = make_flat_list_of_images(images)
+
+        if images is not None and not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        data = {}
+        if images is not None:
+            batch_grids = []
+            batch_crops = []
+            batch_pooled_patches_idx = []
+            batch_num_crops = []
+
+            for image in images:
+                image_grid, crops, pooled_idx = image_to_patches_and_grids(
+                    image,
+                    max_crops,
+                    overlap_margins,
+                    base_image_input_size,
+                    resample,
+                    image_mean,
+                    image_std,
+                    patch_size,
+                    image_pooling_w,
+                    image_pooling_h,
+                    crop_mode,
+                )
+                batch_grids.append(image_grid)
+                batch_crops.append(crops)
+                batch_pooled_patches_idx.append(pooled_idx)
+                batch_num_crops.append(crops.shape[0])
+
+            pixel_values = np.concatenate(batch_crops, 0)
+            image_token_pooling = np.concatenate(batch_pooled_patches_idx, 0)
+            image_grids = np.concatenate(batch_grids, 0)
+            image_num_crops = np.array(batch_num_crops)
+
+            data.update(
+                pixel_values=pixel_values,
+                image_token_pooling=image_token_pooling,
+                image_grids=image_grids,
+                image_num_crops=image_num_crops,
+            )
+
+        return BatchFeature(data, tensor_type=return_tensors)
+
+
+MolmoAct2ImageProcessor.register_for_auto_class()
--- a/src/lerobot/policies/molmoact2/hf_model/inference.py
+++ b/src/lerobot/policies/molmoact2/hf_model/inference.py
@@ -0,0 +1,748 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# ruff: noqa
+
+"""Inference utilities for MolmoAct2"""
+
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple
+from collections.abc import Iterable, Sequence
+
+import torch
+from torch.nn import functional as F
+from transformers.cache_utils import Cache
+from transformers.configuration_utils import PretrainedConfig
+
+
+@dataclass
+class _ActionFlowInputs:
+    trajectory: torch.Tensor
+    context: Any
+    modulations: Sequence[Any]
+    action_dim_is_pad: torch.Tensor | None
+
+
+@dataclass
+class _ActionFlowCudaGraph:
+    key: tuple[Any, ...]
+    graph: torch.cuda.CUDAGraph
+    static_inputs: _ActionFlowInputs
+    output: torch.Tensor
+
+
+@dataclass
+class _DepthDecodeCudaGraphLayerStage:
+    residual: torch.Tensor
+    query: torch.Tensor
+    key: torch.Tensor
+    value: torch.Tensor
+
+
+@dataclass
+class _DepthDecodeCudaGraphPostStage:
+    graph: torch.cuda.CUDAGraph
+    attn_context: torch.Tensor
+
+
+@dataclass
+class _DepthDecodeCudaGraph:
+    cache_key: tuple[Any, ...]
+    pre_graph: torch.cuda.CUDAGraph
+    token_ids: torch.Tensor
+    cos: torch.Tensor
+    sin: torch.Tensor
+    positions: torch.Tensor
+    stages: Sequence[_DepthDecodeCudaGraphLayerStage]
+    post_graphs: Sequence[_DepthDecodeCudaGraphPostStage]
+    output: torch.Tensor
+
+
+@dataclass
+class _DepthDecodeCudaGraphSpec:
+    eligible: bool
+    cache_key_prefix: tuple[Any, ...]
+    num_hidden_layers: int
+    head_dim: int
+    num_attention_heads: int
+
+
+def _cache_seq_len_int(past_key_values: Cache | None) -> int:
+    if past_key_values is None:
+        return 0
+    seq_len = past_key_values.get_seq_length()
+    if torch.is_tensor(seq_len):
+        return int(seq_len.item())
+    return int(seq_len)
+
+
+def _cache_max_len_int(past_key_values: Cache | None) -> int:
+    if past_key_values is None:
+        return -1
+    max_len = past_key_values.get_max_cache_shape()
+    if torch.is_tensor(max_len):
+        return int(max_len.item())
+    return int(max_len)
+
+
+def _iter_cache_key_values(
+    past_key_values: Cache,
+) -> Iterable[tuple[torch.Tensor | None, torch.Tensor | None]]:
+    layers = getattr(past_key_values, "layers", None)
+    if layers is not None:
+        for layer in layers:
+            yield getattr(layer, "keys", None), getattr(layer, "values", None)
+        return
+    for layer in past_key_values:
+        yield layer[0], layer[1]
+
+
+class _DepthDecodeStaticLayerCache:
+    is_compileable = False
+    is_sliding = False
+
+    def __init__(self, max_cache_len: int) -> None:
+        self.max_cache_len = int(max_cache_len)
+        self.cumulative_length = 0
+        self.keys: torch.Tensor | None = None
+        self.values: torch.Tensor | None = None
+
+    def _allocate(self, key_states: torch.Tensor, value_states: torch.Tensor) -> None:
+        bsz, n_heads = key_states.shape[:2]
+        self.keys = torch.empty(
+            (bsz, n_heads, self.max_cache_len, key_states.shape[-1]),
+            dtype=key_states.dtype,
+            device=key_states.device,
+        )
+        self.values = torch.empty(
+            (bsz, n_heads, self.max_cache_len, value_states.shape[-1]),
+            dtype=value_states.dtype,
+            device=value_states.device,
+        )
+
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        *args,
+        **kwargs,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if self.keys is None:
+            self._allocate(key_states, value_states)
+        start = self.cumulative_length
+        end = start + key_states.shape[-2]
+        if end > self.max_cache_len:
+            raise RuntimeError(f"KV cache length {end} exceeds max_cache_len={self.max_cache_len}.")
+        self.keys[:, :, start:end, :].copy_(key_states)
+        self.values[:, :, start:end, :].copy_(value_states)
+        self.cumulative_length = end
+        return self.keys[:, :, :end, :], self.values[:, :, :end, :]
+
+    def get_seq_length(self) -> int:
+        return self.cumulative_length
+
+    def get_max_cache_shape(self) -> int:
+        return -1
+
+    def reset(self) -> None:
+        self.cumulative_length = 0
+
+
+class _DepthDecodeStaticCache(Cache):
+    def __init__(self, config: PretrainedConfig, max_cache_len: int) -> None:
+        text_config = config.get_text_config(decoder=True)
+        super().__init__(
+            layers=[
+                _DepthDecodeStaticLayerCache(max_cache_len=max_cache_len)
+                for _ in range(text_config.num_hidden_layers)
+            ]
+        )
+
+    def get_seq_length(self, layer_idx: int = 0) -> int:
+        return self.layers[layer_idx].get_seq_length()
+
+    def get_max_cache_shape(self, layer_idx: int = 0) -> int:
+        return self.layers[layer_idx].get_max_cache_shape()
+
+    def reset(self) -> None:
+        for layer in self.layers:
+            layer.reset()
+
+
+class ActionCudaGraphManager:
+    def __init__(self, model: Any) -> None:
+        self.model = model
+        self.enabled = True
+        self.action_flow_graph: _ActionFlowCudaGraph | None = None
+
+    def set_enabled(self, enabled: bool) -> None:
+        self.enabled = bool(enabled)
+
+    def can_use_action_flow(self, inputs: _ActionFlowInputs) -> bool:
+        action_model = self.model
+        if not self.enabled:
+            return False
+        if action_model.training or action_model._require_action_expert().training:
+            return False
+        if inputs.trajectory.device.type != "cuda":
+            return False
+
+        def all_on_cuda():
+            yield inputs.trajectory
+            for k, v in inputs.context.kv_contexts:
+                yield k
+                yield v
+            for t in (
+                inputs.context.cross_mask,
+                inputs.context.self_mask,
+                inputs.context.valid_action,
+                inputs.action_dim_is_pad,
+            ):
+                if t is not None:
+                    yield t
+            if inputs.context.rope_cache is not None:
+                yield from inputs.context.rope_cache
+            for step in inputs.modulations:
+                yield step.conditioning
+                for block_modulation in step.block_modulations:
+                    yield from block_modulation
+                yield from step.final_modulation
+
+        return all(t.device.type == "cuda" for t in all_on_cuda())
+
+    def run_action_flow(
+        self,
+        inputs: _ActionFlowInputs,
+        steps: int,
+        run_loop,
+    ) -> torch.Tensor:
+        key = _cuda_graph_key(inputs, steps)
+        cache = self.action_flow_graph
+        if cache is None or cache.key != key:
+            static_inputs = _clone_static_inputs(inputs)
+            graph, output = _capture_cuda_graph(
+                lambda: run_loop(static_inputs, steps),
+                inputs.trajectory.device,
+                after_warmup=lambda: static_inputs.trajectory.copy_(inputs.trajectory),
+            )
+            cache = _ActionFlowCudaGraph(
+                key=key,
+                graph=graph,
+                static_inputs=static_inputs,
+                output=output,
+            )
+            self.action_flow_graph = cache
+        else:
+            _copy_inputs_(cache.static_inputs, inputs)
+
+        cache.graph.replay()
+        return cache.output.clone()
+
+
+class DepthDecodeCudaGraphManager:
+    def __init__(self, model: Any) -> None:
+        self.model = model
+        self.backbone = model.model
+        self.enabled = True
+        self.graph: _DepthDecodeCudaGraph | None = None
+        self.graph_spec: _DepthDecodeCudaGraphSpec | None = None
+
+    def set_enabled(self, enabled: bool) -> None:
+        self.enabled = bool(enabled)
+
+    def make_static_cache(self, max_cache_len: int) -> _DepthDecodeStaticCache:
+        return _DepthDecodeStaticCache(
+            config=self.model.config.text_config,
+            max_cache_len=max_cache_len,
+        )
+
+    def _depth_decode_spec(self) -> _DepthDecodeCudaGraphSpec:
+        static = self.graph_spec
+        if static is None:
+            cfg = self.backbone.transformer.config
+            rotary_emb = getattr(self.backbone.transformer, "rotary_emb", None)
+            static = _DepthDecodeCudaGraphSpec(
+                eligible=(
+                    not cfg.norm_after
+                    and cfg.rope_scaling_layers is None
+                    and getattr(rotary_emb, "rope_type", None) == "default"
+                    and cfg._attn_implementation == "sdpa"
+                ),
+                cache_key_prefix=(
+                    cfg.hidden_size,
+                    cfg.num_attention_heads,
+                    cfg.num_key_value_heads,
+                    cfg.head_dim,
+                    cfg.num_hidden_layers,
+                    cfg.use_qk_norm,
+                    cfg.qk_norm_type,
+                    cfg._attn_implementation,
+                ),
+                num_hidden_layers=cfg.num_hidden_layers,
+                head_dim=cfg.head_dim,
+                num_attention_heads=cfg.num_attention_heads,
+            )
+            self.graph_spec = static
+        return static
+
+    def can_use(
+        self,
+        next_input_ids: torch.Tensor,
+        *,
+        past_key_values: Cache,
+        attention_bias: torch.Tensor,
+    ) -> bool:
+        if not self.enabled or self.model.training or self.backbone.transformer.training:
+            return False
+        if next_input_ids.device.type != "cuda":
+            return False
+        if next_input_ids.ndim != 2 or next_input_ids.shape[0] != 1 or next_input_ids.shape[1] != 1:
+            return False
+        if not isinstance(past_key_values, _DepthDecodeStaticCache):
+            return False
+        if not torch.is_tensor(attention_bias) or attention_bias.device != next_input_ids.device:
+            return False
+        return self._depth_decode_spec().eligible
+
+    def _depth_decode_key(
+        self,
+        next_input_ids: torch.Tensor,
+        attention_bias: torch.Tensor,
+    ) -> tuple[Any, ...]:
+        device = next_input_ids.device
+        return (
+            self._depth_decode_spec().cache_key_prefix,
+            device.type,
+            device.index,
+            self.model.lm_head.weight.dtype,
+            attention_bias.shape[-1],
+        )
+
+    def _select_depth_decode_rope(self, cos: torch.Tensor, sin: torch.Tensor, *, past_length: int) -> None:
+        emb = self.backbone.transformer.rotary_emb
+        cos.copy_(emb._pos_cos_cache[0, :, past_length : past_length + 1, :])
+        sin.copy_(emb._pos_sin_cache[0, :, past_length : past_length + 1, :])
+
+    def _depth_decode_pre_layer(
+        self,
+        layer_idx: int,
+        hidden_states: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        block = self.backbone.transformer.blocks[layer_idx]
+        attention = block.self_attn
+        residual = hidden_states
+        hidden_states = block.attn_norm(hidden_states)
+
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, attention.head_dim)
+        qkv = attention.att_proj(hidden_states)
+        query_states, key_states, value_states = qkv.split(attention.fused_dims, dim=-1)
+        value_states = value_states.view(hidden_shape)
+
+        apply_qk_norm = attention.q_norm is not None and attention.k_norm is not None
+        norm_after_view = apply_qk_norm and attention.qk_norm_type == "qwen3"
+
+        if apply_qk_norm and not norm_after_view:
+            query_states = attention.q_norm(query_states)
+            key_states = attention.k_norm(key_states)
+
+        query_states = query_states.view(hidden_shape)
+        key_states = key_states.view(hidden_shape)
+
+        if norm_after_view:
+            query_states = attention.q_norm(query_states)
+            key_states = attention.k_norm(key_states)
+
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        query_states, key_states = _apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        return residual, query_states, key_states, value_states
+
+    def _depth_decode_pre0(
+        self,
+        token_ids: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        inputs_embeds = self.model._embed_base_tokens(token_ids)
+        return self._depth_decode_pre_layer(0, inputs_embeds, cos, sin)
+
+    def _depth_decode_post_layer(
+        self,
+        layer_idx: int,
+        residual: torch.Tensor,
+        attn_context: torch.Tensor,
+    ) -> torch.Tensor:
+        block = self.backbone.transformer.blocks[layer_idx]
+        attention = block.self_attn
+        input_shape = residual.shape[:-1]
+        attn_output = attn_context.reshape(*input_shape, -1).contiguous()
+        attn_output = attention.attn_out(attn_output)
+        hidden_states = residual + block.dropout(attn_output)
+
+        residual = hidden_states
+        hidden_states = block.ff_norm(hidden_states)
+        hidden_states = block.mlp(hidden_states)
+        hidden_states = residual + block.dropout(hidden_states)
+        return hidden_states
+
+    def _depth_decode_post_and_pre_next(
+        self,
+        layer_idx: int,
+        residual: torch.Tensor,
+        attn_context: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        hidden_states = self._depth_decode_post_layer(layer_idx, residual, attn_context)
+        return self._depth_decode_pre_layer(layer_idx + 1, hidden_states, cos, sin)
+
+    def _depth_decode_last_post(
+        self,
+        layer_idx: int,
+        residual: torch.Tensor,
+        attn_context: torch.Tensor,
+    ) -> torch.Tensor:
+        hidden_states = self._depth_decode_post_layer(layer_idx, residual, attn_context)
+        return self.backbone.transformer.ln_f(hidden_states)
+
+    def _build_depth_decode_graph(
+        self,
+        next_input_ids: torch.Tensor,
+        *,
+        past_length: int,
+        attention_bias: torch.Tensor,
+    ) -> _DepthDecodeCudaGraph:
+        text_config = self.backbone.transformer.config
+        device = next_input_ids.device
+        dtype = self.model.lm_head.weight.dtype
+        static = self._depth_decode_spec()
+        num_layers = static.num_hidden_layers
+        head_dim = static.head_dim
+        max_cache_len = int(attention_bias.shape[-1])
+        max_rope_len = max(int(text_config.max_position_embeddings or 0), max_cache_len)
+        self.backbone.transformer.prepare_rope_cache(device=device, max_seq_len=max_rope_len)
+
+        token_ids = torch.empty((1, 1), device=device, dtype=torch.long)
+        cos = torch.empty((1, 1, head_dim), device=device, dtype=dtype)
+        sin = torch.empty_like(cos)
+        positions = torch.arange(max_cache_len, device=device, dtype=torch.long)
+        context_shape = (1, 1, static.num_attention_heads, head_dim)
+
+        token_ids.copy_(next_input_ids)
+        self._select_depth_decode_rope(cos, sin, past_length=past_length)
+
+        pre_graph, pre_output = _capture_cuda_graph(
+            lambda: self._depth_decode_pre0(token_ids, cos, sin),
+            device,
+        )
+        stages = [_DepthDecodeCudaGraphLayerStage(*pre_output)]
+        post_graphs = []
+        for layer_idx in range(num_layers - 1):
+            stage = stages[-1]
+            attn_context = torch.empty(context_shape, device=device, dtype=dtype)
+            graph, output = _capture_cuda_graph(
+                lambda layer_idx=layer_idx, stage=stage, attn_context=attn_context: (
+                    self._depth_decode_post_and_pre_next(
+                        layer_idx,
+                        stage.residual,
+                        attn_context,
+                        cos,
+                        sin,
+                    )
+                ),
+                device,
+            )
+            post_graphs.append(_DepthDecodeCudaGraphPostStage(graph=graph, attn_context=attn_context))
+            stages.append(_DepthDecodeCudaGraphLayerStage(*output))
+
+        last_stage = stages[-1]
+        last_attn_context = torch.empty(context_shape, device=device, dtype=dtype)
+        last_graph, last_output = _capture_cuda_graph(
+            lambda: self._depth_decode_last_post(
+                num_layers - 1,
+                last_stage.residual,
+                last_attn_context,
+            ),
+            device,
+        )
+        post_graphs.append(_DepthDecodeCudaGraphPostStage(graph=last_graph, attn_context=last_attn_context))
+        return _DepthDecodeCudaGraph(
+            cache_key=self._depth_decode_key(next_input_ids, attention_bias),
+            pre_graph=pre_graph,
+            token_ids=token_ids,
+            cos=cos,
+            sin=sin,
+            positions=positions,
+            stages=tuple(stages),
+            post_graphs=tuple(post_graphs),
+            output=last_output,
+        )
+
+    def _get_depth_decode_graph(
+        self,
+        next_input_ids: torch.Tensor,
+        *,
+        past_length: int,
+        attention_bias: torch.Tensor,
+    ) -> _DepthDecodeCudaGraph:
+        key = self._depth_decode_key(next_input_ids, attention_bias)
+        decode_graph = self.graph
+        if decode_graph is None or decode_graph.cache_key != key:
+            decode_graph = self._build_depth_decode_graph(
+                next_input_ids,
+                past_length=past_length,
+                attention_bias=attention_bias,
+            )
+            self.graph = decode_graph
+        else:
+            decode_graph.token_ids.copy_(next_input_ids)
+            self._select_depth_decode_rope(decode_graph.cos, decode_graph.sin, past_length=past_length)
+        return decode_graph
+
+    def _run_depth_decode_attention_core(
+        self,
+        layer_idx: int,
+        stage: _DepthDecodeCudaGraphLayerStage,
+        *,
+        past_key_values: Cache,
+        attention_bias: torch.Tensor,
+        cache_position: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> torch.Tensor:
+        attention = self.backbone.transformer.blocks[layer_idx].self_attn
+        cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+        key_states, value_states = past_key_values.update(
+            stage.key,
+            stage.value,
+            layer_idx,
+            cache_kwargs,
+        )
+        key_states = _repeat_kv(key_states, attention.num_key_value_groups)
+        value_states = _repeat_kv(value_states, attention.num_key_value_groups)
+        attn_output = F.scaled_dot_product_attention(
+            stage.query,
+            key_states,
+            value_states,
+            attn_mask=attention_bias,
+            dropout_p=0.0,
+            is_causal=False,
+        )
+        return attn_output.transpose(1, 2)
+
+    def run(
+        self,
+        next_input_ids: torch.Tensor,
+        *,
+        past_key_values: Cache,
+        attention_bias: torch.Tensor,
+        past_length: int,
+    ) -> tuple[torch.Tensor, Cache]:
+        end = past_length + 1
+        decode_graph = self._get_depth_decode_graph(
+            next_input_ids,
+            past_length=past_length,
+            attention_bias=attention_bias,
+        )
+        cache_position = decode_graph.positions[past_length:end]
+        attention_bias_q = attention_bias[:, :, past_length:end, :end]
+
+        decode_graph.pre_graph.replay()
+
+        for layer_idx, post_graph in enumerate(decode_graph.post_graphs):
+            attn_context = self._run_depth_decode_attention_core(
+                layer_idx,
+                decode_graph.stages[layer_idx],
+                past_key_values=past_key_values,
+                attention_bias=attention_bias_q,
+                cache_position=cache_position,
+                cos=decode_graph.cos,
+                sin=decode_graph.sin,
+            )
+            post_graph.attn_context.copy_(attn_context)
+            post_graph.graph.replay()
+
+        return decode_graph.output, past_key_values
+
+
+def _cuda_graph_tensor_signature(
+    tensor: torch.Tensor | None,
+) -> tuple[Any, ...] | None:
+    if tensor is None:
+        return None
+    return (
+        tuple(tensor.shape),
+        tuple(tensor.stride()),
+        str(tensor.dtype),
+        str(tensor.device),
+    )
+
+
+def _cuda_graph_context_signature(context: Any) -> tuple[Any, ...]:
+    sig = _cuda_graph_tensor_signature
+    return (
+        tuple((sig(k), sig(v)) for k, v in context.kv_contexts),
+        sig(context.cross_mask),
+        sig(context.self_mask),
+        sig(context.valid_action),
+        None if context.rope_cache is None else tuple(sig(t) for t in context.rope_cache),
+    )
+
+
+def _cuda_graph_modulation_signature(modulations: Sequence[Any]) -> tuple[Any, ...]:
+    sig = _cuda_graph_tensor_signature
+    return tuple(
+        (
+            sig(step.conditioning),
+            tuple(tuple(sig(t) for t in block_modulation) for block_modulation in step.block_modulations),
+            tuple(sig(t) for t in step.final_modulation),
+        )
+        for step in modulations
+    )
+
+
+def _cuda_graph_key(inputs: _ActionFlowInputs, steps: int) -> tuple[Any, ...]:
+    sig = _cuda_graph_tensor_signature
+    return (
+        sig(inputs.trajectory),
+        _cuda_graph_context_signature(inputs.context),
+        _cuda_graph_modulation_signature(inputs.modulations),
+        sig(inputs.action_dim_is_pad),
+        int(steps),
+    )
+
+
+def _clone_static_tensor(tensor: torch.Tensor | None) -> torch.Tensor | None:
+    if tensor is None:
+        return None
+    static = torch.empty_strided(
+        tuple(tensor.shape),
+        tuple(tensor.stride()),
+        device=tensor.device,
+        dtype=tensor.dtype,
+    )
+    static.copy_(tensor)
+    return static
+
+
+def _clone_static_context(context: Any) -> Any:
+    rope_cache = None
+    if context.rope_cache is not None:
+        rope_cache = tuple(_clone_static_tensor(t) for t in context.rope_cache)
+    return context.__class__(
+        kv_contexts=tuple((_clone_static_tensor(k), _clone_static_tensor(v)) for k, v in context.kv_contexts),
+        cross_mask=_clone_static_tensor(context.cross_mask),
+        self_mask=_clone_static_tensor(context.self_mask),
+        valid_action=_clone_static_tensor(context.valid_action),
+        rope_cache=rope_cache,
+    )
+
+
+def _clone_static_modulations(modulations: Sequence[Any]) -> Sequence[Any]:
+    return tuple(
+        step.__class__(
+            conditioning=_clone_static_tensor(step.conditioning),
+            block_modulations=tuple(
+                tuple(_clone_static_tensor(t) for t in block_modulation)
+                for block_modulation in step.block_modulations
+            ),
+            final_modulation=tuple(_clone_static_tensor(t) for t in step.final_modulation),
+        )
+        for step in modulations
+    )
+
+
+def _clone_static_inputs(inputs: _ActionFlowInputs) -> _ActionFlowInputs:
+    return _ActionFlowInputs(
+        trajectory=_clone_static_tensor(inputs.trajectory),
+        context=_clone_static_context(inputs.context),
+        modulations=_clone_static_modulations(inputs.modulations),
+        action_dim_is_pad=_clone_static_tensor(inputs.action_dim_is_pad),
+    )
+
+
+def _copy_context_(dst: Any, src: Any) -> None:
+    for (dst_k, dst_v), (src_k, src_v) in zip(dst.kv_contexts, src.kv_contexts):
+        dst_k.copy_(src_k)
+        dst_v.copy_(src_v)
+    if src.cross_mask is not None:
+        dst.cross_mask.copy_(src.cross_mask)
+    if src.self_mask is not None:
+        dst.self_mask.copy_(src.self_mask)
+    if src.valid_action is not None:
+        dst.valid_action.copy_(src.valid_action)
+    if src.rope_cache is not None:
+        for dst_tensor, src_tensor in zip(dst.rope_cache, src.rope_cache):
+            dst_tensor.copy_(src_tensor)
+
+
+def _copy_inputs_(dst: _ActionFlowInputs, src: _ActionFlowInputs) -> None:
+    dst.trajectory.copy_(src.trajectory)
+    _copy_context_(dst.context, src.context)
+    if src.action_dim_is_pad is not None:
+        dst.action_dim_is_pad.copy_(src.action_dim_is_pad)
+
+
+def _rotate_half(x: torch.Tensor) -> torch.Tensor:
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def _apply_rotary_pos_emb(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim: int = 1,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (_rotate_half(q) * sin)
+    k_embed = (k * cos) + (_rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def _repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def _capture_cuda_graph(
+    fn,
+    device: torch.device,
+    *,
+    after_warmup=None,
+) -> tuple[torch.cuda.CUDAGraph, Any]:
+    warmup_stream = torch.cuda.Stream(device=device)
+    warmup_stream.wait_stream(torch.cuda.current_stream(device))
+    with torch.cuda.stream(warmup_stream):
+        fn()
+    torch.cuda.current_stream(device).wait_stream(warmup_stream)
+    if after_warmup is not None:
+        after_warmup()
+
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph):
+        output = fn()
+    return graph, output
--- a/src/lerobot/policies/molmoact2/hf_model/modeling_molmoact2.py
+++ b/src/lerobot/policies/molmoact2/hf_model/modeling_molmoact2.py
--- a/src/lerobot/policies/molmoact2/hf_model/processing_molmoact2.py
+++ b/src/lerobot/policies/molmoact2/hf_model/processing_molmoact2.py
@@ -0,0 +1,431 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# ruff: noqa
+
+"""
+Processor class for MolmoAct2.
+"""
+
+from typing import Optional, Union
+import dataclasses
+
+import numpy as np
+
+from transformers.image_utils import ImageInput
+from transformers.video_utils import VideoInput
+from transformers.processing_utils import (
+    Unpack,
+    ProcessingKwargs,
+    ProcessorMixin,
+)
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.tokenization_utils_base import TextInput, PreTokenizedInput
+from transformers.utils import logging
+
+from transformers import AutoTokenizer
+from .image_processing_molmoact2 import MolmoAct2ImagesKwargs, MolmoAct2ImageProcessor
+from .video_processing_molmoact2 import MolmoAct2VideoProcessorKwargs, MolmoAct2VideoProcessor
+
+
+logger = logging.get_logger(__name__)
+
+
+# Special tokens, these should be present in any tokenizer we use since the preprocessor uses them
+IMAGE_PATCH_TOKEN = f"<im_patch>"  # Where to insert high-res tokens
+IMAGE_LOW_RES_TOKEN = f"<im_low>"  # Where to insert low-res tokens
+IM_START_TOKEN = f"<im_start>"
+LOW_RES_IMAGE_START_TOKEN = f"<low_res_im_start>"
+FRAME_START_TOKEN = f"<frame_start>"
+IM_END_TOKEN = f"<im_end>"
+FRAME_END_TOKEN = f"<frame_end>"
+IM_COL_TOKEN = f"<im_col>"
+IMAGE_PROMPT = "<|image|>"
+VIDEO_PROMPT = "<|video|>"
+
+IMAGE_TOKENS = [
+    IMAGE_PATCH_TOKEN,
+    IM_COL_TOKEN,
+    IM_START_TOKEN,
+    LOW_RES_IMAGE_START_TOKEN,
+    FRAME_START_TOKEN,
+    IM_END_TOKEN,
+    FRAME_END_TOKEN,
+    IMAGE_LOW_RES_TOKEN,
+]
+
+
+class MolmoAct2ProcessorKwargs(ProcessingKwargs, total=False):
+    """MolmoAct2 processor kwargs"""
+
+    images_kwargs: MolmoAct2ImagesKwargs
+    videos_kwargs: MolmoAct2VideoProcessorKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+            "return_mm_token_type_ids": True,
+        },
+        "videos_kwargs": {"return_metadata": True},
+    }
+
+
+class MolmoAct2Processor(ProcessorMixin):
+    attributes = ["image_processor", "video_processor", "tokenizer"]
+    optional_attributes = [
+        "chat_template",
+        "time_mode",
+        "image_use_col_tokens",
+        "use_single_crop_col_tokens",
+        "use_single_crop_start_token",
+        "video_use_col_tokens",
+        "use_frame_special_tokens",
+    ]
+    image_processor_class = "AutoImageProcessor"
+    video_processor_class = "AutoVideoProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(
+        self,
+        image_processor: MolmoAct2ImageProcessor = None,
+        video_processor: MolmoAct2VideoProcessor = None,
+        tokenizer: AutoTokenizer = None,
+        chat_template: str | None = None,
+        image_use_col_tokens: bool | None = True,
+        use_single_crop_col_tokens: bool | None = None,
+        use_single_crop_start_token: bool | None = True,
+        video_use_col_tokens: bool | None = False,
+        use_frame_special_tokens: bool | None = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(
+            image_processor,
+            video_processor,
+            tokenizer,
+            chat_template=chat_template,
+        )
+        self.image_use_col_tokens = image_use_col_tokens
+        self.use_single_crop_col_tokens = use_single_crop_col_tokens
+        self.use_single_crop_start_token = use_single_crop_start_token
+        self.video_use_col_tokens = video_use_col_tokens
+        self.use_frame_special_tokens = use_frame_special_tokens
+
+        self.image_placeholder_token = IMAGE_PROMPT
+        self.video_placeholder_token = VIDEO_PROMPT
+        self.image_token_ids = [tokenizer.convert_tokens_to_ids(token) for token in IMAGE_TOKENS]
+
+    def get_image_tokens(self, image_grid: np.ndarray):
+        resized_h, resized_w, height, width = image_grid
+        if int(height) == 0 or int(width) == 0:
+            per_row = np.full(resized_w, IMAGE_PATCH_TOKEN)
+            use_single_crop_col_tokens = (
+                self.image_use_col_tokens
+                if self.use_single_crop_col_tokens is None
+                else self.use_single_crop_col_tokens
+            )
+            if use_single_crop_col_tokens:
+                per_row = np.concatenate([per_row, [IM_COL_TOKEN]], 0)
+            joint = [
+                [IM_START_TOKEN],
+                np.tile(per_row, [resized_h]),
+                [IM_END_TOKEN],
+            ]
+            return np.concatenate(joint)
+        per_row = np.full(width, IMAGE_PATCH_TOKEN)
+        if self.image_use_col_tokens:
+            per_row = np.concatenate([per_row, [IM_COL_TOKEN]], 0)
+        joint = [
+            [IM_START_TOKEN],
+            np.tile(per_row, [height]),
+            [IM_END_TOKEN],
+        ]
+        per_row = np.full(resized_w, IMAGE_PATCH_TOKEN)
+        use_single_crop_col_tokens = (
+            self.image_use_col_tokens
+            if self.use_single_crop_col_tokens is None
+            else self.use_single_crop_col_tokens
+        )
+        image_start_token = LOW_RES_IMAGE_START_TOKEN if self.use_single_crop_start_token else IM_START_TOKEN
+        if use_single_crop_col_tokens:
+            per_row = np.concatenate([per_row, [IM_COL_TOKEN]], 0)
+        joint = [
+            [image_start_token],
+            np.tile(per_row, [resized_h]),
+            [IM_END_TOKEN],
+        ] + joint
+
+        return np.concatenate(joint)
+
+    def get_video_string(
+        self,
+        video_grid: np.ndarray,
+        timestamps: np.ndarray,
+    ):
+        if self.use_frame_special_tokens:
+            start_token_id = FRAME_START_TOKEN
+            end_token_id = FRAME_END_TOKEN
+        else:
+            start_token_id = IM_START_TOKEN
+            end_token_id = IM_END_TOKEN
+
+        num_frames, h, w = video_grid
+        video_string: str = ""
+        for frame_idx, frame_time in enumerate(timestamps):
+            # `per-frame-compact` time mode
+            prev_space = " " if frame_idx > 0 else ""
+            frame_prefix = prev_space + f"{frame_time:.1f} "  # explicit whitespace before/after image tokens
+
+            video_string += frame_prefix
+            per_row = np.full(w, IMAGE_PATCH_TOKEN)
+            if self.video_use_col_tokens:
+                per_row = np.concatenate([per_row, [IM_COL_TOKEN]], 0)
+            extra_tokens = np.tile(per_row, [h])
+            video_tokens = [
+                [start_token_id],
+                extra_tokens,
+                [end_token_id],
+            ]
+            video_string += "".join(np.concatenate(video_tokens, 0))
+
+        return video_string
+
+    def insert_bos(
+        self,
+        input_ids: np.ndarray,
+        attention_mask: np.ndarray,
+        bos_token_id: int,
+        pad_token_id: int,
+    ):
+        """
+        Args:
+            input_ids: [B, S] array with left padding
+            attention_mask: [B, S] array (0 for pad, 1 for valid)
+            bos_token_id: int
+            pad_token_id: int
+        Returns:
+            input_ids_out: [B, S] or [B, S+1] array with bos inserted if needed
+            attention_mask_out: same shape as input_ids_out
+        """
+
+        need_to_expand = len(input_ids.shape) == 1
+        if need_to_expand:
+            input_ids = input_ids[None, :]
+            attention_mask = attention_mask[None, :]
+
+        B, S = input_ids.shape
+
+        # Handle zero-length sequence
+        if S == 0:
+            new_input_ids = np.full((B, 1), bos_token_id, dtype=input_ids.dtype)
+            new_attention_mask = np.ones((B, 1), dtype=attention_mask.dtype)
+            if need_to_expand:
+                new_input_ids = new_input_ids[0]
+                new_attention_mask = new_attention_mask[0]
+            return new_input_ids, new_attention_mask
+
+        first_valid_index = (attention_mask == 1).argmax(axis=-1)  # [B]
+        bos_already_present = np.all(input_ids[np.arange(B), first_valid_index] == bos_token_id)
+
+        if bos_already_present:
+            if need_to_expand:
+                input_ids = input_ids[0]
+                attention_mask = attention_mask[0]
+            return input_ids, attention_mask
+        else:
+            new_input_ids = np.full((B, S + 1), pad_token_id, dtype=input_ids.dtype)
+            new_attention_mask = np.zeros((B, S + 1), dtype=attention_mask.dtype)
+
+            src_idx = np.tile(np.arange(S), (B, 1))  # [B, S]
+            valid_mask = src_idx >= first_valid_index[:, None]  # [B, S]
+            tgt_idx = src_idx + 1  # shit right
+            batch_idx = np.tile(np.arange(B)[:, None], (1, S))  # [B, S]
+
+            # flatten valid_positions
+            flat_vals = input_ids[valid_mask]
+            flat_batch = batch_idx[valid_mask]
+            flat_tgt = tgt_idx[valid_mask]
+
+            new_input_ids[flat_batch, flat_tgt] = flat_vals
+            new_attention_mask[flat_batch, flat_tgt] = 1
+
+            insert_pos = first_valid_index
+            new_input_ids[np.arange(B), insert_pos] = bos_token_id
+            new_attention_mask[np.arange(B), insert_pos] = 1
+
+            if need_to_expand:
+                new_input_ids = new_input_ids[0]
+                new_attention_mask = new_attention_mask[0]
+
+            return new_input_ids, new_attention_mask
+
+    def __call__(
+        self,
+        text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None,
+        images: ImageInput = None,
+        videos: VideoInput = None,
+        **kwargs: Unpack[MolmoAct2ProcessorKwargs],
+    ) -> BatchFeature:
+        """
+
+        Args:
+            text (`str`, `list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            videos (`dict[str, Any]` or `list[dict[str, Any]]`):
+                The video or batch of videos to be prepared. Each video can be a dictionary with the following keys:
+                - `"frames"`: `np.ndarray` of shape (T, H, W, 3)
+                - `"timestamps"`: `np.ndarray` of shape (T,)
+                - `"sampled_fps"`: `float` (optional)
+                - `"sampling_augmentation"`: `str` (optional)
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            `BatchFeature`: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **image_token_pooling** -- Indices of the patches in `image_grids` to pool for each token in `image_tokens`.
+              Returned when `images` is not `None`.
+            - **image_grids** -- Grids of images. Returned when `images` is not `None`.
+            - **image_num_crops** -- Number of crops for each image. Returned when `images` is not `None`.
+            - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
+            - **video_token_pooling** -- Indices of the patches in `video_grids` to pool for each token in `video_tokens`.
+              Returned when `videos` is not `None`.
+            - **video_grids** -- Grids of videos. Returned when `videos` is not `None`.
+        """
+
+        output_kwargs = self._merge_kwargs(
+            MolmoAct2ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        if images is not None:
+            image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
+            image_grids = image_inputs["image_grids"]
+        else:
+            image_inputs = {}
+            image_grids = None
+
+        if videos is not None:
+            videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"])
+            video_grids = videos_inputs["video_grids"]
+            # If user has not requested video metadata, pop it
+            if "return_metadata" not in kwargs:
+                video_metadata = videos_inputs.pop("video_metadata")
+            else:
+                video_metadata = videos_inputs["video_metadata"]
+        else:
+            videos_inputs = {}
+            video_grids = None
+
+        if not isinstance(text, list):
+            text = [text]
+
+        text = text.copy()  # below lines change text in-place
+
+        if image_grids is not None:
+            index = 0
+            for i in range(len(text)):
+                num_images = text[i].count(self.image_placeholder_token)
+                image_grids_i = image_grids[index : index + num_images]
+                for image_grid in image_grids_i:
+                    image_tokens = self.get_image_tokens(image_grid)
+                    image_string = "".join(image_tokens)
+                    text[i] = text[i].replace(self.image_placeholder_token, image_string, 1)
+                index += num_images
+
+        if video_grids is not None:
+            index = 0
+            for i in range(len(text)):
+                num_videos = text[i].count(self.video_placeholder_token)
+                assert num_videos in {0, 1}, "At most one video is supported for now"
+                video_grids_i = video_grids[index : index + num_videos]
+                metadata_i = video_metadata[index : index + num_videos]
+                for video_grid, metadata in zip(video_grids_i, metadata_i):
+                    video_string = self.get_video_string(
+                        video_grid,
+                        metadata.timestamps,
+                    )
+                    text[i] = text[i].replace(self.video_placeholder_token, video_string, 1)
+                index += num_videos
+
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
+
+        input_ids = text_inputs["input_ids"]
+        attention_mask = text_inputs["attention_mask"]
+
+        input_ids = np.array(input_ids)
+        attention_mask = np.array(attention_mask)
+
+        bos = self.tokenizer.bos_token_id or self.tokenizer.eos_token_id
+        input_ids, attention_mask = self.insert_bos(
+            input_ids, attention_mask, bos, self.tokenizer.pad_token_id
+        )
+
+        if return_mm_token_type_ids:
+            image_tokens = np.array(self.image_token_ids).astype(input_ids.dtype)
+            token_type_ids = np.any(input_ids[:, :, None] == image_tokens[None, None, :], axis=-1)
+            text_inputs["token_type_ids"] = token_type_ids.tolist()
+
+        text_inputs["input_ids"] = input_ids.tolist()
+        text_inputs["attention_mask"] = attention_mask.tolist()
+
+        return BatchFeature(
+            data={**text_inputs, **image_inputs, **videos_inputs},
+            tensor_type=return_tensors,
+        )
+
+    def post_process_image_text_to_text(
+        self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs
+    ):
+        """
+        Post-process the output of the model to decode the text.
+
+        Args:
+            generated_outputs (`torch.Tensor` or `np.ndarray`):
+                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
+                or `(sequence_length,)`.
+            skip_special_tokens (`bool`, *optional*, defaults to `True`):
+                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+                Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
+            **kwargs:
+                Additional arguments to be passed to the tokenizer's `batch_decode method`.
+
+        Returns:
+            `list[str]`: The decoded text.
+        """
+        return self.tokenizer.batch_decode(
+            generated_outputs,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+
+
+MolmoAct2Processor.register_for_auto_class()
--- a/src/lerobot/policies/molmoact2/hf_model/video_processing_molmoact2.py
+++ b/src/lerobot/policies/molmoact2/hf_model/video_processing_molmoact2.py
@@ -0,0 +1,997 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The Allen Institute for Artificial Intelligence and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# ruff: noqa
+
+"""Video processor class for MolmoAct2"""
+
+from functools import partial
+import os
+import warnings
+from contextlib import redirect_stdout
+from io import BytesIO
+from urllib.parse import urlparse
+from typing import Optional, Union
+from collections.abc import Callable
+
+import numpy as np
+import requests
+import einops
+import torch
+import torchvision.transforms
+
+from transformers.image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ImageInput,
+    PILImageResampling,
+    SizeDict,
+    validate_kwargs,
+)
+from transformers.video_utils import (
+    VideoInput,
+    is_valid_video,
+    make_batched_videos,
+    make_batched_metadata,
+    VideoMetadata,
+)
+from transformers.processing_utils import Unpack, VideosKwargs
+from transformers.video_processing_utils import BaseVideoProcessor
+from transformers.utils import logging
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.utils import (
+    is_av_available,
+    is_decord_available,
+    is_torchcodec_available,
+    is_yt_dlp_available,
+    TensorType,
+    logging,
+    to_numpy,
+)
+
+
+logger = logging.get_logger(__name__)
+
+MAX_VIDEO_FPS = 8
+
+
+def normalize_image(
+    image: np.ndarray,
+    image_mean: list[float],
+    image_std: list[float],
+) -> np.ndarray:
+    if np.allclose(image_mean, [0.5, 0.5, 0.5]) and np.allclose(image_std, [0.5, 0.5, 0.5]):
+        return image * np.asarray(2.0, dtype=np.float32) - np.asarray(1.0, dtype=np.float32)
+    image -= np.array(image_mean, dtype=np.float32)[None, None, :]
+    image /= np.array(image_std, dtype=np.float32)[None, None, :]
+    return image
+
+
+def resize_image(
+    image: np.ndarray,
+    desired_output_size: list[int],
+    resample: PILImageResampling,
+) -> np.ndarray:
+    if len(image.shape) == 3:
+        is_video = False
+        image = torch.permute(torch.from_numpy(image), [2, 0, 1])
+    else:
+        is_video = True
+        image = torch.permute(torch.from_numpy(image), [0, 3, 1, 2])
+    dtype = image.dtype
+    if torch.is_floating_point(image):
+        in_min = 0.0
+        in_max = 1.0
+        resized = torchvision.transforms.Resize(
+            desired_output_size,
+            resample,
+            antialias=False,
+        )(image)
+        resized = torch.clip(resized, 0.0, 1.0).to(dtype)
+    else:
+        assert image.dtype == torch.uint8, "SigLIP expects float images or uint8 images, but got {}".format(
+            image.dtype
+        )
+        in_min = 0.0
+        in_max = 255.0
+        resized = torchvision.transforms.Resize(
+            desired_output_size,
+            resample,
+            antialias=False,
+        )(image)
+        resized = torch.clip(resized, 0, 255).to(dtype)
+
+    resized = resized.to(torch.float32)
+    resized = (resized - in_min) / (in_max - in_min)
+
+    if is_video:
+        resized = torch.permute(resized, [0, 2, 3, 1]).numpy()
+    else:
+        resized = torch.permute(resized, [1, 2, 0]).numpy()
+
+    return resized
+
+
+def build_resized_image(
+    image: np.ndarray,
+    base_image_input_size: list[int],
+    resample: PILImageResampling,
+    image_mean: list[float],
+    image_std: list[float],
+    image_patch_size: int,
+) -> tuple[np.ndarray, np.ndarray]:
+    resized = resize_image(
+        image,
+        base_image_input_size,
+        resample,
+    )
+    resized = normalize_image(resized, image_mean, image_std)
+    if len(resized.shape) == 3:
+        resized = np.expand_dims(resized, 0)
+    crop_patch_w = base_image_input_size[1] // image_patch_size
+    crop_patch_h = base_image_input_size[0] // image_patch_size
+    resize_idx = np.arange(crop_patch_w * crop_patch_h).reshape([crop_patch_h, crop_patch_w])
+    return resized, resize_idx
+
+
+def batch_pixels_to_patches(array: np.ndarray, patch_size: int) -> np.ndarray:
+    """Reshape images of [n_images, h, w, 3] -> [n_images, n_patches, pixels_per_patch]"""
+    if len(array.shape) == 3:
+        n_crops, h, w = array.shape
+        h_patches = h // patch_size
+        w_patches = w // patch_size
+        array = np.reshape(array, [n_crops, h_patches, patch_size, w_patches, patch_size])
+        array = np.transpose(array, [0, 1, 3, 2, 4])
+        array = np.reshape(array, [n_crops, h_patches * w_patches, patch_size * patch_size])
+        return array
+    else:
+        n_crops, h, w, c = array.shape
+        h_patches = h // patch_size
+        w_patches = w // patch_size
+        array = np.reshape(array, [n_crops, h_patches, patch_size, w_patches, patch_size, c])
+        array = np.transpose(array, [0, 1, 3, 2, 4, 5])
+        array = np.reshape(array, [n_crops, h_patches * w_patches, patch_size * patch_size * c])
+        return array
+
+
+def arange_for_pooling(
+    idx_arr: np.ndarray,
+    pool_h: int,
+    pool_w: int,
+) -> np.ndarray:
+    h_pad = pool_h * ((idx_arr.shape[0] + pool_h - 1) // pool_h) - idx_arr.shape[0]
+    w_pad = pool_w * ((idx_arr.shape[1] + pool_w - 1) // pool_w) - idx_arr.shape[1]
+    idx_arr = np.pad(
+        idx_arr,
+        [[h_pad // 2, (h_pad + 1) // 2], [w_pad // 2, (w_pad + 1) // 2]],
+        mode="constant",
+        constant_values=-1,
+    )
+    return einops.rearrange(idx_arr, "(h dh) (w dw) -> h w (dh dw)", dh=pool_h, dw=pool_w)
+
+
+def image_to_patches_and_grids(
+    image: ImageInput,
+    base_image_input_size: list[int],
+    resample: PILImageResampling,
+    image_mean: list[float],
+    image_std: list[float],
+    image_patch_size: int,
+    image_pooling_w: int,
+    image_pooling_h: int,
+) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """
+    :return image_grids, the shape of each image after pooling
+    :return crops, the image crops to processes with the ViT
+    :return pooled_patch_idx, for each patch_id tokens in `image_tokens`, the indices of the
+                                patches in `crops` to pool for that token, masked with -1
+    """
+    if isinstance(base_image_input_size, int):
+        base_image_input_size = (base_image_input_size, base_image_input_size)
+
+    pooling_w = image_pooling_w
+    pooling_h = image_pooling_h
+
+    resized, resize_idx = build_resized_image(
+        image,
+        base_image_input_size,
+        resample,
+        image_mean,
+        image_std,
+        image_patch_size,
+    )
+    pooling_idx = arange_for_pooling(resize_idx, pooling_h, pooling_w)
+    h, w = pooling_idx.shape[:2]
+    pooling_idx = pooling_idx.reshape([-1, pooling_h * pooling_w])
+    image_grid = [h, w]
+    return (
+        image_grid,
+        batch_pixels_to_patches(resized, image_patch_size),
+        pooling_idx,
+    )
+
+
+def get_candidate_target_fps(
+    video_fps: int | float,
+    sampling_fps: int | float,
+    max_fps: int | float = MAX_VIDEO_FPS,
+) -> list[float]:
+    """
+    Return the subset of `video_fps` factors that remain multiples of `sampling_fps`.
+
+    Examples:
+        >>> get_candidate_target_fps(video_fps=6, sampling_fps=2)
+        [2, 6]
+        >>> get_candidate_target_fps(video_fps=5, sampling_fps=1)
+        [1, 5]
+        >>> get_candidate_target_fps(video_fps=2, sampling_fps=2)
+        [2]
+        >>> get_candidate_target_fps(video_fps=5, sampling_fps=2)
+        Traceback (most recent call last):
+            ...
+        ValueError: sampling_fps=2 must divide video_fps=5 to produce consistent frame steps.
+    """
+    video_fps = int(video_fps)
+    sampling_fps = int(sampling_fps)
+    max_fps = int(max_fps)
+
+    if sampling_fps is None:
+        raise ValueError("sampling_fps must be provided")
+    if video_fps <= 0 or sampling_fps <= 0:
+        raise ValueError(f"video_fps and sampling_fps must be positive (got {video_fps}, {sampling_fps})")
+    if video_fps % sampling_fps != 0:
+        raise ValueError(f"sampling_fps={sampling_fps} must divide video_fps={video_fps}.")
+
+    candidates = []
+    for candidate in range(sampling_fps, video_fps + 1, sampling_fps):
+        if candidate > max_fps:
+            break
+        if video_fps % candidate == 0:
+            candidates.append(float(candidate))
+
+    return candidates
+
+
+def read_video_decord(
+    video_path,
+    sample_timestamps_fn: Callable,
+    **kwargs,
+) -> np.ndarray:
+    """
+    Decode a video using the Decord backend.
+
+    Args:
+        video_path (`str`):
+            Path to the video file.
+        sample_timestamps_fn (`Callable`):
+            A callable function that will return timestamps at which the video should be sampled.
+
+    Returns:
+        tuple[`np.array`, `VideoMetadata`]: A tuple containing:
+            - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
+            - `VideoMetadata` object.
+    """
+    # Lazy import from decord
+    import importlib
+
+    decord = importlib.import_module("decord")
+
+    vr = decord.VideoReader(uri=video_path, ctx=decord.cpu(0))  # decord has problems with gpu
+    video_fps = vr.get_avg_fps()
+    total_num_frames = len(vr)
+    time_stamps = vr.get_frame_timestamp(list(range(len(vr))))
+    duration = time_stamps[-1][1] - time_stamps[0][0]
+
+    metadata = VideoMetadata(
+        total_num_frames=int(total_num_frames),
+        fps=float(video_fps),
+        duration=float(duration),
+        video_backend="decord",
+    )
+
+    target_timestamps = sample_timestamps_fn(metadata=metadata, **kwargs)
+    target_timestamps = np.array(target_timestamps)
+    offset = time_stamps[0, 0]
+
+    ix = np.searchsorted(time_stamps[:, 1], target_timestamps + offset, side="right")
+    ix = np.minimum(ix, len(time_stamps) - 1)
+
+    video = vr.get_batch(ix).asnumpy()
+    metadata.update(
+        {
+            "frames_indices": target_timestamps * video_fps,
+            "height": video.shape[1],
+            "width": video.shape[2],
+        }
+    )
+    return video, metadata
+
+
+def read_video_torchcodec(
+    video_path,
+    sample_timestamps_fn: Callable,
+    **kwargs,
+) -> np.ndarray:
+    """
+    Decode a video using torchcodec decoder.
+
+    Args:
+        video_path (`str`):
+            Path to the video file.
+        sample_timestamps_fn (`Callable`):
+            A callable function that will return timestamps at which the video should be sampled.
+
+    Returns:
+        tuple[`np.array`, `VideoMetadata`]: A tuple containing:
+            - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
+            - `VideoMetadata` object.
+    """
+    # Lazy import torchcodec
+    import importlib
+
+    torchcodec = importlib.import_module("torchcodec")
+
+    decoder = torchcodec.decoders.VideoDecoder(
+        video_path,
+        # Interestingly `exact` mode takes less than approximate when we load the whole video
+        seek_mode="exact",
+        # Allow FFmpeg decide on the number of threads for efficiency
+        num_ffmpeg_threads=0,
+    )
+    # If the first frame starts at > 0, we effectively clip the video starting at that time
+    # since (most) video players would also skip to that time
+    time_offset = decoder.metadata.begin_stream_seconds_from_content
+    # Note this duration does assume we started playing at `time_offset`
+    duration = decoder.metadata.duration_seconds
+
+    metadata = VideoMetadata(
+        total_num_frames=decoder.metadata.num_frames,
+        fps=decoder.metadata.average_fps,
+        duration=duration,
+        video_backend="torchcodec",
+        height=decoder.metadata.height,
+        width=decoder.metadata.width,
+    )
+
+    target_timestamps = sample_timestamps_fn(metadata=metadata, **kwargs)
+
+    # Floating point/rounding issues might cause `target_timestamps` to be very slightly
+    # out-of-bounds, to handle this we sanity check then clip them
+    assert all(x >= 0 for x in target_timestamps)
+    assert all(x < duration + 1e-6 for x in target_timestamps)
+    # 1e-6 padding since torchcodec can throw out-of-bounds errors even if you ask for the
+    # exact boundary value, we should still get the first/last frame anyway
+    max_timestamp = decoder.metadata.end_stream_seconds_from_content - 1e-6
+    min_timestamp = decoder.metadata.begin_stream_seconds_from_content + 1e-6
+    # Note we avoid using numpy ops here to reduce floating precision issues
+    timestamps = [x + time_offset for x in target_timestamps]
+    timestamps = [max(min_timestamp, min(max_timestamp, x)) for x in timestamps]
+
+    video = (
+        decoder.get_frames_played_at(timestamps).data.numpy().transpose(0, 2, 3, 1)
+    )  # Convert to THWC format
+    target_timestamps = np.array(target_timestamps)
+    metadata.frames_indices = target_timestamps * metadata.fps
+
+    return video, metadata
+
+
+def read_video_pyav(
+    video_path,
+    sample_timestamps_fn: Callable,
+    **kwargs,
+) -> np.ndarray:
+    """
+    Decode a video using the PyAV backend.
+
+    Args:
+        video_path (`str`):
+            Path to the video file.
+        sample_timestamps_fn (`Callable`):
+            A callable function that will return timestamps at which the video should be sampled.
+
+    Returns:
+        tuple[`np.array`, `VideoMetadata`]: A tuple containing:
+            - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
+            - `VideoMetadata` object.
+    """
+    # Lazy import torchcodec
+    import importlib
+
+    av = importlib.import_module("av")
+
+    with av.open(video_path) as container:
+        video_stream = container.streams.video[0]
+        fps = video_stream.average_rate or video_stream.guessed_rate
+        it = container.decode(video=0)
+        frames = list(it)
+
+        stream = container.streams.video[0]
+        start = frames[0].pts * stream.time_base
+        container_end = stream.duration
+        if container_end is not None:
+            container_end *= stream.time_base
+        if container_end is None or container_end < frames[-1].pts:
+            # Some problem with stream duration, so use the frame PTS directly
+            # and guess the duration of the last frame
+            end = frames[-1].pts * stream.time_base + 1 / fps
+        else:
+            end = container_end
+        duration = float(end - start)
+
+        metadata = VideoMetadata(
+            total_num_frames=len(frames),
+            fps=float(fps),
+            duration=float(duration),
+            video_backend="pyav",
+            height=video_stream.height,
+            width=video_stream.width,
+        )
+
+        target_timestamps = sample_timestamps_fn(metadata=metadata, **kwargs)
+        offset = float(start)
+
+        target_timestamps = np.array(target_timestamps)
+        end_time_stamps = np.array([float(frame.pts * stream.time_base) for frame in frames[1:]] + [duration])
+        indices = np.searchsorted(end_time_stamps, target_timestamps + offset, side="right")
+        indices = np.minimum(indices, len(end_time_stamps) - 1)
+
+        video = np.stack(
+            [frames[i].to_ndarray(format="rgb24", channel_last=True) for i in indices],
+            axis=0,
+        )
+
+        metadata.frames_indices = target_timestamps * fps
+
+        return video, metadata
+
+
+VIDEO_DECODERS = {
+    "decord": read_video_decord,
+    "torchcodec": read_video_torchcodec,
+    "pyav": read_video_pyav,
+}
+
+
+def load_video(
+    video: VideoInput,
+    backend: str = "decord",
+    sample_timestamps_fn: Callable | None = None,
+    **kwargs,
+):
+    """
+    Loads `video` to a numpy array.
+
+    Args:
+        video (`VideoInput`):
+            The video to convert to the numpy array format. Can be a link to video or local path.
+        backend (`str`, *optional*, defaults to `"decord"`):
+            The backend to use when loading the video. Can be any of ["decord", "pyav", ""torchcodec"]. Defaults to "decord".
+        sample_timestamps_fn (`Callable`):
+            A callable function that will return timestamps at which the video should be sampled.
+    """
+
+    # Early exit if provided an array or `PIL` frames
+    if not isinstance(video, str):
+        metadata = [None] * len(video)
+        return video, metadata
+
+    if urlparse(video).netloc in ["www.youtube.com", "youtube.com"]:
+        if not is_yt_dlp_available():
+            raise ImportError("To load a video from YouTube url you have  to install `yt_dlp` first.")
+        # Lazy import from yt_dlp
+        import importlib
+
+        yt_dlp = importlib.import_module("yt_dlp")
+
+        buffer = BytesIO()
+        with redirect_stdout(buffer), yt_dlp.YoutubeDL() as f:
+            f.download([video])
+        bytes_obj = buffer.getvalue()
+        file_obj = BytesIO(bytes_obj)
+    elif video.startswith("http://") or video.startswith("https://"):
+        file_obj = BytesIO(requests.get(video, timeout=10).content)
+    elif os.path.isfile(video):
+        file_obj = video
+    else:
+        raise TypeError(
+            "Incorrect format used for video. Should be an url linking to an video or a local path."
+        )
+
+    # can also load with decord, but not cv2/torchvision
+    # both will fail in case of url links
+    video_is_url = video.startswith("http://") or video.startswith("https://")
+    if video_is_url and backend == "opencv":
+        raise ValueError("If you are trying to load a video from URL, you cannot use 'opencv' as backend")
+
+    if (
+        (not is_decord_available() and backend == "decord")
+        or (not is_torchcodec_available() and backend == "torchcodec")
+        or (not is_av_available() and backend == "pyav")
+    ):
+        raise ImportError(
+            f"You chose backend={backend} for loading the video but the required library is not found in your environment "
+            f"Make sure to install {backend} before loading the video."
+        )
+
+    video_decoder = VIDEO_DECODERS[backend]
+    video, metadata = video_decoder(file_obj, sample_timestamps_fn, **kwargs)
+    return video, metadata
+
+
+def get_target_fps(
+    video_fps: float,
+    max_frames: int,
+    total_frames: int,
+    frame_sample_mode: str,
+    candidate_target_fps: tuple[float],
+) -> float:
+    """
+    Get the target fps that best spans the video and has the most frames sampled
+    """
+    num_frames_sampled = 0
+    selected_target_fps = None
+    for target_fps in candidate_target_fps:
+        step_size = max(int(video_fps / target_fps), 1)
+        num_frames_sampled_at_fps = int(total_frames / step_size)
+        if num_frames_sampled == 0:
+            if "uniform" in frame_sample_mode:
+                if num_frames_sampled_at_fps > max_frames:
+                    break
+            selected_target_fps = target_fps
+            num_frames_sampled = num_frames_sampled_at_fps
+
+        else:
+            # the candidate sampling fps increases so frame count can't decrease
+            assert num_frames_sampled <= num_frames_sampled_at_fps
+            if num_frames_sampled_at_fps > max_frames:
+                # choose the sampling fps that spans the video
+                continue
+
+            elif num_frames_sampled_at_fps > num_frames_sampled:
+                # both are less than max_frames, choose the one with higher density of frames sampled
+                selected_target_fps = target_fps
+                num_frames_sampled = num_frames_sampled_at_fps
+    return selected_target_fps
+
+
+def get_frame_times_and_chosen_fps(selected_target_fps, total_frames, max_frames, video_fps):
+    if selected_target_fps is None:
+        frame_indices = np.linspace(0, total_frames, max_frames, endpoint=False, dtype=int)
+    else:
+        step_size = max(int(video_fps / selected_target_fps), 1)
+        frame_indices = np.arange(0, total_frames, step_size)
+    if len(frame_indices) > max_frames:
+        frame_indices = frame_indices[:max_frames]
+    return selected_target_fps, frame_indices
+
+
+class MolmoAct2VideoProcessorKwargs(VideosKwargs, total=False):
+    patch_size: int | None
+    pooling_size: list[int] | None
+    frame_sample_mode: str | None
+    max_fps: int | None
+    sampling_fps: int | None
+
+
+class MolmoAct2VideoProcessor(BaseVideoProcessor):
+    resample = PILImageResampling.BILINEAR
+    size = {"height": 378, "width": 378}
+    image_mean = IMAGENET_STANDARD_MEAN
+    image_std = IMAGENET_STANDARD_STD
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    do_convert_rgb = True
+    patch_size = 14
+    pooling_size = [3, 3]
+    do_sample_frames = True
+    frame_sample_mode = "uniform_last_frame"
+    max_fps = 2
+    sampling_fps = 2
+    valid_kwargs = MolmoAct2VideoProcessorKwargs
+    model_input_names = ["pixel_values_videos", "video_token_pooling", "video_grids"]
+
+    def __init__(self, **kwargs: Unpack[MolmoAct2VideoProcessorKwargs]):
+        super().__init__(**kwargs)
+        if self.size is not None and (
+            self.size.get("height", None) is None or self.size.get("width", None) is None
+        ):
+            raise ValueError("size must contain 'height' and 'width' keys.")
+
+    def _further_process_kwargs(
+        self,
+        size: SizeDict | None = None,
+        **kwargs,
+    ) -> dict:
+        """
+        Update kwargs that need further processing before being validated
+        Can be overridden by subclasses to customize the processing of kwargs.
+        """
+        if size is not None and ("height" not in size or "width" not in size):
+            raise ValueError("size must contain 'height' and 'width' keys.")
+
+        return super()._further_process_kwargs(size=size, **kwargs)
+
+    def sample_times(
+        self,
+        metadata: VideoMetadata,
+        frame_sample_mode: str,
+        num_frames: int,
+        max_fps: int | None = None,
+        sampling_fps: int | None = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Time-based sampling if an array video is passed
+        Args:
+            metadata (`VideoMetadata`):
+                Metadata of the video containing information about total duration, fps and total number of frames.
+            frame_sample_mode (`str`, *optional*):
+                Mode to sample frames. Defaults to `self.frame_sample_mode`.
+            num_frames (`int`, *optional*):
+                Maximum number of frames to sample. Defaults to `self.num_frames`.
+            man_fps (`int`, *optional*):
+                Maximum frames per second to sample.
+            sampling_fps (`int`, *optional*):
+                Sampling frames per second. Defaults to `self.sampling_fps`.
+                Used when `frame_sample_mode` is `"fps"`.
+        """
+        frame_sample_mode = frame_sample_mode or self.frame_sample_mode
+        num_frames = num_frames or self.num_frames
+        sampling_fps = sampling_fps or self.sampling_fps
+
+        duration = metadata.duration or metadata.total_num_frames / metadata.fps
+        if frame_sample_mode == "fps":
+            candidate_target_fps = get_candidate_target_fps(metadata.fps, sampling_fps)
+            # Try larger and larger FPSs until we hit one that can't span the video
+            target_fps = candidate_target_fps[0]
+            for candidate_fps in candidate_target_fps[1:]:
+                if num_frames / candidate_fps < duration:
+                    break
+                target_fps = candidate_fps
+            times = np.arange(0, num_frames) / target_fps
+            times = times[times < duration]
+            return times
+        elif frame_sample_mode == "uniform_last_frame":
+            if max_fps is not None:
+                max_duration = (num_frames - 1) / max_fps  # -1 to include the last frame
+                if max_duration < duration:
+                    times = np.linspace(0, duration, num=num_frames, endpoint=True, dtype=np.float64)
+                else:
+                    times = np.arange(0.0, stop=duration, step=1 / max_fps)
+                    times = np.concatenate([times, [duration]], axis=0)
+                    assert len(times) <= num_frames
+            else:
+                times = np.linspace(0, duration, num=num_frames, endpoint=True, dtype=np.float64)
+            return times
+        else:
+            raise NotImplementedError(frame_sample_mode)
+
+    def sample_frames(
+        self,
+        metadata: VideoMetadata,
+        frame_sample_mode: str | None = None,
+        num_frames: int | None = None,
+        max_fps: int | None = None,
+        sampling_fps: int | None = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Frame-based sampling if an array video is passed
+        Args:
+            metadata (`VideoMetadata`):
+                Metadata of the video containing information about total duration, fps and total number of frames.
+            frame_sample_mode (`str`, *optional*):
+                Mode to sample frames. Defaults to `self.frame_sample_mode`.
+            num_frames (`int`, *optional*):
+                Maximum number of frames to sample. Defaults to `self.num_frames`.
+            max_fps (`int`, *optional*):
+                Maximum frames per second to sample.
+            sampling_fps (`int`, *optional*):
+                Sampling frames per second. Defaults to `self.sampling_fps`.
+                Used when `frame_sample_mode` is `"fps"`.
+        """
+        frame_sample_mode = frame_sample_mode or self.frame_sample_mode
+        num_frames = num_frames or self.num_frames
+        sampling_fps = sampling_fps or self.sampling_fps
+
+        total_num_frames = metadata.total_num_frames
+        if frame_sample_mode == "uniform_last_frame" and max_fps is not None:
+            duration = total_num_frames / metadata.fps
+            if total_num_frames <= 2:
+                return np.arange(total_num_frames).astype(int)
+            if duration > (num_frames - 1) / max_fps:  # -1 to include the last frame
+                # uniform fallback
+                indices = np.linspace(
+                    0,
+                    total_num_frames - 1,
+                    num=min(num_frames, total_num_frames),
+                    endpoint=True,
+                ).astype(int)
+                return indices
+            else:
+                float_indices = np.arange(
+                    0.0,
+                    stop=total_num_frames - 1,
+                    step=float(metadata.fps / max_fps),
+                )
+                if np.round(float_indices[-1]) != total_num_frames - 1:
+                    float_indices = np.concatenate([float_indices, [total_num_frames - 1]], axis=0)
+                indices = np.round(float_indices).astype(int)
+                assert indices[-1] < total_num_frames
+                assert len(float_indices) <= num_frames
+                return indices
+        elif frame_sample_mode == "uniform_last_frame":
+            indices = np.linspace(
+                0,
+                total_num_frames - 1,
+                num=min(num_frames, total_num_frames),
+                endpoint=True,
+            ).astype(int)
+            return indices
+        elif frame_sample_mode == "fps":
+            candidate_target_fps = get_candidate_target_fps(metadata.fps, sampling_fps)
+            selected_target_fps = get_target_fps(
+                metadata.fps,
+                num_frames,
+                total_num_frames,
+                frame_sample_mode,
+                candidate_target_fps,
+            )
+            _, indices = get_frame_times_and_chosen_fps(
+                selected_target_fps,
+                total_num_frames,
+                num_frames,
+                metadata.fps,
+            )
+            return indices
+        else:
+            raise NotImplementedError(frame_sample_mode)
+
+    def fetch_videos(self, video_url_or_urls: str | list[str] | list[list[str]], sample_timestamps_fn=None):
+        """
+        Convert a single or a list of urls into the corresponding `np.array` objects.
+
+        If a single url is passed, the return value will be a single object. If a list is passed a list of objects is
+        returned.
+        """
+        if (not is_decord_available()) and (not is_torchcodec_available()) and (not is_av_available()):
+            raise ImportError(
+                "MolmoAct2VideoProcessor requires `decord`, `torchcodec`, or `av` to be installed."
+            )
+
+        if is_decord_available():
+            backend = "decord"
+        elif is_torchcodec_available():
+            warnings.warn(
+                "`decord` is not installed and cannot be used to decode the video by default. "
+                "Falling back to `torchcodec`."
+            )
+            backend = "torchcodec"
+        else:
+            warnings.warn(
+                "`decord` is not installed and cannot be used to decode the video by default. "
+                "Falling back to `PyAV`."
+            )
+            backend = "pyav"
+
+        if isinstance(video_url_or_urls, list):
+            return list(
+                zip(
+                    *[
+                        self.fetch_videos(x, sample_timestamps_fn=sample_timestamps_fn)
+                        for x in video_url_or_urls
+                    ]
+                )
+            )
+        else:
+            return load_video(video_url_or_urls, backend=backend, sample_timestamps_fn=sample_timestamps_fn)
+
+    def _decode_and_sample_videos(
+        self,
+        videos: VideoInput,
+        video_metadata: VideoMetadata | dict,
+        do_sample_frames: bool | None = None,
+        sample_indices_fn: Callable | None = None,
+        sample_timestamps_fn: Callable | None = None,
+    ):
+        """
+        Decode input videos and sample frames if needed.
+        """
+        videos = make_batched_videos(videos)
+        video_metadata = make_batched_metadata(videos, video_metadata=video_metadata)
+
+        # Framed-based sampling if an array video is passed
+        # Otherwise, time-based sampling with decoding
+        if is_valid_video(videos[0]) and do_sample_frames:
+            assert video_metadata[0].fps is not None, "FPS must be provided for video input"
+            sampled_videos = []
+            sampled_metadata = []
+            for video, metadata in zip(videos, video_metadata):
+                indices = sample_indices_fn(metadata=metadata)
+                metadata.frames_indices = indices
+                sampled_videos.append(video[indices])
+                sampled_metadata.append(metadata)
+            videos = sampled_videos
+            video_metadata = sampled_metadata
+        elif not is_valid_video(videos[0]):
+            if sample_indices_fn is None:
+                logger.warning(
+                    "do_sample_frames is False, but video array is not provided: "
+                    "Will decode the video and sample frames using MolmoAct2's default sampling mode"
+                )
+            if isinstance(videos[0], list):
+                raise ValueError("A list of images is not supported for video input!")
+            else:
+                videos, video_metadata = self.fetch_videos(videos, sample_timestamps_fn=sample_timestamps_fn)
+
+        return videos, video_metadata
+
+    def _prepare_input_videos(
+        self,
+        videos: VideoInput,
+        **kwargs,
+    ) -> list[np.ndarray]:
+        processed_videos = [to_numpy(video) for video in videos]
+        return processed_videos
+
+    def preprocess(
+        self,
+        videos: VideoInput,
+        **kwargs: Unpack[MolmoAct2VideoProcessorKwargs],
+    ) -> BatchFeature:
+        validate_kwargs(
+            captured_kwargs=kwargs.keys(),
+            valid_processor_keys=list(self.valid_kwargs.__annotations__.keys()) + ["return_tensors"],
+        )
+
+        # Set default kwargs from self. This ensures that if a kwarg is not provided
+        # by the user, it gets its default value from the instance, or is set to None.
+        for kwarg_name in self.valid_kwargs.__annotations__:
+            kwargs.setdefault(kwarg_name, getattr(self, kwarg_name, None))
+
+        do_sample_frames = kwargs.pop("do_sample_frames")
+        video_metadata = kwargs.pop("video_metadata")
+
+        sample_indices_fn = partial(self.sample_frames, **kwargs) if do_sample_frames else None
+        sample_timestamps_fn = partial(self.sample_times, **kwargs)
+        videos, video_metadata = self._decode_and_sample_videos(
+            videos,
+            video_metadata=video_metadata,
+            do_sample_frames=do_sample_frames,
+            sample_indices_fn=sample_indices_fn,
+            sample_timestamps_fn=sample_timestamps_fn,
+        )
+        videos = self._prepare_input_videos(videos=videos)
+
+        kwargs = self._further_process_kwargs(**kwargs)
+
+        return_metadata = kwargs.pop("return_metadata")
+        preprocessed_videos = self._preprocess(videos=videos, **kwargs)
+        if return_metadata:
+            preprocessed_videos["video_metadata"] = video_metadata
+        return preprocessed_videos
+
+    def _preprocess(
+        self,
+        videos: list[np.ndarray],
+        size: SizeDict | None = None,
+        resample: PILImageResampling | None = None,
+        image_mean: float | list[float] | None = None,
+        image_std: float | list[float] | None = None,
+        do_convert_rgb: bool | None = None,
+        patch_size: int | None = None,
+        pooling_size: list[int] | None = None,
+        return_tensors: str | TensorType | None = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Preprocess a video for the model.
+        Args:
+            videos (`VideoInput`):
+                Video to preprocess.
+            size (`SizeDict`, *optional*, defaults to `self.size`):
+                Size of the image after resizing.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use when resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            patch_size (`int`, *optional*, defaults to `self.patch_size`):
+                The spatial patch size of the vision encoder.
+            pooling_size (`list[int]`, *optional*, defaults to `self.pooling_size`):
+                The pooling size of the vision adapter.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+
+        Returns:
+            A `BatchFeature` containing the following keys:
+                - `pixel_values_videos`: The preprocessed videos.
+                - `video_token_pooling`: The indices of the patches in `crops` to pool for each token in `video_tokens`.
+                - `video_grids`: The video grids.
+        """
+        if size.height is None or size.width is None:
+            raise ValueError("size must contain 'height' and 'width' keys.")
+
+        base_image_input_size = [size.height, size.width]
+
+        resample = resample or self.resample
+        image_mean = image_mean or self.image_mean
+        image_std = image_std or self.image_std
+        do_convert_rgb = do_convert_rgb or self.do_convert_rgb
+
+        patch_size = patch_size or self.patch_size
+        pooling_size = pooling_size or self.pooling_size
+
+        image_pooling_h, image_pooling_w = pooling_size
+
+        batch_grids = []
+        batch_crops = []
+        batch_pooled_patches_idx = []
+
+        for video in videos:
+            all_crops = []
+            pooled_patches_idx = []
+
+            for frame in video:
+                image_grid, crops, pooled_idx = image_to_patches_and_grids(
+                    frame,
+                    base_image_input_size,
+                    resample,
+                    image_mean,
+                    image_std,
+                    patch_size,
+                    image_pooling_w,
+                    image_pooling_h,
+                )
+                offset = sum(np.prod(x.shape[:2]) for x in all_crops)
+                pooled_idx_with_offset = np.where(pooled_idx >= 0, pooled_idx + offset, pooled_idx)
+                pooled_patches_idx.append(pooled_idx_with_offset)
+                all_crops.append(crops)
+
+            video_grid = np.array([len(video), image_grid[0], image_grid[1]])
+            all_crops = np.concatenate(all_crops, 0)
+            pooled_patches_idx = np.concatenate(pooled_patches_idx, 0)
+
+            batch_grids.append(video_grid)
+            batch_crops.append(all_crops)
+            batch_pooled_patches_idx.append(pooled_patches_idx)
+
+        video_grids = np.stack(batch_grids, 0)
+        pixel_values_videos = np.concatenate(batch_crops, 0)
+        video_token_pooling = np.concatenate(batch_pooled_patches_idx, 0)
+
+        data = dict(
+            pixel_values_videos=pixel_values_videos,
+            video_token_pooling=video_token_pooling,
+            video_grids=video_grids,
+        )
+
+        return BatchFeature(data, tensor_type=return_tensors)
+
+
+MolmoAct2VideoProcessor.register_for_auto_class()
--- a/src/lerobot/policies/molmoact2/modeling_molmoact2.py
+++ b/src/lerobot/policies/molmoact2/modeling_molmoact2.py
--- a/src/lerobot/policies/molmoact2/processor_molmoact2.py
+++ b/src/lerobot/policies/molmoact2/processor_molmoact2.py
--- a/src/lerobot/rewards/init.py
+++ b/src/lerobot/rewards/init.py
@@ -20,12 +20,16 @@ from .factory import (
    make_reward_pre_post_processors as make_reward_pre_post_processors,
 )
 from .pretrained import PreTrainedRewardModel as PreTrainedRewardModel
+from .robometer.configuration_robometer import RobometerConfig as RobometerConfig
 from .sarm.configuration_sarm import SARMConfig as SARMConfig
+from .topreward.configuration_topreward import TOPRewardConfig as TOPRewardConfig

 __all__ = [
    # Configuration classes
    "RewardClassifierConfig",
+    "RobometerConfig",
    "SARMConfig",
+    "TOPRewardConfig",
    # Base class
    "PreTrainedRewardModel",
    # Factory functions
--- a/src/lerobot/rewards/factory.py
+++ b/src/lerobot/rewards/factory.py
@@ -25,7 +25,9 @@ from lerobot.processor import PolicyAction, PolicyProcessorPipeline

 from .classifier.configuration_classifier import RewardClassifierConfig
 from .pretrained import PreTrainedRewardModel
+from .robometer.configuration_robometer import RobometerConfig
 from .sarm.configuration_sarm import SARMConfig
+from .topreward.configuration_topreward import TOPRewardConfig


 def get_reward_model_class(name: str) -> type[PreTrainedRewardModel]:
@@ -37,7 +39,7 @@ def get_reward_model_class(name: str) -> type[PreTrainedRewardModel]:

    Args:
        name: The name of the reward model. Supported names are "reward_classifier",
-              "sarm".
+              "sarm", "robometer", "topreward".

    Returns:
        The reward model class corresponding to the given name.
@@ -53,6 +55,14 @@ def get_reward_model_class(name: str) -> type[PreTrainedRewardModel]:
        from lerobot.rewards.sarm.modeling_sarm import SARMRewardModel

        return SARMRewardModel
+    elif name == "robometer":
+        from lerobot.rewards.robometer.modeling_robometer import RobometerRewardModel
+
+        return RobometerRewardModel
+    elif name == "topreward":
+        from lerobot.rewards.topreward.modeling_topreward import TOPRewardModel
+
+        return TOPRewardModel
    else:
        try:
            return _get_reward_model_cls_from_name(name=name)
@@ -69,7 +79,7 @@ def make_reward_model_config(reward_type: str, **kwargs) -> RewardModelConfig:

    Args:
        reward_type: The type of the reward model. Supported types include
-                     "reward_classifier", "sarm".
+                     "reward_classifier", "sarm", "robometer", "topreward".
        **kwargs: Keyword arguments to be passed to the configuration class constructor.

    Returns:
@@ -82,6 +92,10 @@ def make_reward_model_config(reward_type: str, **kwargs) -> RewardModelConfig:
        return RewardClassifierConfig(**kwargs)
    elif reward_type == "sarm":
        return SARMConfig(**kwargs)
+    elif reward_type == "robometer":
+        return RobometerConfig(**kwargs)
+    elif reward_type == "topreward":
+        return TOPRewardConfig(**kwargs)
    else:
        try:
            config_cls = RewardModelConfig.get_choice_class(reward_type)
@@ -161,6 +175,21 @@ def make_reward_pre_post_processors(
            dataset_stats=kwargs.get("dataset_stats"),
            dataset_meta=kwargs.get("dataset_meta"),
        )
+    elif isinstance(reward_cfg, RobometerConfig):
+        from lerobot.rewards.robometer.processor_robometer import make_robometer_pre_post_processors
+
+        return make_robometer_pre_post_processors(
+            config=reward_cfg,
+            dataset_stats=kwargs.get("dataset_stats"),
+        )
+
+    elif isinstance(reward_cfg, TOPRewardConfig):
+        from lerobot.rewards.topreward.processor_topreward import make_topreward_pre_post_processors
+
+        return make_topreward_pre_post_processors(
+            config=reward_cfg,
+            dataset_stats=kwargs.get("dataset_stats"),
+        )

    else:
        try:
--- a/src/lerobot/rewards/robometer/init.py
+++ b/src/lerobot/rewards/robometer/init.py
@@ -0,0 +1,19 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .configuration_robometer import RobometerConfig
+from .modeling_robometer import RobometerRewardModel
+from .processor_robometer import make_robometer_pre_post_processors
+
+__all__ = ["RobometerConfig", "RobometerRewardModel", "make_robometer_pre_post_processors"]
--- a/src/lerobot/rewards/robometer/compute_rabc_weights.py
+++ b/src/lerobot/rewards/robometer/compute_rabc_weights.py
@@ -0,0 +1,320 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Compute per-frame Robometer progress and success curves for a LeRobot dataset.
+
+For each episode, builds per-frame sub-samples using the frame-steps
+strategy from the Robometer eval server: for each original frame ``t``,
+linspace-subsample ``[0, t]`` into ``K`` frames (default 4, matching
+``NUM_SUBSAMPLED_FRAMES`` in the eval server), run one forward through
+the Robometer processor + model, and keep the last-frame progress value.
+All sub-samples are the same size ``K`` so they batch cleanly.
+
+The parquet uses the same schema as SARM's
+:mod:`lerobot.rewards.sarm.compute_rabc_weights` so existing consumers —
+:class:`lerobot.rewards.sarm.rabc.RABCWeights` (which reads
+``progress_sparse``) and the progress-overlay script in
+``examples/dataset/create_progress_videos.py`` — work without modification.
+
+Usage:
+    # Dense per-frame progress for one episode
+    python -m lerobot.rewards.robometer.compute_rabc_weights \\
+        --dataset-repo-id lerobot/libero_10_image \\
+        --reward-model-path lerobot/Robometer-4B \\
+        --episodes 0
+
+    # All episodes with batching
+    python -m lerobot.rewards.robometer.compute_rabc_weights \\
+        --dataset-repo-id lerobot/libero_10_image \\
+        --reward-model-path lerobot/Robometer-4B \\
+        --batch-size 16
+"""
+
+from __future__ import annotations
+
+import argparse
+import logging
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import pyarrow as pa
+import pyarrow.parquet as pq
+import torch
+from tqdm import tqdm
+
+from lerobot.datasets import LeRobotDataset
+from lerobot.rewards.robometer.configuration_robometer import RobometerConfig
+from lerobot.rewards.robometer.modeling_robometer import RobometerRewardModel
+from lerobot.rewards.robometer.processor_robometer import RobometerEncoderProcessorStep
+from lerobot.types import TransitionKey
+
+DEFAULT_OUTPUT_FILENAME = "robometer_progress.parquet"
+
+# Upstream Robometer eval server uses K=4 for frame-steps sub-samples.
+DEFAULT_NUM_SUBSAMPLED_FRAMES = 4
+
+
+def get_reward_model_path_from_parquet(parquet_path: Path) -> str | None:
+    """Read ``reward_model_path`` from parquet metadata if available."""
+    if not parquet_path.exists():
+        return None
+    try:
+        metadata = pq.read_metadata(parquet_path).schema.to_arrow_schema().metadata
+        if metadata and b"reward_model_path" in metadata:
+            return metadata[b"reward_model_path"].decode()
+    except Exception:  # nosec B110
+        return None
+    return None
+
+
+def _resolve_task(sample: dict[str, Any], default: str) -> str:
+    """Best-effort task extraction from a dataset sample."""
+    task = sample.get("task")
+    if isinstance(task, str) and task:
+        return task
+    return default
+
+
+def _build_subsample_indices(num_frames: int, num_subsampled_frames: int) -> list[np.ndarray]:
+    """Frame-steps linspace expansion.
+
+    For each ``t in [0, num_frames - 1]`` returns ``num_subsampled_frames``
+    indices from ``np.linspace(0, t, num_subsampled_frames)`` — the first
+    and last frames are always included. Each entry is a fixed-size array
+    so the model can batch them.
+    """
+    return [np.linspace(0, t, num_subsampled_frames).round().astype(np.int64) for t in range(num_frames)]
+
+
+def compute_robometer_progress(
+    dataset_repo_id: str,
+    reward_model_path: str,
+    output_path: str | None = None,
+    device: str = "cuda",
+    batch_size: int = 32,
+    num_subsampled_frames: int = DEFAULT_NUM_SUBSAMPLED_FRAMES,
+    episodes: list[int] | None = None,
+    image_key: str | None = None,
+) -> Path:
+    """Run Robometer over a dataset and write per-frame progress + success."""
+    logging.info(f"Loading Robometer: {reward_model_path}")
+    config = RobometerConfig(pretrained_path=reward_model_path, device=device)
+    if image_key is not None:
+        config.image_key = image_key
+    model = RobometerRewardModel.from_pretrained(reward_model_path, config=config)
+    model.to(device).eval()
+
+    encoder = RobometerEncoderProcessorStep(
+        base_model_id=config.base_model_id,
+        image_key=config.image_key,
+        task_key=config.task_key,
+        default_task=config.default_task,
+        max_frames=num_subsampled_frames,
+        use_multi_image=config.use_multi_image,
+        use_per_frame_progress_token=config.use_per_frame_progress_token,
+    )
+
+    image_key = config.image_key
+
+    logging.info(f"Loading dataset: {dataset_repo_id}")
+    dataset = LeRobotDataset(dataset_repo_id, download_videos=True)
+    logging.info(f"Dataset: {dataset.num_episodes} episodes, {dataset.num_frames} frames")
+
+    episode_indices = list(range(dataset.num_episodes)) if episodes is None else episodes
+    logging.info(f"Processing {len(episode_indices)} episode(s)")
+
+    all_index: list[int] = []
+    all_episode: list[int] = []
+    all_frame: list[int] = []
+    all_progress: list[float] = []
+
+    for episode_idx in tqdm(episode_indices, desc="Episodes"):
+        ep = dataset.meta.episodes[episode_idx]
+        ep_start = int(ep["dataset_from_index"])
+        ep_end = int(ep["dataset_to_index"])
+        num_frames = ep_end - ep_start
+        if num_frames <= 0:
+            continue
+
+        first_sample = dataset[ep_start]
+        task = _resolve_task(first_sample, default=config.default_task or "perform the task")
+
+        ep_frames = torch.stack([dataset[ep_start + i][image_key] for i in range(num_frames)])
+
+        sub_indices = _build_subsample_indices(num_frames, num_subsampled_frames)
+
+        progress_per_frame = np.zeros(num_frames, dtype=np.float32)
+
+        for start in tqdm(range(0, num_frames, batch_size), desc=f"  Ep {episode_idx}", leave=False):
+            end = min(start + batch_size, num_frames)
+            frames_batch = torch.stack([ep_frames[sub_indices[i]] for i in range(start, end)])
+
+            transition = {
+                TransitionKey.OBSERVATION: {image_key: frames_batch},
+                TransitionKey.COMPLEMENTARY_DATA: {"task": task},
+            }
+            encoded = encoder(transition)
+            obs = encoded[TransitionKey.OBSERVATION]
+            batch = {
+                key: value.to(device) if isinstance(value, torch.Tensor) else value
+                for key, value in obs.items()
+            }
+
+            with torch.no_grad():
+                rewards = model.compute_reward(batch)
+            progress_per_frame[start:end] = rewards.cpu().numpy()
+
+        for local in range(num_frames):
+            all_index.append(ep_start + local)
+            all_episode.append(episode_idx)
+            all_frame.append(local)
+            all_progress.append(float(progress_per_frame[local]))
+
+        if device.startswith("cuda"):
+            torch.cuda.empty_cache()
+
+    table = pa.table(
+        {
+            "index": np.asarray(all_index, dtype=np.int64),
+            "episode_index": np.asarray(all_episode, dtype=np.int64),
+            "frame_index": np.asarray(all_frame, dtype=np.int64),
+            "progress_sparse": np.asarray(all_progress, dtype=np.float32),
+        }
+    ).replace_schema_metadata({b"reward_model_path": reward_model_path.encode()})
+
+    out = Path(dataset.root) / DEFAULT_OUTPUT_FILENAME if output_path is None else Path(output_path)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    pq.write_table(table, out)
+    logging.info(f"Saved {len(table)} frame values to {out}")
+
+    progress_arr = np.asarray(all_progress, dtype=np.float32)
+    if progress_arr.size:
+        logging.info(
+            f"Progress: mean={float(progress_arr.mean()):.4f}, "
+            f"std={float(progress_arr.std()):.4f}, "
+            f"min={float(progress_arr.min()):.4f}, "
+            f"max={float(progress_arr.max()):.4f}"
+        )
+    return out
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Compute per-frame Robometer progress curves for RA-BC weighting.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    # Dense per-frame progress for one episode
+    python -m lerobot.rewards.robometer.compute_rabc_weights \\
+        --dataset-repo-id lerobot/libero_10_image \\
+        --reward-model-path lerobot/Robometer-4B \\
+        --episodes 0
+
+    # All episodes, smaller batches for memory-constrained GPUs
+    python -m lerobot.rewards.robometer.compute_rabc_weights \\
+        --dataset-repo-id lerobot/libero_10_image \\
+        --reward-model-path lerobot/Robometer-4B \\
+        --batch-size 16
+        """,
+    )
+    parser.add_argument(
+        "--dataset-repo-id", type=str, required=True, help="HuggingFace dataset repo id or local path."
+    )
+    parser.add_argument(
+        "--reward-model-path", type=str, default=None, help="Robometer checkpoint repo id or local path."
+    )
+    parser.add_argument("--output-path", type=str, default=None, help="Output parquet path.")
+    parser.add_argument("--device", type=str, default="cuda", help="Device to use (default: cuda).")
+    parser.add_argument(
+        "--batch-size", type=int, default=32, help="Sub-samples per Qwen forward (default: 32)."
+    )
+    parser.add_argument(
+        "--num-subsampled-frames",
+        type=int,
+        default=DEFAULT_NUM_SUBSAMPLED_FRAMES,
+        help=f"Frames per sub-sample (default: {DEFAULT_NUM_SUBSAMPLED_FRAMES}, matches eval server).",
+    )
+    parser.add_argument(
+        "--episodes", type=int, nargs="+", default=None, help="Process only these episode indices."
+    )
+    parser.add_argument(
+        "--image-key", type=str, default=None, help="Image observation key (default: from config)."
+    )
+    parser.add_argument(
+        "--push-to-hub", action="store_true", help="Upload to the dataset repo on HuggingFace Hub."
+    )
+
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+
+    reward_model_path = args.reward_model_path
+    if reward_model_path is None:
+        temp_dataset = LeRobotDataset(args.dataset_repo_id, download_videos=False)
+        parquet_path = Path(temp_dataset.root) / DEFAULT_OUTPUT_FILENAME
+        reward_model_path = get_reward_model_path_from_parquet(parquet_path)
+        if reward_model_path:
+            logging.info(f"Using reward model from parquet metadata: {reward_model_path}")
+        else:
+            raise ValueError(
+                "--reward-model-path is required (no existing parquet with model metadata found)."
+            )
+
+    output_path = compute_robometer_progress(
+        dataset_repo_id=args.dataset_repo_id,
+        reward_model_path=reward_model_path,
+        output_path=args.output_path,
+        device=args.device,
+        batch_size=args.batch_size,
+        num_subsampled_frames=args.num_subsampled_frames,
+        episodes=args.episodes,
+        image_key=args.image_key,
+    )
+
+    print(f"\nRobometer progress saved to: {output_path}")
+
+    if args.push_to_hub:
+        from huggingface_hub import HfApi
+
+        api = HfApi()
+        hub_path = DEFAULT_OUTPUT_FILENAME
+
+        print(f"\nUploading to Hub: {args.dataset_repo_id}/{hub_path}")
+        api.upload_file(
+            path_or_fileobj=str(output_path),
+            path_in_repo=hub_path,
+            repo_id=args.dataset_repo_id,
+            repo_type="dataset",
+        )
+        print(
+            "Successfully uploaded to: "
+            f"https://huggingface.co/datasets/{args.dataset_repo_id}/blob/main/{hub_path}"
+        )
+
+        print("\nTo use in training, add to your config:")
+        print("  use_rabc: true")
+        print(f"  rabc_progress_path: hf://datasets/{args.dataset_repo_id}/{hub_path}")
+        print("  rabc_head_mode: sparse")
+    else:
+        print("\nTo use in training, add to your config:")
+        print("  use_rabc: true")
+        print(f"  rabc_progress_path: {output_path}")
+        print("  rabc_head_mode: sparse")
+
+
+if __name__ == "__main__":
+    main()
--- a/src/lerobot/rewards/robometer/configuration_robometer.py
+++ b/src/lerobot/rewards/robometer/configuration_robometer.py
@@ -0,0 +1,158 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from copy import deepcopy
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any
+
+from lerobot.configs import FeatureType, NormalizationMode, PolicyFeature
+from lerobot.configs.rewards import RewardModelConfig
+from lerobot.utils.constants import OBS_IMAGES
+from lerobot.utils.import_utils import _transformers_available, require_package
+
+if TYPE_CHECKING or _transformers_available:
+    from transformers import AutoConfig, AutoTokenizer
+else:
+    AutoConfig = None  # type: ignore[assignment]
+    AutoTokenizer = None  # type: ignore[assignment]
+
+
+# Special tokens Robometer adds to the Qwen-VL tokenizer at construction time.
+# The order is part of the data contract: upstream resized ``embed_tokens``
+# after adding these tokens in this exact order, so changing the set or order
+# would silently misalign the saved embedding rows with their token ids.
+# ``<|reward_token|>`` and ``<|sim_token|>`` are leftover from earlier upstream
+# heads (never read at inference) but still occupy rows the checkpoint expects.
+ROBOMETER_SPECIAL_TOKENS = (
+    "<|split_token|>",
+    "<|reward_token|>",
+    "<|pref_token|>",
+    "<|sim_token|>",
+    "<|prog_token|>",
+)
+
+
+@RewardModelConfig.register_subclass("robometer")
+@dataclass
+class RobometerConfig(RewardModelConfig):
+    """Configuration for the Robometer reward model."""
+
+    pretrained_path: str | None = "lerobot/Robometer-4B"
+    image_key: str = OBS_IMAGES + ".top"
+    task_key: str = "task"
+    default_task: str | None = None
+
+    max_frames: int | None = 8
+    reward_output: str = "progress"  # "progress" or "success"
+    success_threshold: float = 0.5
+
+    license: str | None = "apache-2.0"
+    tags: list[str] | None = field(
+        default_factory=lambda: ["reward-model", "vision-language", "qwen3-vl", "zero-shot"]
+    )
+
+    base_model_id: str = "Qwen/Qwen3-VL-4B-Instruct"
+    torch_dtype: str = "bfloat16"
+    use_multi_image: bool = True
+    use_per_frame_progress_token: bool = True
+    average_temporal_patches: bool = True
+    frame_pooling: str = "mean"  # "mean" | "boundary" | "attention"
+    frame_pooling_attn_temperature: float = 1.0
+    progress_loss_type: str = "discrete"  # "l1" | "l2" | "discrete"
+    progress_discrete_bins: int = 10
+
+    # Serialised Qwen backbone config (post-resize). Always populated by
+    # ``__post_init__`` from ``base_model_id`` + ``len(tokenizer) + 5``, so it
+    # is non-empty after construction. Saved into ``config.json`` automatically
+    # by the base ``_save_pretrained``.
+    vlm_config: dict[str, Any] = field(default_factory=dict)
+
+    input_features: dict[str, PolicyFeature] = field(default_factory=dict)
+    output_features: dict[str, PolicyFeature] = field(default_factory=dict)
+    normalization_mapping: dict[str, NormalizationMode] = field(
+        default_factory=lambda: {
+            "VISUAL": NormalizationMode.IDENTITY,
+            "REWARD": NormalizationMode.IDENTITY,
+        }
+    )
+
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        if self.reward_output not in {"progress", "success"}:
+            raise ValueError(f"reward_output must be 'progress' or 'success', got {self.reward_output!r}")
+        if self.max_frames is not None and self.max_frames < 1:
+            raise ValueError(f"max_frames must be >= 1, got {self.max_frames}")
+        if self.frame_pooling not in {"mean", "boundary", "attention"}:
+            raise ValueError(f"frame_pooling must be mean/boundary/attention; got {self.frame_pooling!r}")
+        if self.frame_pooling_attn_temperature <= 0:
+            raise ValueError("frame_pooling_attn_temperature must be > 0")
+        if self.progress_loss_type not in {"l1", "l2", "discrete"}:
+            raise ValueError(f"progress_loss_type must be l1/l2/discrete; got {self.progress_loss_type!r}")
+        if self.use_per_frame_progress_token and not self.use_multi_image:
+            raise ValueError("use_per_frame_progress_token=True requires use_multi_image=True")
+
+        if self.image_key not in self.input_features:
+            self.input_features[self.image_key] = PolicyFeature(shape=(3, 224, 224), type=FeatureType.VISUAL)
+        self.output_features.setdefault("progress", PolicyFeature(shape=(1,), type=FeatureType.REWARD))
+        self.output_features.setdefault("success", PolicyFeature(shape=(1,), type=FeatureType.REWARD))
+
+        # Deterministically populate ``vlm_config`` so it is non-empty after
+        # construction. For ``Qwen/Qwen3-VL-4B-Instruct`` this gives
+        # ``len(tokenizer) + 5 = 151,669 + 5 = 151,674`` — the exact post-resize
+        # vocab the published ``Robometer-4B`` checkpoint was saved with.
+        if not self.vlm_config:
+            require_package("transformers", extra="robometer")
+            vlm = AutoConfig.from_pretrained(self.base_model_id).to_dict()
+            tokenizer = AutoTokenizer.from_pretrained(self.base_model_id)
+            text_config = vlm.get("text_config")
+            if not isinstance(text_config, dict):
+                raise ValueError(
+                    f"Backbone config for {self.base_model_id!r} has no nested `text_config`; "
+                    "Robometer expects a Qwen-VL-style config."
+                )
+            text_config["vocab_size"] = len(tokenizer) + len(ROBOMETER_SPECIAL_TOKENS)
+            self.vlm_config = vlm
+
+    @property
+    def use_discrete_progress(self) -> bool:
+        """Whether the progress head outputs distribution logits over bins."""
+        return self.progress_loss_type.lower() == "discrete"
+
+    @property
+    def vlm_backbone_config(self):
+        """Reconstruct the Qwen backbone config from :attr:`vlm_config`."""
+        require_package("transformers", extra="robometer")
+        config_dict = deepcopy(self.vlm_config)
+        model_type = config_dict.pop("model_type", None)
+        if model_type is None:
+            raise ValueError("vlm_config must include `model_type` to reconstruct the backbone config")
+        return AutoConfig.for_model(model_type, **config_dict)
+
+    @property
+    def observation_delta_indices(self) -> list[int] | None:
+        return None
+
+    @property
+    def action_delta_indices(self) -> None:
+        return None
+
+    @property
+    def reward_delta_indices(self) -> None:
+        return None
+
+    def validate_features(self) -> None:
+        if self.image_key not in self.input_features:
+            raise ValueError(f"Robometer requires image input feature {self.image_key!r}")
--- a/src/lerobot/rewards/robometer/modeling_robometer.py
+++ b/src/lerobot/rewards/robometer/modeling_robometer.py
@@ -0,0 +1,481 @@
+# Copyright 2026 Anthony Liang, Yigit Korkmaz, Stephen Tu, Erdem Bıyık, Jesse Zhang
+# and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""ROBOMETER: Scaling General-Purpose Robotic Reward Models via Trajectory Comparisons.
+
+Paper:         https://arxiv.org/abs/2603.02115
+Project:       https://robometer.github.io
+Original code: https://github.com/aliang8/robometer
+Model:         https://huggingface.co/robometer/Robometer-4B
+
+Robometer is a general-purpose, video-language-input reward model built on
+``Qwen/Qwen3-VL-4B-Instruct``. It is trained with a dual reward-prediction
+objective:
+
+- A frame-level progress loss anchoring reward magnitude on expert data.
+- A trajectory-comparison preference loss imposing global ordering constraints
+  across trajectories sharing the same instruction.
+
+To support downstream RL it also predicts a frame-level binary success. The
+training prompt inserts three learnable tokens:
+
+- ``<|prog_token|>`` after each frame to read per-frame progress and success.
+- ``<|pref_token|>`` at the end to read pairwise preference (training-only).
+- ``<|split_token|>`` between two trajectories in preference samples
+  (training-only).
+
+Progress is modeled as a categorical distribution over ``progress_discrete_bins``
+uniformly-spaced centers in ``[0, 1]`` (C51-style), and the continuous estimate
+is recovered as the softmax-weighted mean of those centers — see
+:func:`convert_bins_to_continuous`.
+
+This LeRobot port is **inference-only**: the preference head is preserved in
+the state dict for byte-equivalence with the published ``Robometer-4B``
+checkpoint but is not queried by :meth:`RobometerRewardModel.compute_reward`,
+which returns the last-frame progress (clamped to ``[0, 1]``) or sigmoid'd
+success probability depending on :attr:`RobometerConfig.reward_output`.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, Any
+
+import torch
+from torch import Tensor, nn
+
+from lerobot.rewards.pretrained import PreTrainedRewardModel
+from lerobot.rewards.robometer.configuration_robometer import RobometerConfig
+from lerobot.utils.constants import OBS_PREFIX
+from lerobot.utils.import_utils import _transformers_available, require_package
+
+if TYPE_CHECKING or _transformers_available:
+    from transformers import AutoModelForImageTextToText
+else:
+    AutoModelForImageTextToText = None  # type: ignore[assignment]
+
+logger = logging.getLogger(__name__)
+
+# Namespace for Robometer's pre-encoded Qwen-VL observation tensors.
+ROBOMETER_FEATURE_PREFIX = f"{OBS_PREFIX}robometer."
+ROBOMETER_QWEN_INPUT_KEYS = (
+    "input_ids",
+    "attention_mask",
+    "pixel_values",
+    "pixel_values_videos",
+    "image_grid_thw",
+    "video_grid_thw",
+    "second_per_grid_ts",
+    "mm_token_type_ids",
+)
+ROBOMETER_METADATA_KEYS = (
+    "prog_token_id",
+    "vision_start_token_id",
+    "vision_end_token_id",
+    "video_merge_size",
+)
+ROBOMETER_INPUT_KEYS = ROBOMETER_QWEN_INPUT_KEYS + ROBOMETER_METADATA_KEYS
+
+
+def convert_bins_to_continuous(bin_logits: Tensor) -> Tensor:
+    """Collapse per-bin logits into a single value in ``[0, 1]``.
+
+    The discrete progress head outputs ``num_bins`` logits per frame. Bins are
+    evenly spaced centers in ``[0, 1]``; the continuous prediction is the
+    softmax-weighted mean of those centers.
+    """
+    bin_probs = torch.softmax(bin_logits, dim=-1)
+    num_bins = bin_logits.shape[-1]
+    bin_centers = torch.linspace(0.0, 1.0, num_bins, device=bin_logits.device, dtype=bin_logits.dtype)
+    return (bin_probs * bin_centers).sum(dim=-1)
+
+
+def _squeeze_last_safe(x: Tensor) -> Tensor:
+    """Drop a trailing singleton dim only when present."""
+    return x.squeeze(-1) if x.ndim > 1 and x.shape[-1] == 1 else x
+
+
+def _torch_dtype(name: str) -> torch.dtype:
+    dtype = getattr(torch, name, None)
+    if isinstance(dtype, torch.dtype):
+        return dtype
+    raise ValueError(f"Unknown torch dtype: {name!r}")
+
+
+class RobometerPredictionHead(nn.Sequential):
+    """Small MLP head used for Robometer's progress / success / preference outputs."""
+
+    def __init__(self, hidden_dim: int, output_size: int, *, dropout: float, with_sigmoid: bool) -> None:
+        layers: list[nn.Module] = [
+            nn.Linear(hidden_dim, hidden_dim // 2),
+            nn.LayerNorm(hidden_dim // 2),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim // 2, output_size),
+        ]
+        if with_sigmoid:
+            layers.append(nn.Sigmoid())
+        super().__init__(*layers)
+
+
+def decode_progress_outputs(
+    progress_logits: Tensor | None,
+    success_logits: Tensor | None,
+    *,
+    is_discrete_mode: bool,
+) -> dict[str, list[list[float]]]:
+    """Decode RBM head outputs into per-frame floats.
+
+    Args:
+        progress_logits: ``(B, T)`` (continuous) or ``(B, T, num_bins)`` (discrete).
+        success_logits: ``(B, T)`` raw logits, ``sigmoid``-ed to probabilities.
+        is_discrete_mode: if True the progress logits get a softmax over bins
+            and are projected onto bin centers via :func:`convert_bins_to_continuous`.
+
+    Returns:
+        Dict with ``progress_pred`` and ``success_probs``, each a list of
+        length ``B`` of per-frame float lists.
+    """
+    progress_pred: list[list[float]] = []
+    success_probs: list[list[float]] = []
+
+    if progress_logits is not None:
+        for sample_logits in progress_logits:
+            if is_discrete_mode:
+                continuous = convert_bins_to_continuous(sample_logits.detach().float().cpu())
+                progress_pred.append(continuous.flatten().tolist())
+            else:
+                progress_pred.append(sample_logits.detach().float().cpu().flatten().tolist())
+
+    if success_logits is not None:
+        for sample_logits in success_logits:
+            success_probs.append(torch.sigmoid(sample_logits.detach().float().cpu()).flatten().tolist())
+
+    return {"progress_pred": progress_pred, "success_probs": success_probs}
+
+
+class RobometerRewardModel(PreTrainedRewardModel):
+    """Robometer (RBM) reward model — inference-only LeRobot port.
+
+    Wraps a Qwen-VL backbone (default: ``Qwen/Qwen3-VL-4B-Instruct``) with three
+    prediction heads from the paper (progress, success, preference). At
+    inference time only the progress and success heads are queried; the
+    preference head is kept on the module so the published ``Robometer-4B``
+    safetensors load unchanged.
+    """
+
+    name = "robometer"
+    config_class = RobometerConfig
+
+    def __init__(self, config: RobometerConfig, *, dropout: float = 0.1) -> None:
+        require_package("transformers", extra="robometer")
+        super().__init__(config)
+        self.config = config
+
+        # Two backbone-build paths (EO-1 style, branched on ``pretrained_path``):
+        #
+        #   - Fresh training (``pretrained_path is None``): download the base
+        #     Qwen weights and resize the embed table to match
+        #     ``vlm_config.text_config.vocab_size`` — populated deterministically
+        #     in ``RobometerConfig.__post_init__`` as
+        #     ``len(tokenizer) + len(ROBOMETER_SPECIAL_TOKENS)``
+        #
+        #   - Loading a saved checkpoint (``pretrained_path`` is set): rebuild
+        #     the empty architecture from ``vlm_config`` via
+        #     ``AutoModelForImageTextToText.from_config`` so the subsequent
+        #     ``model.safetensors`` load is a direct fill of the right shape —
+        #     no redundant Qwen weight download.
+        torch_dtype = _torch_dtype(config.torch_dtype)
+        if config.pretrained_path is None:
+            self.model = AutoModelForImageTextToText.from_pretrained(
+                config.base_model_id,
+                dtype=torch_dtype,
+                trust_remote_code=True,
+            )
+            target_vocab = config.vlm_config["text_config"]["vocab_size"]
+            self.model.resize_token_embeddings(target_vocab)
+        else:
+            self.model = AutoModelForImageTextToText.from_config(
+                config.vlm_backbone_config,
+                dtype=torch_dtype,
+                trust_remote_code=True,
+            )
+
+        # All Qwen-VL backbones Robometer supports expose `text_config.hidden_size`.
+        # Falls back to the top-level `hidden_size` so future non-multimodal
+        # variants would still resolve.
+        backbone_config = self.model.config
+        text_config = getattr(backbone_config, "text_config", None)
+        hidden_size = getattr(text_config, "hidden_size", None) if text_config is not None else None
+        if hidden_size is None:
+            hidden_size = getattr(backbone_config, "hidden_size", None)
+        if hidden_size is None:
+            raise AttributeError(
+                f"Could not infer hidden_size from backbone config of {config.base_model_id}"
+            )
+        hidden_dim = int(hidden_size)
+
+        # Robometer's three prediction heads + frame-pool attention.
+        progress_output = config.progress_discrete_bins if config.use_discrete_progress else 1
+        self.progress_head = RobometerPredictionHead(
+            hidden_dim,
+            progress_output,
+            dropout=dropout,
+            with_sigmoid=not config.use_discrete_progress,
+        )
+        self.preference_head = RobometerPredictionHead(hidden_dim, 1, dropout=dropout, with_sigmoid=False)
+        self.success_head = RobometerPredictionHead(hidden_dim, 1, dropout=dropout, with_sigmoid=False)
+        self.frame_pool_attn = nn.Linear(hidden_dim, 1, bias=False)
+
+        # Match the dtype of the loaded base model so weight loading is a no-op cast.
+        model_dtype = next(self.model.parameters()).dtype
+        self.progress_head.to(dtype=model_dtype)
+        self.preference_head.to(dtype=model_dtype)
+        self.success_head.to(dtype=model_dtype)
+        self.frame_pool_attn.to(dtype=model_dtype)
+
+    def compute_reward(self, batch: dict[str, Tensor]) -> Tensor:
+        inputs = {
+            key: batch[f"{ROBOMETER_FEATURE_PREFIX}{key}"]
+            for key in ROBOMETER_INPUT_KEYS
+            if f"{ROBOMETER_FEATURE_PREFIX}{key}" in batch
+        }
+        if "input_ids" not in inputs:
+            raise KeyError(
+                f"Robometer batch missing pre-encoded inputs (expected "
+                f"`{ROBOMETER_FEATURE_PREFIX}input_ids`). Make sure the "
+                "RobometerEncoderProcessorStep ran before `compute_reward`."
+            )
+
+        device = next(self.model.parameters()).device
+        inputs = {key: value.to(device) if hasattr(value, "to") else value for key, value in inputs.items()}
+
+        self.eval()
+        with torch.no_grad():
+            progress_logits, success_logits = self._compute_rbm_logits(inputs)
+
+        decoded = decode_progress_outputs(
+            progress_logits,
+            success_logits,
+            is_discrete_mode=self.config.use_discrete_progress,
+        )
+        values = (
+            decoded["success_probs"] if self.config.reward_output == "success" else decoded["progress_pred"]
+        )
+
+        rewards = torch.stack([torch.as_tensor(seq, dtype=torch.float32)[-1] for seq in values])
+        if self.config.reward_output == "success":
+            rewards = (rewards > self.config.success_threshold).float()
+        else:
+            # Match upstream Robometer's ``extract_rewards_from_output``: per-frame
+            # progress predictions are clamped to ``[0, 1]`` before being returned.
+            rewards = rewards.clamp(0.0, 1.0)
+        return rewards.to(self.config.device or "cpu")
+
+    def _compute_rbm_logits(
+        self,
+        inputs: dict[str, Any],
+    ) -> tuple[Tensor, Tensor]:
+        """Run the Qwen3-VL backbone and apply Robometer's heads.
+
+        ``inputs`` is the encoded batch produced by
+        :class:`RobometerEncoderProcessorStep`. It carries Qwen tensors as well
+        as Robometer-specific metadata (``prog_token_id``,
+        ``vision_start_token_id``, ``vision_end_token_id``, ``video_merge_size``)
+        — the metadata is popped here so the rest can be forwarded straight to
+        the Qwen model.
+
+        Returns ``(progress_logits, success_logits)``. Shapes:
+
+        - ``progress_logits``: ``(B, T)`` (continuous) or ``(B, T, num_bins)`` (discrete).
+        - ``success_logits``: ``(B, T)`` raw logits (sigmoid happens at decode time).
+        """
+        prog_token_id = inputs.pop("prog_token_id", None)
+        vision_start_token_id = inputs.pop("vision_start_token_id", None)
+        vision_end_token_id = inputs.pop("vision_end_token_id", None)
+        video_merge_size = inputs.pop("video_merge_size", 14)
+
+        # Qwen3-VL doesn't reliably populate `last_hidden_state`; ask for the
+        # full hidden-state tuple and take the last layer. This matches the
+        # `is_qwen3` path in upstream Robometer's `RBM.forward_qwen` (main).
+        outputs = self.model(**inputs, output_hidden_states=True, return_dict=True)
+        hidden_state = (
+            outputs.hidden_states[-1]
+            if getattr(outputs, "hidden_states", None)
+            else outputs.last_hidden_state
+        )
+
+        input_ids = inputs["input_ids"]
+        if self.config.use_per_frame_progress_token:
+            if prog_token_id is None:
+                raise KeyError("`prog_token_id` missing in batch (run RobometerEncoderProcessorStep first)")
+            return self._process_token_extraction(hidden_state, input_ids, prog_token_id=prog_token_id)
+        if self.config.use_multi_image:
+            if vision_start_token_id is None or vision_end_token_id is None:
+                raise KeyError(
+                    "`vision_start_token_id` / `vision_end_token_id` missing in batch "
+                    "(run RobometerEncoderProcessorStep first)"
+                )
+            return self._process_multi_image_frames(
+                hidden_state,
+                input_ids,
+                start_id=vision_start_token_id,
+                end_id=vision_end_token_id,
+            )
+        video_grid_thw = inputs.get("video_grid_thw")
+        if video_grid_thw is None:
+            raise ValueError("video_grid_thw is required for video-mode Robometer inference")
+        if vision_start_token_id is None:
+            raise KeyError("`vision_start_token_id` missing in batch")
+        return self._process_video_frames(
+            hidden_state,
+            input_ids,
+            video_grid_thw,
+            start_id=vision_start_token_id,
+            merge_size=video_merge_size,
+        )
+
+    def _apply_heads_to_hidden_states(self, frame_embeddings: Tensor) -> tuple[Tensor, Tensor]:
+        """Apply progress + success heads to a tensor of frame embeddings."""
+        progress_out = self.progress_head(frame_embeddings)
+        progress = progress_out if self.config.use_discrete_progress else _squeeze_last_safe(progress_out)
+        success = _squeeze_last_safe(self.success_head(frame_embeddings))
+        return progress, success
+
+    def _process_token_extraction(
+        self,
+        hidden_state: Tensor,
+        input_ids: Tensor,
+        *,
+        prog_token_id: int,
+    ) -> tuple[Tensor, Tensor]:
+        """Per-frame progress/success from ``<|prog_token|>`` positions."""
+        token_mask = input_ids == prog_token_id
+        batch_indices, positions = token_mask.nonzero(as_tuple=True)
+        if positions.numel() == 0:
+            raise ValueError("`<|prog_token|>` not found in any sequence")
+
+        per_sample_hidden = [
+            hidden_state[i, positions[batch_indices == i]] for i in range(input_ids.shape[0])
+        ]
+        progress_list, success_list = [], []
+        for embeddings in per_sample_hidden:
+            if embeddings.shape[0] == 0:
+                raise ValueError("`<|prog_token|>` missing in a sequence")
+            progress, success = self._apply_heads_to_hidden_states(embeddings)
+            progress_list.append(progress)
+            success_list.append(success)
+
+        return torch.stack(progress_list), torch.stack(success_list)
+
+    def _process_multi_image_frames(
+        self,
+        hidden_state: Tensor,
+        input_ids: Tensor,
+        *,
+        start_id: int,
+        end_id: int,
+    ) -> tuple[Tensor, Tensor]:
+        """Per-frame progress/success in multi-image mode (Qwen-VL)."""
+        progress_list, success_list = [], []
+        for batch_idx in range(input_ids.shape[0]):
+            seq_ids = input_ids[batch_idx]
+            seq_hidden = hidden_state[batch_idx]
+            frame_embeddings = self._extract_hidden_states_from_token_pairs(
+                seq_hidden, seq_ids, start_id, end_id
+            )
+            progress, success = self._apply_heads_to_hidden_states(frame_embeddings)
+            progress_list.append(progress)
+            success_list.append(success)
+
+        return torch.stack(progress_list), torch.stack(success_list)
+
+    def _extract_hidden_states_from_token_pairs(
+        self,
+        hidden_state: Tensor,
+        input_ids: Tensor,
+        start_id: int,
+        end_id: int,
+    ) -> Tensor:
+        start_positions = (input_ids == start_id).nonzero(as_tuple=True)[0]
+        end_positions = (input_ids == end_id).nonzero(as_tuple=True)[0]
+        if start_positions.numel() == 0:
+            raise ValueError("`<|vision_start|>` not found in sequence")
+        if start_positions.numel() != end_positions.numel():
+            raise ValueError(
+                f"Mismatched vision token counts: {start_positions.numel()} start vs "
+                f"{end_positions.numel()} end"
+            )
+
+        frames: list[Tensor] = []
+        for start, end in zip(start_positions.tolist(), end_positions.tolist(), strict=True):
+            if start >= end:
+                raise ValueError(f"Invalid vision token pair: start={start} end={end}")
+            patch_tokens = hidden_state[start + 1 : end]
+            if patch_tokens.shape[0] == 0:
+                frames.append((hidden_state[start] + hidden_state[end]) / 2.0)
+                continue
+
+            pooling = self.config.frame_pooling
+            if pooling == "mean":
+                frames.append(patch_tokens.mean(dim=0))
+            elif pooling == "boundary":
+                frames.append(patch_tokens[-1])
+            else:  # attention
+                scores = (
+                    self.frame_pool_attn(patch_tokens).squeeze(-1)
+                    / self.config.frame_pooling_attn_temperature
+                )
+                weights = torch.softmax(scores, dim=0).unsqueeze(-1)
+                frames.append((weights * patch_tokens).sum(dim=0))
+
+        return torch.stack(frames)
+
+    def _process_video_frames(
+        self,
+        hidden_state: Tensor,
+        input_ids: Tensor,
+        video_grid_thw: Tensor,
+        *,
+        start_id: int,
+        merge_size: int,
+    ) -> tuple[Tensor, Tensor]:
+        """Per-frame progress/success in video mode (Qwen-VL)."""
+        progress_list, success_list = [], []
+        for batch_idx in range(input_ids.shape[0]):
+            seq_ids = input_ids[batch_idx]
+            seq_hidden = hidden_state[batch_idx]
+            start_positions = (seq_ids == start_id).nonzero(as_tuple=True)[0]
+            if start_positions.numel() == 0:
+                raise ValueError("`<|vision_start|>` not found in sequence")
+            t_dim, h_dim, w_dim = (int(x) for x in video_grid_thw[batch_idx].tolist())
+            tokens_per_frame = (h_dim * w_dim) // (merge_size**2)
+
+            cursor = start_positions[0].item()
+            frame_embeddings: list[Tensor] = []
+            for _ in range(t_dim):
+                if self.config.average_temporal_patches:
+                    patch = seq_hidden[cursor : cursor + tokens_per_frame]
+                    frame_embeddings.append(patch.mean(dim=0))
+                else:
+                    frame_embeddings.append(seq_hidden[cursor + tokens_per_frame])
+                cursor += tokens_per_frame
+
+            stacked = torch.stack(frame_embeddings)
+            progress, success = self._apply_heads_to_hidden_states(stacked)
+            progress_list.append(progress)
+            success_list.append(success)
+
+        return torch.stack(progress_list), torch.stack(success_list)
--- a/src/lerobot/rewards/robometer/processor_robometer.py
+++ b/src/lerobot/rewards/robometer/processor_robometer.py
@@ -0,0 +1,338 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Robometer pre/post processing pipelines."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any
+
+import numpy as np
+import torch
+from PIL import Image
+from torch import Tensor
+
+from lerobot.configs import PipelineFeatureType, PolicyFeature
+from lerobot.processor import (
+    AddBatchDimensionProcessorStep,
+    DeviceProcessorStep,
+    PolicyAction,
+    PolicyProcessorPipeline,
+    ProcessorStep,
+    ProcessorStepRegistry,
+    policy_action_to_transition,
+)
+from lerobot.rewards.robometer.configuration_robometer import (
+    ROBOMETER_SPECIAL_TOKENS,
+    RobometerConfig,
+)
+from lerobot.rewards.robometer.modeling_robometer import ROBOMETER_FEATURE_PREFIX
+from lerobot.types import EnvTransition, TransitionKey
+from lerobot.utils.constants import (
+    OBS_IMAGES,
+    POLICY_POSTPROCESSOR_DEFAULT_NAME,
+    POLICY_PREPROCESSOR_DEFAULT_NAME,
+)
+from lerobot.utils.import_utils import _transformers_available, require_package
+
+if TYPE_CHECKING or _transformers_available:
+    from transformers import AutoProcessor
+else:
+    AutoProcessor = None
+
+PROGRESS_PROMPT = (
+    "The task for the robot is '{task}'. Given the trajectory video, predict "
+    "the task progress at each frame, how far along the robot is towards "
+    "completing the task, a float between 0 and 1, where 0 is the starting "
+    "state and 1 is when the task is completed. If the robot is not "
+    "performing the same task, predict 0 progress."
+)
+
+
+def _frames_to_pil(frames: np.ndarray) -> list[Image.Image]:
+    """Convert ``(T, H, W, C)`` uint8 frames to a list of PIL images."""
+    if frames.ndim != 4:
+        raise ValueError(f"Expected (T,H,W,C) frames; got shape {frames.shape}")
+    if frames.dtype != np.uint8:
+        frames = np.clip(frames, 0, 255).astype(np.uint8)
+    return [Image.fromarray(frames[i]) for i in range(frames.shape[0])]
+
+
+def _video_to_numpy(video: Tensor, *, max_frames: int | None) -> np.ndarray:
+    """Convert one trajectory tensor to a ``(T, H, W, C) uint8`` numpy array."""
+    if max_frames is not None:
+        video = video[-max_frames:]
+    if video.shape[1] in (1, 3):
+        video = video.permute(0, 2, 3, 1)
+    elif video.shape[-1] not in (1, 3):
+        raise ValueError(f"Expected channel dim of size 1 or 3, got shape {tuple(video.shape)}")
+
+    array = video.detach().cpu().numpy()
+    if np.issubdtype(array.dtype, np.floating) and array.size > 0 and array.max() <= 1.0:
+        array = array * 255.0
+    return np.clip(array, 0, 255).astype(np.uint8)
+
+
+def _expand_tasks(task: Any, *, batch_size: int, default: str | None) -> list[str]:
+    if task is None:
+        task = default
+    if task is None:
+        raise KeyError("Robometer expected a task description in complementary data")
+    if isinstance(task, str):
+        return [task] * batch_size
+    if isinstance(task, tuple):
+        task = list(task)
+    if not (isinstance(task, list) and all(isinstance(item, str) for item in task)):
+        raise TypeError(f"Robometer task must be a string or list of strings, got {type(task)}")
+    if len(task) == 1 and batch_size > 1:
+        return task * batch_size
+    if len(task) != batch_size:
+        raise ValueError(f"Expected {batch_size} tasks, got {len(task)}")
+    return task
+
+
+@dataclass
+@ProcessorStepRegistry.register(name="robometer_encoder")
+class RobometerEncoderProcessorStep(ProcessorStep):
+    """Encode raw frames + task into Qwen-VL tensors for the Robometer model.
+
+    Loads a :class:`~transformers.AutoProcessor` matching ``base_model_id`` and
+    registers Robometer's special tokens on the tokenizer. The matching
+    embedding resize happens model-side in
+    :meth:`RobometerRewardModel.__init__`.
+
+    At call time the step reads:
+
+    - ``observation[image_key]``: ``(B, T, C, H, W)`` or ``(B, C, H, W)`` frames.
+    - ``complementary_data[task_key]``: a string or list of strings.
+
+    and writes ``observation[f"{ROBOMETER_FEATURE_PREFIX}<name>"]`` for:
+
+    - the Qwen-VL processor outputs: ``input_ids``, ``attention_mask``,
+      ``pixel_values``, ``image_grid_thw``, ``video_grid_thw``, ...
+    - Robometer-specific token ids consumed by the model heads:
+      ``prog_token_id``, ``vision_start_token_id``, ``vision_end_token_id``,
+      ``video_merge_size``.
+    """
+
+    base_model_id: str = "Qwen/Qwen3-VL-4B-Instruct"
+    image_key: str = OBS_IMAGES + ".top"
+    task_key: str = "task"
+    default_task: str | None = None
+    max_frames: int | None = 8
+    use_multi_image: bool = True
+    use_per_frame_progress_token: bool = True
+    max_length: int = 1024
+
+    _processor: Any = field(default=None, init=False, repr=False)
+
+    def __post_init__(self) -> None:
+        require_package("transformers", extra="robometer")
+        require_package("qwen-vl-utils", extra="robometer", import_name="qwen_vl_utils")
+
+        self._processor = AutoProcessor.from_pretrained(
+            self.base_model_id,
+            trust_remote_code=True,
+            do_sample_frames=False,
+            padding_side="right",
+        )
+
+        # Register Robometer's special tokens on the tokenizer. The matching
+        # embedding resize happens model-side in `RobometerRewardModel.__init__`.
+        tokenizer = self._processor.tokenizer
+        # Qwen tokenizers may not define a pad token, but batched prompts/videos
+        # require padding, so reuse EOS as the padding token.
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        for token in ROBOMETER_SPECIAL_TOKENS:
+            if token not in tokenizer.get_vocab():
+                tokenizer.add_special_tokens({"additional_special_tokens": [token]})
+
+    def __call__(self, transition: EnvTransition) -> EnvTransition:
+        observation = transition.get(TransitionKey.OBSERVATION)
+        complementary = transition.get(TransitionKey.COMPLEMENTARY_DATA) or {}
+        if not isinstance(observation, dict):
+            raise ValueError("RobometerEncoderProcessorStep requires an observation dict")
+
+        if self.image_key not in observation:
+            raise KeyError(f"Robometer expected image key {self.image_key!r} in observation")
+
+        frames = observation[self.image_key]
+        tensor = frames.detach().cpu() if isinstance(frames, Tensor) else torch.as_tensor(frames)
+        if tensor.ndim == 4:
+            tensor = tensor.unsqueeze(1)
+        elif tensor.ndim != 5:
+            raise ValueError(
+                f"Expected Robometer frames with shape (B,C,H,W) or (B,T,C,H,W); got {tuple(tensor.shape)}"
+            )
+
+        batch_size = tensor.shape[0]
+        tasks = _expand_tasks(
+            complementary.get(self.task_key, self.default_task),
+            batch_size=batch_size,
+            default=self.default_task,
+        )
+
+        samples = [
+            (_video_to_numpy(tensor[i], max_frames=self.max_frames), tasks[i]) for i in range(batch_size)
+        ]
+        encoded = self.encode_samples(samples)
+
+        new_observation = dict(observation)
+        for key, value in encoded.items():
+            new_observation[f"{ROBOMETER_FEATURE_PREFIX}{key}"] = value
+
+        new_transition = transition.copy()
+        new_transition[TransitionKey.OBSERVATION] = new_observation
+        return new_transition
+
+    def encode_samples(self, samples: list[tuple[np.ndarray, str]]) -> dict[str, Tensor]:
+        """Run the Qwen-VL processor on a list of ``(frames, task)`` samples."""
+        from qwen_vl_utils import process_vision_info
+
+        conversations = [self._build_conversation(frames, task) for frames, task in samples]
+
+        texts = [
+            self._processor.apply_chat_template(
+                msg,
+                tokenize=False,
+                add_generation_prompt=False,
+                add_vision_id=True,
+                enable_thinking=False,
+                fps=1,
+            )
+            for msg in conversations
+        ]
+
+        process_kwargs: dict[str, Any] = {
+            "return_video_kwargs": True,
+            "return_video_metadata": True,
+        }
+        image_processor = getattr(self._processor, "image_processor", None)
+        if image_processor is not None and hasattr(image_processor, "patch_size"):
+            process_kwargs["image_patch_size"] = image_processor.patch_size
+
+        image_inputs, video_inputs, video_kwargs = process_vision_info(conversations, **process_kwargs)
+
+        videos: list[Any] | None = None
+        video_metadatas: list[Any] | None = None
+        if video_inputs:
+            if isinstance(video_inputs[0], tuple) and len(video_inputs[0]) == 2:
+                videos_seq, metadatas_seq = zip(*video_inputs, strict=False)
+                videos = list(videos_seq)
+                video_metadatas = list(metadatas_seq)
+            else:
+                videos = list(video_inputs)
+
+        processor_kwargs: dict[str, Any] = {
+            "text": texts,
+            "images": image_inputs,
+            "padding": True,
+            "truncation": False,
+            "max_length": self.max_length,
+            "return_tensors": "pt",
+            "do_resize": False,
+        }
+        if videos is not None:
+            processor_kwargs["videos"] = videos
+        if video_metadatas is not None:
+            processor_kwargs["video_metadata"] = video_metadatas
+        if video_kwargs:
+            processor_kwargs.update(video_kwargs)
+
+        encoded = self._processor(**processor_kwargs)
+
+        # Write Robometer-specific token ids and the video patch merge size into
+        # the encoded batch so `RobometerRewardModel` doesn't need its own
+        # tokenizer at inference (EO1-style separation: the processor owns the
+        # tokenizer, the model owns the backbone and heads).
+        tokenizer = self._processor.tokenizer
+        encoded["prog_token_id"] = tokenizer.convert_tokens_to_ids("<|prog_token|>")
+        encoded["vision_start_token_id"] = tokenizer.convert_tokens_to_ids("<|vision_start|>")
+        encoded["vision_end_token_id"] = tokenizer.convert_tokens_to_ids("<|vision_end|>")
+        video_processor = getattr(self._processor, "video_processor", None)
+        encoded["video_merge_size"] = int(getattr(video_processor, "merge_size", 14))
+        return encoded
+
+    def _build_conversation(self, frames: np.ndarray, task: str) -> list[dict[str, Any]]:
+        pil_frames = _frames_to_pil(frames)
+        prompt = PROGRESS_PROMPT.format(task=task)
+        content: list[dict[str, Any]] = [{"type": "text", "text": prompt}]
+
+        if self.use_multi_image:
+            for image in pil_frames:
+                content.append({"type": "image", "image": image})
+                if self.use_per_frame_progress_token:
+                    content.append({"type": "text", "text": "<|prog_token|>"})
+        else:
+            content.append({"type": "video", "video": pil_frames, "sample_fps": 1.0})
+
+        return [{"role": "user", "content": content}]
+
+    def transform_features(
+        self, features: dict[PipelineFeatureType, dict[str, PolicyFeature]]
+    ) -> dict[PipelineFeatureType, dict[str, PolicyFeature]]:
+        return features
+
+    def get_config(self) -> dict[str, Any]:
+        return {
+            "base_model_id": self.base_model_id,
+            "image_key": self.image_key,
+            "task_key": self.task_key,
+            "default_task": self.default_task,
+            "max_frames": self.max_frames,
+            "use_multi_image": self.use_multi_image,
+            "use_per_frame_progress_token": self.use_per_frame_progress_token,
+            "max_length": self.max_length,
+        }
+
+
+def make_robometer_pre_post_processors(
+    config: RobometerConfig,
+    dataset_stats: dict[str, dict[str, Any]] | None = None,
+) -> tuple[
+    PolicyProcessorPipeline[dict[str, Any], dict[str, Any]],
+    PolicyProcessorPipeline[PolicyAction, PolicyAction],
+]:
+    """Pipeline that pre-encodes frames + task into Qwen-VL tensors.
+
+    The preprocessor adds a batch dimension if needed, runs Robometer's
+    encoder, and moves everything to the configured device. The
+    postprocessor is the identity since Robometer outputs a single reward
+    tensor.
+    """
+    del dataset_stats  # Robometer has its own normalisation inside the Qwen-VL processor.
+
+    preprocessor = PolicyProcessorPipeline[dict[str, Any], dict[str, Any]](
+        steps=[
+            AddBatchDimensionProcessorStep(),
+            RobometerEncoderProcessorStep(
+                base_model_id=config.base_model_id,
+                image_key=config.image_key,
+                task_key=config.task_key,
+                default_task=config.default_task,
+                max_frames=config.max_frames,
+                use_multi_image=config.use_multi_image,
+                use_per_frame_progress_token=config.use_per_frame_progress_token,
+            ),
+            DeviceProcessorStep(device=config.device or "cpu"),
+        ],
+        name=POLICY_PREPROCESSOR_DEFAULT_NAME,
+    )
+    postprocessor = PolicyProcessorPipeline(
+        name=POLICY_POSTPROCESSOR_DEFAULT_NAME,
+        to_transition=policy_action_to_transition,
+    )
+    return preprocessor, postprocessor
--- a/src/lerobot/rewards/topreward/init.py
+++ b/src/lerobot/rewards/topreward/init.py
@@ -0,0 +1,19 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .configuration_topreward import TOPRewardConfig
+from .modeling_topreward import TOPRewardModel
+from .processor_topreward import make_topreward_pre_post_processors
+
+__all__ = ["TOPRewardConfig", "TOPRewardModel", "make_topreward_pre_post_processors"]
--- a/src/lerobot/rewards/topreward/compute_rabc_weights.py
+++ b/src/lerobot/rewards/topreward/compute_rabc_weights.py
@@ -0,0 +1,353 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Compute per-frame TOPReward progress curves for a LeRobot dataset.
+
+For each episode, scores trajectory prefixes of increasing length using
+the TOPReward reward model, min-max normalises the raw log-prob rewards per episode,
+and writes a parquet file with one row per frame.
+
+The parquet uses the same schema as SARM's :mod:`lerobot.rewards.sarm.compute_rabc_weights`.
+
+Usage:
+    # Sparse-dense mode (15 anchors per episode, matches upstream)
+    python -m lerobot.rewards.topreward.compute_rabc_weights \\
+        --dataset-repo-id lerobot/libero_10_image \\
+        --num-samples 15
+
+    # Use a different VLM backbone
+    python -m lerobot.rewards.topreward.compute_rabc_weights \\
+        --dataset-repo-id lerobot/libero_10_image \\
+        --vlm-name Qwen/Qwen3-VL-4B-Instruct
+"""
+
+from __future__ import annotations
+
+import argparse
+import logging
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import pyarrow as pa
+import pyarrow.parquet as pq
+import torch
+from tqdm import tqdm
+
+from lerobot.datasets import LeRobotDataset
+from lerobot.rewards.topreward.configuration_topreward import TOPRewardConfig
+from lerobot.rewards.topreward.modeling_topreward import TOPRewardModel
+from lerobot.rewards.topreward.processor_topreward import TOPRewardEncoderProcessorStep
+from lerobot.types import TransitionKey
+
+DEFAULT_OUTPUT_FILENAME = "topreward_progress.parquet"
+
+
+def get_reward_model_path_from_parquet(parquet_path: Path) -> str | None:
+    """Read ``reward_model_path`` from parquet metadata if available."""
+    if not parquet_path.exists():
+        return None
+    try:
+        metadata = pq.read_metadata(parquet_path).schema.to_arrow_schema().metadata
+        if metadata and b"reward_model_path" in metadata:
+            return metadata[b"reward_model_path"].decode()
+    except Exception:  # nosec B110
+        return None
+    return None
+
+
+def _resolve_task(sample: dict[str, Any], default: str) -> str:
+    """Best-effort task extraction from a dataset sample."""
+    task = sample.get("task")
+    if isinstance(task, str) and task:
+        return task
+    return default
+
+
+def normalize_rewards(rewards: list[float] | np.ndarray) -> np.ndarray:
+    """Min-max normalise raw log-prob rewards into ``[0, 1]``."""
+    rewards_arr = np.asarray(rewards, dtype=np.float64)
+    if rewards_arr.size == 0:
+        return rewards_arr.astype(np.float32)
+    if rewards_arr.size == 1:
+        return np.array([1.0], dtype=np.float32)
+    r_min, r_max = rewards_arr.min(), rewards_arr.max()
+    if r_max == r_min:
+        return np.ones_like(rewards_arr, dtype=np.float32)
+    return ((rewards_arr - r_min) / (r_max - r_min)).astype(np.float32)
+
+
+def compute_instruction_rewards_for_prefixes(
+    model: TOPRewardModel,
+    encoder: TOPRewardEncoderProcessorStep,
+    dataset: LeRobotDataset,
+    ep_start: int,
+    num_frames: int,
+    task: str,
+    image_key: str,
+    num_samples: int | None,
+    device: str,
+) -> np.ndarray:
+    """Score an episode via prefix sweep and return a per-frame normalised curve."""
+    if num_samples is None or num_samples >= num_frames:
+        prefix_lengths = np.arange(1, num_frames + 1, dtype=np.int64)
+    else:
+        prefix_lengths = np.unique(np.linspace(1, num_frames, num_samples).round().astype(np.int64))
+
+    episode_frames = torch.stack([dataset[ep_start + i][image_key] for i in range(num_frames)])
+    rewards: list[float] = []
+    for length in prefix_lengths:
+        frames = episode_frames[: int(length)].unsqueeze(0)  # (1, T, C, H, W)
+
+        transition = {
+            TransitionKey.OBSERVATION: {image_key: frames},
+            TransitionKey.COMPLEMENTARY_DATA: {"task": task},
+        }
+        encoded = encoder(transition)
+        obs = encoded[TransitionKey.OBSERVATION]
+        batch = {
+            key: value.to(device) if isinstance(value, torch.Tensor) else value for key, value in obs.items()
+        }
+
+        with torch.no_grad():
+            reward = model.compute_reward(batch)
+        rewards.append(float(reward.item()))
+
+    normalized_rewards = normalize_rewards(rewards)
+
+    if prefix_lengths.shape[0] == num_frames:
+        return normalized_rewards
+
+    return np.interp(
+        np.arange(1, num_frames + 1, dtype=np.float64),
+        prefix_lengths.astype(np.float64),
+        normalized_rewards.astype(np.float64),
+    ).astype(np.float32)
+
+
+def compute_topreward_progress(
+    dataset_repo_id: str,
+    reward_model_path: str | None = None,
+    vlm_name: str | None = None,
+    output_path: str | None = None,
+    device: str = "cuda",
+    num_samples: int | None = None,
+    fps: float | None = None,
+    episodes: list[int] | None = None,
+) -> Path:
+    """Run TOPReward over a dataset and write per-frame progress."""
+    if reward_model_path is not None:
+        logging.info(f"Loading TOPReward config from: {reward_model_path}")
+        model = TOPRewardModel.from_pretrained(reward_model_path)
+        config = model.config
+        config.device = device
+        if vlm_name is not None and vlm_name != config.vlm_name:
+            logging.info(f"Overriding vlm_name from config: {config.vlm_name} -> {vlm_name}")
+            config.vlm_name = vlm_name
+            model = TOPRewardModel(config)
+    else:
+        config_kwargs: dict[str, Any] = {"device": device}
+        if vlm_name is not None:
+            config_kwargs["vlm_name"] = vlm_name
+        if fps is not None:
+            config_kwargs["fps"] = fps
+        config = TOPRewardConfig(**config_kwargs)
+        logging.info(f"Constructing TOPReward with VLM: {config.vlm_name}")
+        model = TOPRewardModel(config)
+
+    model.to(device).eval()
+
+    encoder = TOPRewardEncoderProcessorStep(
+        vlm_name=config.vlm_name,
+        image_key=config.image_key,
+        task_key=config.task_key,
+        default_task=config.default_task,
+        max_frames=None,  # no tail-crop: we control prefix length explicitly
+        fps=config.fps,
+        prompt_prefix=config.prompt_prefix,
+        prompt_suffix_template=config.prompt_suffix_template,
+        add_chat_template=config.add_chat_template,
+        max_length=config.max_input_length,
+    )
+
+    image_key = config.image_key
+
+    logging.info(f"Loading dataset: {dataset_repo_id}")
+    dataset = LeRobotDataset(dataset_repo_id, download_videos=True)
+    logging.info(f"Dataset: {dataset.num_episodes} episodes, {dataset.num_frames} frames")
+
+    episode_indices = list(range(dataset.num_episodes)) if episodes is None else episodes
+    logging.info(f"Processing {len(episode_indices)} episode(s)")
+
+    all_index: list[int] = []
+    all_episode: list[int] = []
+    all_frame: list[int] = []
+    all_progress: list[float] = []
+
+    for episode_idx in tqdm(episode_indices, desc="Episodes"):
+        ep = dataset.meta.episodes[episode_idx]
+        ep_start = int(ep["dataset_from_index"])
+        ep_end = int(ep["dataset_to_index"])
+        num_frames = ep_end - ep_start
+        if num_frames <= 0:
+            continue
+
+        first_sample = dataset[ep_start]
+        task = _resolve_task(first_sample, default=config.default_task or "perform the task")
+
+        per_frame = compute_instruction_rewards_for_prefixes(
+            model=model,
+            encoder=encoder,
+            dataset=dataset,
+            ep_start=ep_start,
+            num_frames=num_frames,
+            task=task,
+            image_key=image_key,
+            num_samples=num_samples,
+            device=device,
+        )
+
+        for local in range(num_frames):
+            all_index.append(ep_start + local)
+            all_episode.append(episode_idx)
+            all_frame.append(local)
+            all_progress.append(float(per_frame[local]))
+
+        if device.startswith("cuda"):
+            torch.cuda.empty_cache()
+
+    table = pa.table(
+        {
+            "index": np.asarray(all_index, dtype=np.int64),
+            "episode_index": np.asarray(all_episode, dtype=np.int64),
+            "frame_index": np.asarray(all_frame, dtype=np.int64),
+            "progress_sparse": np.asarray(all_progress, dtype=np.float32),
+        }
+    )
+
+    schema_metadata: dict[bytes, bytes] = {b"vlm_name": config.vlm_name.encode()}
+    if reward_model_path is not None:
+        schema_metadata[b"reward_model_path"] = reward_model_path.encode()
+    table = table.replace_schema_metadata(schema_metadata)
+
+    out = Path(dataset.root) / DEFAULT_OUTPUT_FILENAME if output_path is None else Path(output_path)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    pq.write_table(table, out)
+    logging.info(f"Saved {len(table)} frame values to {out}")
+
+    progress_arr = np.asarray(all_progress, dtype=np.float32)
+    if progress_arr.size:
+        logging.info(
+            f"Progress: mean={float(progress_arr.mean()):.4f}, "
+            f"std={float(progress_arr.std()):.4f}, "
+            f"min={float(progress_arr.min()):.4f}, "
+            f"max={float(progress_arr.max()):.4f}"
+        )
+    return out
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Compute per-frame TOPReward progress curves for RA-BC weighting.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    # Sparse-dense mode (matches upstream TOPReward num_samples=15)
+    python -m lerobot.rewards.topreward.compute_rabc_weights \\
+        --dataset-repo-id lerobot/libero_10_image \\
+        --num-samples 15
+
+    # Use a smaller VLM
+    python -m lerobot.rewards.topreward.compute_rabc_weights \\
+        --dataset-repo-id lerobot/libero_10_image \\
+        --vlm-name Qwen/Qwen3-VL-4B-Instruct
+        """,
+    )
+    parser.add_argument(
+        "--dataset-repo-id", type=str, required=True, help="HuggingFace dataset repo id or local path."
+    )
+    parser.add_argument(
+        "--reward-model-path", type=str, default=None, help="Optional TOPReward LeRobot config."
+    )
+    parser.add_argument("--vlm-name", type=str, default=None, help="Override the VLM backbone (HF Hub id).")
+    parser.add_argument("--output-path", type=str, default=None, help="Output parquet path.")
+    parser.add_argument("--device", type=str, default="cuda", help="Device to use (default: cuda).")
+    parser.add_argument(
+        "--num-samples",
+        type=int,
+        default=None,
+        help="Anchor prefix samples per episode. None = dense. 15 matches upstream.",
+    )
+    parser.add_argument(
+        "--episodes",
+        type=int,
+        nargs="+",
+        default=None,
+        help="Process only these episode indices (e.g. --episodes 0 or --episodes 0 5 10).",
+    )
+    parser.add_argument("--fps", type=float, default=None, help="Override TOPRewardConfig.fps.")
+    parser.add_argument(
+        "--push-to-hub", action="store_true", help="Upload to the dataset repo on HuggingFace Hub."
+    )
+
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+
+    output_path = compute_topreward_progress(
+        dataset_repo_id=args.dataset_repo_id,
+        reward_model_path=args.reward_model_path,
+        vlm_name=args.vlm_name,
+        output_path=args.output_path,
+        device=args.device,
+        num_samples=args.num_samples,
+        fps=args.fps,
+        episodes=args.episodes,
+    )
+
+    print(f"\nTOPReward progress saved to: {output_path}")
+
+    if args.push_to_hub:
+        from huggingface_hub import HfApi
+
+        api = HfApi()
+        hub_path = DEFAULT_OUTPUT_FILENAME
+
+        print(f"\nUploading to Hub: {args.dataset_repo_id}/{hub_path}")
+        api.upload_file(
+            path_or_fileobj=str(output_path),
+            path_in_repo=hub_path,
+            repo_id=args.dataset_repo_id,
+            repo_type="dataset",
+        )
+        print(
+            "Successfully uploaded to: "
+            f"https://huggingface.co/datasets/{args.dataset_repo_id}/blob/main/{hub_path}"
+        )
+
+        print("\nTo use in training, add to your config:")
+        print("  use_rabc: true")
+        print(f"  rabc_progress_path: hf://datasets/{args.dataset_repo_id}/{hub_path}")
+        print("  rabc_head_mode: sparse")
+    else:
+        print("\nTo use in training, add to your config:")
+        print("  use_rabc: true")
+        print(f"  rabc_progress_path: {output_path}")
+        print("  rabc_head_mode: sparse")
+
+
+if __name__ == "__main__":
+    main()
--- a/src/lerobot/rewards/topreward/configuration_topreward.py
+++ b/src/lerobot/rewards/topreward/configuration_topreward.py
@@ -0,0 +1,146 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+
+from lerobot.configs import FeatureType, NormalizationMode, PolicyFeature
+from lerobot.configs.rewards import RewardModelConfig
+from lerobot.utils.constants import OBS_IMAGES
+
+# Default prompt scaffolding from the upstream TOPReward paper / reference
+# implementation (``QwenClient.compute_instruction_reward``). The prompt
+# scores the terminal ``True`` token in ``f"{instruction} ... True"``
+# given the video.
+DEFAULT_PROMPT_PREFIX = (
+    "The above video shows a robot manipulation trajectory that completes the following task: "
+)
+DEFAULT_PROMPT_SUFFIX_TEMPLATE = (
+    "{instruction} Decide whether the above statement is True or not. The answer is: True"
+)
+
+
+@RewardModelConfig.register_subclass("topreward")
+@dataclass
+class TOPRewardConfig(RewardModelConfig):
+    """Configuration for the TOPReward zero-shot reward model.
+
+    TOPReward is **zero-shot**: it has no learnable parameters of its own.
+    The "model" is a generic vision-language model (default
+    ``Qwen/Qwen3-VL-8B-Instruct``) used with a fixed prompt to extract
+    token log-probabilities as a reward signal. There is therefore no
+    fine-tuned checkpoint to host: ``pretrained_path`` is unused at
+    runtime — the model identity is :attr:`vlm_name` (an HF Hub id).
+
+    Args:
+        vlm_name: Hugging Face Hub id of the underlying VLM. Must be a
+            Qwen3-VL family model (the only client implemented in this
+            LeRobot port).
+        torch_dtype: Torch dtype name passed to the VLM loader
+            (``"auto"``, ``"bfloat16"``, ``"float16"``, ...).
+        attn_implementation: ``transformers`` attention implementation
+            (e.g. ``"flash_attention_2"``, ``"sdpa"``). Defaults to
+            ``None`` so the upstream picks the best available.
+        image_key: Observation key that holds the trajectory frames.
+        task_key: Complementary-data key that holds the task instruction.
+        default_task: Fallback instruction when ``task_key`` is absent.
+        max_frames: Cap on the number of frames fed to the VLM per
+            sample. ``None`` = use all frames.
+        fps: Frames-per-second metadata for the Qwen video processor.
+        prompt_prefix: Text shown to the VLM right after the video and
+            before the suffix template.
+        prompt_suffix_template: Suffix appended after ``prompt_prefix``.
+            Must contain ``{instruction}``; the VLM scores the
+            log-likelihood of the tokens that follow the prefix.
+        add_chat_template: If ``True``, wrap the full prompt with the
+            tokenizer's chat template before tokenisation (matches
+            upstream ``add_chat_template=True``).
+        success_threshold: Optional log-prob threshold. If finite,
+            :meth:`TOPRewardModel.compute_reward` returns
+            ``(reward > success_threshold).float()`` instead of the raw
+            log-prob.
+        max_input_length: Hard limit on the total tokenized input length;
+            samples that exceed it raise a ``ValueError``.
+    """
+
+    # Path to a local LeRobot dir or HF repo that holds a ``config.json``
+    # snapshot of this TOPRewardConfig. The VLM weights themselves are
+    # always identified by ``vlm_name``.
+    pretrained_path: str | None = None
+
+    vlm_name: str = "Qwen/Qwen3-VL-8B-Instruct"
+    torch_dtype: str = "auto"
+    attn_implementation: str | None = None
+
+    image_key: str = OBS_IMAGES + ".top"
+    task_key: str = "task"
+    default_task: str | None = None
+    max_frames: int | None = 16
+    fps: float = 2.0
+
+    prompt_prefix: str = DEFAULT_PROMPT_PREFIX
+    prompt_suffix_template: str = DEFAULT_PROMPT_SUFFIX_TEMPLATE
+    add_chat_template: bool = False
+
+    success_threshold: float = float("-inf")
+    max_input_length: int = 32768
+
+    license: str | None = "mit"  # matches upstream TOPReward
+    tags: list[str] | None = field(
+        default_factory=lambda: ["reward-model", "vision-language", "qwen3-vl", "zero-shot"]
+    )
+
+    input_features: dict[str, PolicyFeature] = field(default_factory=dict)
+    output_features: dict[str, PolicyFeature] = field(default_factory=dict)
+    normalization_mapping: dict[str, NormalizationMode] = field(
+        default_factory=lambda: {
+            "VISUAL": NormalizationMode.IDENTITY,
+            "REWARD": NormalizationMode.IDENTITY,
+        }
+    )
+
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        if self.max_frames is not None and self.max_frames < 1:
+            raise ValueError(f"max_frames must be >= 1, got {self.max_frames}")
+        if self.fps <= 0:
+            raise ValueError(f"fps must be > 0, got {self.fps}")
+        if "{instruction}" not in self.prompt_suffix_template:
+            raise ValueError(
+                "prompt_suffix_template must contain `{instruction}` so the model "
+                "scores the log-likelihood of the task suffix."
+            )
+        if self.max_input_length <= 0:
+            raise ValueError(f"max_input_length must be > 0, got {self.max_input_length}")
+
+        if self.image_key not in self.input_features:
+            self.input_features[self.image_key] = PolicyFeature(shape=(3, 224, 224), type=FeatureType.VISUAL)
+        self.output_features.setdefault("reward", PolicyFeature(shape=(1,), type=FeatureType.REWARD))
+
+    @property
+    def observation_delta_indices(self) -> list[int] | None:
+        return None
+
+    @property
+    def action_delta_indices(self) -> None:
+        return None
+
+    @property
+    def reward_delta_indices(self) -> None:
+        return None
+
+    def validate_features(self) -> None:
+        if self.image_key not in self.input_features:
+            raise ValueError(f"TOPReward requires image input feature {self.image_key!r}")
--- a/src/lerobot/rewards/topreward/modeling_topreward.py
+++ b/src/lerobot/rewards/topreward/modeling_topreward.py
@@ -0,0 +1,238 @@
+# Copyright 2026 Shirui Chen, Cole Harrison, Ying-Chun Lee, Angela Jin Yang,
+# Zhongzheng Ren, Lillian J. Ratliff, Jiafei Duan, Dieter Fox, Ranjay Krishna
+# and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""TOPReward: Token Probabilities as Hidden Zero-Shot Rewards for Robotics.
+
+Paper:         https://arxiv.org/abs/2602.19313
+Project:       https://topreward.github.io/webpage/
+Original code: https://github.com/TOPReward/TOPReward
+Backbone:      https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct  (default)
+
+TOPReward is a **zero-shot** reward model: it has no fine-tuned weights of
+its own. Given a video trajectory and a task instruction, it asks an
+off-the-shelf VLM how likely the instruction is, conditioned on the video,
+and returns that log-likelihood as the reward signal.
+
+Inference recipe:
+
+1. The processor builds a chat-style prompt, tokenises it, and emits
+   ``input_ids``, ``attention_mask``, vision tensors, and ``labels``.
+   The processor label-masks everything except the terminal answer token with
+   ``-100``.
+2. Forward the full token sequence through the VLM.
+3. Read the terminal answer token log-probability from the logits as the
+   scalar reward.
+
+With the default ``prompt_suffix_template``, the only unmasked token is the
+literal ``"True"`` at the end — the reward is
+``log P("True" | video + prompt + instruction)``.
+
+This LeRobot port is **inference-only and not trainable** — :meth:`forward`
+is intentionally inherited from :class:`PreTrainedRewardModel` and raises
+``NotImplementedError``, making :attr:`PreTrainedRewardModel.is_trainable`
+return ``False``.
+
+Because the VLM weights live on the Hugging Face Hub under their canonical
+id (``Qwen/Qwen3-VL-8B-Instruct`` etc.) and TOPReward never modifies them,
+:meth:`_save_pretrained` and :meth:`from_pretrained` are overridden so a
+TOPReward LeRobot "checkpoint" is a single ``config.json`` (the VLM is
+re-fetched from the Hub at load time).
+"""
+
+from __future__ import annotations
+
+import builtins
+import logging
+import os
+from pathlib import Path
+from tempfile import TemporaryDirectory
+from typing import TYPE_CHECKING, Any, TypeVar
+
+import numpy as np
+import torch
+from huggingface_hub import HfApi, hf_hub_download
+from huggingface_hub.constants import CONFIG_NAME
+from huggingface_hub.errors import HfHubHTTPError
+from torch import Tensor
+from torch.nn.functional import cross_entropy
+
+from lerobot.configs.rewards import RewardModelConfig
+from lerobot.rewards.pretrained import PreTrainedRewardModel
+from lerobot.rewards.topreward.configuration_topreward import TOPRewardConfig
+from lerobot.rewards.topreward.processor_topreward import TOPREWARD_FEATURE_PREFIX, TOPREWARD_INPUT_KEYS
+from lerobot.utils.import_utils import _transformers_available, require_package
+
+if TYPE_CHECKING:
+    from lerobot.configs.train import TrainPipelineConfig
+
+if TYPE_CHECKING or _transformers_available:
+    from transformers import Qwen3VLForConditionalGeneration
+else:
+    Qwen3VLForConditionalGeneration = None  # type: ignore[assignment]
+
+logger = logging.getLogger(__name__)
+
+T = TypeVar("T", bound="TOPRewardModel")
+
+
+def _torch_dtype(name: str) -> torch.dtype | str:
+    """Resolve a torch dtype name; ``"auto"`` is passed through verbatim."""
+    if name == "auto":
+        return "auto"
+    dtype = getattr(torch, name, None)
+    if isinstance(dtype, torch.dtype):
+        return dtype
+    raise ValueError(f"Unknown torch dtype: {name!r}")
+
+
+class TOPRewardModel(PreTrainedRewardModel):
+    """TOPReward zero-shot reward model."""
+
+    name = "topreward"
+    config_class = TOPRewardConfig
+
+    def __init__(self, config: TOPRewardConfig) -> None:
+        require_package("transformers", extra="topreward")
+        super().__init__(config)
+        self.config = config
+
+        torch_dtype = _torch_dtype(config.torch_dtype)
+        model_kwargs: dict[str, Any] = {"dtype": torch_dtype, "trust_remote_code": True}
+        if config.attn_implementation is not None:
+            model_kwargs["attn_implementation"] = config.attn_implementation
+
+        self.model = Qwen3VLForConditionalGeneration.from_pretrained(config.vlm_name, **model_kwargs)
+
+    def compute_reward(self, batch: dict[str, Any]) -> Tensor:
+        """Return one log-prob reward per sample in the batch."""
+        inputs: dict[str, Any] = {}
+        for key in TOPREWARD_INPUT_KEYS:
+            batch_key = f"{TOPREWARD_FEATURE_PREFIX}{key}"
+            if batch_key not in batch:
+                raise KeyError(
+                    f"TOPReward batch missing `{batch_key}`. Make sure the "
+                    "TOPRewardEncoderProcessorStep ran before `compute_reward`."
+                )
+            inputs[key] = batch[batch_key]
+
+        device = next(self.model.parameters()).device
+        inputs = {key: value.to(device) if hasattr(value, "to") else value for key, value in inputs.items()}
+        labels = inputs.pop("labels")
+        inputs["logits_to_keep"] = 2
+
+        self.eval()
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        logits = outputs.logits
+        rewards = -cross_entropy(logits[:, -2, :].float(), labels[:, -1], reduction="none")
+        if np.isfinite(self.config.success_threshold):
+            rewards = (rewards > self.config.success_threshold).float()
+        return rewards.to(self.config.device or "cpu")
+
+    def _save_pretrained(self, save_directory: Path) -> None:
+        """Save ``config.json`` only."""
+        self.config._save_pretrained(save_directory)
+
+    @classmethod
+    def from_pretrained(
+        cls: builtins.type[T],
+        pretrained_name_or_path: str | Path,
+        *,
+        config: RewardModelConfig | None = None,
+        force_download: bool = False,
+        resume_download: bool | None = None,
+        proxies: dict | None = None,
+        token: str | bool | None = None,
+        cache_dir: str | Path | None = None,
+        local_files_only: bool = False,
+        revision: str | None = None,
+        strict: bool = False,  # noqa: ARG003 — accepted for API parity; unused (no safetensors to load)
+        **kwargs: Any,
+    ) -> T:
+        """Load a TOPReward configuration and instantiate the wrapped VLM."""
+        if config is None:
+            config = RewardModelConfig.from_pretrained(
+                pretrained_name_or_path=pretrained_name_or_path,
+                force_download=force_download,
+                resume_download=resume_download,
+                proxies=proxies,
+                token=token,
+                cache_dir=cache_dir,
+                local_files_only=local_files_only,
+                revision=revision,
+                **kwargs,
+            )
+        if not isinstance(config, TOPRewardConfig):
+            raise TypeError(
+                f"Expected a TOPRewardConfig, got {type(config).__name__}. Make sure "
+                f"`pretrained_name_or_path={pretrained_name_or_path!r}` points at a "
+                "TOPReward checkpoint."
+            )
+
+        model_id = str(pretrained_name_or_path)
+        if not os.path.isdir(model_id):
+            try:
+                hf_hub_download(
+                    repo_id=model_id,
+                    filename=CONFIG_NAME,
+                    revision=revision,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    resume_download=resume_download,
+                    token=token,
+                    local_files_only=local_files_only,
+                )
+            except HfHubHTTPError as e:
+                raise FileNotFoundError(
+                    f"{CONFIG_NAME} not found on the HuggingFace Hub in {model_id}"
+                ) from e
+
+        instance = cls(config, **kwargs)
+        instance.to(config.device)
+        instance.eval()
+        return instance
+
+    def push_model_to_hub(self, cfg: TrainPipelineConfig):
+        """Push the TOPReward ``config.json`` + model card to the Hub."""
+        api = HfApi()
+        repo_id = api.create_repo(
+            repo_id=self.config.repo_id, private=self.config.private, exist_ok=True
+        ).repo_id
+
+        with TemporaryDirectory(ignore_cleanup_errors=True) as tmp:
+            saved_path = Path(tmp) / repo_id
+            saved_path.mkdir(parents=True, exist_ok=True)
+
+            self.config._save_pretrained(saved_path)
+
+            card = self.generate_model_card(
+                cfg.dataset.repo_id, self.config.type, self.config.license, self.config.tags
+            )
+            card.save(str(saved_path / "README.md"))
+
+            cfg.save_pretrained(saved_path)
+
+            commit_info = api.upload_folder(
+                repo_id=repo_id,
+                repo_type="model",
+                folder_path=saved_path,
+                commit_message="Upload TOPReward config and readme",
+                allow_patterns=["*.json", "*.yaml", "*.md"],
+                ignore_patterns=["*.tmp", "*.log", "*.safetensors"],
+            )
+
+            logger.info(f"Model pushed to {commit_info.repo_url.url}")
--- a/src/lerobot/rewards/topreward/processor_topreward.py
+++ b/src/lerobot/rewards/topreward/processor_topreward.py
@@ -0,0 +1,305 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""TOPReward pre/post processing pipeline."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any
+
+import torch
+from torch import Tensor
+
+from lerobot.configs import PipelineFeatureType, PolicyFeature
+from lerobot.processor import (
+    AddBatchDimensionProcessorStep,
+    DeviceProcessorStep,
+    PolicyAction,
+    PolicyProcessorPipeline,
+    ProcessorStep,
+    ProcessorStepRegistry,
+    policy_action_to_transition,
+)
+from lerobot.rewards.topreward.configuration_topreward import (
+    DEFAULT_PROMPT_PREFIX,
+    DEFAULT_PROMPT_SUFFIX_TEMPLATE,
+    TOPRewardConfig,
+)
+from lerobot.types import EnvTransition, TransitionKey
+from lerobot.utils.constants import (
+    OBS_IMAGES,
+    OBS_PREFIX,
+    POLICY_POSTPROCESSOR_DEFAULT_NAME,
+    POLICY_PREPROCESSOR_DEFAULT_NAME,
+)
+from lerobot.utils.import_utils import _transformers_available, require_package
+
+if TYPE_CHECKING or _transformers_available:
+    from transformers import AutoProcessor
+else:
+    AutoProcessor = None
+
+TOPREWARD_FEATURE_PREFIX = f"{OBS_PREFIX}topreward."
+
+_TRUE_ANSWER = "True"
+
+TOPREWARD_VLM_INPUT_KEYS = (
+    "input_ids",
+    "attention_mask",
+    "pixel_values_videos",
+    "video_grid_thw",
+    "mm_token_type_ids",
+)
+TOPREWARD_INPUT_KEYS = TOPREWARD_VLM_INPUT_KEYS + ("labels",)
+
+
+def _prepare_video_batch(video: Tensor, *, max_frames: int | None) -> Tensor:
+    """Return videos as ``(B, T, C, H, W)`` uint8 tensors for Qwen3-VL."""
+    if video.ndim == 4:
+        video = video.unsqueeze(1)
+    elif video.ndim != 5:
+        raise ValueError(
+            f"Expected TOPReward frames with shape (B,C,H,W) or (B,T,C,H,W); got {tuple(video.shape)}"
+        )
+
+    if max_frames is not None:
+        video = video[:, -max_frames:]
+    if video.shape[-1] in (1, 3):
+        video = video.permute(0, 1, 4, 2, 3)
+    elif video.shape[2] not in (1, 3):
+        raise ValueError(f"Expected channel dim of size 1 or 3, got shape {tuple(video.shape)}")
+
+    if video.is_floating_point():
+        video = video * 255.0
+
+    return video.clamp(0, 255).to(torch.uint8).contiguous()
+
+
+def _expand_tasks(task: Any, *, batch_size: int, default: str | None) -> list[str]:
+    if task is None:
+        task = default
+    if task is None:
+        raise KeyError("TOPReward expected a task description in complementary data")
+    if isinstance(task, str):
+        return [task] * batch_size
+    if isinstance(task, tuple):
+        task = list(task)
+    if not (isinstance(task, list) and all(isinstance(item, str) for item in task)):
+        raise TypeError(f"TOPReward task must be a string or list of strings, got {type(task)}")
+    if len(task) == 1 and batch_size > 1:
+        return task * batch_size
+    if len(task) != batch_size:
+        raise ValueError(f"Expected {batch_size} tasks, got {len(task)}")
+    return task
+
+
+@dataclass
+@ProcessorStepRegistry.register(name="topreward_encoder")
+class TOPRewardEncoderProcessorStep(ProcessorStep):
+    """Encode raw frames + task into Qwen-VL tensors for the TOPReward model.
+
+    Loads a :class:`~transformers.AutoProcessor` matching ``vlm_name`` and
+    builds the full chat prompt including the instruction suffix. The
+    resulting ``input_ids``, ``attention_mask``, vision tensors, and
+    ``labels`` are written under the ``observation.topreward.*`` namespace
+    so the model can score without re-tokenising.
+
+    At call time the step reads:
+
+    - ``observation[image_key]``: ``(B, T, C, H, W)`` or ``(B, C, H, W)`` frames.
+    - ``complementary_data[task_key]``: a string or list of strings.
+
+    and writes ``observation[f"{TOPREWARD_FEATURE_PREFIX}<name>"]`` for the
+    Qwen-VL tensors plus ``labels``.
+    """
+
+    vlm_name: str = "Qwen/Qwen3-VL-8B-Instruct"
+    image_key: str = OBS_IMAGES + ".top"
+    task_key: str = "task"
+    default_task: str | None = None
+    max_frames: int | None = 16
+    fps: float = 2.0
+    prompt_prefix: str = DEFAULT_PROMPT_PREFIX
+    prompt_suffix_template: str = DEFAULT_PROMPT_SUFFIX_TEMPLATE
+    add_chat_template: bool = False
+    max_length: int = 32768
+
+    _processor: Any = field(default=None, init=False, repr=False)
+
+    def __post_init__(self) -> None:
+        require_package("transformers", extra="topreward")
+        self._processor = AutoProcessor.from_pretrained(self.vlm_name, trust_remote_code=True)
+
+    def __call__(self, transition: EnvTransition) -> EnvTransition:
+        observation = transition.get(TransitionKey.OBSERVATION)
+        complementary = transition.get(TransitionKey.COMPLEMENTARY_DATA) or {}
+        if self.image_key not in observation:
+            raise KeyError(f"TOPReward expected image key {self.image_key!r} in observation")
+
+        frames = observation[self.image_key]
+        videos = frames.detach().cpu() if isinstance(frames, Tensor) else torch.as_tensor(frames)
+        videos = _prepare_video_batch(videos, max_frames=self.max_frames)
+
+        batch_size = videos.shape[0]
+        tasks = _expand_tasks(
+            complementary.get(self.task_key, self.default_task),
+            batch_size=batch_size,
+            default=self.default_task,
+        )
+
+        encoded = self._encode_batch(videos, tasks, batch_size)
+
+        new_observation = dict(observation)
+        for key, value in encoded.items():
+            new_observation[f"{TOPREWARD_FEATURE_PREFIX}{key}"] = value
+
+        new_transition = transition.copy()
+        new_transition[TransitionKey.OBSERVATION] = new_observation
+        return new_transition
+
+    def _encode_batch(self, videos: Tensor, tasks: list[str], batch_size) -> dict[str, Any]:
+        """Tokenise a batch of (frames, task) pairs into Qwen-VL tensors.
+
+        The loop only builds per-sample chat strings. Tokenisation, padding,
+        video preprocessing, and label construction are batched.
+        """
+
+        texts: list[str] = []
+        video_metadata = [
+            {
+                "total_num_frames": int(videos.shape[1]),
+                "fps": float(self.fps),
+                "frames_indices": list(range(int(videos.shape[1]))),
+            }
+            for _ in range(batch_size)
+        ]
+        eos_token = self._processor.tokenizer.eos_token
+
+        for i in range(batch_size):
+            instruction_suffix = self.prompt_suffix_template.format(instruction=tasks[i])
+            if self.add_chat_template:
+                suffix_for_template = instruction_suffix.removesuffix(_TRUE_ANSWER).rstrip()
+                templated_messages = [
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "video", "video": videos[i], "fps": self.fps},
+                            {"type": "text", "text": f"{self.prompt_prefix}{suffix_for_template}"},
+                        ],
+                    }
+                ]
+                prompt_chat = self._processor.apply_chat_template(
+                    templated_messages, tokenize=False, add_generation_prompt=True
+                )
+                full_text = f"{prompt_chat}{_TRUE_ANSWER}"
+            else:
+                user_messages = [
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "video", "video": videos[i], "fps": self.fps},
+                            {"type": "text", "text": self.prompt_prefix},
+                        ],
+                    }
+                ]
+                prompt_chat = self._processor.apply_chat_template(
+                    user_messages, tokenize=False, add_generation_prompt=False
+                )
+                if eos_token is not None:
+                    prompt_chat = prompt_chat.split(eos_token)[0]
+                full_text = f"{prompt_chat}{instruction_suffix}"
+
+            texts.append(full_text)
+
+        result = self._processor(
+            text=texts,
+            videos=videos,
+            video_metadata=video_metadata,
+            do_sample_frames=False,
+            padding=True,
+            padding_side="left",
+            return_tensors="pt",
+        )
+        input_ids = result["input_ids"]
+
+        if input_ids.shape[-1] > self.max_length:
+            raise ValueError(
+                f"TOPReward input length {input_ids.shape[-1]} exceeds max_length "
+                f"{self.max_length}; lower `max_frames` or raise `max_length`."
+            )
+
+        labels = torch.full_like(input_ids, -100)
+        labels[:, -1] = input_ids[:, -1]
+        result["labels"] = labels
+        return result
+
+    def transform_features(
+        self, features: dict[PipelineFeatureType, dict[str, PolicyFeature]]
+    ) -> dict[PipelineFeatureType, dict[str, PolicyFeature]]:
+        return features
+
+    def get_config(self) -> dict[str, Any]:
+        return {
+            "vlm_name": self.vlm_name,
+            "image_key": self.image_key,
+            "task_key": self.task_key,
+            "default_task": self.default_task,
+            "max_frames": self.max_frames,
+            "fps": self.fps,
+            "prompt_prefix": self.prompt_prefix,
+            "prompt_suffix_template": self.prompt_suffix_template,
+            "add_chat_template": self.add_chat_template,
+            "max_length": self.max_length,
+        }
+
+
+def make_topreward_pre_post_processors(
+    config: TOPRewardConfig,
+    dataset_stats: dict[str, dict[str, Any]] | None = None,
+) -> tuple[
+    PolicyProcessorPipeline[dict[str, Any], dict[str, Any]],
+    PolicyProcessorPipeline[PolicyAction, PolicyAction],
+]:
+    """Pipeline that pre-encodes frames + task into Qwen-VL tensors.
+
+    The preprocessor adds a batch dimension if needed, runs TOPReward's
+    encoder (which tokenises the full prompt and emits ``labels``), and
+    moves everything to the configured device. The postprocessor is
+    the identity since TOPReward outputs a single reward tensor.
+    """
+    preprocessor = PolicyProcessorPipeline[dict[str, Any], dict[str, Any]](
+        steps=[
+            AddBatchDimensionProcessorStep(),
+            TOPRewardEncoderProcessorStep(
+                vlm_name=config.vlm_name,
+                image_key=config.image_key,
+                task_key=config.task_key,
+                default_task=config.default_task,
+                max_frames=config.max_frames,
+                fps=config.fps,
+                prompt_prefix=config.prompt_prefix,
+                prompt_suffix_template=config.prompt_suffix_template,
+                add_chat_template=config.add_chat_template,
+                max_length=config.max_input_length,
+            ),
+            DeviceProcessorStep(device=config.device or "cpu"),
+        ],
+        name=POLICY_PREPROCESSOR_DEFAULT_NAME,
+    )
+    postprocessor = PolicyProcessorPipeline(
+        name=POLICY_POSTPROCESSOR_DEFAULT_NAME,
+        to_transition=policy_action_to_transition,
+    )
+    return preprocessor, postprocessor
--- a/src/lerobot/templates/lerobot_rewardmodel_modelcard_template.md
+++ b/src/lerobot/templates/lerobot_rewardmodel_modelcard_template.md
@@ -13,6 +13,10 @@
 A reward classifier is a lightweight neural network that scores observations or trajectories for task success, providing a learned reward signal or offline evaluation when explicit rewards are unavailable.
 {% elif model_name == "sarm" %}
 A Success-Aware Reward Model (SARM) predicts a dense reward signal from observations, typically used downstream for reinforcement learning or human-in-the-loop fine-tuning when task success is not directly observable.
+{% elif model_name == "robometer" %}
+ROBOMETER is a general-purpose video-language robotic reward model built on a fine-tuned Qwen3-VL-4B backbone with progress, preference, and success heads. Given a trajectory video and a task description, it predicts dense, frame-level task progress in [0, 1] and frame-level success probabilities for downstream robot learning, including offline RL, online RL, data filtering and retrieval, and automated failure detection.
+{% elif model_name == "topreward" %}
+TOPReward is a **zero-shot** reward model that extracts token log-probabilities from an off-the-shelf vision-language model (default Qwen3-VL) as a reward signal. Given a video trajectory and a task instruction, it returns the VLM's log-likelihood of the instruction being true, with no fine-tuning required.
 {% else %}
 _Reward model type not recognized — please update this template._
 {% endif %}
				`@@ -0,0 +1 @@`
				`../../../../docs/source/policy_molmoact2_README.md`