tests/rewards/test_reward_model_base.py

# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Tests for the reward model base classes and registry."""

import json
from dataclasses import dataclass
from pathlib import Path
from types import SimpleNamespace

import pytest
import torch

from lerobot.configs.rewards import RewardModelConfig
from lerobot.optim.optimizers import AdamWConfig
from lerobot.rewards.pretrained import PreTrainedRewardModel


@RewardModelConfig.register_subclass(name="_dummy_hub_reward")
@dataclass
class _DummyHubRewardConfig(RewardModelConfig):
    def get_optimizer_preset(self):
        return AdamWConfig(lr=1e-4)


class _DummyHubReward(PreTrainedRewardModel):
    config_class = _DummyHubRewardConfig
    name = "_dummy_hub_reward"

    def __init__(self, config):
        super().__init__(config)
        self.bias = torch.nn.Parameter(torch.zeros(1))

    def compute_reward(self, batch):
        return self.bias.expand(1)


def test_reward_model_config_registry():
    """Verify that classifier and sarm are registered."""
    known = RewardModelConfig.get_known_choices()
    assert "reward_classifier" in known
    assert "sarm" in known


def test_reward_model_config_lookup():
    """Verify that we can look up configs by name."""
    cls = RewardModelConfig.get_choice_class("reward_classifier")
    from lerobot.rewards.classifier.configuration_classifier import RewardClassifierConfig

    assert cls is RewardClassifierConfig


def test_factory_get_reward_model_class():
    """Test the get_reward_model_class factory."""
    from lerobot.rewards.factory import get_reward_model_class

    cls = get_reward_model_class("sarm")
    from lerobot.rewards.sarm.modeling_sarm import SARMRewardModel

    assert cls is SARMRewardModel


def test_factory_unknown_raises():
    """Unknown name should raise ValueError."""
    from lerobot.rewards.factory import get_reward_model_class

    with pytest.raises(ValueError, match="not available"):
        get_reward_model_class("nonexistent_reward_model")


def test_pretrained_reward_model_requires_config_class():
    """Subclass without config_class should fail."""
    with pytest.raises(TypeError, match="must define 'config_class'"):

        class BadModel(PreTrainedRewardModel):
            name = "bad"

            def compute_reward(self, batch):
                pass


def test_pretrained_reward_model_requires_name():
    """Subclass without name should fail."""
    with pytest.raises(TypeError, match="must define 'name'"):

        class BadModel(PreTrainedRewardModel):
            config_class = RewardModelConfig

            def compute_reward(self, batch):
                pass


def test_non_trainable_forward_raises():
    """Non-trainable model should raise on forward()."""
    from dataclasses import dataclass

    from lerobot.optim.optimizers import AdamWConfig

    @dataclass
    class DummyConfig(RewardModelConfig):
        def get_optimizer_preset(self):
            return AdamWConfig(lr=1e-4)

    class DummyReward(PreTrainedRewardModel):
        config_class = DummyConfig
        name = "dummy_test"

        def compute_reward(self, batch):
            return torch.zeros(1)

    config = DummyConfig()
    model = DummyReward(config)

    with pytest.raises(NotImplementedError, match="not trainable"):
        model.forward({"x": torch.zeros(1)})


# ---------------------------------------------------------------------------
# Trainable vs zero-shot (general-purpose) reward models.
# The proposal explicitly supports models like TOPReward that wrap a pretrained
# VLM and produce a reward signal without any training step. These tests pin
# the contract that lets such models coexist with trainable ones.
# ---------------------------------------------------------------------------


def test_is_trainable_false_when_forward_not_overridden():
    """A reward model that only implements ``compute_reward`` is zero-shot."""
    model, _ = _make_dummy_reward_model()
    assert model.is_trainable is False


def test_is_trainable_true_when_forward_overridden():
    """Overriding ``forward`` flips ``is_trainable`` to True."""

    class _TrainableReward(_DummyHubReward):
        name = "_trainable_dummy_reward"

        def forward(self, batch):
            loss = (self.bias**2).sum()
            return loss, {}

    # Register a fresh config subclass so the subclass check passes.
    @RewardModelConfig.register_subclass(name="_trainable_dummy_reward")
    @dataclass
    class _TrainableConfig(_DummyHubRewardConfig):
        pass

    _TrainableReward.config_class = _TrainableConfig
    model = _TrainableReward(_TrainableConfig())
    assert model.is_trainable is True


# ---------------------------------------------------------------------------
# RewardModelConfig.from_pretrained
# ---------------------------------------------------------------------------


def test_reward_model_config_from_pretrained_raises_when_config_missing(tmp_path):
    """``from_pretrained`` must surface a clear ``FileNotFoundError`` when the
    target directory exists but does not contain ``config.json``, instead of
    crashing later inside ``draccus.parse``.
    """
    # tmp_path exists but has no config.json
    with pytest.raises(FileNotFoundError, match="config.json not found"):
        RewardModelConfig.from_pretrained(tmp_path)


def test_reward_model_config_from_pretrained_roundtrip(tmp_path):
    """Round-trip: save a RewardClassifierConfig, reload it, fields must match."""
    from lerobot.rewards.classifier.configuration_classifier import RewardClassifierConfig

    original = RewardClassifierConfig(
        num_classes=3,
        hidden_dim=128,
        latent_dim=64,
        num_cameras=1,
        learning_rate=5e-4,
    )
    original._save_pretrained(tmp_path)

    loaded = RewardModelConfig.from_pretrained(tmp_path)

    assert isinstance(loaded, RewardClassifierConfig)
    assert loaded.num_classes == 3
    assert loaded.hidden_dim == 128
    assert loaded.latent_dim == 64
    assert loaded.num_cameras == 1
    assert loaded.learning_rate == 5e-4


# ---------------------------------------------------------------------------
# TrainPipelineConfig — reward model training path
# ---------------------------------------------------------------------------


def test_train_pipeline_config_path_fields_includes_reward_model():
    """``--reward_model.path=local/dir`` requires ``reward_model`` to be listed
    as a draccus path-field on ``TrainPipelineConfig``."""
    from lerobot.configs.train import TrainPipelineConfig

    fields = TrainPipelineConfig.__get_path_fields__()
    assert "policy" in fields
    assert "reward_model" in fields


def test_train_pipeline_config_trainable_config_returns_reward_model_when_set():
    """When only ``reward_model`` is set, ``trainable_config`` (used by the
    trainer for e.g. ``.device``) must return it — not ``None`` from ``policy``."""
    from lerobot.configs.default import DatasetConfig
    from lerobot.configs.train import TrainPipelineConfig
    from lerobot.rewards.classifier.configuration_classifier import RewardClassifierConfig

    reward_cfg = RewardClassifierConfig(device="cpu")
    cfg = TrainPipelineConfig(
        dataset=DatasetConfig(repo_id="user/repo"),
        reward_model=reward_cfg,
    )

    assert cfg.is_reward_model_training is True
    assert cfg.trainable_config is reward_cfg
    # This is what lerobot_train.py uses to decide force_cpu; ``cfg.policy.device``
    # would AttributeError here because policy is None.
    assert cfg.trainable_config.device == "cpu"


def test_train_pipeline_config_trainable_config_returns_policy_when_set():
    """Mirror of the reward-model case: when only ``policy`` is set,
    ``trainable_config`` must return it."""
    from lerobot.configs.default import DatasetConfig
    from lerobot.configs.train import TrainPipelineConfig
    from lerobot.policies.diffusion.configuration_diffusion import DiffusionConfig

    policy_cfg = DiffusionConfig(device="cpu")
    cfg = TrainPipelineConfig(
        dataset=DatasetConfig(repo_id="user/repo"),
        policy=policy_cfg,
    )

    assert cfg.is_reward_model_training is False
    assert cfg.trainable_config is policy_cfg
    assert cfg.trainable_config.device == "cpu"


def test_train_pipeline_config_from_pretrained_migrates_legacy_rabc_fields(tmp_path):
    """Legacy top-level RA-BC fields should be migrated into ``sample_weighting``."""
    from lerobot.configs.default import DatasetConfig
    from lerobot.configs.train import TRAIN_CONFIG_NAME, TrainPipelineConfig
    from lerobot.policies.diffusion.configuration_diffusion import DiffusionConfig

    cfg = TrainPipelineConfig(
        dataset=DatasetConfig(repo_id="user/repo"),
        policy=DiffusionConfig(device="cpu"),
    )
    cfg._save_pretrained(tmp_path)

    config_path = tmp_path / TRAIN_CONFIG_NAME
    with open(config_path) as f:
        payload = json.load(f)

    payload.pop("sample_weighting", None)
    payload.update(
        {
            "use_rabc": True,
            "rabc_progress_path": "hf://datasets/user/repo/sarm_progress.parquet",
            "rabc_kappa": 0.05,
            "rabc_epsilon": 1e-5,
            "rabc_head_mode": "dense",
        }
    )
    with open(config_path, "w") as f:
        json.dump(payload, f)

    loaded = TrainPipelineConfig.from_pretrained(tmp_path)

    assert loaded.sample_weighting is not None
    assert loaded.sample_weighting.type == "rabc"
    assert loaded.sample_weighting.progress_path == "hf://datasets/user/repo/sarm_progress.parquet"
    assert loaded.sample_weighting.kappa == 0.05
    assert loaded.sample_weighting.epsilon == 1e-5
    assert loaded.sample_weighting.head_mode == "dense"


def test_train_pipeline_config_from_pretrained_strips_legacy_rabc_when_disabled(tmp_path):
    """Legacy RA-BC fields should be ignored when ``use_rabc`` was false."""
    from lerobot.configs.default import DatasetConfig
    from lerobot.configs.train import TRAIN_CONFIG_NAME, TrainPipelineConfig
    from lerobot.policies.diffusion.configuration_diffusion import DiffusionConfig

    cfg = TrainPipelineConfig(
        dataset=DatasetConfig(repo_id="user/repo"),
        policy=DiffusionConfig(device="cpu"),
    )
    cfg._save_pretrained(tmp_path)

    config_path = tmp_path / TRAIN_CONFIG_NAME
    with open(config_path) as f:
        payload = json.load(f)

    payload.pop("sample_weighting", None)
    payload.update(
        {
            "use_rabc": False,
            "rabc_progress_path": "hf://datasets/user/repo/sarm_progress.parquet",
            "rabc_kappa": 0.05,
            "rabc_epsilon": 1e-5,
            "rabc_head_mode": "dense",
        }
    )
    with open(config_path, "w") as f:
        json.dump(payload, f)

    loaded = TrainPipelineConfig.from_pretrained(tmp_path)

    assert loaded.sample_weighting is None


# ---------------------------------------------------------------------------
# PreTrainedRewardModel hub upload: push_model_to_hub + generate_model_card.
# We test the generation side (offline) fully, and the upload side with HfApi
# mocked so nothing actually hits the network.
# ---------------------------------------------------------------------------


def _make_dummy_reward_model(**config_kwargs):
    return _DummyHubReward(_DummyHubRewardConfig(**config_kwargs)), _DummyHubRewardConfig


@pytest.fixture
def _offline_model_card(monkeypatch):
    """``ModelCard.validate`` does a live ``POST`` to huggingface.co — bypass it
    so tests can run offline."""
    from huggingface_hub import ModelCard

    monkeypatch.setattr(ModelCard, "validate", lambda self, *a, **kw: None)


def test_reward_model_generate_model_card_renders_expected_fields(_offline_model_card):
    """``generate_model_card`` must produce a card with the right metadata and
    body, using the dedicated reward-model template."""
    model, _ = _make_dummy_reward_model(
        license="mit",
        tags=["robot", "sim"],
    )

    card = model.generate_model_card(
        dataset_repo_id="user/my_dataset",
        model_type=model.config.type,
        license=model.config.license,
        tags=model.config.tags,
    )

    # Metadata (YAML header) — ModelCardData fields.
    assert card.data.license == "mit"
    assert card.data.library_name == "lerobot"
    assert card.data.pipeline_tag == "robotics"
    assert "reward-model" in card.data.tags
    assert model.config.type in card.data.tags
    assert card.data.model_name == model.config.type
    assert card.data.datasets == "user/my_dataset"

    # Body — specific to the reward-model template, NOT the policy one.
    body = str(card)
    assert "Reward Model Card" in body
    assert "This reward model has been trained" in body
    assert "--reward_model.type=" in body  # reward-model-specific usage block


def test_reward_model_generate_model_card_uses_default_license(_offline_model_card):
    """When config.license is None the card falls back to apache-2.0."""
    model, _ = _make_dummy_reward_model()

    card = model.generate_model_card(
        dataset_repo_id="user/my_dataset",
        model_type=model.config.type,
        license=model.config.license,
        tags=None,
    )

    assert card.data.license == "apache-2.0"


def test_reward_model_push_model_to_hub_uploads_expected_files(monkeypatch, _offline_model_card):
    """``push_model_to_hub`` must:
    1. create the repo,
    2. assemble a temp folder with weights + config.json + train_config.json + README.md,
    3. call ``api.upload_folder`` on that folder.
    All network calls are mocked.
    """
    from huggingface_hub.constants import CONFIG_NAME

    from lerobot.configs.default import DatasetConfig
    from lerobot.configs.train import TRAIN_CONFIG_NAME, TrainPipelineConfig

    model, _ = _make_dummy_reward_model(
        repo_id="user/my_reward",
        license="apache-2.0",
    )
    # Point the reward model's train config at a dummy dataset repo.
    train_cfg = TrainPipelineConfig(
        dataset=DatasetConfig(repo_id="user/my_dataset"),
        reward_model=model.config,
    )

    uploaded: dict = {}
    fake_commit_info = SimpleNamespace(repo_url=SimpleNamespace(url="https://huggingface.co/user/my_reward"))

    class _FakeHfApi:
        def create_repo(self, repo_id, private=None, exist_ok=False):
            uploaded["create_repo_id"] = repo_id
            uploaded["create_private"] = private
            return SimpleNamespace(repo_id=repo_id)

        def upload_folder(self, *, repo_id, repo_type, folder_path, commit_message, **_kwargs):
            uploaded["upload_repo_id"] = repo_id
            uploaded["upload_repo_type"] = repo_type
            uploaded["commit_message"] = commit_message
            # Snapshot files assembled in the temp folder — this is the real
            # contract we care about.
            uploaded["files"] = sorted(p.name for p in Path(folder_path).iterdir())
            return fake_commit_info

    from lerobot.rewards import pretrained as reward_pretrained

    monkeypatch.setattr(reward_pretrained, "HfApi", lambda *a, **kw: _FakeHfApi())

    model.push_model_to_hub(train_cfg)

    assert uploaded["create_repo_id"] == "user/my_reward"
    assert uploaded["upload_repo_id"] == "user/my_reward"
    assert uploaded["upload_repo_type"] == "model"
    assert uploaded["commit_message"] == "Upload reward model weights, train config and readme"
    # Minimum required files that must be uploaded with a reward model.
    assert CONFIG_NAME in uploaded["files"]  # config.json
    assert TRAIN_CONFIG_NAME in uploaded["files"]  # train_config.json
    assert "README.md" in uploaded["files"]
    assert any(name.endswith(".safetensors") for name in uploaded["files"])