tests/policies/test_sarm_subtask_annotations.py

#!/usr/bin/env python

# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pytest

pytest.importorskip("transformers")

from lerobot.data_processing.sarm_annotations.subtask_annotation import (
    Subtask,
    SubtaskAnnotation,
    Timestamp,
    compute_temporal_proportions,
)


def make_annotation(subtasks: list[tuple[str, int, int]]) -> SubtaskAnnotation:
    """Helper to create SubtaskAnnotation from list of (name, start_sec, end_sec)."""
    return SubtaskAnnotation(
        subtasks=[
            Subtask(
                name=name,
                timestamps=Timestamp(
                    start=f"{start // 60:02d}:{start % 60:02d}", end=f"{end // 60:02d}:{end % 60:02d}"
                ),
            )
            for name, start, end in subtasks
        ]
    )


class TestComputeTemporalProportions:
    """Tests for compute_temporal_proportions (SARM Paper Formula 1).

    Formula: ᾱ_k = (1/M) × Σ_i (L_{i,k} / T_i)

    Key insight: This averages the PROPORTION of each subtask within each trajectory,
    giving equal weight to all trajectories regardless of absolute length.
    """

    def test_basic_two_trajectories_equal_proportions(self):
        """Test with two trajectories that have equal proportions."""
        # Both trajectories: subtask1 = 50%, subtask2 = 50%
        # Traj 1: T=100s, subtask1=50s, subtask2=50s
        # Traj 2: T=200s, subtask1=100s, subtask2=100s
        annotations = {
            0: make_annotation([("subtask1", 0, 50), ("subtask2", 50, 100)]),
            1: make_annotation([("subtask1", 0, 100), ("subtask2", 100, 200)]),
        }

        result = compute_temporal_proportions(annotations)

        # Both should be 0.5
        assert abs(result["subtask1"] - 0.5) < 1e-6
        assert abs(result["subtask2"] - 0.5) < 1e-6

    def test_paper_example_different_from_avg_durations(self):
        """Test that compute_temporal_proportions differs from naive average duration approach.

        This is the key test showing the difference between:
        - Paper formula: average of (L_i,k / T_i)
        - Naive approach: mean(L_i,k) / sum(mean(L_i,j))
        """
        # Episode 1: T=100s, subtask1=80s, subtask2=20s (proportions: 0.8, 0.2)
        # Episode 2: T=200s, subtask1=40s, subtask2=160s (proportions: 0.2, 0.8)
        annotations = {
            0: make_annotation([("subtask1", 0, 80), ("subtask2", 80, 100)]),
            1: make_annotation([("subtask1", 0, 40), ("subtask2", 40, 200)]),
        }

        result = compute_temporal_proportions(annotations)

        # Paper formula:
        # ᾱ_1 = (1/2) × (80/100 + 40/200) = (1/2) × (0.8 + 0.2) = 0.5
        # ᾱ_2 = (1/2) × (20/100 + 160/200) = (1/2) × (0.2 + 0.8) = 0.5
        assert abs(result["subtask1"] - 0.5) < 1e-6
        assert abs(result["subtask2"] - 0.5) < 1e-6

    def test_single_trajectory(self):
        """Test with a single trajectory."""
        # T=100s, reach=30s, grasp=20s, lift=50s
        annotations = {
            0: make_annotation([("reach", 0, 30), ("grasp", 30, 50), ("lift", 50, 100)]),
        }

        result = compute_temporal_proportions(annotations)

        assert abs(result["reach"] - 0.3) < 1e-6
        assert abs(result["grasp"] - 0.2) < 1e-6
        assert abs(result["lift"] - 0.5) < 1e-6

    def test_sum_to_one(self):
        """Test that proportions always sum to 1."""
        # Three episodes with varying proportions
        annotations = {
            0: make_annotation([("a", 0, 10), ("b", 10, 50), ("c", 50, 100)]),  # 0.1, 0.4, 0.5
            1: make_annotation([("a", 0, 20), ("b", 20, 70), ("c", 70, 100)]),  # 0.2, 0.5, 0.3
            2: make_annotation([("a", 0, 30), ("b", 30, 90), ("c", 90, 100)]),  # 0.3, 0.6, 0.1
        }

        result = compute_temporal_proportions(annotations)

        total = sum(result.values())
        assert abs(total - 1.0) < 1e-6

    def test_empty_annotations_returns_empty(self):
        """Test that empty annotations returns empty dict."""
        result = compute_temporal_proportions({})
        assert result == {}

    def test_uniform_proportions(self):
        """Test with uniform proportions across subtasks."""
        # Each subtask takes 25% of each episode
        annotations = {
            0: make_annotation([("a", 0, 25), ("b", 25, 50), ("c", 50, 75), ("d", 75, 100)]),
            1: make_annotation([("a", 0, 50), ("b", 50, 100), ("c", 100, 150), ("d", 150, 200)]),
        }

        result = compute_temporal_proportions(annotations)

        for name in ["a", "b", "c", "d"]:
            assert abs(result[name] - 0.25) < 1e-6
-												Add sarm (#2639)

* add initial modeling

* make rewind pretrained policy

* add annotation

* small fix

* add sarm

* subtasks

* fix spawn

* fix rewind discrepancies

* Add script to generate embedding for dataset (#2138)

* Add generate and validate script

* fix precommit

* Improve generate embeddings function by using dataset tools (#2206)

---------

Co-authored-by: Michel Aractingi <michel.aractingi@huggingface.co>

* cleanup

* change order train log

* print batch size

* update sarm processor

* add reward output

* change expected features

* add image validation

* change validation

* get state input from dataset stats

* raise if no state key is found

* pass stats

* cleanup and refactor

* add episode inddex to complementary data

* add subtask init and detection

* revert lerobot_train changes

* pass dataset metadata to policy

* change loadig subtasks

* add small logging

* fix progress conversion and adding initial frame

* use large offset for initial frame (ugly)

* Remove rewind, use clip tokenizer

* add tests, implement formula 1,2 correctly and cleanup

* use task from dataset, cleanup visualizer

* simplify

* simplify and cleanup code and move compute_temporal_proportions to utils

* fix normalization in visualization

* Fix visualization and change prompt

* fix formatting

* add visualize subtask annotations

* use qwen thinking

* try different prompt

* format

* update prompt

* higher temp, long output

* different settings

* use instruct

* show full resp

* split message

* Temp: increase tolerance dataset

* Fix RA-BC (#2572)

* Add next observation loading for RA-BC progress deltas

* Compute weights based on temporal progress deltas instead of static rewards

* Add hard-masking for negative progress deltas in weight computation

* Feat/add dual head (#2582)

* Add dual dense sparse head and annotation

* Add docs

* add dual to procesor

* cleanup

* change sampling in visualize and cleanup

* remove validation

* remove compile

* Feat/test uniform (#2587)

* test uniform

* add different string for misaligned

* Fix rewind and add tests

* uncomment text implementation

* run precommit

* Add head mode for ra-bc

* fix visalization of single task

* add

* return per sample loss

* Fix RA_BC (#2602)

* update rabc implementation

* compute rabc beforehand

* fix import

* add only progress calulation

* use precomputed progress

* multi gpu processing

* import

* fix dataset meta data extraction

* add logging

* logging

* log

* progress per episode

* split differently

* move clip to gpu

* pre decode frames for an episode

* fix cuda initalization

* fix import

* multi processing

* rename

* fix import

* fix

* fix rabc

* use last known progress if oob

* use last known progress if oob

* add misalignment loss with random embeddings

* discard previous changes

* add selection of models to docs for ra_bc

* add transformers dep

* extend tolerance

* initial commit with new codebase

* add tests

* fix

* remove temporal sampler

* drop last frame for sampler

* use original ref

* some fixes

* fix visualization

* remove smoothing and fix order subtasks

* add stride rabc computation

* add push to hub

* add explanation

* add kappa expllaination

* better rabc logging

* feedback pr

* remove dataset tolerance

* revert dataset tool

* revert dataset changes

* add credit

* run precommit

* change path for generate ra_bc

* fix type

* include sarm in all in pyproject

* fix precommit

* lazy import matplotlib

* lazy import qwen

* remove rich console

* skip if transformers is not installed?

* run only when we have faker

* place transformer lazy loading

* Dont test if low transformer version

* fix

* increase transformer

* increase as 4.57.0 is yanked

* remove pi from all

* go back

---------

Co-authored-by: Michel Aractingi <michel.aractingi@huggingface.co>
Co-authored-by: s1lent4gnt <kmeftah.khalil@gmail.com>
											
										
										
											2025-12-18 12:50:32 +01:00
+								#!/usr/bin/env python
 								# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
 								#
 								# Licensed under the Apache License, Version 2.0 (the "License");
 								# you may not use this file except in compliance with the License.
 								# You may obtain a copy of the License at
 								#
 								#     http://www.apache.org/licenses/LICENSE-2.0
 								#
 								# Unless required by applicable law or agreed to in writing, software
 								# distributed under the License is distributed on an "AS IS" BASIS,
 								# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 								# See the License for the specific language governing permissions and
 								# limitations under the License.
 								import pytest
 								pytest.importorskip("transformers")
 								from lerobot.data_processing.sarm_annotations.subtask_annotation import (
 								    Subtask,
 								    SubtaskAnnotation,
 								    Timestamp,
 								    compute_temporal_proportions,
 								)
 								def make_annotation(subtasks: list[tuple[str, int, int]]) -> SubtaskAnnotation:
 								    """Helper to create SubtaskAnnotation from list of (name, start_sec, end_sec)."""
 								    return SubtaskAnnotation(
 								        subtasks=[
 								            Subtask(
 								                name=name,
 								                timestamps=Timestamp(
 								                    start=f"{start // 60:02d}:{start % 60:02d}", end=f"{end // 60:02d}:{end % 60:02d}"
 								                ),
 								            )
 								            for name, start, end in subtasks
 								        ]
 								    )
 								class TestComputeTemporalProportions:
 								    """Tests for compute_temporal_proportions (SARM Paper Formula 1).
 								    Formula: ᾱ_k = (1/M) × Σ_i (L_{i,k} / T_i)
 								    Key insight: This averages the PROPORTION of each subtask within each trajectory,
 								    giving equal weight to all trajectories regardless of absolute length.
 								    """
 								    def test_basic_two_trajectories_equal_proportions(self):
 								        """Test with two trajectories that have equal proportions."""
 								        # Both trajectories: subtask1 = 50%, subtask2 = 50%
 								        # Traj 1: T=100s, subtask1=50s, subtask2=50s
 								        # Traj 2: T=200s, subtask1=100s, subtask2=100s
 								        annotations = {
 : make_annotation([("subtask1", 0, 50), ("subtask2", 50, 100)]),
 : make_annotation([("subtask1", 0, 100), ("subtask2", 100, 200)]),
 								        }
 								        result = compute_temporal_proportions(annotations)
 								        # Both should be 0.5
 								        assert abs(result["subtask1"] - 0.5) < 1e-6
 								        assert abs(result["subtask2"] - 0.5) < 1e-6
 								    def test_paper_example_different_from_avg_durations(self):
 								        """Test that compute_temporal_proportions differs from naive average duration approach.
 								        This is the key test showing the difference between:
 								        - Paper formula: average of (L_i,k / T_i)
 								        - Naive approach: mean(L_i,k) / sum(mean(L_i,j))
 								        """
 								        # Episode 1: T=100s, subtask1=80s, subtask2=20s (proportions: 0.8, 0.2)
 								        # Episode 2: T=200s, subtask1=40s, subtask2=160s (proportions: 0.2, 0.8)
 								        annotations = {
 : make_annotation([("subtask1", 0, 80), ("subtask2", 80, 100)]),
 : make_annotation([("subtask1", 0, 40), ("subtask2", 40, 200)]),
 								        }
 								        result = compute_temporal_proportions(annotations)
 								        # Paper formula:
 								        # ᾱ_1 = (1/2) × (80/100 + 40/200) = (1/2) × (0.8 + 0.2) = 0.5
 								        # ᾱ_2 = (1/2) × (20/100 + 160/200) = (1/2) × (0.2 + 0.8) = 0.5
 								        assert abs(result["subtask1"] - 0.5) < 1e-6
 								        assert abs(result["subtask2"] - 0.5) < 1e-6
 								    def test_single_trajectory(self):
 								        """Test with a single trajectory."""
 								        # T=100s, reach=30s, grasp=20s, lift=50s
 								        annotations = {
 : make_annotation([("reach", 0, 30), ("grasp", 30, 50), ("lift", 50, 100)]),
 								        }
 								        result = compute_temporal_proportions(annotations)
 								        assert abs(result["reach"] - 0.3) < 1e-6
 								        assert abs(result["grasp"] - 0.2) < 1e-6
 								        assert abs(result["lift"] - 0.5) < 1e-6
 								    def test_sum_to_one(self):
 								        """Test that proportions always sum to 1."""
 								        # Three episodes with varying proportions
 								        annotations = {
 : make_annotation([("a", 0, 10), ("b", 10, 50), ("c", 50, 100)]),  # 0.1, 0.4, 0.5
 : make_annotation([("a", 0, 20), ("b", 20, 70), ("c", 70, 100)]),  # 0.2, 0.5, 0.3
 : make_annotation([("a", 0, 30), ("b", 30, 90), ("c", 90, 100)]),  # 0.3, 0.6, 0.1
 								        }
 								        result = compute_temporal_proportions(annotations)
 								        total = sum(result.values())
 								        assert abs(total - 1.0) < 1e-6
 								    def test_empty_annotations_returns_empty(self):
 								        """Test that empty annotations returns empty dict."""
 								        result = compute_temporal_proportions({})
 								        assert result == {}
 								    def test_uniform_proportions(self):
 								        """Test with uniform proportions across subtasks."""
 								        # Each subtask takes 25% of each episode
 								        annotations = {
 : make_annotation([("a", 0, 25), ("b", 25, 50), ("c", 50, 75), ("d", 75, 100)]),
 : make_annotation([("a", 0, 50), ("b", 50, 100), ("c", 100, 150), ("d", 150, 200)]),
 								        }
 								        result = compute_temporal_proportions(annotations)
 								        for name in ["a", "b", "c", "d"]:
 								            assert abs(result[name] - 0.25) < 1e-6