feat(benchmarks): add matrix runner and leaderboard

2026-05-31 02:41:24 +00:00 · 2026-04-15 21:31:33 +02:00
parent dab511dbb1
commit 2ab59a3099
21 changed files with 2096 additions and 50 deletions
--- a/.github/workflows/benchmark_tests.yml
+++ b/.github/workflows/benchmark_tests.yml
@@ -310,3 +310,181 @@ jobs:
          name: metaworld-metrics
          path: /tmp/metaworld-artifacts/metrics.json
          if-no-files-found: warn
+
+  # ── LIBERO-plus ───────────────────────────────────────────────────────────
+  libero-plus-integration-test:
+    name: LIBERO-plus — build image + 1-episode eval
+    runs-on:
+      group: aws-g6-4xlarge-plus
+    env:
+      HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
+
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          persist-credentials: false
+          lfs: true
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
+        with:
+          cache-binary: false
+
+      - name: Build LIBERO-plus benchmark image
+        uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
+        with:
+          context: .
+          file: docker/Dockerfile.benchmark.libero_plus
+          push: false
+          load: true
+          tags: lerobot-benchmark-libero-plus:ci
+          cache-from: type=local,src=/tmp/.buildx-cache-libero-plus
+          cache-to: type=local,dest=/tmp/.buildx-cache-libero-plus,mode=max
+
+      - name: Run LIBERO-plus smoke eval (1 episode)
+        if: env.HF_USER_TOKEN != ''
+        run: |
+          docker run --name libero-plus-eval --gpus all \
+            --shm-size=4g \
+            -e HF_HOME=/tmp/hf \
+            -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
+            -e HF_HUB_DOWNLOAD_TIMEOUT=300 \
+            lerobot-benchmark-libero-plus:ci \
+            bash -c "
+              hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
+              lerobot-eval \
+                --policy.path=lerobot/smolvla_libero_plus \
+                --env.type=libero_plus \
+                --env.task=libero_spatial \
+                '--env.task_ids=[0,100,260,500,1000,1500,2000,2400]' \
+                --eval.batch_size=1 \
+                --eval.n_episodes=1 \
+                --eval.use_async_envs=false \
+                --policy.device=cuda \
+                '--env.camera_name_mapping={\"agentview_image\": \"camera1\", \"robot0_eye_in_hand_image\": \"camera2\"}' \
+                --policy.empty_cameras=1 \
+                --output_dir=/tmp/eval-artifacts
+              python scripts/ci/extract_task_descriptions.py \
+                --env libero_plus --task libero_spatial \
+                --output /tmp/eval-artifacts/task_descriptions.json
+            "
+
+      - name: Copy LIBERO-plus artifacts from container
+        if: always()
+        run: |
+          mkdir -p /tmp/libero-plus-artifacts
+          docker cp libero-plus-eval:/tmp/eval-artifacts/. /tmp/libero-plus-artifacts/ 2>/dev/null || true
+          docker rm -f libero-plus-eval || true
+
+      - name: Parse LIBERO-plus eval metrics
+        if: always()
+        run: |
+          python3 scripts/ci/parse_eval_metrics.py \
+            --artifacts-dir /tmp/libero-plus-artifacts \
+            --env libero_plus \
+            --task libero_spatial \
+            --policy lerobot/smolvla_libero_plus
+
+      - name: Upload LIBERO-plus rollout video
+        if: always()
+        uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
+        with:
+          name: libero-plus-rollout-video
+          path: /tmp/libero-plus-artifacts/videos/
+          if-no-files-found: warn
+
+      - name: Upload LIBERO-plus eval metrics
+        if: always()
+        uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
+        with:
+          name: libero-plus-metrics
+          path: /tmp/libero-plus-artifacts/metrics.json
+          if-no-files-found: warn
+
+  # ── ROBOMME ───────────────────────────────────────────────────────────────
+  robomme-integration-test:
+    name: RoboMME — build image + 1-episode eval
+    runs-on:
+      group: aws-g6-4xlarge-plus
+    env:
+      HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
+
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          persist-credentials: false
+          lfs: true
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
+        with:
+          cache-binary: false
+
+      - name: Build RoboMME benchmark image
+        uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
+        with:
+          context: .
+          file: docker/Dockerfile.benchmark.robomme
+          push: false
+          load: true
+          tags: lerobot-benchmark-robomme:ci
+
+      - name: Run RoboMME smoke eval (1 episode)
+        if: env.HF_USER_TOKEN != ''
+        run: |
+          docker run --name robomme-eval --gpus all \
+            --shm-size=4g \
+            -e HF_HOME=/tmp/hf \
+            -e HF_USER_TOKEN="${HF_USER_TOKEN}" \
+            -e HF_HUB_DOWNLOAD_TIMEOUT=300 \
+            lerobot-benchmark-robomme:ci \
+            bash -c "
+              hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
+              lerobot-eval \
+                --policy.path=lerobot/smolvla_robomme \
+                --env.type=robomme \
+                --env.task=PickXtimes,BinFill,StopCube,MoveCube,InsertPeg \
+                --env.dataset_split=test \
+                --eval.batch_size=1 \
+                --eval.n_episodes=1 \
+                --eval.use_async_envs=false \
+                --policy.device=cuda \
+                '--rename_map={\"observation.images.image\": \"observation.images.camera1\", \"observation.images.wrist_image\": \"observation.images.camera2\"}' \
+                --policy.empty_cameras=3 \
+                --output_dir=/tmp/eval-artifacts
+              python scripts/ci/extract_task_descriptions.py \
+                --env robomme --task PickXtimes,BinFill,StopCube,MoveCube,InsertPeg \
+                --output /tmp/eval-artifacts/task_descriptions.json
+            "
+
+      - name: Copy RoboMME artifacts from container
+        if: always()
+        run: |
+          mkdir -p /tmp/robomme-artifacts
+          docker cp robomme-eval:/tmp/eval-artifacts/. /tmp/robomme-artifacts/ 2>/dev/null || true
+          docker rm -f robomme-eval || true
+
+      - name: Parse RoboMME eval metrics
+        if: always()
+        run: |
+          python3 scripts/ci/parse_eval_metrics.py \
+            --artifacts-dir /tmp/robomme-artifacts \
+            --env robomme \
+            --task PickXtimes \
+            --policy lerobot/smolvla_robomme
+
+      - name: Upload RoboMME rollout video
+        if: always()
+        uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
+        with:
+          name: robomme-rollout-video
+          path: /tmp/robomme-artifacts/videos/
+          if-no-files-found: warn
+
+      - name: Upload RoboMME eval metrics
+        if: always()
+        uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
+        with:
+          name: robomme-metrics
+          path: /tmp/robomme-artifacts/metrics.json
+          if-no-files-found: warn
--- a/benchmarks/init.py
+++ b/benchmarks/init.py
@@ -0,0 +1 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
--- a/benchmarks/publish_benchmark_result.py
+++ b/benchmarks/publish_benchmark_result.py
@@ -0,0 +1,156 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Publish benchmark rows and lightweight artifacts to a Hub dataset."""
+
+from __future__ import annotations
+
+import argparse
+import json
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Any
+
+from lerobot.utils.history_repo import UploadTarget, make_hub_file_url, upload_targets, utc_timestamp_slug
+
+
+def load_json_if_exists(path: Path) -> dict[str, Any] | None:
+    if not path.exists():
+        return None
+    return json.loads(path.read_text())
+
+
+def find_latest_train_config_path(run_root: Path) -> Path | None:
+    checkpoints_dir = run_root / "train" / "checkpoints"
+    if not checkpoints_dir.exists():
+        return None
+    candidates = sorted(
+        checkpoints_dir.glob("*/pretrained_model/train_config.json"),
+        key=lambda path: path.parts[-3],
+    )
+    return candidates[-1] if candidates else None
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--benchmark", required=True)
+    parser.add_argument("--policy", required=True)
+    parser.add_argument("--run_root", required=True, type=Path)
+    parser.add_argument("--results_repo", required=True)
+    parser.add_argument("--git_commit", required=True)
+    parser.add_argument("--num_gpus", required=True, type=int)
+    parser.add_argument("--microbatch_per_gpu", required=True, type=int)
+    parser.add_argument("--gradient_accumulation_steps", required=True, type=int)
+    parser.add_argument("--effective_batch_size", required=True, type=int)
+    parser.add_argument("--train_wall_time_s", required=True, type=float)
+    parser.add_argument("--eval_wall_time_s", required=True, type=float)
+    parser.add_argument("--slurm_job_id", default="")
+    parser.add_argument("--docker_image", required=True)
+    return parser.parse_args()
+
+
+def build_row(args: argparse.Namespace) -> tuple[dict[str, Any], list[UploadTarget]]:
+    now = datetime.now(UTC)
+    created_at = now.isoformat()
+    timestamp = utc_timestamp_slug(now)
+    run_id = f"{timestamp}__{args.benchmark}__{args.policy}__{args.slurm_job_id or 'manual'}"
+    eval_info = load_json_if_exists(args.run_root / "eval" / "eval_info.json") or {}
+    train_config_path = find_latest_train_config_path(args.run_root)
+    train_config = load_json_if_exists(train_config_path) or {}
+
+    artifact_prefix = f"artifacts/{args.benchmark}/{args.policy}/{run_id}"
+    row_path_in_repo = f"rows/{args.benchmark}/{args.policy}/{run_id}.json"
+
+    row = {
+        "schema_version": 1,
+        "created_at": created_at,
+        "run_id": run_id,
+        "benchmark": args.benchmark,
+        "policy": args.policy,
+        "git_commit": args.git_commit,
+        "slurm_job_id": args.slurm_job_id or None,
+        "docker_image": args.docker_image,
+        "resources": {
+            "num_gpus": args.num_gpus,
+            "microbatch_per_gpu": args.microbatch_per_gpu,
+            "gradient_accumulation_steps": args.gradient_accumulation_steps,
+            "effective_batch_size": args.effective_batch_size,
+        },
+        "timings": {
+            "train_wall_time_s": args.train_wall_time_s,
+            "eval_wall_time_s": args.eval_wall_time_s,
+            "total_wall_time_s": args.train_wall_time_s + args.eval_wall_time_s,
+        },
+        "eval": {
+            "overall": eval_info.get("overall", {}),
+            "per_group": eval_info.get("per_group", {}),
+            "per_task_count": len(eval_info.get("per_task", [])),
+        },
+        "paths": {
+            "run_root": str(args.run_root),
+            "train_dir": str(args.run_root / "train"),
+            "eval_dir": str(args.run_root / "eval"),
+        },
+        "train_config": train_config,
+        "artifact_urls": {
+            "row": make_hub_file_url(args.results_repo, row_path_in_repo),
+        },
+    }
+
+    row_path = args.run_root / "benchmark_row.json"
+    row_path.parent.mkdir(parents=True, exist_ok=True)
+    upload_list = [UploadTarget(local_path=row_path, path_in_repo=row_path_in_repo)]
+
+    eval_info_path = args.run_root / "eval" / "eval_info.json"
+    if eval_info_path.exists():
+        row["artifact_urls"]["eval_info"] = make_hub_file_url(
+            args.results_repo, f"{artifact_prefix}/eval_info.json"
+        )
+        upload_list.append(
+            UploadTarget(local_path=eval_info_path, path_in_repo=f"{artifact_prefix}/eval_info.json")
+        )
+
+    if train_config_path is not None and train_config_path.exists():
+        row["artifact_urls"]["train_config"] = make_hub_file_url(
+            args.results_repo, f"{artifact_prefix}/train_config.json"
+        )
+        upload_list.append(
+            UploadTarget(local_path=train_config_path, path_in_repo=f"{artifact_prefix}/train_config.json")
+        )
+
+    row_path.write_text(json.dumps(row, indent=2, sort_keys=True))
+    return row, upload_list
+
+
+def main() -> int:
+    args = parse_args()
+    row, upload_list = build_row(args)
+    uploaded = upload_targets(
+        repo_id=args.results_repo,
+        targets=upload_list,
+        repo_type="dataset",
+        private=False,
+        commit_message=f"Add benchmark row {row['run_id']}",
+    )
+    row["uploaded_paths"] = uploaded
+    row_path = args.run_root / "benchmark_row.json"
+    row_path.write_text(json.dumps(row, indent=2, sort_keys=True))
+    print(json.dumps(row, indent=2, sort_keys=True))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/benchmarks/run_benchmark_matrix.py
+++ b/benchmarks/run_benchmark_matrix.py
@@ -0,0 +1,647 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Generate lightweight SLURM jobs for policy x benchmark benchmarking."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import math
+import subprocess
+from dataclasses import asdict, dataclass, field
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Any
+
+from lerobot.utils.history_repo import utc_timestamp_slug
+
+MAX_GPUS = 8
+MIN_GPUS = 1
+DEFAULT_STEPS = 20_000
+DEFAULT_EFFECTIVE_BATCH_SIZE = 256
+DEFAULT_MICROBATCH_PER_GPU = 32
+DEFAULT_EVAL_BATCH_SIZE = 1
+DEFAULT_CPUS_PER_GPU = 8
+DEFAULT_MEMORY_PER_GPU_GB = 40
+
+
+@dataclass(frozen=True)
+class BenchmarkSpec:
+    name: str
+    dataset_repo_id: str
+    docker_image: str
+    eval_env_type: str
+    eval_task: str
+    eval_n_episodes: int
+    train_steps: int = DEFAULT_STEPS
+    effective_batch_size: int = DEFAULT_EFFECTIVE_BATCH_SIZE
+    train_extra_args: dict[str, Any] = field(default_factory=dict)
+    eval_extra_args: dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass(frozen=True)
+class PolicySpec:
+    name: str
+    policy_type: str
+    num_gpus: int
+    policy_path: str | None = None
+    microbatch_per_gpu: int = DEFAULT_MICROBATCH_PER_GPU
+    extra_train_args: dict[str, Any] = field(default_factory=dict)
+    extra_eval_args: dict[str, Any] = field(default_factory=dict)
+    needs_tokenizer: bool = False
+    tokenizer_args: dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass(frozen=True)
+class PlannedJob:
+    benchmark: str
+    policy: str
+    run_rel: str
+    num_gpus: int
+    microbatch_per_gpu: int
+    gradient_accumulation_steps: int
+    effective_batch_size: int
+    docker_image: str
+    train_args: dict[str, Any]
+    eval_args: dict[str, Any]
+    tokenizer_args: dict[str, Any] | None
+    script_path: str
+
+
+BENCHMARKS: dict[str, BenchmarkSpec] = {
+    "libero_plus": BenchmarkSpec(
+        name="libero_plus",
+        dataset_repo_id="lerobot/libero_plus",
+        docker_image="lerobot-benchmark-libero-plus:latest",
+        eval_env_type="libero_plus",
+        eval_task="libero_spatial,libero_object,libero_goal,libero_10",
+        eval_n_episodes=10,
+        train_extra_args={
+            "rename_map": {
+                "observation.images.image": "observation.images.camera1",
+                "observation.images.image2": "observation.images.camera2",
+            },
+        },
+        eval_extra_args={
+            "env.camera_name_mapping": {
+                "agentview_image": "camera1",
+                "robot0_eye_in_hand_image": "camera2",
+            },
+            "env.max_parallel_tasks": 1,
+            "eval.batch_size": DEFAULT_EVAL_BATCH_SIZE,
+            "eval.use_async_envs": False,
+            "eval.max_episodes_rendered": 0,
+            "policy.device": "cuda",
+        },
+    ),
+    "robomme": BenchmarkSpec(
+        name="robomme",
+        dataset_repo_id="lerobot/robomme",
+        docker_image="lerobot-benchmark-robomme:latest",
+        eval_env_type="robomme",
+        eval_task=(
+            "BinFill,PickXtimes,SwingXtimes,StopCube,VideoUnmask,VideoUnmaskSwap,"
+            "ButtonUnmask,ButtonUnmaskSwap,PickHighlight,VideoRepick,VideoPlaceButton,"
+            "VideoPlaceOrder,MoveCube,InsertPeg,PatternLock,RouteStick"
+        ),
+        eval_n_episodes=50,
+        train_extra_args={
+            "rename_map": {
+                "observation.images.image": "observation.images.camera1",
+                "observation.images.wrist_image": "observation.images.camera2",
+            },
+        },
+        eval_extra_args={
+            "env.dataset_split": "test",
+            "env.max_parallel_tasks": 1,
+            "rename_map": {
+                "observation.images.image": "observation.images.camera1",
+                "observation.images.wrist_image": "observation.images.camera2",
+            },
+            "eval.batch_size": DEFAULT_EVAL_BATCH_SIZE,
+            "eval.use_async_envs": False,
+            "eval.max_episodes_rendered": 0,
+            "policy.device": "cuda",
+        },
+    ),
+}
+
+
+POLICIES: dict[str, PolicySpec] = {
+    "pi0": PolicySpec(
+        name="pi0",
+        policy_type="pi0",
+        policy_path="lerobot/pi0_base",
+        num_gpus=8,
+        extra_train_args={
+            "policy.n_action_steps": 30,
+            "policy.scheduler_decay_steps": DEFAULT_STEPS,
+            "policy.empty_cameras": 0,
+        },
+    ),
+    "pi0_fast": PolicySpec(
+        name="pi0_fast",
+        policy_type="pi0_fast",
+        policy_path="lerobot/pi0fast-base",
+        num_gpus=8,
+        extra_train_args={
+            "policy.n_action_steps": 30,
+            "policy.scheduler_decay_steps": DEFAULT_STEPS,
+            "policy.empty_cameras": 0,
+        },
+        needs_tokenizer=True,
+        tokenizer_args={
+            "action_horizon": 30,
+            "encoded_dims": "0:7",
+            "normalization_mode": "QUANTILES",
+            "vocab_size": 1024,
+            "scale": 10.0,
+            "push_to_hub": True,
+        },
+    ),
+    "pi05": PolicySpec(
+        name="pi05",
+        policy_type="pi05",
+        policy_path="lerobot/pi05_base",
+        num_gpus=8,
+        extra_train_args={
+            "policy.n_action_steps": 30,
+            "policy.scheduler_decay_steps": DEFAULT_STEPS,
+            "policy.empty_cameras": 0,
+        },
+    ),
+    "groot": PolicySpec(
+        name="groot",
+        policy_type="groot",
+        num_gpus=8,
+        extra_train_args={
+            "policy.n_action_steps": 30,
+            "policy.base_model_path": "nvidia/GR00T-N1.5-3B",
+            "policy.tune_diffusion_model": True,
+            "policy.tune_projector": True,
+            "policy.tune_llm": False,
+            "policy.tune_visual": False,
+            "policy.use_bf16": True,
+        },
+    ),
+    "act": PolicySpec(
+        name="act",
+        policy_type="act",
+        num_gpus=1,
+        extra_train_args={
+            "policy.n_action_steps": 30,
+        },
+    ),
+    "diffusion": PolicySpec(
+        name="diffusion",
+        policy_type="diffusion",
+        num_gpus=1,
+        extra_train_args={
+            "policy.horizon": 32,
+            "policy.n_action_steps": 30,
+            "policy.n_obs_steps": 2,
+        },
+    ),
+    "smolvla": PolicySpec(
+        name="smolvla",
+        policy_type="smolvla",
+        policy_path="lerobot/smolvla_base",
+        num_gpus=8,
+        extra_train_args={
+            "policy.n_action_steps": 30,
+            "policy.load_vlm_weights": True,
+            "policy.freeze_vision_encoder": False,
+            "policy.train_expert_only": False,
+            "policy.scheduler_decay_steps": DEFAULT_STEPS,
+            "policy.empty_cameras": 1,
+        },
+    ),
+    "xvla": PolicySpec(
+        name="xvla",
+        policy_type="xvla",
+        policy_path="lerobot/xvla-widowx",
+        num_gpus=4,
+        extra_train_args={
+            "policy.n_action_steps": 32,
+            "policy.scheduler_decay_steps": DEFAULT_STEPS,
+            "policy.empty_cameras": 1,
+        },
+    ),
+    "multi_task_dit": PolicySpec(
+        name="multi_task_dit",
+        policy_type="multi_task_dit",
+        num_gpus=1,
+        extra_train_args={
+            "policy.horizon": 32,
+            "policy.n_action_steps": 30,
+        },
+    ),
+}
+
+
+def normalize_repo_id(hub_org: str, repo_or_id: str) -> str:
+    return repo_or_id if "/" in repo_or_id else f"{hub_org}/{repo_or_id}"
+
+
+def get_requested_names(
+    requested: list[str] | None,
+    available: dict[str, Any],
+    *,
+    kind: str,
+) -> list[str]:
+    if not requested:
+        return list(available)
+    unknown = sorted(set(requested) - set(available))
+    if unknown:
+        raise ValueError(f"Unknown {kind}: {', '.join(unknown)}. Available: {', '.join(available)}")
+    return requested
+
+
+def compute_gradient_accumulation_steps(
+    *,
+    effective_batch_size: int,
+    num_gpus: int,
+    microbatch_per_gpu: int,
+) -> int:
+    per_step_batch = num_gpus * microbatch_per_gpu
+    if effective_batch_size % per_step_batch != 0:
+        raise ValueError(
+            f"Cannot reach effective batch {effective_batch_size} with {num_gpus=} and "
+            f"{microbatch_per_gpu=}."
+        )
+    return effective_batch_size // per_step_batch
+
+
+def make_run_slug() -> str:
+    return utc_timestamp_slug()
+
+
+def shell_value(value: Any) -> str:
+    if isinstance(value, bool):
+        value = "true" if value else "false"
+    elif isinstance(value, (dict, list)):
+        value = json.dumps(value, sort_keys=True)
+    else:
+        value = str(value)
+    escaped = (
+        value.replace("\\", "\\\\")
+        .replace('"', '\\"')
+        .replace("$", "\\$")
+        .replace("`", "\\`")
+    )
+    return f'"{escaped}"'
+
+
+def format_cli_args(args: dict[str, Any]) -> str:
+    lines = []
+    for key, value in args.items():
+        lines.append(f"  --{key}={shell_value(value)}")
+    return " \\\n".join(lines)
+
+
+def build_train_args(
+    *,
+    benchmark: BenchmarkSpec,
+    policy: PolicySpec,
+    train_dir: str,
+    gradient_accumulation_steps: int,
+) -> dict[str, Any]:
+    args: dict[str, Any] = {
+        "dataset.repo_id": benchmark.dataset_repo_id,
+        "output_dir": train_dir,
+        "steps": benchmark.train_steps,
+        "batch_size": policy.microbatch_per_gpu,
+        "gradient_accumulation_steps": gradient_accumulation_steps,
+        "eval_freq": 0,
+        "save_freq": benchmark.train_steps,
+        "save_checkpoint": True,
+        "log_freq": 100,
+        "wandb.enable": False,
+        "policy.push_to_hub": False,
+        "policy.device": "cuda",
+    }
+    if policy.policy_path:
+        args["policy.path"] = policy.policy_path
+    else:
+        args["policy.type"] = policy.policy_type
+    args.update(benchmark.train_extra_args)
+    args.update(policy.extra_train_args)
+    return args
+
+
+def build_eval_args(
+    *,
+    benchmark: BenchmarkSpec,
+    policy: PolicySpec,
+    checkpoint_path: str,
+    eval_dir: str,
+) -> dict[str, Any]:
+    args: dict[str, Any] = {
+        "policy.path": checkpoint_path,
+        "env.type": benchmark.eval_env_type,
+        "env.task": benchmark.eval_task,
+        "eval.n_episodes": benchmark.eval_n_episodes,
+        "output_dir": eval_dir,
+    }
+    args.update(benchmark.eval_extra_args)
+    args.update(policy.extra_eval_args)
+    return args
+
+
+def plan_jobs(
+    *,
+    output_dir: Path,
+    hub_org: str,
+    results_repo: str,
+    policies: list[str],
+    benchmarks: list[str],
+) -> list[PlannedJob]:
+    _ = hub_org
+    _ = results_repo
+    scripts_dir = output_dir / "slurm"
+    jobs: list[PlannedJob] = []
+    for benchmark_name in benchmarks:
+        benchmark = BENCHMARKS[benchmark_name]
+        for policy_name in policies:
+            policy = POLICIES[policy_name]
+            num_gpus = max(MIN_GPUS, min(policy.num_gpus, MAX_GPUS))
+            run_rel = f"runs/{benchmark_name}/{policy_name}/{make_run_slug()}"
+            run_root = f"/benchmark-output/{run_rel}"
+            gradient_accumulation_steps = compute_gradient_accumulation_steps(
+                effective_batch_size=benchmark.effective_batch_size,
+                num_gpus=num_gpus,
+                microbatch_per_gpu=policy.microbatch_per_gpu,
+            )
+            train_dir = f"{run_root}/train"
+            checkpoint_path = f"{train_dir}/checkpoints/{benchmark.train_steps:06d}/pretrained_model"
+            eval_dir = f"{run_root}/eval"
+            train_args = build_train_args(
+                benchmark=benchmark,
+                policy=policy,
+                train_dir=train_dir,
+                gradient_accumulation_steps=gradient_accumulation_steps,
+            )
+            eval_args = build_eval_args(
+                benchmark=benchmark,
+                policy=policy,
+                checkpoint_path=checkpoint_path,
+                eval_dir=eval_dir,
+            )
+            tokenizer_args = None
+            if policy.needs_tokenizer:
+                tokenizer_repo_id = f"{hub_org}/{policy_name}-{benchmark_name}-tokenizer"
+                tokenizer_args = {
+                    "repo_id": benchmark.dataset_repo_id,
+                    "output_dir": f"{run_root}/tokenizer",
+                    "hub_repo_id": tokenizer_repo_id,
+                    **policy.tokenizer_args,
+                }
+                train_args["policy.action_tokenizer_name"] = tokenizer_repo_id
+            script_path = str(scripts_dir / f"{benchmark_name}__{policy_name}.sbatch")
+            jobs.append(
+                PlannedJob(
+                    benchmark=benchmark_name,
+                    policy=policy_name,
+                    run_rel=run_rel,
+                    num_gpus=num_gpus,
+                    microbatch_per_gpu=policy.microbatch_per_gpu,
+                    gradient_accumulation_steps=gradient_accumulation_steps,
+                    effective_batch_size=benchmark.effective_batch_size,
+                    docker_image=benchmark.docker_image,
+                    train_args=train_args,
+                    eval_args=eval_args,
+                    tokenizer_args=tokenizer_args,
+                    script_path=script_path,
+                )
+            )
+    return jobs
+
+
+def render_sbatch_script(
+    *,
+    job: PlannedJob,
+    output_dir: Path,
+    results_repo_id: str,
+    git_commit: str,
+) -> str:
+    host_output_dir = output_dir.resolve()
+    run_root = f"/benchmark-output/{job.run_rel}"
+    host_run_root = host_output_dir / job.run_rel
+    cpus_per_task = max(DEFAULT_CPUS_PER_GPU, DEFAULT_CPUS_PER_GPU * job.num_gpus)
+    mem_gb = max(DEFAULT_MEMORY_PER_GPU_GB, DEFAULT_MEMORY_PER_GPU_GB * job.num_gpus)
+    gpu_ids_expr = "${GPU_IDS}"
+    train_cli = format_cli_args(job.train_args)
+    eval_cli = format_cli_args(job.eval_args)
+    tokenizer_command = ""
+    if job.tokenizer_args:
+        tokenizer_cli = format_cli_args(job.tokenizer_args)
+        tokenizer_command = f"""
+docker run --rm --gpus all \\
+  --shm-size=16g \\
+  -e CUDA_VISIBLE_DEVICES={gpu_ids_expr} \\
+  -e HF_TOKEN="${{HF_TOKEN:-}}" \\
+  -e HF_USER_TOKEN="${{HF_TOKEN:-}}" \\
+  -e HF_HOME=/tmp/hf \\
+  -v "{host_output_dir}:/benchmark-output" \\
+  -w /lerobot \\
+  "{job.docker_image}" \\
+  bash -lc '
+    set -euo pipefail
+    if [[ -n "${{HF_TOKEN:-}}" ]]; then
+      hf auth login --token "${{HF_TOKEN}}" --add-to-git-credential 2>/dev/null || true
+    fi
+    lerobot-train-tokenizer \\
+{tokenizer_cli}
+  '
+"""
+    return f"""#!/bin/bash
+#SBATCH --job-name=bench-{job.benchmark}-{job.policy}
+#SBATCH --gres=gpu:{job.num_gpus}
+#SBATCH --cpus-per-task={cpus_per_task}
+#SBATCH --mem={mem_gb}G
+#SBATCH --output={output_dir.resolve()}/logs/{job.benchmark}__{job.policy}__%j.out
+#SBATCH --error={output_dir.resolve()}/logs/{job.benchmark}__{job.policy}__%j.err
+
+set -euo pipefail
+
+HF_TOKEN="${{HF_TOKEN:-${{HF_USER_TOKEN:-}}}}"
+GPU_IDS="$(seq -s, 0 $(({job.num_gpus} - 1)))"
+RUN_ROOT="{run_root}"
+
+mkdir -p "{host_output_dir}/logs"
+mkdir -p "{host_run_root.parent}"
+
+{tokenizer_command}
+
+TRAIN_START="$(date +%s)"
+docker run --rm --gpus all \\
+  --shm-size=16g \\
+  -e CUDA_VISIBLE_DEVICES="${{GPU_IDS}}" \\
+  -e HF_TOKEN="${{HF_TOKEN:-}}" \\
+  -e HF_USER_TOKEN="${{HF_TOKEN:-}}" \\
+  -e HF_HOME=/tmp/hf \\
+  -v "{host_output_dir}:/benchmark-output" \\
+  -w /lerobot \\
+  "{job.docker_image}" \\
+  bash -lc '
+    set -euo pipefail
+    if [[ -n "${{HF_TOKEN:-}}" ]]; then
+      hf auth login --token "${{HF_TOKEN}}" --add-to-git-credential 2>/dev/null || true
+    fi
+    accelerate launch --num_processes={job.num_gpus} $(which lerobot-train) \\
+{train_cli}
+  '
+TRAIN_END="$(date +%s)"
+
+EVAL_START="$(date +%s)"
+docker run --rm --gpus all \\
+  --shm-size=16g \\
+  -e CUDA_VISIBLE_DEVICES="${{GPU_IDS}}" \\
+  -e HF_TOKEN="${{HF_TOKEN:-}}" \\
+  -e HF_USER_TOKEN="${{HF_TOKEN:-}}" \\
+  -e HF_HOME=/tmp/hf \\
+  -v "{host_output_dir}:/benchmark-output" \\
+  -w /lerobot \\
+  "{job.docker_image}" \\
+  bash -lc '
+    set -euo pipefail
+    if [[ -n "${{HF_TOKEN:-}}" ]]; then
+      hf auth login --token "${{HF_TOKEN}}" --add-to-git-credential 2>/dev/null || true
+    fi
+    lerobot-eval \\
+{eval_cli}
+  '
+EVAL_END="$(date +%s)"
+TRAIN_WALL_TIME_S="$((TRAIN_END - TRAIN_START))"
+EVAL_WALL_TIME_S="$((EVAL_END - EVAL_START))"
+
+docker run --rm --gpus all \\
+  --shm-size=16g \\
+  -e CUDA_VISIBLE_DEVICES="${{GPU_IDS}}" \\
+  -e HF_TOKEN="${{HF_TOKEN:-}}" \\
+  -e HF_USER_TOKEN="${{HF_TOKEN:-}}" \\
+  -e HF_HOME=/tmp/hf \\
+  -e RUN_ROOT="${{RUN_ROOT}}" \\
+  -e TRAIN_WALL_TIME_S="${{TRAIN_WALL_TIME_S}}" \\
+  -e EVAL_WALL_TIME_S="${{EVAL_WALL_TIME_S}}" \\
+  -v "{host_output_dir}:/benchmark-output" \\
+  -w /lerobot \\
+  "{job.docker_image}" \\
+  bash -lc '
+    set -euo pipefail
+    if [[ -n "${{HF_TOKEN:-}}" ]]; then
+      hf auth login --token "${{HF_TOKEN}}" --add-to-git-credential 2>/dev/null || true
+    fi
+    uv run python benchmarks/publish_benchmark_result.py \\
+      --benchmark={job.benchmark} \\
+      --policy={job.policy} \\
+      --run_root="${{RUN_ROOT}}" \\
+      --results_repo={results_repo_id} \\
+      --git_commit={git_commit} \\
+      --num_gpus={job.num_gpus} \\
+      --microbatch_per_gpu={job.microbatch_per_gpu} \\
+      --gradient_accumulation_steps={job.gradient_accumulation_steps} \\
+      --effective_batch_size={job.effective_batch_size} \\
+      --train_wall_time_s="${{TRAIN_WALL_TIME_S}}" \\
+      --eval_wall_time_s="${{EVAL_WALL_TIME_S}}" \\
+      --slurm_job_id="${{SLURM_JOB_ID:-}}" \\
+      --docker_image={job.docker_image}
+  '
+"""
+
+
+def write_manifest(
+    *,
+    output_dir: Path,
+    jobs: list[PlannedJob],
+    git_commit: str,
+    hub_org: str,
+    results_repo: str,
+) -> Path:
+    manifest = {
+        "generated_at": datetime.now(UTC).isoformat(),
+        "git_commit": git_commit,
+        "hub_org": hub_org,
+        "results_repo": results_repo,
+        "jobs": [asdict(job) for job in jobs],
+    }
+    manifest_path = output_dir / "manifest.json"
+    manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True))
+    return manifest_path
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--policies", nargs="*", default=None)
+    parser.add_argument("--benchmarks", nargs="*", default=None)
+    parser.add_argument("--output_dir", required=True, type=Path)
+    parser.add_argument("--hub_org", required=True)
+    parser.add_argument("--results_repo", required=True)
+    parser.add_argument("--submit", action="store_true")
+    return parser.parse_args()
+
+
+def get_git_commit() -> str:
+    return subprocess.check_output(["git", "rev-parse", "HEAD"], text=True).strip()
+
+
+def main() -> int:
+    args = parse_args()
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+    (args.output_dir / "slurm").mkdir(parents=True, exist_ok=True)
+    (args.output_dir / "logs").mkdir(parents=True, exist_ok=True)
+
+    selected_policies = get_requested_names(args.policies, POLICIES, kind="policies")
+    selected_benchmarks = get_requested_names(args.benchmarks, BENCHMARKS, kind="benchmarks")
+    git_commit = get_git_commit()
+    results_repo_id = normalize_repo_id(args.hub_org, args.results_repo)
+
+    jobs = plan_jobs(
+        output_dir=args.output_dir,
+        hub_org=args.hub_org,
+        results_repo=results_repo_id,
+        policies=selected_policies,
+        benchmarks=selected_benchmarks,
+    )
+
+    for job in jobs:
+        script = render_sbatch_script(
+            job=job,
+            output_dir=args.output_dir,
+            results_repo_id=results_repo_id,
+            git_commit=git_commit,
+        )
+        script_path = Path(job.script_path)
+        script_path.write_text(script)
+        script_path.chmod(0o755)
+        if args.submit:
+            subprocess.run(["sbatch", str(script_path)], check=True)
+
+    manifest_path = write_manifest(
+        output_dir=args.output_dir,
+        jobs=jobs,
+        git_commit=git_commit,
+        hub_org=args.hub_org,
+        results_repo=results_repo_id,
+    )
+    print(f"Wrote {len(jobs)} benchmark jobs to {args.output_dir}")
+    print(f"Manifest: {manifest_path}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/docker/Dockerfile.benchmark.libero_plus
+++ b/docker/Dockerfile.benchmark.libero_plus
@@ -0,0 +1,48 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FROM huggingface/lerobot-gpu:latest
+
+USER root
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+         unzip libexpat1 libfontconfig1-dev libmagickwand-dev \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*
+USER user_lerobot
+
+RUN uv pip install --no-cache \
+        "robosuite==1.4.1" bddl easydict mujoco matplotlib wand scikit-image gym
+
+ENV LIBERO_PLUS_ROOT=/home/user_lerobot/libero-plus/libero/libero
+RUN git clone --depth=1 https://github.com/sylvestf/LIBERO-plus.git /home/user_lerobot/libero-plus \
+    && cd /home/user_lerobot/libero-plus && uv pip install --no-cache --no-deps -e "." \
+    && uv pip uninstall hf-libero 2>/dev/null || true
+ENV PYTHONPATH="/home/user_lerobot/libero-plus:${PYTHONPATH}"
+
+RUN python -c "\
+from huggingface_hub import hf_hub_download; \
+hf_hub_download(repo_id='Sylvest/LIBERO-plus', repo_type='dataset', \
+                filename='assets.zip', local_dir='/tmp/libero-plus-dl')" \
+    && unzip -q /tmp/libero-plus-dl/assets.zip -d /tmp/libero-plus-dl/extract \
+    && mv /tmp/libero-plus-dl/extract/inspire/hdd/project/embodied-multimodality/public/syfei/libero_new/release/dataset/LIBERO-plus-0/assets \
+          ${LIBERO_PLUS_ROOT}/assets \
+    && rm -rf /tmp/libero-plus-dl
+
+RUN mkdir -p /home/user_lerobot/.libero \
+    && printf "assets: ${LIBERO_PLUS_ROOT}/assets\nbddl_files: ${LIBERO_PLUS_ROOT}/bddl_files\ndatasets: ${LIBERO_PLUS_ROOT}/../datasets\ninit_states: ${LIBERO_PLUS_ROOT}/init_files\n" \
+       > /home/user_lerobot/.libero/config.yaml
+
+COPY --chown=user_lerobot:user_lerobot . .
+
+CMD ["/bin/bash"]
--- a/docker/Dockerfile.benchmark.robomme
+++ b/docker/Dockerfile.benchmark.robomme
@@ -0,0 +1,39 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FROM huggingface/lerobot-gpu:latest
+
+ENV NVIDIA_DRIVER_CAPABILITIES=all \
+    VK_ICD_FILENAMES=/usr/share/vulkan/icd.d/nvidia_icd.json
+
+USER root
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+         libvulkan1 libvulkan-dev mesa-vulkan-drivers \
+    && mkdir -p /usr/share/vulkan/icd.d \
+    && echo '{"file_format_version":"1.0.0","ICD":{"library_path":"libGLX_nvidia.so.0","api_version":"1.3.0"}}' \
+       > /usr/share/vulkan/icd.d/nvidia_icd.json \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*
+USER user_lerobot
+
+COPY --chown=user_lerobot:user_lerobot setup.py pyproject.toml uv.lock README.md MANIFEST.in ./
+RUN printf 'gymnasium==0.29.1\nnumpy==1.26.4\n' > /tmp/robomme_override.txt \
+    && uv pip install --no-cache --override /tmp/robomme_override.txt \
+         -e ".[smolvla,av-dep]" \
+         "robomme @ git+https://github.com/RoboMME/robomme_benchmark.git@main" \
+    && python -c "import robomme; print('robomme import OK')"
+
+COPY --chown=user_lerobot:user_lerobot . .
+
+CMD ["/bin/bash"]
--- a/scripts/ci/extract_task_descriptions.py
+++ b/scripts/ci/extract_task_descriptions.py
@@ -31,10 +31,22 @@ from __future__ import annotations

 import argparse
 import json
+import re
 import sys
 from pathlib import Path


+# LIBERO-plus derives task.language by space-joining the perturbation-variant
+# filename, so strip the perturbation metadata blob to recover the base prompt.
+_LIBERO_PERTURBATION_TAIL_RE = re.compile(
+    r"(?:\s(?:view|initstate|noise|add|tb|table|light|level)(?:\s\d+)+)+$"
+)
+
+
+def _strip_libero_perturbation_tail(instruction: str) -> str:
+    return _LIBERO_PERTURBATION_TAIL_RE.sub("", instruction).strip()
+
+
 def _libero_descriptions(task_suite: str) -> dict[str, str]:
    from libero.libero import benchmark  # type: ignore[import-untyped]

@@ -47,7 +59,10 @@ def _libero_descriptions(task_suite: str) -> dict[str, str]:
        )
        return {}
    suite = suite_dict[task_suite]()
-    return {f"{task_suite}_{i}": suite.get_task(i).language for i in range(suite.n_tasks)}
+    return {
+        f"{task_suite}_{i}": _strip_libero_perturbation_tail(suite.get_task(i).language)
+        for i in range(suite.n_tasks)
+    }


 def _metaworld_descriptions(task_name: str) -> dict[str, str]:
@@ -57,6 +72,14 @@ def _metaworld_descriptions(task_name: str) -> dict[str, str]:
    return {f"{task_name}_0": label}


+def _robomme_descriptions(task_names: str) -> dict[str, str]:
+    return {
+        f"{task_name}_0": task_name.replace("_", " ").strip()
+        for task_name in (task.strip() for task in task_names.split(","))
+        if task_name
+    }
+
+
 def main() -> int:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--env", required=True, help="Environment family (libero, metaworld, ...)")
@@ -66,10 +89,12 @@ def main() -> int:

    descriptions: dict[str, str] = {}
    try:
-        if args.env == "libero":
+        if args.env in {"libero", "libero_plus"}:
            descriptions = _libero_descriptions(args.task)
        elif args.env == "metaworld":
            descriptions = _metaworld_descriptions(args.task)
+        elif args.env == "robomme":
+            descriptions = _robomme_descriptions(args.task)
        else:
            print(
                f"[extract_task_descriptions] No description extractor for env '{args.env}'.",
--- a/spaces/benchmark-leaderboard/README.md
+++ b/spaces/benchmark-leaderboard/README.md
@@ -0,0 +1,27 @@
+---
+title: LeRobot Benchmark Leaderboard
+emoji: 🤖
+colorFrom: yellow
+colorTo: orange
+sdk: gradio
+sdk_version: 5.29.0
+app_file: app.py
+pinned: false
+license: apache-2.0
+short_description: Benchmark history for LeRobot policy x benchmark runs
+---
+
+# LeRobot Benchmark Leaderboard
+
+This Space reads immutable benchmark rows from a Hugging Face dataset and shows:
+
+- Latest result per policy and benchmark
+- Historical trends over time
+- Direct links to uploaded eval and config artifacts
+
+## Configuration
+
+Set `BENCHMARK_RESULTS_REPO` in the Space settings if you want to point the UI
+at a different public dataset. The default is:
+
+- `lerobot/benchmark-history`
--- a/spaces/benchmark-leaderboard/app.py
+++ b/spaces/benchmark-leaderboard/app.py
@@ -0,0 +1,226 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import json
+import os
+import time
+from pathlib import Path
+from typing import Any
+
+import gradio as gr
+import pandas as pd
+import plotly.express as px
+from huggingface_hub import HfApi, hf_hub_download
+
+RESULTS_REPO = os.environ.get("BENCHMARK_RESULTS_REPO", "lerobot/benchmark-history")
+CACHE_DIR = Path("/tmp/benchmark-leaderboard-cache")
+CACHE_DIR.mkdir(parents=True, exist_ok=True)
+CACHE_TTL_S = 300
+
+_CACHE: dict[str, tuple[float, pd.DataFrame]] = {}
+
+
+def _row_to_record(row: dict[str, Any]) -> dict[str, Any]:
+    overall = row.get("eval", {}).get("overall", {})
+    resources = row.get("resources", {})
+    timings = row.get("timings", {})
+    artifact_urls = row.get("artifact_urls", {})
+    return {
+        "created_at": row.get("created_at"),
+        "benchmark": row.get("benchmark"),
+        "policy": row.get("policy"),
+        "success_rate": overall.get("pc_success"),
+        "n_episodes": overall.get("n_episodes"),
+        "avg_sum_reward": overall.get("avg_sum_reward"),
+        "train_wall_time_s": timings.get("train_wall_time_s"),
+        "eval_wall_time_s": timings.get("eval_wall_time_s"),
+        "total_wall_time_s": timings.get("total_wall_time_s"),
+        "num_gpus": resources.get("num_gpus"),
+        "microbatch_per_gpu": resources.get("microbatch_per_gpu"),
+        "gradient_accumulation_steps": resources.get("gradient_accumulation_steps"),
+        "effective_batch_size": resources.get("effective_batch_size"),
+        "git_commit": row.get("git_commit"),
+        "row_url": artifact_urls.get("row"),
+        "eval_info_url": artifact_urls.get("eval_info"),
+        "train_config_url": artifact_urls.get("train_config"),
+    }
+
+
+def load_rows(repo_id: str = RESULTS_REPO) -> pd.DataFrame:
+    cache_key = f"rows::{repo_id}"
+    cached = _CACHE.get(cache_key)
+    if cached is not None and (time.monotonic() - cached[0]) < CACHE_TTL_S:
+        return cached[1]
+
+    api = HfApi()
+    files = [path for path in api.list_repo_files(repo_id=repo_id, repo_type="dataset") if path.startswith("rows/")]
+    records: list[dict[str, Any]] = []
+    for path_in_repo in sorted(files, reverse=True):
+        local_path = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename=path_in_repo, cache_dir=CACHE_DIR)
+        with open(local_path) as f:
+            row = json.load(f)
+        records.append(_row_to_record(row))
+
+    df = pd.DataFrame.from_records(records)
+    if not df.empty:
+        df["created_at"] = pd.to_datetime(df["created_at"], utc=True)
+        df = df.sort_values("created_at", ascending=False).reset_index(drop=True)
+    _CACHE[cache_key] = (time.monotonic(), df)
+    return df
+
+
+def make_latest_table(df: pd.DataFrame) -> pd.DataFrame:
+    if df.empty:
+        return df
+    latest = (
+        df.sort_values("created_at", ascending=False)
+        .groupby(["benchmark", "policy"], as_index=False)
+        .first()
+        .sort_values(["benchmark", "success_rate"], ascending=[True, False], na_position="last")
+    )
+    return latest[
+        [
+            "benchmark",
+            "policy",
+            "success_rate",
+            "n_episodes",
+            "train_wall_time_s",
+            "eval_wall_time_s",
+            "num_gpus",
+            "effective_batch_size",
+            "git_commit",
+            "row_url",
+            "eval_info_url",
+            "train_config_url",
+        ]
+    ]
+
+
+def make_history_figure(df: pd.DataFrame, benchmark: str, policy: str | None) -> Any:
+    filtered = df[df["benchmark"] == benchmark]
+    if policy and policy != "All":
+        filtered = filtered[filtered["policy"] == policy]
+    if filtered.empty:
+        return px.line(title="No benchmark rows found")
+    fig = px.line(
+        filtered.sort_values("created_at"),
+        x="created_at",
+        y="success_rate",
+        color="policy",
+        markers=True,
+        hover_data=["git_commit", "num_gpus", "train_wall_time_s", "eval_wall_time_s"],
+        title=f"{benchmark} success rate history",
+    )
+    fig.update_layout(yaxis_title="Success rate (%)", xaxis_title="Run time")
+    return fig
+
+
+def make_run_markdown(df: pd.DataFrame, benchmark: str, policy: str | None) -> str:
+    filtered = df[df["benchmark"] == benchmark]
+    if policy and policy != "All":
+        filtered = filtered[filtered["policy"] == policy]
+    if filtered.empty:
+        return "No matching runs yet."
+    latest = filtered.sort_values("created_at", ascending=False).iloc[0]
+    row_link = latest["row_url"] if pd.notna(latest["row_url"]) else None
+    eval_link = latest["eval_info_url"] if pd.notna(latest["eval_info_url"]) else None
+    train_link = latest["train_config_url"] if pd.notna(latest["train_config_url"]) else None
+    lines = [
+        f"Latest run: `{latest['policy']}` on `{latest['benchmark']}`",
+        f"Success rate: `{latest['success_rate']}`",
+        f"GPUs: `{latest['num_gpus']}`",
+        f"Effective batch size: `{latest['effective_batch_size']}`",
+        f"Commit: `{latest['git_commit']}`",
+    ]
+    if row_link:
+        lines.append(f"Row JSON: [open]({row_link})")
+    if eval_link:
+        lines.append(f"Eval Info: [open]({eval_link})")
+    if train_link:
+        lines.append(f"Train Config: [open]({train_link})")
+    return "\n\n".join(lines)
+
+
+def refresh_view(benchmark: str, policy: str) -> tuple[pd.DataFrame, dict[str, Any], Any, str]:
+    df = load_rows()
+    latest_table = make_latest_table(df)
+    benchmark_names = sorted(df["benchmark"].dropna().unique().tolist()) if not df.empty else []
+    if benchmark not in benchmark_names and benchmark_names:
+        benchmark = benchmark_names[0]
+    policy_choices = ["All"]
+    if benchmark and not df.empty:
+        policy_choices.extend(sorted(df[df["benchmark"] == benchmark]["policy"].dropna().unique().tolist()))
+    if policy not in policy_choices:
+        policy = "All"
+    history = make_history_figure(df, benchmark, policy)
+    summary = make_run_markdown(df, benchmark, policy)
+    return latest_table, gr.update(choices=policy_choices, value=policy), history, summary
+
+
+with gr.Blocks(title="LeRobot Benchmark Leaderboard") as demo:
+    gr.Markdown(
+        f"""
+# LeRobot Benchmark Leaderboard
+
+Results dataset: [`{RESULTS_REPO}`](https://huggingface.co/datasets/{RESULTS_REPO})
+"""
+    )
+
+    with gr.Row():
+        benchmark_dropdown = gr.Dropdown(label="Benchmark", choices=[])
+        policy_dropdown = gr.Dropdown(label="Policy", choices=["All"], value="All")
+        refresh_button = gr.Button("Refresh")
+
+    latest_table = gr.Dataframe(label="Latest Results", interactive=False)
+    history_plot = gr.Plot(label="History")
+    latest_summary = gr.Markdown()
+
+    def _initial_state():
+        df = load_rows()
+        benchmarks = sorted(df["benchmark"].dropna().unique().tolist()) if not df.empty else []
+        benchmark = benchmarks[0] if benchmarks else ""
+        latest, policy_choices, history, summary = refresh_view(benchmark, "All")
+        return (
+            gr.update(choices=benchmarks, value=benchmark),
+            policy_choices,
+            latest,
+            history,
+            summary,
+        )
+
+    demo.load(
+        _initial_state,
+        outputs=[benchmark_dropdown, policy_dropdown, latest_table, history_plot, latest_summary],
+    )
+    refresh_button.click(
+        refresh_view,
+        inputs=[benchmark_dropdown, policy_dropdown],
+        outputs=[latest_table, policy_dropdown, history_plot, latest_summary],
+    )
+    benchmark_dropdown.change(
+        refresh_view,
+        inputs=[benchmark_dropdown, policy_dropdown],
+        outputs=[latest_table, policy_dropdown, history_plot, latest_summary],
+    )
+    policy_dropdown.change(
+        refresh_view,
+        inputs=[benchmark_dropdown, policy_dropdown],
+        outputs=[latest_table, policy_dropdown, history_plot, latest_summary],
+    )
+
+
+if __name__ == "__main__":
+    demo.launch()
--- a/spaces/benchmark-leaderboard/requirements.txt
+++ b/spaces/benchmark-leaderboard/requirements.txt
@@ -0,0 +1,4 @@
+gradio>=5.0.0,<6.0.0
+plotly>=5.18.0
+pandas>=2.0.0
+huggingface-hub>=1.0.0,<2.0.0
--- a/src/lerobot/configs/default.py
+++ b/src/lerobot/configs/default.py
@@ -67,11 +67,17 @@ class EvalConfig:
    # `batch_size` specifies the number of environments to use in a gym.vector.VectorEnv.
    # Set to 0 for auto-tuning based on available CPU cores and n_episodes.
    batch_size: int = 0
+    # Number of rollout videos to save per evaluated task. Set to 0 to disable videos.
+    max_episodes_rendered: int = 10
    # `use_async_envs` specifies whether to use asynchronous environments (multiprocessing).
    # Defaults to True; automatically downgraded to SyncVectorEnv when batch_size=1.
    use_async_envs: bool = True

    def __post_init__(self) -> None:
+        if self.max_episodes_rendered < 0:
+            raise ValueError(
+                f"`max_episodes_rendered` must be non-negative, got {self.max_episodes_rendered}."
+            )
        if self.batch_size == 0:
            self.batch_size = self._auto_batch_size()
        if self.batch_size > self.n_episodes:
--- a/src/lerobot/configs/train.py
+++ b/src/lerobot/configs/train.py
@@ -56,6 +56,7 @@ class TrainPipelineConfig(HubMixin):
    # Number of workers for the dataloader.
    num_workers: int = 4
    batch_size: int = 8
+    gradient_accumulation_steps: int = 1
    steps: int = 100_000
    eval_freq: int = 20_000
    log_freq: int = 200
@@ -132,6 +133,11 @@ class TrainPipelineConfig(HubMixin):
        if isinstance(self.dataset.repo_id, list):
            raise NotImplementedError("LeRobotMultiDataset is not currently implemented.")

+        if self.gradient_accumulation_steps <= 0:
+            raise ValueError(
+                f"`gradient_accumulation_steps` must be strictly positive, got {self.gradient_accumulation_steps}."
+            )
+
        if not self.use_policy_training_preset and (self.optimizer is None or self.scheduler is None):
            raise ValueError("Optimizer and Scheduler must be set when the policy presets are not used.")
        elif self.use_policy_training_preset and not self.resume:
--- a/src/lerobot/envs/init.py
+++ b/src/lerobot/envs/init.py
@@ -18,7 +18,15 @@
 # from lerobot.utils.import_utils import require_package
 # require_package("gymnasium", extra="<update_extra>", import_name="gymnasium")

-from .configs import AlohaEnv, EnvConfig, HILSerlRobotEnvConfig, HubEnvConfig, PushtEnv
+from .configs import (
+    AlohaEnv,
+    EnvConfig,
+    HILSerlRobotEnvConfig,
+    HubEnvConfig,
+    LiberoPlusEnv,
+    PushtEnv,
+    RoboMMEEnv,
+)
 from .factory import make_env, make_env_config, make_env_pre_post_processors
 from .utils import check_env_attributes_and_types, close_envs, env_to_policy_features, preprocess_observation

@@ -27,7 +35,9 @@ __all__ = [
    "EnvConfig",
    "HILSerlRobotEnvConfig",
    "HubEnvConfig",
+    "LiberoPlusEnv",
    "PushtEnv",
+    "RoboMMEEnv",
    "check_env_attributes_and_types",
    "close_envs",
    "env_to_policy_features",
--- a/src/lerobot/envs/configs.py
+++ b/src/lerobot/envs/configs.py
@@ -574,3 +574,58 @@ class IsaaclabArenaEnv(HubEnvConfig):
            ),
            PolicyProcessorPipeline(steps=[]),
        )
+
+
+@EnvConfig.register_subclass("libero_plus")
+@dataclass
+class LiberoPlusEnv(LiberoEnv):
+    """Config for LIBERO-plus robustness benchmark evaluation."""
+
+    task: str = "libero_spatial"
+
+
+@EnvConfig.register_subclass("robomme")
+@dataclass
+class RoboMMEEnv(EnvConfig):
+    """RoboMME memory-augmented manipulation benchmark."""
+
+    task: str = "PickXtimes"
+    fps: int = 10
+    episode_length: int = 300
+    action_space: str = "joint_angle"
+    dataset_split: str = "test"
+    task_ids: list[int] | None = None
+    features: dict[str, PolicyFeature] = field(
+        default_factory=lambda: {
+            ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(8,)),
+            "image": PolicyFeature(type=FeatureType.VISUAL, shape=(256, 256, 3)),
+            "wrist_image": PolicyFeature(type=FeatureType.VISUAL, shape=(256, 256, 3)),
+            OBS_STATE: PolicyFeature(type=FeatureType.STATE, shape=(8,)),
+        }
+    )
+    features_map: dict[str, str] = field(
+        default_factory=lambda: {
+            ACTION: ACTION,
+            "image": f"{OBS_IMAGES}.image",
+            "wrist_image": f"{OBS_IMAGES}.wrist_image",
+            OBS_STATE: OBS_STATE,
+        }
+    )
+
+    @property
+    def gym_kwargs(self) -> dict:
+        return {}
+
+    def create_envs(self, n_envs: int, use_async_envs: bool = True):
+        from .robomme import create_robomme_envs
+
+        env_cls = _make_vec_env_cls(use_async_envs, n_envs)
+        return create_robomme_envs(
+            task=self.task,
+            n_envs=n_envs,
+            action_space_type=self.action_space,
+            dataset=self.dataset_split,
+            episode_length=self.episode_length,
+            task_ids=self.task_ids,
+            env_cls=env_cls,
+        )
--- a/src/lerobot/envs/libero.py
+++ b/src/lerobot/envs/libero.py
@@ -16,6 +16,7 @@
 from __future__ import annotations

 import os
+import re
 from collections import defaultdict
 from collections.abc import Callable, Iterable, Mapping, Sequence
 from functools import partial
@@ -69,14 +70,28 @@ def _select_task_ids(total_tasks: int, task_ids: Iterable[int] | None) -> list[i
    return ids


+# LIBERO-plus perturbation variants encode the perturbation in the filename
+# but on disk only the base `.pruned_init` exists — strip the suffix to match
+# LIBERO-plus's own suite.get_task_init_states() (we reimplement it here so we
+# can pass weights_only=False for PyTorch 2.6+ numpy pickles).
+_LIBERO_PERTURBATION_SUFFIX_RE = re.compile(r"_(?:language|view|light)_[^.]*|_(?:table|tb)_\d+")
+
+
 def get_task_init_states(task_suite: Any, i: int) -> np.ndarray:
-    init_states_path = (
-        Path(get_libero_path("init_states"))
-        / task_suite.tasks[i].problem_folder
-        / task_suite.tasks[i].init_states_file
-    )
-    init_states = torch.load(init_states_path, weights_only=False)  # nosec B614
-    return init_states
+    task = task_suite.tasks[i]
+    filename = Path(task.init_states_file)
+    root = Path(get_libero_path("init_states"))
+
+    # `_add_` / `_level` variants store extra-object layouts under libero_newobj/
+    # as a flat array that must be reshaped to (1, -1).
+    if "_add_" in filename.name or "_level" in filename.name:
+        init_states_path = root / "libero_newobj" / task.problem_folder / filename.name
+        init_states = torch.load(init_states_path, weights_only=False)  # nosec B614
+        return init_states.reshape(1, -1)
+
+    stripped = _LIBERO_PERTURBATION_SUFFIX_RE.sub("", filename.stem) + filename.suffix
+    init_states_path = root / task.problem_folder / stripped
+    return torch.load(init_states_path, weights_only=False)  # nosec B614


 def get_libero_dummy_action():
--- a/src/lerobot/envs/robomme.py
+++ b/src/lerobot/envs/robomme.py
@@ -0,0 +1,209 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""RoboMME environment wrapper for LeRobot evaluation."""
+
+from __future__ import annotations
+
+from collections.abc import Callable, Sequence
+from functools import partial
+from typing import Any
+
+import gymnasium as gym
+import numpy as np
+from gymnasium import spaces
+
+ROBOMME_TASKS = [
+    "BinFill",
+    "PickXtimes",
+    "SwingXtimes",
+    "StopCube",
+    "VideoUnmask",
+    "VideoUnmaskSwap",
+    "ButtonUnmask",
+    "ButtonUnmaskSwap",
+    "PickHighlight",
+    "VideoRepick",
+    "VideoPlaceButton",
+    "VideoPlaceOrder",
+    "MoveCube",
+    "InsertPeg",
+    "PatternLock",
+    "RouteStick",
+]
+
+
+class RoboMMEGymEnv(gym.Env):
+    """Thin Gymnasium wrapper around a single RoboMME episode env."""
+
+    metadata = {"render_modes": ["rgb_array"], "render_fps": 10}
+
+    def __init__(
+        self,
+        task: str = "PickXtimes",
+        action_space_type: str = "joint_angle",
+        dataset: str = "test",
+        episode_idx: int = 0,
+        max_steps: int = 300,
+    ):
+        super().__init__()
+        from robomme.env_record_wrapper import BenchmarkEnvBuilder
+
+        self._builder = BenchmarkEnvBuilder(
+            env_id=task,
+            dataset=dataset,
+            action_space=action_space_type,
+            gui_render=False,
+            max_steps=max_steps,
+        )
+        self._max_episode_steps = max_steps
+        self._episode_idx = episode_idx
+        self._max_steps = max_steps
+        self._env = None
+        self._last_raw_obs: dict | None = None
+
+        action_dim = 8 if action_space_type == "joint_angle" else 7
+        self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(action_dim,), dtype=np.float32)
+        self.observation_space = spaces.Dict(
+            {
+                "image": spaces.Box(0, 255, shape=(256, 256, 3), dtype=np.uint8),
+                "wrist_image": spaces.Box(0, 255, shape=(256, 256, 3), dtype=np.uint8),
+                "state": spaces.Box(-np.inf, np.inf, shape=(8,), dtype=np.float32),
+            }
+        )
+
+    def reset(self, *, seed=None, options=None):
+        super().reset(seed=seed)
+        self._env = self._builder.make_env_for_episode(
+            episode_idx=self._episode_idx,
+            max_steps=self._max_steps,
+        )
+        obs, info = self._env.reset()
+        self._last_raw_obs = obs
+        return self._convert_obs(obs), self._convert_info(info)
+
+    def step(self, action):
+        obs, reward, terminated, truncated, info = self._env.step(action)
+        self._last_raw_obs = obs
+
+        terminated_bool = bool(terminated.item()) if hasattr(terminated, "item") else bool(terminated)
+        truncated_bool = bool(truncated.item()) if hasattr(truncated, "item") else bool(truncated)
+
+        status = info.get("status", "ongoing")
+        conv_info = self._convert_info(info)
+        conv_info["is_success"] = status == "success"
+
+        return self._convert_obs(obs), float(reward), terminated_bool, truncated_bool, conv_info
+
+    def render(self) -> np.ndarray | None:
+        if self._last_raw_obs is None:
+            return np.zeros((256, 256, 3), dtype=np.uint8)
+        front = self._last_raw_obs.get("front_rgb_list")
+        if front is None:
+            return np.zeros((256, 256, 3), dtype=np.uint8)
+        frame = front[-1] if isinstance(front, list) else front
+        return np.asarray(frame, dtype=np.uint8)
+
+    def _convert_obs(self, obs: dict) -> dict:
+        front_rgb = (
+            obs["front_rgb_list"][-1] if isinstance(obs["front_rgb_list"], list) else obs["front_rgb_list"]
+        )
+        wrist_rgb = (
+            obs["wrist_rgb_list"][-1] if isinstance(obs["wrist_rgb_list"], list) else obs["wrist_rgb_list"]
+        )
+        joint_state = (
+            obs["joint_state_list"][-1]
+            if isinstance(obs["joint_state_list"], list)
+            else obs["joint_state_list"]
+        )
+        gripper_state = (
+            obs["gripper_state_list"][-1]
+            if isinstance(obs["gripper_state_list"], list)
+            else obs["gripper_state_list"]
+        )
+
+        joint = np.asarray(joint_state, dtype=np.float32).flatten()[:7]
+        gripper = np.asarray(gripper_state, dtype=np.float32).flatten()[:1]
+        state = np.concatenate([joint, gripper])
+
+        return {
+            "image": np.asarray(front_rgb, dtype=np.uint8),
+            "wrist_image": np.asarray(wrist_rgb, dtype=np.uint8),
+            "state": state,
+        }
+
+    def _convert_info(self, info: dict) -> dict:
+        return {
+            "status": info.get("status", "ongoing"),
+            "task_goal": info.get("task_goal", ""),
+        }
+
+
+def _make_env_fns(
+    *,
+    task: str,
+    n_envs: int,
+    action_space_type: str,
+    dataset: str,
+    episode_length: int,
+    task_id: int,
+) -> list[Callable[[], RoboMMEGymEnv]]:
+    def _make_one(episode_index: int) -> RoboMMEGymEnv:
+        return RoboMMEGymEnv(
+            task=task,
+            action_space_type=action_space_type,
+            dataset=dataset,
+            episode_idx=episode_index,
+            max_steps=episode_length,
+        )
+
+    return [partial(_make_one, task_id + i) for i in range(n_envs)]
+
+
+def create_robomme_envs(
+    task: str,
+    n_envs: int = 1,
+    action_space_type: str = "joint_angle",
+    dataset: str = "test",
+    episode_length: int = 300,
+    task_ids: list[int] | None = None,
+    env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None,
+) -> dict[str, dict[int, gym.vector.VectorEnv]]:
+    """Create vectorized RoboMME environments for evaluation."""
+    if env_cls is None or not callable(env_cls):
+        raise ValueError("env_cls must be a callable that wraps a list of env factory callables.")
+    if not isinstance(n_envs, int) or n_envs <= 0:
+        raise ValueError(f"n_envs must be a positive int; got {n_envs}.")
+
+    if task_ids is None:
+        task_ids = [0]
+
+    task_names = [t.strip() for t in task.split(",") if t.strip()]
+    out: dict[str, dict[int, gym.vector.VectorEnv]] = {}
+    for task_name in task_names:
+        envs_by_task: dict[int, gym.vector.VectorEnv] = {}
+        for task_id in task_ids:
+            fns = _make_env_fns(
+                task=task_name,
+                n_envs=n_envs,
+                action_space_type=action_space_type,
+                dataset=dataset,
+                episode_length=episode_length,
+                task_id=task_id,
+            )
+            envs_by_task[task_id] = env_cls(fns)
+        out[task_name] = envs_by_task
+    return out
--- a/src/lerobot/scripts/lerobot_eval.py
+++ b/src/lerobot/scripts/lerobot_eval.py
@@ -572,7 +572,7 @@ def eval_main(cfg: EvalPipelineConfig):
            preprocessor=preprocessor,
            postprocessor=postprocessor,
            n_episodes=cfg.eval.n_episodes,
-            max_episodes_rendered=10,
+            max_episodes_rendered=cfg.eval.max_episodes_rendered,
            videos_dir=Path(cfg.output_dir) / "videos",
            start_seed=cfg.seed,
            max_parallel_tasks=cfg.env.max_parallel_tasks,
--- a/src/lerobot/scripts/lerobot_train.py
+++ b/src/lerobot/scripts/lerobot_train.py
@@ -71,6 +71,9 @@ def update_policy(
    lr_scheduler=None,
    lock=None,
    rabc_weights_provider=None,
+    *,
+    do_optimizer_step: bool = True,
+    loss_divisor: int = 1,
 ) -> tuple[MetricsTracker, dict]:
    """
    Performs a single training step to update the policy's weights.
@@ -122,34 +125,38 @@ def update_policy(
            loss, output_dict = policy.forward(batch)

        # TODO(rcadene): policy.unnormalize_outputs(out_dict)
+        logged_loss = loss.detach()
+        if loss_divisor > 1:
+            loss = loss / loss_divisor

    # Use accelerator's backward method
    accelerator.backward(loss)

-    # Clip gradients if specified
-    if grad_clip_norm > 0:
-        grad_norm = accelerator.clip_grad_norm_(policy.parameters(), grad_clip_norm)
-    else:
-        grad_norm = torch.nn.utils.clip_grad_norm_(
-            policy.parameters(), float("inf"), error_if_nonfinite=False
-        )
+    grad_norm_value = 0.0
+    if do_optimizer_step:
+        if grad_clip_norm > 0:
+            grad_norm = accelerator.clip_grad_norm_(policy.parameters(), grad_clip_norm)
+        else:
+            grad_norm = torch.nn.utils.clip_grad_norm_(
+                policy.parameters(), float("inf"), error_if_nonfinite=False
+            )
+        grad_norm_value = grad_norm.item()

-    # Optimizer step
-    with lock if lock is not None else nullcontext():
-        optimizer.step()
+        with lock if lock is not None else nullcontext():
+            optimizer.step()

-    optimizer.zero_grad()
+        optimizer.zero_grad()

-    # Step through pytorch scheduler at every batch instead of epoch
-    if lr_scheduler is not None:
-        lr_scheduler.step()
+        # Step through pytorch scheduler at every optimizer step instead of epoch
+        if lr_scheduler is not None:
+            lr_scheduler.step()

-    # Update internal buffers if policy has update method
-    if has_method(accelerator.unwrap_model(policy, keep_fp32_wrapper=True), "update"):
-        accelerator.unwrap_model(policy, keep_fp32_wrapper=True).update()
+        # Update internal buffers if policy has update method
+        if has_method(accelerator.unwrap_model(policy, keep_fp32_wrapper=True), "update"):
+            accelerator.unwrap_model(policy, keep_fp32_wrapper=True).update()

-    train_metrics.loss = loss.item()
-    train_metrics.grad_norm = grad_norm.item()
+    train_metrics.loss = logged_loss.item()
+    train_metrics.grad_norm = grad_norm_value
    train_metrics.lr = optimizer.param_groups[0]["lr"]
    train_metrics.update_s = time.perf_counter() - start_time
    return train_metrics, output_dict
@@ -359,8 +366,16 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
        logging.info(f"{dataset.num_frames=} ({format_big_number(dataset.num_frames)})")
        logging.info(f"{dataset.num_episodes=}")
        num_processes = accelerator.num_processes
-        effective_bs = cfg.batch_size * num_processes
-        logging.info(f"Effective batch size: {cfg.batch_size} x {num_processes} = {effective_bs}")
+        micro_batch = cfg.batch_size
+        logical_batch = cfg.batch_size * cfg.gradient_accumulation_steps
+        effective_bs = logical_batch * num_processes
+        logging.info(
+            "Effective batch size: %s x %s x %s = %s",
+            micro_batch,
+            cfg.gradient_accumulation_steps,
+            num_processes,
+            effective_bs,
+        )
        logging.info(f"{num_learnable_params=} ({format_big_number(num_learnable_params)})")
        logging.info(f"{num_total_params=} ({format_big_number(num_total_params)})")

@@ -407,9 +422,10 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
    }

    # Keep global batch size for logging; MetricsTracker handles world size internally.
-    effective_batch_size = cfg.batch_size * accelerator.num_processes
+    logical_batch_size = cfg.batch_size * cfg.gradient_accumulation_steps
+    effective_batch_size = logical_batch_size * accelerator.num_processes
    train_tracker = MetricsTracker(
-        cfg.batch_size,
+        logical_batch_size,
        dataset.num_frames,
        dataset.num_episodes,
        train_metrics,
@@ -431,21 +447,62 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
        )

    for _ in range(step, cfg.steps):
-        start_time = time.perf_counter()
-        batch = next(dl_iter)
-        batch = preprocessor(batch)
-        train_tracker.dataloading_s = time.perf_counter() - start_time
+        step_dataloading_s = 0.0
+        step_update_s = 0.0
+        step_losses = []
+        step_grad_norm = 0.0
+        step_lr = optimizer.param_groups[0]["lr"]
+        output_dict = {}
+        optimizer.zero_grad()
+        for accumulation_idx in range(cfg.gradient_accumulation_steps):
+            start_time = time.perf_counter()
+            batch = next(dl_iter)
+            batch = preprocessor(batch)
+            step_dataloading_s += time.perf_counter() - start_time

-        train_tracker, output_dict = update_policy(
-            train_tracker,
-            policy,
-            batch,
-            optimizer,
-            cfg.optimizer.grad_clip_norm,
-            accelerator=accelerator,
-            lr_scheduler=lr_scheduler,
-            rabc_weights_provider=rabc_weights,
-        )
+            is_last_microbatch = accumulation_idx == cfg.gradient_accumulation_steps - 1
+            micro_metrics = MetricsTracker(
+                cfg.batch_size,
+                dataset.num_frames,
+                dataset.num_episodes,
+                {
+                    "loss": AverageMeter("loss", ":.3f"),
+                    "grad_norm": AverageMeter("grdn", ":.3f"),
+                    "lr": AverageMeter("lr", ":0.1e"),
+                    "update_s": AverageMeter("updt_s", ":.3f"),
+                },
+                accelerator=accelerator,
+            )
+            sync_context = (
+                nullcontext()
+                if is_last_microbatch or accelerator.num_processes == 1
+                else accelerator.no_sync(policy)
+            )
+            with sync_context:
+                micro_metrics, micro_output_dict = update_policy(
+                    micro_metrics,
+                    policy,
+                    batch,
+                    optimizer,
+                    cfg.optimizer.grad_clip_norm,
+                    accelerator=accelerator,
+                    lr_scheduler=lr_scheduler if is_last_microbatch else None,
+                    rabc_weights_provider=rabc_weights,
+                    do_optimizer_step=is_last_microbatch,
+                    loss_divisor=cfg.gradient_accumulation_steps,
+                )
+            step_update_s += micro_metrics.update_s.val
+            step_losses.append(micro_metrics.loss.val)
+            if is_last_microbatch:
+                step_grad_norm = micro_metrics.grad_norm.val
+                step_lr = micro_metrics.lr.val
+                output_dict = micro_output_dict
+
+        train_tracker.loss = sum(step_losses) / len(step_losses)
+        train_tracker.grad_norm = step_grad_norm
+        train_tracker.lr = step_lr
+        train_tracker.update_s = step_update_s
+        train_tracker.dataloading_s = step_dataloading_s

        # Note: eval and checkpoint happens *after* the `step`th training update has completed, so we
        # increment `step` here.
@@ -510,7 +567,7 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
                        postprocessor=postprocessor,
                        n_episodes=cfg.eval.n_episodes,
                        videos_dir=cfg.output_dir / "eval" / f"videos_step_{step_id}",
-                        max_episodes_rendered=4,
+                        max_episodes_rendered=cfg.eval.max_episodes_rendered,
                        start_seed=cfg.seed,
                        max_parallel_tasks=cfg.env.max_parallel_tasks,
                    )
@@ -541,7 +598,9 @@ def train(cfg: TrainPipelineConfig, accelerator: "Accelerator | None" = None):
                if wandb_logger:
                    wandb_log_dict = {**eval_tracker.to_dict(), **eval_info}
                    wandb_logger.log_dict(wandb_log_dict, step, mode="eval")
-                    wandb_logger.log_video(eval_info["overall"]["video_paths"][0], step, mode="eval")
+                    video_paths = eval_info["overall"].get("video_paths", [])
+                    if video_paths:
+                        wandb_logger.log_video(video_paths[0], step, mode="eval")

            accelerator.wait_for_everyone()

--- a/src/lerobot/utils/history_repo.py
+++ b/src/lerobot/utils/history_repo.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Any
+
+from huggingface_hub import HfApi
+
+
+def utc_timestamp_slug(now: datetime | None = None) -> str:
+    current = now or datetime.now(UTC)
+    return current.strftime("%Y%m%dT%H%M%SZ")
+
+
+def make_hub_file_url(repo_id: str, path_in_repo: str, repo_type: str = "dataset") -> str:
+    prefix = "datasets/" if repo_type == "dataset" else ""
+    return f"https://huggingface.co/{prefix}{repo_id}/resolve/main/{path_in_repo}"
+
+
+def write_json(path: Path, payload: dict[str, Any]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(payload, indent=2, sort_keys=True))
+
+
+@dataclass(frozen=True)
+class UploadTarget:
+    local_path: Path
+    path_in_repo: str
+
+
+def upload_targets(
+    repo_id: str,
+    targets: list[UploadTarget],
+    *,
+    repo_type: str = "dataset",
+    token: str | None = None,
+    private: bool | None = None,
+    commit_message: str | None = None,
+) -> dict[str, str]:
+    api = HfApi(token=token)
+    api.create_repo(repo_id=repo_id, repo_type=repo_type, private=private, exist_ok=True)
+    uploaded: dict[str, str] = {}
+    for target in targets:
+        api.upload_file(
+            path_or_fileobj=str(target.local_path),
+            path_in_repo=target.path_in_repo,
+            repo_id=repo_id,
+            repo_type=repo_type,
+            commit_message=commit_message or f"Upload {target.path_in_repo}",
+        )
+        uploaded[target.path_in_repo] = make_hub_file_url(repo_id, target.path_in_repo, repo_type=repo_type)
+    return uploaded
--- a/tests/benchmarks/test_benchmark_matrix.py
+++ b/tests/benchmarks/test_benchmark_matrix.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+
+from benchmarks.run_benchmark_matrix import (
+    PlannedJob,
+    compute_gradient_accumulation_steps,
+    plan_jobs,
+    render_sbatch_script,
+    write_manifest,
+)
+
+
+def _one_job(job_list: list[PlannedJob]) -> PlannedJob:
+    assert len(job_list) == 1
+    return job_list[0]
+
+
+def test_compute_gradient_accumulation_steps_for_fixed_effective_batch():
+    assert compute_gradient_accumulation_steps(
+        effective_batch_size=256,
+        num_gpus=8,
+        microbatch_per_gpu=32,
+    ) == 1
+    assert compute_gradient_accumulation_steps(
+        effective_batch_size=256,
+        num_gpus=4,
+        microbatch_per_gpu=32,
+    ) == 2
+    assert compute_gradient_accumulation_steps(
+        effective_batch_size=256,
+        num_gpus=1,
+        microbatch_per_gpu=32,
+    ) == 8
+
+
+def test_plan_jobs_filters_libero_plus_only(tmp_path):
+    jobs = plan_jobs(
+        output_dir=tmp_path,
+        hub_org="lerobot",
+        results_repo="lerobot/benchmark-history",
+        policies=["pi0", "act"],
+        benchmarks=["libero_plus"],
+    )
+
+    assert [job.benchmark for job in jobs] == ["libero_plus", "libero_plus"]
+    assert [job.policy for job in jobs] == ["pi0", "act"]
+
+
+def test_plan_jobs_includes_libero_plus_and_robomme(tmp_path):
+    jobs = plan_jobs(
+        output_dir=tmp_path,
+        hub_org="lerobot",
+        results_repo="lerobot/benchmark-history",
+        policies=["pi0"],
+        benchmarks=["libero_plus", "robomme"],
+    )
+
+    assert [job.benchmark for job in jobs] == ["libero_plus", "robomme"]
+    assert jobs[0].effective_batch_size == 256
+    assert jobs[1].effective_batch_size == 256
+
+
+def test_plan_jobs_sets_expected_gpu_and_accumulation(tmp_path):
+    jobs = plan_jobs(
+        output_dir=tmp_path,
+        hub_org="lerobot",
+        results_repo="lerobot/benchmark-history",
+        policies=["pi0", "xvla", "act"],
+        benchmarks=["robomme"],
+    )
+    by_policy = {job.policy: job for job in jobs}
+
+    assert by_policy["pi0"].num_gpus == 8
+    assert by_policy["pi0"].gradient_accumulation_steps == 1
+    assert by_policy["xvla"].num_gpus == 4
+    assert by_policy["xvla"].gradient_accumulation_steps == 2
+    assert by_policy["act"].num_gpus == 1
+    assert by_policy["act"].gradient_accumulation_steps == 8
+
+
+def test_render_sbatch_script_contains_train_eval_and_publish(tmp_path):
+    job = _one_job(
+        plan_jobs(
+            output_dir=tmp_path,
+            hub_org="lerobot",
+            results_repo="lerobot/benchmark-history",
+            policies=["pi0_fast"],
+            benchmarks=["robomme"],
+        )
+    )
+
+    script = render_sbatch_script(
+        job=job,
+        output_dir=tmp_path,
+        results_repo_id="lerobot/benchmark-history",
+        git_commit="deadbeef",
+    )
+
+    assert "docker/Dockerfile" not in script
+    assert "lerobot-benchmark-robomme:latest" in script
+    assert '--dataset.repo_id="lerobot/robomme"' in script
+    assert '--env.type="robomme"' in script
+    assert "--gradient_accumulation_steps=1" in script
+    assert "lerobot-train-tokenizer" in script
+    assert "benchmarks/publish_benchmark_result.py" in script
+
+
+def test_write_manifest_records_job_metadata(tmp_path):
+    jobs = plan_jobs(
+        output_dir=tmp_path,
+        hub_org="lerobot",
+        results_repo="lerobot/benchmark-history",
+        policies=["pi0"],
+        benchmarks=["libero_plus", "robomme"],
+    )
+    manifest_path = write_manifest(
+        output_dir=tmp_path,
+        jobs=jobs,
+        git_commit="deadbeef",
+        hub_org="lerobot",
+        results_repo="lerobot/benchmark-history",
+    )
+
+    manifest = json.loads(manifest_path.read_text())
+    assert manifest["git_commit"] == "deadbeef"
+    assert manifest["results_repo"] == "lerobot/benchmark-history"
+    assert [job["benchmark"] for job in manifest["jobs"]] == ["libero_plus", "robomme"]
--- a/tests/envs/test_robomme_env.py
+++ b/tests/envs/test_robomme_env.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python
+
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import sys
+from types import ModuleType
+from unittest.mock import MagicMock
+
+import numpy as np
+
+
+def _install_robomme_stub():
+    stub = ModuleType("robomme")
+    wrapper_stub = ModuleType("robomme.env_record_wrapper")
+
+    class FakeBuilder:
+        def __init__(self, **kwargs):
+            pass
+
+        def make_env_for_episode(self, episode_idx: int, max_steps: int):
+            env = MagicMock()
+            obs = {
+                "front_rgb_list": [np.zeros((256, 256, 3), dtype=np.uint8)],
+                "wrist_rgb_list": [np.zeros((256, 256, 3), dtype=np.uint8)],
+                "joint_state_list": [np.zeros(7, dtype=np.float32)],
+                "gripper_state_list": [np.zeros(2, dtype=np.float32)],
+            }
+            env.reset.return_value = (obs, {"status": "ongoing", "task_goal": "pick the cube"})
+            env.step.return_value = (obs, 0.0, False, False, {"status": "ongoing", "task_goal": ""})
+            return env
+
+    wrapper_stub.BenchmarkEnvBuilder = FakeBuilder
+    stub.env_record_wrapper = wrapper_stub
+    sys.modules["robomme"] = stub
+    sys.modules["robomme.env_record_wrapper"] = wrapper_stub
+
+
+def _uninstall_robomme_stub():
+    sys.modules.pop("robomme", None)
+    sys.modules.pop("robomme.env_record_wrapper", None)
+
+
+def test_robomme_env_config_defaults():
+    from lerobot.envs.configs import RoboMMEEnv
+
+    cfg = RoboMMEEnv()
+    assert cfg.task == "PickXtimes"
+    assert cfg.fps == 10
+    assert cfg.episode_length == 300
+    assert cfg.action_space == "joint_angle"
+    assert cfg.dataset_split == "test"
+    assert cfg.task_ids is None
+
+
+def test_robomme_features_map():
+    from lerobot.envs.configs import RoboMMEEnv
+    from lerobot.utils.constants import ACTION, OBS_IMAGES, OBS_STATE
+
+    cfg = RoboMMEEnv()
+    assert cfg.features_map[ACTION] == ACTION
+    assert cfg.features_map["image"] == f"{OBS_IMAGES}.image"
+    assert cfg.features_map["wrist_image"] == f"{OBS_IMAGES}.wrist_image"
+    assert cfg.features_map[OBS_STATE] == OBS_STATE
+
+
+def test_convert_obs_list_format():
+    _install_robomme_stub()
+    try:
+        from lerobot.envs.robomme import RoboMMEGymEnv
+
+        env = RoboMMEGymEnv.__new__(RoboMMEGymEnv)
+
+        front = np.full((256, 256, 3), 42, dtype=np.uint8)
+        wrist = np.full((256, 256, 3), 7, dtype=np.uint8)
+        joints = np.arange(7, dtype=np.float32)
+        gripper = np.array([0.5, 0.5], dtype=np.float32)
+
+        obs_raw = {
+            "front_rgb_list": [np.zeros_like(front), front],
+            "wrist_rgb_list": [np.zeros_like(wrist), wrist],
+            "joint_state_list": [np.zeros(7, dtype=np.float32), joints],
+            "gripper_state_list": [np.zeros(2, dtype=np.float32), gripper],
+        }
+
+        result = env._convert_obs(obs_raw)
+        np.testing.assert_array_equal(result["image"], front)
+        np.testing.assert_array_equal(result["wrist_image"], wrist)
+        assert result["state"].shape == (8,)
+        np.testing.assert_array_almost_equal(result["state"][:7], joints)
+        assert result["state"][7] == gripper[0]
+    finally:
+        _uninstall_robomme_stub()
+
+
+def test_create_robomme_envs_multi_task():
+    _install_robomme_stub()
+    try:
+        from lerobot.envs.robomme import create_robomme_envs
+
+        env_cls = MagicMock(return_value=MagicMock())
+        result = create_robomme_envs(
+            task="PickXtimes,BinFill,StopCube",
+            n_envs=1,
+            env_cls=env_cls,
+        )
+
+        assert set(result.keys()) == {"PickXtimes", "BinFill", "StopCube"}
+    finally:
+        _uninstall_robomme_stub()
				`@@ -0,0 +1 @@`
				`# Copyright 2026 The HuggingFace Inc. team. All rights reserved.`