diff --git a/.github/workflows/benchmark_tests.yml b/.github/workflows/benchmark_tests.yml index 00806f990..b65883f1a 100644 --- a/.github/workflows/benchmark_tests.yml +++ b/.github/workflows/benchmark_tests.yml @@ -525,3 +525,110 @@ jobs: name: robocasa-metrics path: /tmp/robocasa-artifacts/metrics.json if-no-files-found: warn + + # ── ROBOCEREBRA ─────────────────────────────────────────────────────────── + # Reuses the LIBERO simulator (libero_10 suite) with RoboCerebra camera + # defaults (image/wrist_image). The image is layered on + # huggingface/lerobot-gpu, which already ships [libero] as part of [all]. + robocerebra-integration-test: + name: RoboCerebra — build image + 1-episode eval + runs-on: + group: aws-g6-4xlarge-plus + env: + HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }} + + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + lfs: true + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses] + with: + cache-binary: false + + - name: Login to Docker Hub + if: ${{ env.DOCKERHUB_USERNAME != '' }} + uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses] + with: + username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} + password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }} + env: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} + + - name: Build RoboCerebra benchmark image + uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses] + with: + context: . + file: docker/Dockerfile.benchmark.robocerebra + push: false + load: true + tags: lerobot-benchmark-robocerebra:ci + cache-from: type=local,src=/tmp/.buildx-cache-robocerebra + cache-to: type=local,dest=/tmp/.buildx-cache-robocerebra,mode=max + + - name: Run RoboCerebra smoke eval (1 episode) + if: env.HF_USER_TOKEN != '' + run: | + docker run --name robocerebra-eval --gpus all \ + --shm-size=4g \ + -e HF_HOME=/tmp/hf \ + -e HF_USER_TOKEN="${HF_USER_TOKEN}" \ + -e HF_HUB_DOWNLOAD_TIMEOUT=300 \ + -e LIBERO_DATA_FOLDER=/tmp/libero_data \ + lerobot-benchmark-robocerebra:ci \ + bash -c " + hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true + lerobot-eval \ + --policy.path=lerobot/smolvla_robocerebra \ + --env.type=libero \ + --env.task=libero_10 \ + --env.fps=20 \ + --env.obs_type=pixels_agent_pos \ + --env.observation_height=256 \ + --env.observation_width=256 \ + '--env.camera_name_mapping={\"agentview_image\": \"image\", \"robot0_eye_in_hand_image\": \"wrist_image\"}' \ + --eval.batch_size=1 \ + --eval.n_episodes=1 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--rename_map={\"observation.images.image\": \"observation.images.camera1\", \"observation.images.wrist_image\": \"observation.images.camera2\"}' \ + --policy.empty_cameras=1 \ + --output_dir=/tmp/eval-artifacts + python scripts/ci/extract_task_descriptions.py \ + --env libero --task libero_10 \ + --output /tmp/eval-artifacts/task_descriptions.json + " + + - name: Copy RoboCerebra artifacts from container + if: always() + run: | + mkdir -p /tmp/robocerebra-artifacts + docker cp robocerebra-eval:/tmp/eval-artifacts/. /tmp/robocerebra-artifacts/ 2>/dev/null || true + docker rm -f robocerebra-eval || true + + - name: Parse RoboCerebra eval metrics + if: always() + run: | + python3 scripts/ci/parse_eval_metrics.py \ + --artifacts-dir /tmp/robocerebra-artifacts \ + --env robocerebra \ + --task libero_10 \ + --policy lerobot/smolvla_robocerebra + + - name: Upload RoboCerebra rollout video + if: always() + uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses] + with: + name: robocerebra-rollout-video + path: /tmp/robocerebra-artifacts/videos/ + if-no-files-found: warn + + - name: Upload RoboCerebra eval metrics + if: always() + uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses] + with: + name: robocerebra-metrics + path: /tmp/robocerebra-artifacts/metrics.json + if-no-files-found: warn diff --git a/docker/Dockerfile.benchmark.robocerebra b/docker/Dockerfile.benchmark.robocerebra new file mode 100644 index 000000000..9378bd66a --- /dev/null +++ b/docker/Dockerfile.benchmark.robocerebra @@ -0,0 +1,43 @@ +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Benchmark image for RoboCerebra integration tests. +# RoboCerebra reuses LIBERO's simulator (libero_10 suite) with a different +# rename_map, so this image is identical to the LIBERO benchmark image — +# extends the nightly GPU base with LIBERO assets + the PR's source code. +# +# Build: docker build -f docker/Dockerfile.benchmark.robocerebra -t lerobot-benchmark-robocerebra . +# Run: docker run --gpus all --rm lerobot-benchmark-robocerebra lerobot-eval ... + +FROM huggingface/lerobot-gpu:latest + +# Pre-download lerobot/libero-assets from HF Hub so nothing is fetched at +# runtime (which times out on CI). Point the libero config at the cached path. +# libero/libero/__init__.py calls input() when ~/.libero/config.yaml is missing, +# so we write the config before any libero import can happen. +RUN LIBERO_DIR=$(python -c \ + "import importlib.util, os; s=importlib.util.find_spec('libero'); \ + print(os.path.join(os.path.dirname(s.origin), 'libero'))") && \ + mkdir -p /home/user_lerobot/.libero && \ + python -c "\ +from huggingface_hub import snapshot_download; \ +snapshot_download(repo_id='lerobot/libero-assets', repo_type='dataset', \ + local_dir='/home/user_lerobot/.libero/assets')" && \ + printf "assets: /home/user_lerobot/.libero/assets\nbddl_files: ${LIBERO_DIR}/bddl_files\ndatasets: ${LIBERO_DIR}/../datasets\ninit_states: ${LIBERO_DIR}/init_files\n" \ + > /home/user_lerobot/.libero/config.yaml + +# Overlay the PR's source code on top of the nightly image. +COPY --chown=user_lerobot:user_lerobot . . + +CMD ["/bin/bash"] diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index bb0dad1bf..ff3c08e96 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -83,6 +83,8 @@ title: RoboTwin 2.0 - local: robocasa title: RoboCasa365 + - local: robocerebra + title: RoboCerebra - local: envhub_isaaclab_arena title: NVIDIA IsaacLab Arena Environments title: "Benchmarks" diff --git a/docs/source/robocerebra.mdx b/docs/source/robocerebra.mdx new file mode 100644 index 000000000..9776bd40f --- /dev/null +++ b/docs/source/robocerebra.mdx @@ -0,0 +1,99 @@ +# RoboCerebra + +[RoboCerebra](https://robocerebra-project.github.io/) is a long-horizon manipulation benchmark that evaluates **high-level reasoning, planning, and memory** in VLAs. Episodes chain multiple sub-goals with language-grounded intermediate instructions, built on top of LIBERO's simulator stack (MuJoCo + robosuite, Franka Panda 7-DOF). + +- Paper: [RoboCerebra: A Large-scale Benchmark for Long-horizon Robotic Manipulation Evaluation](https://arxiv.org/abs/2506.06677) +- Project website: [robocerebra-project.github.io](https://robocerebra-project.github.io/) +- Dataset: [`lerobot/robocerebra_unified`](https://huggingface.co/datasets/lerobot/robocerebra_unified) — LeRobot v3.0, 6,660 episodes / 571,116 frames at 20 fps, 1,728 language-grounded sub-tasks. +- Pretrained policy: [`lerobot/smolvla_robocerebra`](https://huggingface.co/lerobot/smolvla_robocerebra) + +## Available tasks + +RoboCerebra reuses LIBERO's simulator, so evaluation runs against the LIBERO `libero_10` long-horizon suite: + +| Suite | CLI name | Tasks | Description | +| --------- | ----------- | ----- | ------------------------------------------------------------- | +| LIBERO-10 | `libero_10` | 10 | Long-horizon kitchen/living room tasks chaining 3–6 sub-goals | + +Each RoboCerebra episode in the dataset is segmented into multiple sub-tasks with natural-language instructions, which the unified dataset exposes as independent supervision signals. + +## Installation + +RoboCerebra piggybacks on LIBERO, so the `libero` extra is all you need: + +```bash +pip install -e ".[libero]" +``` + + +RoboCerebra requires Linux (MuJoCo / robosuite). Set the rendering backend before training or evaluation: + +```bash +export MUJOCO_GL=egl # for headless servers (HPC, cloud) +``` + + + +## Evaluation + +RoboCerebra eval runs against LIBERO's `libero_10` suite with RoboCerebra's camera naming (`image` + `wrist_image`) and an extra empty-camera slot so a three-view-trained policy receives the expected input layout: + +```bash +lerobot-eval \ + --policy.path=lerobot/smolvla_robocerebra \ + --env.type=libero \ + --env.task=libero_10 \ + --env.fps=20 \ + --env.obs_type=pixels_agent_pos \ + --env.observation_height=256 \ + --env.observation_width=256 \ + '--env.camera_name_mapping={"agentview_image": "image", "robot0_eye_in_hand_image": "wrist_image"}' \ + --eval.batch_size=1 \ + --eval.n_episodes=10 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--rename_map={"observation.images.image": "observation.images.camera1", "observation.images.wrist_image": "observation.images.camera2"}' \ + --policy.empty_cameras=1 +``` + +### Recommended evaluation episodes + +**10 episodes per task** across the `libero_10` suite (100 total) for reproducible benchmarking. Matches the protocol used in the RoboCerebra paper. + +## Policy inputs and outputs + +**Observations:** + +- `observation.state` — 8-dim proprioceptive state (7 joint positions + gripper) +- `observation.images.image` — third-person view, 256×256 HWC uint8 +- `observation.images.wrist_image` — wrist-mounted camera view, 256×256 HWC uint8 + +**Actions:** + +- Continuous control in `Box(-1, 1, shape=(7,))` — end-effector delta (6D) + gripper (1D) + +## Training + +The unified dataset at [`lerobot/robocerebra_unified`](https://huggingface.co/datasets/lerobot/robocerebra_unified) exposes two RGB streams and language-grounded sub-task annotations: + +| Feature | Shape | Description | +| -------------------------------- | ------------- | -------------------- | +| `observation.images.image` | (256, 256, 3) | Third-person view | +| `observation.images.wrist_image` | (256, 256, 3) | Wrist-mounted camera | +| `observation.state` | (8,) | Joint pos + gripper | +| `action` | (7,) | EEF delta + gripper | + +Fine-tune a SmolVLA base on it: + +```bash +lerobot-train \ + --policy.path=lerobot/smolvla_base \ + --dataset.repo_id=lerobot/robocerebra_unified \ + --env.type=libero \ + --env.task=libero_10 \ + --output_dir=outputs/smolvla_robocerebra +``` + +## Reproducing published results + +The released checkpoint [`lerobot/smolvla_robocerebra`](https://huggingface.co/lerobot/smolvla_robocerebra) was trained on `lerobot/robocerebra_unified` and evaluated with the command in the [Evaluation](#evaluation) section. CI runs the same command with `--eval.n_episodes=1` as a smoke test on every PR touching the benchmark.