Merge branch 'main' into codex/model-profiling

This commit is contained in:
Pepijn
2026-04-21 11:23:26 +02:00
committed by GitHub
23 changed files with 3856 additions and 34 deletions

View File

@@ -83,10 +83,13 @@ jobs:
cache-binary: false
- name: Login to Docker Hub
if: ${{ env.DOCKERHUB_USERNAME != '' }}
uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
with:
username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
env:
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
# Build the benchmark-specific image. The Dockerfile separates dep-install
# from source-copy, so code-only changes skip the slow uv-sync layer
@@ -115,7 +118,7 @@ jobs:
bash -c "
hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
lerobot-eval \
--policy.path=pepijn223/smolvla_libero \
--policy.path=lerobot/smolvla_libero \
--env.type=libero \
--env.task=libero_spatial \
--eval.batch_size=1 \
@@ -144,7 +147,7 @@ jobs:
--artifacts-dir /tmp/libero-artifacts \
--env libero \
--task libero_spatial \
--policy pepijn223/smolvla_libero
--policy lerobot/smolvla_libero
- name: Upload Libero rollout video
if: always()
@@ -238,10 +241,13 @@ jobs:
cache-binary: false
- name: Login to Docker Hub
if: ${{ env.DOCKERHUB_USERNAME != '' }}
uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
with:
username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
env:
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
- name: Build MetaWorld benchmark image
uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
@@ -264,7 +270,7 @@ jobs:
bash -c "
hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
lerobot-eval \
--policy.path=pepijn223/smolvla_metaworld \
--policy.path=lerobot/smolvla_metaworld \
--env.type=metaworld \
--env.task=metaworld-push-v3 \
--eval.batch_size=1 \
@@ -293,7 +299,7 @@ jobs:
--artifacts-dir /tmp/metaworld-artifacts \
--env metaworld \
--task metaworld-push-v3 \
--policy pepijn223/smolvla_metaworld
--policy lerobot/smolvla_metaworld
- name: Upload MetaWorld rollout video
if: always()
@@ -310,3 +316,530 @@ jobs:
name: metaworld-metrics
path: /tmp/metaworld-artifacts/metrics.json
if-no-files-found: warn
# ── ROBOTWIN 2.0 ──────────────────────────────────────────────────────────
# Isolated image: full RoboTwin 2.0 stack — SAPIEN, mplib, CuRobo,
# pytorch3d, + simulation assets (~4 GB).
# Build takes ~20 min on first run; subsequent runs hit the layer cache.
# Requires an NVIDIA GPU runner with CUDA 12.1 drivers.
robotwin-integration-test:
name: RoboTwin 2.0 — build image + 1-episode eval
runs-on:
group: aws-g6-4xlarge-plus
env:
HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
ROBOTWIN_POLICY: lerobot/smolvla_robotwin
ROBOTWIN_TASKS: beat_block_hammer,click_bell,handover_block,stack_blocks_two,click_alarmclock,open_microwave,adjust_bottle,lift_pot,stamp_seal,turn_switch
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false
lfs: true
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
with:
cache-binary: false
- name: Login to Docker Hub
if: ${{ env.DOCKERHUB_USERNAME != '' }}
uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
with:
username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
env:
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
# Build the full-install image: SAPIEN, mplib, CuRobo, pytorch3d +
# simulation assets (~4 GB). Layer cache lives in the runner's local
# Docker daemon — reused across re-runs on the same machine.
- name: Build RoboTwin 2.0 benchmark image
uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
with:
context: .
file: docker/Dockerfile.benchmark.robotwin
push: false
load: true
tags: lerobot-benchmark-robotwin:ci
cache-from: type=local,src=/tmp/.buildx-cache-robotwin
cache-to: type=local,dest=/tmp/.buildx-cache-robotwin,mode=max
- name: Run RoboTwin 2.0 smoke eval (10 tasks, 1 episode each)
if: env.HF_USER_TOKEN != ''
run: |
# Named container (no --rm) so we can docker cp artifacts out.
docker run --name robotwin-eval --gpus all \
--shm-size=4g \
-e HF_HOME=/tmp/hf \
-e HF_USER_TOKEN="${HF_USER_TOKEN}" \
-e ROBOTWIN_POLICY="${ROBOTWIN_POLICY}" \
-e ROBOTWIN_TASKS="${ROBOTWIN_TASKS}" \
lerobot-benchmark-robotwin:ci \
bash -c "
hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
cd /opt/robotwin && lerobot-eval \
--policy.path=\"\$ROBOTWIN_POLICY\" \
--env.type=robotwin \
--env.task=\"\$ROBOTWIN_TASKS\" \
--eval.batch_size=1 \
--eval.n_episodes=1 \
--eval.use_async_envs=false \
--policy.device=cuda \
'--rename_map={\"observation.images.head_camera\": \"observation.images.camera1\", \"observation.images.left_camera\": \"observation.images.camera2\", \"observation.images.right_camera\": \"observation.images.camera3\"}' \
--output_dir=/tmp/eval-artifacts
python /lerobot/scripts/ci/extract_task_descriptions.py \
--env robotwin \
--task \"\$ROBOTWIN_TASKS\" \
--output /tmp/eval-artifacts/task_descriptions.json
"
- name: Copy RoboTwin artifacts from container
if: always()
run: |
mkdir -p /tmp/robotwin-artifacts
docker cp robotwin-eval:/tmp/eval-artifacts/. /tmp/robotwin-artifacts/ 2>/dev/null || true
docker rm -f robotwin-eval || true
- name: Parse RoboTwin eval metrics
if: always()
run: |
python3 scripts/ci/parse_eval_metrics.py \
--artifacts-dir /tmp/robotwin-artifacts \
--env robotwin \
--task "${ROBOTWIN_TASKS}" \
--policy "${ROBOTWIN_POLICY}"
- name: Upload RoboTwin rollout video
if: always()
uses: actions/upload-artifact@v4
with:
name: robotwin-rollout-video
path: /tmp/robotwin-artifacts/videos/
if-no-files-found: warn
- name: Upload RoboTwin eval metrics
if: always()
uses: actions/upload-artifact@v4
with:
name: robotwin-metrics
path: /tmp/robotwin-artifacts/metrics.json
if-no-files-found: warn
# ── ROBOCASA365 ──────────────────────────────────────────────────────────
# Isolated image: robocasa + robosuite installed manually as editable
# clones (no `lerobot[robocasa]` extra — robocasa's setup.py pins
# `lerobot==0.3.3`, which would shadow this repo's lerobot).
robocasa-integration-test:
name: RoboCasa365 — build image + 1-episode eval
runs-on:
group: aws-g6-4xlarge-plus
env:
HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false
lfs: true
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
with:
cache-binary: false
- name: Login to Docker Hub
if: ${{ env.DOCKERHUB_USERNAME != '' }}
uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
with:
username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
env:
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
- name: Build RoboCasa365 benchmark image
uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
with:
context: .
file: docker/Dockerfile.benchmark.robocasa
push: false
load: true
tags: lerobot-benchmark-robocasa:ci
- name: Run RoboCasa365 smoke eval (10 atomic tasks, 1 episode each)
if: env.HF_USER_TOKEN != ''
run: |
docker run --name robocasa-eval --gpus all \
--shm-size=4g \
-e HF_HOME=/tmp/hf \
-e HF_USER_TOKEN="${HF_USER_TOKEN}" \
-e HF_HUB_DOWNLOAD_TIMEOUT=300 \
-e MUJOCO_GL=egl \
lerobot-benchmark-robocasa:ci \
bash -c "
hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
lerobot-eval \
--policy.path=lerobot/smolvla_robocasa \
--env.type=robocasa \
--env.task=CloseFridge,OpenCabinet,OpenDrawer,TurnOnMicrowave,TurnOffStove,CloseToasterOvenDoor,SlideDishwasherRack,TurnOnSinkFaucet,NavigateKitchen,TurnOnElectricKettle \
--eval.batch_size=1 \
--eval.n_episodes=1 \
--eval.use_async_envs=false \
--policy.device=cuda \
'--rename_map={\"observation.images.robot0_agentview_left\": \"observation.images.camera1\", \"observation.images.robot0_eye_in_hand\": \"observation.images.camera2\", \"observation.images.robot0_agentview_right\": \"observation.images.camera3\"}' \
--output_dir=/tmp/eval-artifacts
python scripts/ci/extract_task_descriptions.py \
--env robocasa \
--task CloseFridge,OpenCabinet,OpenDrawer,TurnOnMicrowave,TurnOffStove,CloseToasterOvenDoor,SlideDishwasherRack,TurnOnSinkFaucet,NavigateKitchen,TurnOnElectricKettle \
--output /tmp/eval-artifacts/task_descriptions.json
"
- name: Copy RoboCasa365 artifacts from container
if: always()
run: |
mkdir -p /tmp/robocasa-artifacts
docker cp robocasa-eval:/tmp/eval-artifacts/. /tmp/robocasa-artifacts/ 2>/dev/null || true
docker rm -f robocasa-eval || true
- name: Parse RoboCasa365 eval metrics
if: always()
run: |
python3 scripts/ci/parse_eval_metrics.py \
--artifacts-dir /tmp/robocasa-artifacts \
--env robocasa \
--task atomic_smoke_10 \
--policy lerobot/smolvla_robocasa
- name: Upload RoboCasa365 rollout video
if: always()
uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
with:
name: robocasa-rollout-video
path: /tmp/robocasa-artifacts/videos/
if-no-files-found: warn
- name: Upload RoboCasa365 eval metrics
if: always()
uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
with:
name: robocasa-metrics
path: /tmp/robocasa-artifacts/metrics.json
if-no-files-found: warn
# ── ROBOCEREBRA ───────────────────────────────────────────────────────────
# Reuses the LIBERO simulator (libero_10 suite) with RoboCerebra camera
# defaults (image/wrist_image). The image is layered on
# huggingface/lerobot-gpu, which already ships [libero] as part of [all].
robocerebra-integration-test:
name: RoboCerebra — build image + 1-episode eval
runs-on:
group: aws-g6-4xlarge-plus
env:
HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false
lfs: true
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
with:
cache-binary: false
- name: Login to Docker Hub
if: ${{ env.DOCKERHUB_USERNAME != '' }}
uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
with:
username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
env:
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
- name: Build RoboCerebra benchmark image
uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
with:
context: .
file: docker/Dockerfile.benchmark.robocerebra
push: false
load: true
tags: lerobot-benchmark-robocerebra:ci
cache-from: type=local,src=/tmp/.buildx-cache-robocerebra
cache-to: type=local,dest=/tmp/.buildx-cache-robocerebra,mode=max
- name: Run RoboCerebra smoke eval (1 episode)
if: env.HF_USER_TOKEN != ''
run: |
docker run --name robocerebra-eval --gpus all \
--shm-size=4g \
-e HF_HOME=/tmp/hf \
-e HF_USER_TOKEN="${HF_USER_TOKEN}" \
-e HF_HUB_DOWNLOAD_TIMEOUT=300 \
-e LIBERO_DATA_FOLDER=/tmp/libero_data \
lerobot-benchmark-robocerebra:ci \
bash -c "
hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
lerobot-eval \
--policy.path=lerobot/smolvla_robocerebra \
--env.type=libero \
--env.task=libero_10 \
--env.fps=20 \
--env.obs_type=pixels_agent_pos \
--env.observation_height=256 \
--env.observation_width=256 \
'--env.camera_name_mapping={\"agentview_image\": \"image\", \"robot0_eye_in_hand_image\": \"wrist_image\"}' \
--eval.batch_size=1 \
--eval.n_episodes=1 \
--eval.use_async_envs=false \
--policy.device=cuda \
'--rename_map={\"observation.images.image\": \"observation.images.camera1\", \"observation.images.wrist_image\": \"observation.images.camera2\"}' \
--policy.empty_cameras=1 \
--output_dir=/tmp/eval-artifacts
python scripts/ci/extract_task_descriptions.py \
--env libero --task libero_10 \
--output /tmp/eval-artifacts/task_descriptions.json
"
- name: Copy RoboCerebra artifacts from container
if: always()
run: |
mkdir -p /tmp/robocerebra-artifacts
docker cp robocerebra-eval:/tmp/eval-artifacts/. /tmp/robocerebra-artifacts/ 2>/dev/null || true
docker rm -f robocerebra-eval || true
- name: Parse RoboCerebra eval metrics
if: always()
run: |
python3 scripts/ci/parse_eval_metrics.py \
--artifacts-dir /tmp/robocerebra-artifacts \
--env robocerebra \
--task libero_10 \
--policy lerobot/smolvla_robocerebra
- name: Upload RoboCerebra rollout video
if: always()
uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
with:
name: robocerebra-rollout-video
path: /tmp/robocerebra-artifacts/videos/
if-no-files-found: warn
- name: Upload RoboCerebra eval metrics
if: always()
uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
with:
name: robocerebra-metrics
path: /tmp/robocerebra-artifacts/metrics.json
if-no-files-found: warn
# ── ROBOMME ───────────────────────────────────────────────────────────────
# Isolated image: mani-skill/SAPIEN/Vulkan chain with gymnasium and numpy
# overrides (robomme can't be a pyproject extra due to numpy<2 pin).
robomme-integration-test:
name: RoboMME — build image + 1-episode eval
runs-on:
group: aws-g6-4xlarge-plus
env:
HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
ROBOMME_POLICY: lerobot/smolvla_robomme
ROBOMME_TASKS: PickXtimes,BinFill,StopCube,MoveCube,InsertPeg,SwingXtimes,VideoUnmask,ButtonUnmask,PickHighlight,PatternLock
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false
lfs: true
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
with:
cache-binary: false
- name: Login to Docker Hub
if: ${{ env.DOCKERHUB_USERNAME != '' }}
uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
with:
username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
env:
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
- name: Build RoboMME benchmark image
uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
with:
context: .
file: docker/Dockerfile.benchmark.robomme
push: false
load: true
tags: lerobot-benchmark-robomme:ci
- name: Run RoboMME smoke eval (10 tasks, 1 episode each)
if: env.HF_USER_TOKEN != ''
run: |
docker run --name robomme-eval --gpus all \
--shm-size=4g \
-e HF_HOME=/tmp/hf \
-e HF_USER_TOKEN="${HF_USER_TOKEN}" \
-e HF_HUB_DOWNLOAD_TIMEOUT=300 \
-e ROBOMME_POLICY="${ROBOMME_POLICY}" \
-e ROBOMME_TASKS="${ROBOMME_TASKS}" \
lerobot-benchmark-robomme:ci \
bash -c "
hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
lerobot-eval \
--policy.path=\"\$ROBOMME_POLICY\" \
--env.type=robomme \
--env.task=\"\$ROBOMME_TASKS\" \
--env.dataset_split=test \
--env.task_ids=[0] \
--eval.batch_size=1 \
--eval.n_episodes=1 \
--eval.use_async_envs=false \
--policy.device=cuda \
'--rename_map={\"observation.images.image\": \"observation.images.camera1\", \"observation.images.wrist_image\": \"observation.images.camera2\"}' \
--policy.empty_cameras=3 \
--output_dir=/tmp/eval-artifacts
python scripts/ci/extract_task_descriptions.py \
--env robomme --task \"\$ROBOMME_TASKS\" \
--output /tmp/eval-artifacts/task_descriptions.json
"
- name: Copy RoboMME artifacts from container
if: always()
run: |
mkdir -p /tmp/robomme-artifacts
docker cp robomme-eval:/tmp/eval-artifacts/. /tmp/robomme-artifacts/ 2>/dev/null || true
docker rm -f robomme-eval || true
- name: Parse RoboMME eval metrics
if: always()
run: |
python3 scripts/ci/parse_eval_metrics.py \
--artifacts-dir /tmp/robomme-artifacts \
--env robomme \
--task "${ROBOMME_TASKS}" \
--policy "${ROBOMME_POLICY}"
- name: Upload RoboMME rollout video
if: always()
uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
with:
name: robomme-rollout-video
path: /tmp/robomme-artifacts/videos/
if-no-files-found: warn
- name: Upload RoboMME eval metrics
if: always()
uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
with:
name: robomme-metrics
path: /tmp/robomme-artifacts/metrics.json
if-no-files-found: warn
# ── LIBERO-plus ───────────────────────────────────────────────────────────
# Isolated image: LIBERO-plus fork cloned into /home/user_lerobot on top of
# huggingface/lerobot-gpu (see docker/Dockerfile.benchmark.libero_plus).
libero-plus-integration-test:
name: LIBERO-plus — build image + 1-episode eval
runs-on:
group: aws-g6-4xlarge-plus
env:
HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
LIBERO_PLUS_SUITE: libero_spatial
LIBERO_PLUS_POLICY: lerobot/smolvla_libero_plus
LIBERO_PLUS_TASK_IDS: "[0,100,260,500,1000,1500,2000,2400]"
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false
lfs: true
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
with:
cache-binary: false
- name: Login to Docker Hub
if: ${{ env.DOCKERHUB_USERNAME != '' }}
uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
with:
username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
env:
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
- name: Build LIBERO-plus benchmark image
uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
with:
context: .
file: docker/Dockerfile.benchmark.libero_plus
push: false
load: true
tags: lerobot-benchmark-libero-plus:ci
cache-from: type=local,src=/tmp/.buildx-cache-libero-plus
cache-to: type=local,dest=/tmp/.buildx-cache-libero-plus,mode=max
- name: Run LIBERO-plus smoke eval (1 episode)
if: env.HF_USER_TOKEN != ''
run: |
docker run --name libero-plus-eval --gpus all \
--shm-size=4g \
-e HF_HOME=/tmp/hf \
-e HF_USER_TOKEN="${HF_USER_TOKEN}" \
-e HF_HUB_DOWNLOAD_TIMEOUT=300 \
-e LIBERO_PLUS_SUITE="${LIBERO_PLUS_SUITE}" \
-e LIBERO_PLUS_POLICY="${LIBERO_PLUS_POLICY}" \
-e LIBERO_PLUS_TASK_IDS="${LIBERO_PLUS_TASK_IDS}" \
lerobot-benchmark-libero-plus:ci \
bash -c "
hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
lerobot-eval \
--policy.path=\"\$LIBERO_PLUS_POLICY\" \
--env.type=libero_plus \
--env.task=\"\$LIBERO_PLUS_SUITE\" \
--env.task_ids=\"\$LIBERO_PLUS_TASK_IDS\" \
--eval.batch_size=1 \
--eval.n_episodes=1 \
--eval.use_async_envs=false \
--policy.device=cuda \
'--env.camera_name_mapping={\"agentview_image\": \"camera1\", \"robot0_eye_in_hand_image\": \"camera2\"}' \
--policy.empty_cameras=1 \
--output_dir=/tmp/eval-artifacts
python scripts/ci/extract_task_descriptions.py \
--env libero_plus --task \"\$LIBERO_PLUS_SUITE\" \
--output /tmp/eval-artifacts/task_descriptions.json
"
- name: Copy LIBERO-plus artifacts from container
if: always()
run: |
mkdir -p /tmp/libero-plus-artifacts
docker cp libero-plus-eval:/tmp/eval-artifacts/. /tmp/libero-plus-artifacts/ 2>/dev/null || true
docker rm -f libero-plus-eval || true
- name: Parse LIBERO-plus eval metrics
if: always()
run: |
python3 scripts/ci/parse_eval_metrics.py \
--artifacts-dir /tmp/libero-plus-artifacts \
--env libero_plus \
--task "${LIBERO_PLUS_SUITE}" \
--policy "${LIBERO_PLUS_POLICY}"
- name: Upload LIBERO-plus rollout video
if: always()
uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
with:
name: libero-plus-rollout-video
path: /tmp/libero-plus-artifacts/videos/
if-no-files-found: warn
- name: Upload LIBERO-plus eval metrics
if: always()
uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
with:
name: libero-plus-metrics
path: /tmp/libero-plus-artifacts/metrics.json
if-no-files-found: warn

View File

@@ -0,0 +1,84 @@
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Benchmark image for LIBERO-plus integration tests.
# Extends the nightly GPU image (which has lerobot[all]) with the LIBERO-plus
# fork source + its 6.4 GB perturbation assets.
#
# Build: docker build -f docker/Dockerfile.benchmark.libero_plus -t lerobot-benchmark-libero-plus .
# Run: docker run --gpus all --rm lerobot-benchmark-libero-plus lerobot-eval ...
FROM huggingface/lerobot-gpu:latest
ENV MUJOCO_GL=egl
# unzip for the 6.4 GB assets.zip; the rest are LIBERO-plus build-time extras
# (wand / ImageMagick / fontconfig) not in the nightly base.
USER root
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
unzip libexpat1 libfontconfig1-dev libmagickwand-dev \
&& apt-get clean && rm -rf /var/lib/apt/lists/*
USER user_lerobot
# robosuite==1.4.1 is mandatory (the fork uses `single_arm_env` removed in
# v1.5+). The rest are LIBERO-plus runtime deps pulled from its setup.py.
# We install these explicitly instead of via the [libero_plus] extra because
# the extra's `libero @ git+...` dep installs as a namespace package and then
# clone and PYTHONPATH-override it below.
RUN uv pip install --no-cache \
"robosuite==1.4.1" \
"bddl==1.0.1" \
"easydict==1.13" \
"mujoco==3.7.0" \
"matplotlib==3.10.8" \
"Wand==0.6.13" \
"scikit-image==0.25.2" \
"gym==0.26.2"
# Clone LIBERO-plus and make it importable as `libero`. The nightly base has
# hf-libero (10 tasks) preinstalled via lerobot[libero]; uninstall it so
# Python resolves `import libero` to the 2402-task LIBERO-plus module instead.
# Pinned to the current upstream main SHA so benchmark builds stay reproducible.
ARG LIBERO_PLUS_SHA=4976dc3
ENV LIBERO_PLUS_ROOT=/home/user_lerobot/libero-plus/libero/libero
RUN git clone https://github.com/sylvestf/LIBERO-plus.git /home/user_lerobot/libero-plus \
&& git -C /home/user_lerobot/libero-plus checkout ${LIBERO_PLUS_SHA} \
&& cd /home/user_lerobot/libero-plus && uv pip install --no-cache --no-deps -e "." \
&& (uv pip uninstall hf-libero 2>/dev/null || true)
ENV PYTHONPATH="/home/user_lerobot/libero-plus:${PYTHONPATH}"
# Perturbation textures/scenes: bddl_base_domain.py resolves XMLs via
# DIR_PATH/../assets (package-relative, ignoring ~/.libero/config.yaml). All
# 2402 tasks reference files that ship only in Sylvest/LIBERO-plus's
# assets.zip (6.4 GB) under a deep author-internal prefix — extract and
# flatten it under ${LIBERO_PLUS_ROOT}/assets.
RUN python -c "\
from huggingface_hub import hf_hub_download; \
hf_hub_download(repo_id='Sylvest/LIBERO-plus', repo_type='dataset', \
filename='assets.zip', local_dir='/tmp/libero-plus-dl')" \
&& unzip -q /tmp/libero-plus-dl/assets.zip -d /tmp/libero-plus-dl/extract \
&& ASSETS_DIR=$(find /tmp/libero-plus-dl/extract -type d -name assets | head -1) \
&& mv "${ASSETS_DIR}" ${LIBERO_PLUS_ROOT}/assets \
&& rm -rf /tmp/libero-plus-dl
# Point ~/.libero/config.yaml at the clone so LIBERO-plus's imports are
# non-interactive (it calls input() when the config is missing).
RUN mkdir -p /home/user_lerobot/.libero \
&& printf "assets: ${LIBERO_PLUS_ROOT}/assets\nbddl_files: ${LIBERO_PLUS_ROOT}/bddl_files\ndatasets: ${LIBERO_PLUS_ROOT}/../datasets\ninit_states: ${LIBERO_PLUS_ROOT}/init_files\n" \
> /home/user_lerobot/.libero/config.yaml
# Overlay the PR's source code on top of the nightly image.
COPY --chown=user_lerobot:user_lerobot . .
CMD ["/bin/bash"]

View File

@@ -0,0 +1,71 @@
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Benchmark image for RoboCasa365 integration tests.
# Extends the nightly GPU image (which already has all extras installed)
# with the PR's source code and RoboCasa-specific asset setup.
#
# Build: docker build -f docker/Dockerfile.benchmark.robocasa -t lerobot-benchmark-robocasa .
# Run: docker run --gpus all --rm lerobot-benchmark-robocasa lerobot-eval ...
FROM huggingface/lerobot-gpu:latest
# Install robocasa + robosuite as editable clones. pip-installing from git
# omits data files like robocasa/models/assets/box_links/box_links_assets.json
# (not declared in package_data), which download_kitchen_assets needs at import.
#
# `--no-deps` on robocasa is deliberate: its setup.py pins `lerobot==0.3.3`
# in install_requires, which would shadow the editable lerobot baked into
# this image. We install robocasa's actual runtime deps explicitly instead.
# Pinned SHAs for reproducible benchmark runs. Bump when you need an
# upstream fix; don't rely on `main`/`master` drift.
ARG ROBOCASA_SHA=56e355ccc64389dfc1b8a61a33b9127b975ba681
ARG ROBOSUITE_SHA=aaa8b9b214ce8e77e82926d677b4d61d55e577ab
RUN git clone https://github.com/robocasa/robocasa.git ~/robocasa && \
git -C ~/robocasa checkout ${ROBOCASA_SHA} && \
git clone https://github.com/ARISE-Initiative/robosuite.git ~/robosuite && \
git -C ~/robosuite checkout ${ROBOSUITE_SHA} && \
uv pip install --no-cache -e ~/robocasa --no-deps && \
uv pip install --no-cache -e ~/robosuite && \
uv pip install --no-cache \
"numpy==2.2.5" "numba==0.61.2" "scipy==1.15.3" "mujoco==3.3.1" \
"pygame==2.6.1" "Pillow==12.2.0" "opencv-python==4.13.0.92" \
"pyyaml==6.0.3" "pynput==1.8.1" "tqdm==4.67.3" "termcolor==3.3.0" \
"imageio==2.37.3" "h5py==3.16.0" "lxml==6.0.4" "hidapi==0.14.0.post4" \
"tianshou==0.4.10" "gymnasium==1.2.3"
# Set up robocasa macros and download kitchen assets. We need:
# - tex : base environment textures
# - tex_generative : AI-generated textures; kitchen fixture XMLs embed
# refs to generative_textures/wall/tex*.png
# unconditionally, so MjModel.from_xml_string fails
# at reset time without them (even if the env is
# constructed with generative_textures=None).
# - fixtures_lw : lightwheel kitchen fixtures (fridge, counters...)
# - objs_lw : lightwheel object meshes (stools, misc props)
# We skip the objaverse/aigen object packs (~30GB combined) by pairing
# this with --env.obj_registries=["lightwheel"] on the lerobot side.
# The download script prompts interactively, so pipe 'y' to auto-accept.
RUN python -m robocasa.scripts.setup_macros && \
yes y | python -m robocasa.scripts.download_kitchen_assets \
--type tex tex_generative fixtures_lw objs_lw
# Overlay the PR's source code on top of the nightly image.
COPY --chown=user_lerobot:user_lerobot . .
# Re-install lerobot editably so the new source (with RoboCasaEnv registration)
# replaces the stale package baked into the nightly image.
RUN uv pip install --no-cache --no-deps -e .
CMD ["/bin/bash"]

View File

@@ -0,0 +1,43 @@
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Benchmark image for RoboCerebra integration tests.
# RoboCerebra reuses LIBERO's simulator (libero_10 suite) with a different
# rename_map, so this image is identical to the LIBERO benchmark image —
# extends the nightly GPU base with LIBERO assets + the PR's source code.
#
# Build: docker build -f docker/Dockerfile.benchmark.robocerebra -t lerobot-benchmark-robocerebra .
# Run: docker run --gpus all --rm lerobot-benchmark-robocerebra lerobot-eval ...
FROM huggingface/lerobot-gpu:latest
# Pre-download lerobot/libero-assets from HF Hub so nothing is fetched at
# runtime (which times out on CI). Point the libero config at the cached path.
# libero/libero/__init__.py calls input() when ~/.libero/config.yaml is missing,
# so we write the config before any libero import can happen.
RUN LIBERO_DIR=$(python -c \
"import importlib.util, os; s=importlib.util.find_spec('libero'); \
print(os.path.join(os.path.dirname(s.origin), 'libero'))") && \
mkdir -p /home/user_lerobot/.libero && \
python -c "\
from huggingface_hub import snapshot_download; \
snapshot_download(repo_id='lerobot/libero-assets', repo_type='dataset', \
local_dir='/home/user_lerobot/.libero/assets')" && \
printf "assets: /home/user_lerobot/.libero/assets\nbddl_files: ${LIBERO_DIR}/bddl_files\ndatasets: ${LIBERO_DIR}/../datasets\ninit_states: ${LIBERO_DIR}/init_files\n" \
> /home/user_lerobot/.libero/config.yaml
# Overlay the PR's source code on top of the nightly image.
COPY --chown=user_lerobot:user_lerobot . .
CMD ["/bin/bash"]

View File

@@ -0,0 +1,56 @@
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Benchmark image for RoboMME integration tests.
# Extends the nightly GPU image (which has lerobot[all]) with Vulkan system
# libs for ManiSkill/SAPIEN and the robomme extra. robomme isn't in [all]
# because mani-skill hard-pins gymnasium==0.29.1 and numpy<2.0.0 which
# conflict with lerobot's defaults; both are safe at runtime:
# - gymnasium 0.29.x has the same 5-tuple step() API as 1.x (since 0.26)
# - numpy 1.26.4 is API-compatible with lerobot's actual usage.
#
# Build: docker build -f docker/Dockerfile.benchmark.robomme -t lerobot-benchmark-robomme .
# Run: docker run --gpus all --rm lerobot-benchmark-robomme lerobot-eval ...
FROM huggingface/lerobot-gpu:latest
# NVIDIA Container Toolkit: expose Vulkan driver capability for headless rendering.
ENV NVIDIA_DRIVER_CAPABILITIES=all \
VK_ICD_FILENAMES=/usr/share/vulkan/icd.d/nvidia_icd.json
# ManiSkill/SAPIEN's renderer needs Vulkan, which isn't in the base image.
USER root
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
libvulkan1 libvulkan-dev mesa-vulkan-drivers \
&& mkdir -p /usr/share/vulkan/icd.d \
&& echo '{"file_format_version":"1.0.0","ICD":{"library_path":"libGLX_nvidia.so.0","api_version":"1.3.0"}}' \
> /usr/share/vulkan/icd.d/nvidia_icd.json \
&& apt-get clean && rm -rf /var/lib/apt/lists/*
USER user_lerobot
# Install smolvla + av-dep via the PR's pyproject, then layer robomme on top
# with gymnasium/numpy overrides. robomme isn't a pyproject extra because its
# mani-skill pin conflicts with lerobot's base numpy>=2 (see pyproject.toml).
COPY --chown=user_lerobot:user_lerobot setup.py pyproject.toml uv.lock README.md MANIFEST.in ./
RUN printf 'gymnasium==0.29.1\nnumpy==1.26.4\n' > /tmp/robomme_override.txt \
&& uv pip install --no-cache --override /tmp/robomme_override.txt \
-e ".[smolvla,av-dep]" \
"robomme @ git+https://github.com/RoboMME/robomme_benchmark.git@main" \
&& python -c "import robomme; print('robomme import OK')"
# Overlay the PR's source code on top of the nightly image.
COPY --chown=user_lerobot:user_lerobot . .
CMD ["/bin/bash"]

View File

@@ -0,0 +1,122 @@
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Benchmark image for RoboTwin 2.0 integration tests.
# Extends the nightly GPU image with the RoboTwin simulator stack:
# sapien/mplib/pytorch3d + NVlabs CuRobo + embodiments.zip + objects.zip
# (~3.96 GB of assets; background_texture.zip ~11 GB skipped for smoke eval).
#
# Build: docker build -f docker/Dockerfile.benchmark.robotwin -t lerobot-benchmark-robotwin .
# Run: docker run --gpus all --rm lerobot-benchmark-robotwin \
# lerobot-eval --env.type=robotwin --env.task=beat_block_hammer ...
FROM huggingface/lerobot-gpu:latest
ENV NVIDIA_DRIVER_CAPABILITIES=all \
VK_ICD_FILENAMES=/usr/share/vulkan/icd.d/nvidia_icd.json \
ROBOTWIN_ROOT=/opt/robotwin
# The nightly base is CUDA -base (no compiler, no Vulkan loader). CuRobo's
# `pip install -e .` runs nvcc, and SAPIEN renders via Vulkan — add both.
USER root
# Pinned upstream SHA for reproducible benchmark runs. Bump when we need
# an upstream fix; don't rely on `main` drift.
ARG ROBOTWIN_SHA=0aeea2d669c0f8516f4d5785f0aa33ba812c14b4
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
cuda-nvcc-12-4 cuda-cudart-dev-12-4 \
libvulkan1 vulkan-tools \
&& mkdir -p /usr/share/vulkan/icd.d \
&& echo '{"file_format_version":"1.0.0","ICD":{"library_path":"libGLX_nvidia.so.0","api_version":"1.3.0"}}' \
> /usr/share/vulkan/icd.d/nvidia_icd.json \
&& git clone https://github.com/RoboTwin-Platform/RoboTwin.git ${ROBOTWIN_ROOT} \
&& git -C ${ROBOTWIN_ROOT} checkout ${ROBOTWIN_SHA} \
&& chown -R user_lerobot:user_lerobot ${ROBOTWIN_ROOT} \
&& apt-get clean && rm -rf /var/lib/apt/lists/*
USER user_lerobot
# RoboTwin runtime deps (av is already in the base via [av-dep]).
RUN uv pip install --no-cache \
"sapien==3.0.0b1" "mplib==0.2.1" "transforms3d==0.4.2" "trimesh==4.4.3" \
"open3d==0.19.0" "imageio==2.34.2" termcolor zarr pydantic h5py
# pytorch3d has no universal wheel; must be built from source (~10 min, cached).
RUN uv pip install --no-cache --no-build-isolation \
"git+https://github.com/facebookresearch/pytorch3d.git@stable"
# CuRobo — NVlabs motion generator; TORCH_CUDA_ARCH_LIST must be set or the
# build aborts on an empty arch list. Pinned SHA for reproducibility.
ARG CUROBO_SHA=ca941586c33b8482ed9c0e74d60f23efd64b516a
RUN cd ${ROBOTWIN_ROOT}/envs \
&& git clone https://github.com/NVlabs/curobo.git \
&& git -C curobo checkout ${CUROBO_SHA} \
&& cd curobo \
&& TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;8.9;9.0" \
uv pip install -e . --no-build-isolation --no-cache
# Upstream patches (mirror RoboTwin's script/_install.sh).
# These patches target the exact versions pinned above; re-check when upgrading.
# mplib==0.2.1: drop a broken `or collide` clause in planner.py.
# Safe to remove once mplib > 0.2.1 ships with the fix upstream.
# sapien==3.0.0b1: fix URDF loader encoding + .srdf extension check.
# Safe to remove once sapien > 3.0.0b1 ships with the fix upstream.
RUN python - <<'EOF'
import pathlib, re, site
for d in site.getsitepackages():
p = pathlib.Path(d) / "mplib" / "planner.py"
if p.exists():
p.write_text(re.sub(r"\bor collide\b", "", p.read_text(), count=1))
print(f"mplib patch applied: {p}")
p = pathlib.Path(d) / "sapien" / "wrapper" / "urdf_loader.py"
if p.exists():
src = p.read_text().replace(
"with open(srdf_path) as f:", 'with open(srdf_path, encoding="utf-8") as f:'
).replace('"srdf"', '".srdf"')
p.write_text(src)
print(f"sapien patch applied: {p}")
EOF
# Simulation assets from TianxingChen/RoboTwin2.0: embodiments (~220 MB) +
# objects (~3.74 GB). background_texture (~11 GB) is intentionally skipped.
# The dataset is public — no auth token needed.
RUN python - <<'EOF'
import os, pathlib, zipfile
from huggingface_hub import hf_hub_download
assets_dir = pathlib.Path(os.environ["ROBOTWIN_ROOT"]) / "assets"
assets_dir.mkdir(parents=True, exist_ok=True)
for fname in ("embodiments.zip", "objects.zip"):
local = hf_hub_download(
repo_id="TianxingChen/RoboTwin2.0",
repo_type="dataset",
filename=fname,
local_dir=str(assets_dir),
)
with zipfile.ZipFile(local, "r") as z:
z.extractall(str(assets_dir))
pathlib.Path(local).unlink()
EOF
WORKDIR ${ROBOTWIN_ROOT}
RUN python script/update_embodiment_config_path.py
ENV PYTHONPATH="${ROBOTWIN_ROOT}:${PYTHONPATH}"
# Return to the lerobot source directory (set by base image) before overlaying.
WORKDIR /lerobot
# Overlay the PR's source code on top of the nightly image.
COPY --chown=user_lerobot:user_lerobot . .
CMD ["/bin/bash"]

View File

@@ -77,8 +77,18 @@
title: Adding a New Benchmark
- local: libero
title: LIBERO
- local: libero_plus
title: LIBERO-plus
- local: metaworld
title: Meta-World
- local: robotwin
title: RoboTwin 2.0
- local: robocasa
title: RoboCasa365
- local: robocerebra
title: RoboCerebra
- local: robomme
title: RoboMME
- local: envhub_isaaclab_arena
title: NVIDIA IsaacLab Arena Environments
title: "Benchmarks"

188
docs/source/libero_plus.mdx Normal file
View File

@@ -0,0 +1,188 @@
# LIBERO-plus
LIBERO-plus is a **robustness benchmark** for Vision-Language-Action (VLA) models built on top of [LIBERO](./libero). It systematically stress-tests policies by applying **seven independent perturbation dimensions** to the original LIBERO task set, exposing failure modes that standard benchmarks miss.
- Paper: [In-depth Robustness Analysis of Vision-Language-Action Models](https://arxiv.org/abs/2510.13626)
- GitHub: [sylvestf/LIBERO-plus](https://github.com/sylvestf/LIBERO-plus)
- Dataset: [lerobot/libero_plus](https://huggingface.co/datasets/lerobot/libero_plus)
![An overview of the LIBERO-plus benchmark perturbation dimensions](https://github.com/sylvestf/LIBERO-plus/raw/main/static/images/libero-plus.jpg)
## Perturbation dimensions
LIBERO-plus creates ~10 000 task variants by perturbing each original LIBERO task along these axes:
| Dimension | What changes |
| --------------------- | ----------------------------------------------------- |
| Objects layout | Target position, presence of confounding objects |
| Camera viewpoints | Camera position, orientation, field-of-view |
| Robot initial states | Manipulator start pose |
| Language instructions | LLM-rewritten task description (paraphrase / synonym) |
| Light conditions | Intensity, direction, color, shadow |
| Background textures | Scene surface and object appearance |
| Sensor noise | Photometric distortions and image degradation |
## Available task suites
LIBERO-plus covers the same five suites as LIBERO:
| Suite | CLI name | Tasks | Max steps | Description |
| -------------- | ---------------- | ----- | --------- | -------------------------------------------------- |
| LIBERO-Spatial | `libero_spatial` | 10 | 280 | Tasks requiring reasoning about spatial relations |
| LIBERO-Object | `libero_object` | 10 | 280 | Tasks centered on manipulating different objects |
| LIBERO-Goal | `libero_goal` | 10 | 300 | Goal-conditioned tasks with changing targets |
| LIBERO-90 | `libero_90` | 90 | 400 | Short-horizon tasks from the LIBERO-100 collection |
| LIBERO-Long | `libero_10` | 10 | 520 | Long-horizon tasks from the LIBERO-100 collection |
<Tip warning={true}>
Installing LIBERO-plus **replaces** vanilla LIBERO — it uninstalls `hf-libero`
so that `import libero` resolves to the LIBERO-plus fork. You cannot have both
installed at the same time. To switch back to vanilla LIBERO, uninstall the
fork and reinstall with `pip install -e ".[libero]"`.
</Tip>
## Installation
### System dependencies (Linux only)
```bash
sudo apt install libexpat1 libfontconfig1-dev libmagickwand-dev
```
### Python package
```bash
pip install -e ".[libero]" "robosuite==1.4.1" bddl easydict mujoco wand scikit-image gym
git clone https://github.com/sylvestf/LIBERO-plus.git
cd LIBERO-plus && pip install --no-deps -e .
pip uninstall -y hf-libero # so `import libero` resolves to the fork
```
LIBERO-plus is installed from its GitHub fork rather than a pyproject extra — the fork ships as a namespace package that pip can't handle, so it must be cloned and added to `PYTHONPATH`. See `docker/Dockerfile.benchmark.libero_plus` for the canonical install. MuJoCo is required, so only Linux is supported.
<Tip>
Set the MuJoCo rendering backend before running evaluation:
```bash
export MUJOCO_GL=egl # headless / HPC / cloud
```
</Tip>
### Download LIBERO-plus assets
LIBERO-plus ships its extended asset pack separately. Download `assets.zip` from the [Hugging Face dataset](https://huggingface.co/datasets/Sylvest/LIBERO-plus/tree/main) and extract it into the LIBERO-plus package directory:
```bash
# After installing the package, find where it was installed:
python -c "import libero; print(libero.__file__)"
# Then extract assets.zip into <package_root>/libero/assets/
```
## Evaluation
### Default evaluation (recommended)
Evaluate across the four standard suites (10 episodes per task):
```bash
lerobot-eval \
--policy.path="your-policy-id" \
--env.type=libero_plus \
--env.task=libero_spatial,libero_object,libero_goal,libero_10 \
--eval.batch_size=1 \
--eval.n_episodes=10 \
--env.max_parallel_tasks=1
```
### Single-suite evaluation
Evaluate on one LIBERO-plus suite:
```bash
lerobot-eval \
--policy.path="your-policy-id" \
--env.type=libero_plus \
--env.task=libero_spatial \
--eval.batch_size=1 \
--eval.n_episodes=10
```
- `--env.task` picks the suite (`libero_spatial`, `libero_object`, etc.).
- `--env.task_ids` restricts to specific task indices (`[0]`, `[1,2,3]`, etc.). Omit to run all tasks in the suite.
- `--eval.batch_size` controls how many environments run in parallel.
- `--eval.n_episodes` sets how many episodes to run per task.
### Multi-suite evaluation
Benchmark a policy across multiple suites at once by passing a comma-separated list:
```bash
lerobot-eval \
--policy.path="your-policy-id" \
--env.type=libero_plus \
--env.task=libero_spatial,libero_object \
--eval.batch_size=1 \
--eval.n_episodes=10
```
### Control mode
LIBERO-plus supports two control modes — `relative` (default) and `absolute`. Different VLA checkpoints are trained with different action parameterizations, so make sure the mode matches your policy:
```bash
--env.control_mode=relative # or "absolute"
```
### Policy inputs and outputs
**Observations:**
- `observation.state` — 8-dim proprioceptive features (eef position, axis-angle orientation, gripper qpos)
- `observation.images.image` — main camera view (`agentview_image`), HWC uint8
- `observation.images.image2` — wrist camera view (`robot0_eye_in_hand_image`), HWC uint8
**Actions:**
- Continuous control in `Box(-1, 1, shape=(7,))` — 6D end-effector delta + 1D gripper
### Recommended evaluation episodes
For reproducible benchmarking, use **10 episodes per task** across all four standard suites (Spatial, Object, Goal, Long). This gives 400 total episodes and matches the protocol used for published results.
## Training
### Dataset
A LeRobot-format training dataset for LIBERO-plus is available at:
- [lerobot/libero_plus](https://huggingface.co/datasets/lerobot/libero_plus)
### Example training command
```bash
lerobot-train \
--policy.type=smolvla \
--policy.repo_id=${HF_USER}/smolvla_libero_plus \
--policy.load_vlm_weights=true \
--dataset.repo_id=lerobot/libero_plus \
--env.type=libero_plus \
--env.task=libero_spatial \
--output_dir=./outputs/ \
--steps=100000 \
--batch_size=4 \
--eval.batch_size=1 \
--eval.n_episodes=1 \
--eval_freq=1000
```
## Relationship to LIBERO
LIBERO-plus is a drop-in extension of LIBERO:
- Same Python gym interface (`LiberoEnv`, `LiberoProcessorStep`)
- Same camera names and observation/action format
- Same task suite names
- Installs under the same `libero` Python package name (different GitHub repo)
To use the original LIBERO benchmark, see [LIBERO](./libero) and use `--env.type=libero`.

188
docs/source/robocasa.mdx Normal file
View File

@@ -0,0 +1,188 @@
# RoboCasa365
[RoboCasa365](https://robocasa.ai) is a large-scale simulation framework for training and benchmarking **generalist robots** in everyday kitchen tasks. It ships 365 diverse manipulation tasks across 2,500 kitchen environments, 3,200+ object assets and 600+ hours of human demonstration data, on a PandaOmron 12-DOF mobile manipulator (Franka arm on a holonomic base).
- Paper: [RoboCasa: Large-Scale Simulation of Everyday Tasks for Generalist Robots](https://arxiv.org/abs/2406.02523)
- GitHub: [robocasa/robocasa](https://github.com/robocasa/robocasa)
- Project website: [robocasa.ai](https://robocasa.ai)
- Pretrained policy: [`lerobot/smolvla_robocasa`](https://huggingface.co/lerobot/smolvla_robocasa)
- Single-task dataset (CloseFridge): [`pepijn223/robocasa_CloseFridge`](https://huggingface.co/datasets/pepijn223/robocasa_CloseFridge)
<img
src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/robocasa-banner.webp"
alt="RoboCasa365 benchmark overview"
width="85%"
/>
## Available tasks
RoboCasa365 organizes its 365 tasks into two families and three upstream benchmark groups that LeRobot exposes as first-class `--env.task` shortcuts:
| Family | Tasks | Description |
| --------- | ----- | ------------------------------------------------------------------------------- |
| Atomic | ~65 | Single-skill tasks: pick-and-place, door/drawer manipulation, appliance control |
| Composite | ~300 | Multi-step tasks across 60+ categories: cooking, cleaning, organizing, etc. |
**Atomic task examples:** `CloseFridge`, `OpenDrawer`, `OpenCabinet`, `TurnOnMicrowave`, `TurnOffStove`, `NavigateKitchen`, `PickPlaceCounterToStove`.
**Composite task categories:** baking, boiling, brewing, chopping, clearing table, defrosting food, loading dishwasher, making tea, microwaving food, washing dishes, and more.
`--env.task` accepts three forms:
- a single task name (`CloseFridge`)
- a comma-separated list (`CloseFridge,OpenBlenderLid,PickPlaceCoffee`)
- a benchmark-group shortcut — `atomic_seen`, `composite_seen`, `composite_unseen`, `pretrain50`, `pretrain100`, `pretrain200`, `pretrain300` — which auto-expands to the upstream task list and auto-sets the dataset `split` (`target` or `pretrain`).
## Installation
RoboCasa and its dependency `robosuite` are not published on PyPI, and RoboCasa's own `setup.py` hardcodes `lerobot==0.3.3`, which conflicts with this repo's `lerobot`. LeRobot therefore does **not** expose a `robocasa` extra — install the two packages manually as editable clones (using `--no-deps` on `robocasa` to skip its shadowed `lerobot` pin):
```bash
# After following the standard LeRobot installation instructions.
git clone https://github.com/robocasa/robocasa.git ~/robocasa
git clone https://github.com/ARISE-Initiative/robosuite.git ~/robosuite
pip install -e ~/robocasa --no-deps
pip install -e ~/robosuite
# Robocasa's runtime deps (the ones its setup.py would have pulled, minus
# the bad lerobot pin).
pip install numpy numba scipy mujoco pygame Pillow opencv-python \
pyyaml pynput tqdm termcolor imageio h5py lxml hidapi \
tianshou gymnasium
python -m robocasa.scripts.setup_macros
# Lightweight assets (lightwheel object meshes + textures). Enough for
# the default env out of the box.
python -m robocasa.scripts.download_kitchen_assets \
--type tex tex_generative fixtures_lw objs_lw
# Optional: full objaverse/aigen registries (~30GB) for richer object
# variety. Enable at eval time via --env.obj_registries (see below).
# python -m robocasa.scripts.download_kitchen_assets --type objs_objaverse
```
<Tip>
RoboCasa requires MuJoCo. Set the rendering backend before training or evaluation:
```bash
export MUJOCO_GL=egl # for headless servers (HPC, cloud)
```
</Tip>
### Object registries
By default the env samples objects only from the `lightwheel` registry (what `--type objs_lw` ships), which avoids a `Probabilities contain NaN` crash when the objaverse / aigen packs aren't on disk. If you've downloaded the full asset set, enable the full registry at runtime:
```bash
--env.obj_registries='[objaverse,lightwheel]'
```
## Evaluation
All eval snippets below mirror the CI command (see `.github/workflows/benchmark_tests.yml`). The `--rename_map` argument maps RoboCasa's native camera keys (`robot0_agentview_left` / `robot0_eye_in_hand` / `robot0_agentview_right`) onto the three-camera (`camera1` / `camera2` / `camera3`) input layout the released `smolvla_robocasa` policy was trained on.
### Single-task evaluation (recommended for quick iteration)
```bash
lerobot-eval \
--policy.path=lerobot/smolvla_robocasa \
--env.type=robocasa \
--env.task=CloseFridge \
--eval.batch_size=1 \
--eval.n_episodes=20 \
--eval.use_async_envs=false \
--policy.device=cuda \
'--rename_map={"observation.images.robot0_agentview_left": "observation.images.camera1", "observation.images.robot0_eye_in_hand": "observation.images.camera2", "observation.images.robot0_agentview_right": "observation.images.camera3"}'
```
### Multi-task evaluation
Pass a comma-separated list of tasks:
```bash
lerobot-eval \
--policy.path=lerobot/smolvla_robocasa \
--env.type=robocasa \
--env.task=CloseFridge,OpenCabinet,OpenDrawer,TurnOnMicrowave,TurnOffStove \
--eval.batch_size=1 \
--eval.n_episodes=20 \
--eval.use_async_envs=false \
--policy.device=cuda \
'--rename_map={"observation.images.robot0_agentview_left": "observation.images.camera1", "observation.images.robot0_eye_in_hand": "observation.images.camera2", "observation.images.robot0_agentview_right": "observation.images.camera3"}'
```
### Benchmark-group evaluation
Run an entire upstream group (e.g. all 18 `atomic_seen` tasks with `split=target`):
```bash
lerobot-eval \
--policy.path=lerobot/smolvla_robocasa \
--env.type=robocasa \
--env.task=atomic_seen \
--eval.batch_size=1 \
--eval.n_episodes=20 \
--eval.use_async_envs=false \
--policy.device=cuda \
'--rename_map={"observation.images.robot0_agentview_left": "observation.images.camera1", "observation.images.robot0_eye_in_hand": "observation.images.camera2", "observation.images.robot0_agentview_right": "observation.images.camera3"}'
```
### Recommended evaluation episodes
**20 episodes per task** for reproducible benchmarking. Matches the protocol used in published results.
## Policy inputs and outputs
**Observations** (raw RoboCasa camera names are preserved verbatim):
- `observation.state` — 16-dim proprioceptive state (base position, base quaternion, relative end-effector position, relative end-effector quaternion, gripper qpos)
- `observation.images.robot0_agentview_left` — left agent view, 256×256 HWC uint8
- `observation.images.robot0_eye_in_hand` — wrist camera view, 256×256 HWC uint8
- `observation.images.robot0_agentview_right` — right agent view, 256×256 HWC uint8
**Actions:**
- Continuous control in `Box(-1, 1, shape=(12,))` — base motion (4D) + control mode (1D) + end-effector position (3D) + end-effector rotation (3D) + gripper (1D).
## Training
### Single-task example
A ready-to-use single-task dataset is on the Hub:
[`pepijn223/robocasa_CloseFridge`](https://huggingface.co/datasets/pepijn223/robocasa_CloseFridge).
Fine-tune a SmolVLA base on `CloseFridge`:
```bash
lerobot-train \
--policy.type=smolvla \
--policy.repo_id=${HF_USER}/smolvla_robocasa_CloseFridge \
--policy.load_vlm_weights=true \
--policy.push_to_hub=true \
--dataset.repo_id=pepijn223/robocasa_CloseFridge \
--env.type=robocasa \
--env.task=CloseFridge \
--output_dir=./outputs/smolvla_robocasa_CloseFridge \
--steps=100000 \
--batch_size=4 \
--eval_freq=5000 \
--eval.batch_size=1 \
--eval.n_episodes=5 \
--save_freq=10000
```
Evaluate the resulting checkpoint:
```bash
lerobot-eval \
--policy.path=${HF_USER}/smolvla_robocasa_CloseFridge \
--env.type=robocasa \
--env.task=CloseFridge \
--eval.batch_size=1 \
--eval.n_episodes=20
```
## Reproducing published results
The released checkpoint [`lerobot/smolvla_robocasa`](https://huggingface.co/lerobot/smolvla_robocasa) is evaluated with the commands in the [Evaluation](#evaluation) section. CI runs a 10-atomic-task smoke eval (one episode each) on every PR touching the benchmark, picking fixture-centric tasks that don't require the objaverse asset pack.

View File

@@ -0,0 +1,99 @@
# RoboCerebra
[RoboCerebra](https://robocerebra-project.github.io/) is a long-horizon manipulation benchmark that evaluates **high-level reasoning, planning, and memory** in VLAs. Episodes chain multiple sub-goals with language-grounded intermediate instructions, built on top of LIBERO's simulator stack (MuJoCo + robosuite, Franka Panda 7-DOF).
- Paper: [RoboCerebra: A Large-scale Benchmark for Long-horizon Robotic Manipulation Evaluation](https://arxiv.org/abs/2506.06677)
- Project website: [robocerebra-project.github.io](https://robocerebra-project.github.io/)
- Dataset: [`lerobot/robocerebra_unified`](https://huggingface.co/datasets/lerobot/robocerebra_unified) — LeRobot v3.0, 6,660 episodes / 571,116 frames at 20 fps, 1,728 language-grounded sub-tasks.
- Pretrained policy: [`lerobot/smolvla_robocerebra`](https://huggingface.co/lerobot/smolvla_robocerebra)
## Available tasks
RoboCerebra reuses LIBERO's simulator, so evaluation runs against the LIBERO `libero_10` long-horizon suite:
| Suite | CLI name | Tasks | Description |
| --------- | ----------- | ----- | ------------------------------------------------------------- |
| LIBERO-10 | `libero_10` | 10 | Long-horizon kitchen/living room tasks chaining 36 sub-goals |
Each RoboCerebra episode in the dataset is segmented into multiple sub-tasks with natural-language instructions, which the unified dataset exposes as independent supervision signals.
## Installation
RoboCerebra piggybacks on LIBERO, so the `libero` extra is all you need:
```bash
pip install -e ".[libero]"
```
<Tip>
RoboCerebra requires Linux (MuJoCo / robosuite). Set the rendering backend before training or evaluation:
```bash
export MUJOCO_GL=egl # for headless servers (HPC, cloud)
```
</Tip>
## Evaluation
RoboCerebra eval runs against LIBERO's `libero_10` suite with RoboCerebra's camera naming (`image` + `wrist_image`) and an extra empty-camera slot so a three-view-trained policy receives the expected input layout:
```bash
lerobot-eval \
--policy.path=lerobot/smolvla_robocerebra \
--env.type=libero \
--env.task=libero_10 \
--env.fps=20 \
--env.obs_type=pixels_agent_pos \
--env.observation_height=256 \
--env.observation_width=256 \
'--env.camera_name_mapping={"agentview_image": "image", "robot0_eye_in_hand_image": "wrist_image"}' \
--eval.batch_size=1 \
--eval.n_episodes=10 \
--eval.use_async_envs=false \
--policy.device=cuda \
'--rename_map={"observation.images.image": "observation.images.camera1", "observation.images.wrist_image": "observation.images.camera2"}' \
--policy.empty_cameras=1
```
### Recommended evaluation episodes
**10 episodes per task** across the `libero_10` suite (100 total) for reproducible benchmarking. Matches the protocol used in the RoboCerebra paper.
## Policy inputs and outputs
**Observations:**
- `observation.state` — 8-dim proprioceptive state (7 joint positions + gripper)
- `observation.images.image` — third-person view, 256×256 HWC uint8
- `observation.images.wrist_image` — wrist-mounted camera view, 256×256 HWC uint8
**Actions:**
- Continuous control in `Box(-1, 1, shape=(7,))` — end-effector delta (6D) + gripper (1D)
## Training
The unified dataset at [`lerobot/robocerebra_unified`](https://huggingface.co/datasets/lerobot/robocerebra_unified) exposes two RGB streams and language-grounded sub-task annotations:
| Feature | Shape | Description |
| -------------------------------- | ------------- | -------------------- |
| `observation.images.image` | (256, 256, 3) | Third-person view |
| `observation.images.wrist_image` | (256, 256, 3) | Wrist-mounted camera |
| `observation.state` | (8,) | Joint pos + gripper |
| `action` | (7,) | EEF delta + gripper |
Fine-tune a SmolVLA base on it:
```bash
lerobot-train \
--policy.path=lerobot/smolvla_base \
--dataset.repo_id=lerobot/robocerebra_unified \
--env.type=libero \
--env.task=libero_10 \
--output_dir=outputs/smolvla_robocerebra
```
## Reproducing published results
The released checkpoint [`lerobot/smolvla_robocerebra`](https://huggingface.co/lerobot/smolvla_robocerebra) was trained on `lerobot/robocerebra_unified` and evaluated with the command in the [Evaluation](#evaluation) section. CI runs the same command with `--eval.n_episodes=1` as a smoke test on every PR touching the benchmark.

130
docs/source/robomme.mdx Normal file
View File

@@ -0,0 +1,130 @@
# RoboMME
[RoboMME](https://robomme.github.io) is a memory-augmented manipulation benchmark built on ManiSkill (SAPIEN). It evaluates a robot's ability to retain and use information across an episode — counting, object permanence, reference, and imitation.
- **16 tasks** across 4 memory-skill suites
- **1,600 training demos** (100 per task, 50 val, 50 test)
- **Dataset**: [`lerobot/robomme`](https://huggingface.co/datasets/lerobot/robomme) — LeRobot v3.0, 768K frames at 10 fps
- **Simulator**: ManiSkill / SAPIEN, Panda arm, Linux only
![RoboMME benchmark tasks overview](https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2603.04639/gradient.png)
## Tasks
| Suite | Tasks |
| --------------------------------- | ------------------------------------------------------------- |
| **Counting** (temporal memory) | BinFill, PickXtimes, SwingXtimes, StopCube |
| **Permanence** (spatial memory) | VideoUnmask, VideoUnmaskSwap, ButtonUnmask, ButtonUnmaskSwap |
| **Reference** (object memory) | PickHighlight, VideoRepick, VideoPlaceButton, VideoPlaceOrder |
| **Imitation** (procedural memory) | MoveCube, InsertPeg, PatternLock, RouteStick |
## Installation
> RoboMME requires **Linux** (ManiSkill/SAPIEN uses Vulkan rendering). Docker is recommended to isolate dependency conflicts.
### Native (Linux)
```bash
pip install --override <(printf 'gymnasium==0.29.1\nnumpy==1.26.4\n') \
-e '.[smolvla,av-dep]' \
'robomme @ git+https://github.com/RoboMME/robomme_benchmark.git@main'
```
> **Dependency note**: `mani-skill` (pulled by `robomme`) pins `gymnasium==0.29.1` and `numpy<2.0.0`, which conflict with lerobot's base `numpy>=2.0.0`. That's why `robomme` is not a pyproject extra — use the override install above, or the Docker approach below to avoid conflicts entirely.
### Docker (recommended)
```bash
# Build base image first (from repo root)
docker build -f docker/Dockerfile.eval-base -t lerobot-eval-base .
# Build RoboMME eval image (applies gymnasium + numpy pin overrides)
docker build -f docker/Dockerfile.benchmark.robomme -t lerobot-robomme .
```
The `docker/Dockerfile.benchmark.robomme` image overrides `gymnasium==0.29.1` and `numpy==1.26.4` after lerobot's install. Both versions are runtime-safe for lerobot's actual API usage.
## Running Evaluation
### Default (single task, single episode)
```bash
lerobot-eval \
--policy.path=<your_policy_repo> \
--env.type=robomme \
--env.task=PickXtimes \
--env.dataset_split=test \
--env.task_ids=[0] \
--eval.batch_size=1 \
--eval.n_episodes=1
```
### Multi-task evaluation
Evaluate multiple tasks in one run by comma-separating task names. Use `task_ids` to control which episodes are evaluated per task. Recommended: 50 episodes per task for the test split.
```bash
lerobot-eval \
--policy.path=<your_policy_repo> \
--env.type=robomme \
--env.task=PickXtimes,BinFill,StopCube,MoveCube,InsertPeg \
--env.dataset_split=test \
--env.task_ids=[0,1,2,3,4,5,6,7,8,9] \
--eval.batch_size=1 \
--eval.n_episodes=50
```
### Key CLI options for `env.type=robomme`
| Option | Default | Description |
| -------------------- | ------------- | -------------------------------------------------- |
| `env.task` | `PickXtimes` | Any of the 16 task names above (comma-separated) |
| `env.dataset_split` | `test` | `train`, `val`, or `test` |
| `env.action_space` | `joint_angle` | `joint_angle` (8-D) or `ee_pose` (7-D) |
| `env.episode_length` | `300` | Max steps per episode |
| `env.task_ids` | `null` | List of episode indices to evaluate (null = `[0]`) |
## Dataset
The dataset [`lerobot/robomme`](https://huggingface.co/datasets/lerobot/robomme) is in **LeRobot v3.0 format** and can be loaded directly:
```python
from lerobot.datasets.lerobot_dataset import LeRobotDataset
dataset = LeRobotDataset("lerobot/robomme")
```
### Dataset features
| Feature | Shape | Description |
| ------------------ | ------------- | ------------------------------- |
| `image` | (256, 256, 3) | Front camera RGB |
| `wrist_image` | (256, 256, 3) | Wrist camera RGB |
| `actions` | (8,) | Joint angles + gripper |
| `state` | (8,) | Joint positions + gripper state |
| `simple_subgoal` | str | High-level language annotation |
| `grounded_subgoal` | str | Grounded language annotation |
| `episode_index` | int | Episode ID |
| `frame_index` | int | Frame within episode |
### Feature key alignment (training)
The env wrapper exposes `pixels/image` and `pixels/wrist_image` as observation keys. The `features_map` in `RoboMMEEnv` maps these to `observation.images.image` and `observation.images.wrist_image` for the policy. State is exposed as `agent_pos` and maps to `observation.state`.
The dataset's `image` and `wrist_image` columns already align with the policy input keys, so no renaming is needed when fine-tuning.
## Action Spaces
| Type | Dim | Description |
| ------------- | --- | --------------------------------------------------------- |
| `joint_angle` | 8 | 7 joint angles + 1 gripper (1 closed, +1 open, absolute) |
| `ee_pose` | 7 | xyz + roll/pitch/yaw + gripper |
Set via `--env.action_space=joint_angle` (default) or `--env.action_space=ee_pose`.
## Platform Notes
- **Linux only**: ManiSkill requires SAPIEN/Vulkan. macOS and Windows are not supported.
- **GPU recommended**: Rendering is CPU-capable but slow; CUDA + Vulkan gives full speed.
- **gymnasium / numpy conflict**: See installation note above. Docker image handles this automatically.
- **ManiSkill fork**: `robomme` depends on a specific ManiSkill fork (`YinpeiDai/ManiSkill`), pulled in automatically via the `robomme` package.

223
docs/source/robotwin.mdx Normal file
View File

@@ -0,0 +1,223 @@
# RoboTwin 2.0
RoboTwin 2.0 is a **large-scale dual-arm manipulation benchmark** built on the SAPIEN physics engine. It provides a standardized evaluation protocol for bimanual robotic policies across 50 tasks (as of upstream `main`) with strong domain randomization (clutter, lighting, background, tabletop height, and language instructions).
- Paper: [RoboTwin 2.0: A Scalable Data Generator and Benchmark with Strong Domain Randomization for Robust Bimanual Robotic Manipulation](https://arxiv.org/abs/2506.18088)
- GitHub: [RoboTwin-Platform/RoboTwin](https://github.com/RoboTwin-Platform/RoboTwin)
- Leaderboard: [robotwin-platform.github.io/leaderboard](https://robotwin-platform.github.io/leaderboard)
- Dataset: [lerobot/robotwin_unified](https://huggingface.co/datasets/lerobot/robotwin_unified)
![RoboTwin 2.0 benchmark overview](https://www.aitntnews.com/pictures/2025/7/8/9a7f79cb-5ba9-11f0-8581-fa163e47d677.png)
## Overview
| Property | Value |
| ------------- | -------------------------------------------------------- |
| Tasks | 50 dual-arm manipulation tasks |
| Robot | Aloha-AgileX bimanual (14 DOF, 7 per arm) |
| Action space | 14-dim joint-space, continuous in `[-1, 1]` |
| Cameras | `head_camera`, `left_camera`, `right_camera` |
| Simulator | SAPIEN (not MuJoCo) |
| Eval protocol | 100 episodes/task, 50 demo_clean demonstrations |
| Eval settings | **Easy** (`demo_clean`) and **Hard** (`demo_randomized`) |
## Available tasks
RoboTwin 2.0 ships 50 dual-arm manipulation tasks in its upstream `envs/` directory. The canonical list is the `ROBOTWIN_TASKS` tuple in `src/lerobot/envs/robotwin.py`, mirrored verbatim from the upstream repo. Example tasks:
| Task | CLI name | Category |
| ------------------------ | ------------------------ | ----------------- |
| Beat block with hammer | `beat_block_hammer` | Tool use |
| Click bell / alarm clock | `click_bell` | Precision press |
| Stack blocks (2 / 3) | `stack_blocks_two/three` | Stacking |
| Stack bowls (2 / 3) | `stack_bowls_two/three` | Stacking |
| Handover block / mic | `handover_block` | Bimanual coord. |
| Lift pot | `lift_pot` | Bimanual lift |
| Shake bottle | `shake_bottle` | Continuous motion |
| Turn switch | `turn_switch` | Articulated obj |
| Stamp seal | `stamp_seal` | Precision place |
| Scan object | `scan_object` | Mobile manip. |
Pass a comma-separated list to `--env.task` to run multiple tasks in a single eval sweep.
<Tip warning={true}>
`open_laptop` is currently broken upstream (its `check_success()` uses
`self.arm_tag`, which is only set inside the scripted-expert `play_once()`
path and therefore unavailable during normal policy eval). Avoid it until the
upstream bug is fixed, or patch the task to default `self.arm_tag = "left"` in
`load_actors()`.
</Tip>
## Dataset
The RoboTwin 2.0 dataset is available in **LeRobot v3.0 format** on the Hugging Face Hub:
```
lerobot/robotwin_unified
```
It contains over 100,000 pre-collected trajectories across all 50 tasks (79.6 GB, Apache 2.0 license). No format conversion is needed — it is already in the correct LeRobot v3.0 schema with video observations and action labels.
You can load it directly with the HF Datasets library:
```python
from datasets import load_dataset
ds = load_dataset("lerobot/robotwin_unified", split="train")
```
## Installation
RoboTwin 2.0 requires **Linux** with an NVIDIA GPU (CUDA 12.1 recommended). Installation takes approximately 20 minutes.
### 1. Create a conda environment
```bash
conda create -n robotwin python=3.10 -y
conda activate robotwin
```
### 2. Install LeRobot
```bash
git clone https://github.com/huggingface/lerobot.git
cd lerobot
pip install -e "."
```
### 3. Install RoboTwin 2.0
```bash
git clone https://github.com/RoboTwin-Platform/RoboTwin.git
cd RoboTwin
bash script/_install.sh
bash script/_download_assets.sh
```
The install script handles all Python dependencies including SAPIEN, CuRobo, mplib, and pytorch3d.
<Tip warning={true}>
If the automated install fails, install manually:
```bash
pip install -r requirements.txt
pip install "git+https://github.com/facebookresearch/pytorch3d.git@stable"
cd envs && git clone https://github.com/NVlabs/curobo.git && cd curobo
pip install -e . --no-build-isolation
```
Then apply the required mplib fix: in `mplib/planner.py` line 807, remove `or collide` from the conditional.
</Tip>
### 4. Add RoboTwin to PYTHONPATH
The RoboTwin task modules must be importable by LeRobot. From within the `RoboTwin/` directory:
```bash
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
```
Add this to your shell profile to make it permanent.
## Evaluation
### Standard evaluation (recommended)
Evaluate a policy on a single task with the official protocol (100 episodes):
```bash
lerobot-eval \
--policy.path="your-hf-policy-id" \
--env.type=robotwin \
--env.task=beat_block_hammer \
--eval.batch_size=1 \
--eval.n_episodes=100
```
### Single-task quick check
```bash
lerobot-eval \
--policy.path="your-hf-policy-id" \
--env.type=robotwin \
--env.task=beat_block_hammer \
--eval.batch_size=1 \
--eval.n_episodes=5
```
### Multi-task sweep
Evaluate on several tasks in one run:
```bash
lerobot-eval \
--policy.path="your-hf-policy-id" \
--env.type=robotwin \
--env.task=beat_block_hammer,click_bell,handover_block,stack_blocks_two \
--eval.batch_size=1 \
--eval.n_episodes=100
```
### Full benchmark (all 50 tasks)
```bash
lerobot-eval \
--policy.path="your-hf-policy-id" \
--env.type=robotwin \
--env.task=adjust_bottle,beat_block_hammer,blocks_ranking_rgb,blocks_ranking_size,click_alarmclock,click_bell,dump_bin_bigbin,grab_roller,handover_block,handover_mic,hanging_mug,lift_pot,move_can_pot,move_pillbottle_pad,move_playingcard_away,move_stapler_pad,open_microwave,pick_diverse_bottles,pick_dual_bottles,place_a2b_left,place_a2b_right,place_bread_basket,place_bread_skillet,place_burger_fries,place_can_basket,place_cans_plasticbox,place_container_plate,place_dual_shoes,place_empty_cup,place_fan,place_mouse_pad,place_object_basket,place_object_scale,place_object_stand,place_phone_stand,place_shoe,press_stapler,put_bottles_dustbin,put_object_cabinet,rotate_qrcode,scan_object,shake_bottle,shake_bottle_horizontally,stack_blocks_three,stack_blocks_two,stack_bowls_three,stack_bowls_two,stamp_seal,turn_switch \
--eval.batch_size=1 \
--eval.n_episodes=100
```
<Tip>
`open_laptop` is intentionally omitted above because of the upstream
`self.arm_tag` bug (see the **Available tasks** section). Re-add it once the
upstream fix lands.
</Tip>
## Camera configuration
By default, all three cameras are included:
| Camera key | Description |
| -------------- | ------------------------------ |
| `head_camera` | Torso-mounted overhead view |
| `left_camera` | Left arm wrist-mounted camera |
| `right_camera` | Right arm wrist-mounted camera |
To use a subset of cameras, override `--env.camera_names`:
```bash
lerobot-eval \
--policy.path="your-hf-policy-id" \
--env.type=robotwin \
--env.task=beat_block_hammer \
--env.camera_names="head_camera,left_camera" \
--eval.batch_size=1 \
--eval.n_episodes=10
```
## Environment config reference
Key parameters for `RoboTwinEnvConfig`:
| Parameter | Default | Description |
| -------------------- | ---------------------------------------- | ---------------------------------- |
| `task` | `"beat_block_hammer"` | Comma-separated task name(s) |
| `fps` | `25` | Simulation FPS |
| `episode_length` | `300` | Max steps per episode |
| `obs_type` | `"pixels_agent_pos"` | `"pixels"` or `"pixels_agent_pos"` |
| `camera_names` | `"head_camera,left_camera,right_camera"` | Comma-separated active cameras |
| `observation_height` | `240` | Camera pixel height |
| `observation_width` | `320` | Camera pixel width |
## Leaderboard submission
Results can be submitted to the [RoboTwin 2.0 leaderboard](https://robotwin-platform.github.io/leaderboard). The official protocol requires:
- Training on 50 `demo_clean` demonstrations per task
- Evaluating 100 episodes per task
- Reporting success rate separately for **Easy** (`demo_clean`) and **Hard** (`demo_randomized`) settings
For submission instructions, refer to the [RoboTwin 2.0 documentation](https://robotwin-platform.github.io/doc/).

View File

@@ -212,6 +212,15 @@ aloha = ["lerobot[dataset]", "gym-aloha>=0.1.2,<0.2.0", "lerobot[scipy-dep]"]
pusht = ["lerobot[dataset]", "gym-pusht>=0.1.5,<0.2.0", "pymunk>=6.6.0,<7.0.0"] # TODO: Fix pymunk version in gym-pusht instead
libero = ["lerobot[dataset]", "lerobot[transformers-dep]", "hf-libero>=0.1.3,<0.2.0; sys_platform == 'linux'", "lerobot[scipy-dep]"]
metaworld = ["lerobot[dataset]", "metaworld==3.0.0", "lerobot[scipy-dep]"]
# NOTE: robomme is NOT a pyproject extra — mani-skill hard-pins numpy<2
# which conflicts with lerobot's numpy>=2 base pin, so the two trees can't
# resolve into a single env. Install it only in the RoboMME Docker image
# via `uv pip install --override` (see docker/Dockerfile.benchmark.robomme).
# NOTE: robocasa is NOT exposed as a `lerobot` extra. Its setup.py pins
# `lerobot==0.3.3` in install_requires, which cyclically shadows our own
# workspace `lerobot` and makes the graph unsolvable under any resolver
# (uv, pip). Install it manually alongside robosuite — see
# docs/source/robocasa.mdx for the recipe.
# All
all = [

View File

@@ -31,9 +31,23 @@ from __future__ import annotations
import argparse
import json
import re
import sys
from pathlib import Path
# LIBERO-plus derives task.language by space-joining the perturbation-variant
# filename (grab_language_from_filename in libero/libero/benchmark/__init__.py),
# so non-_language_ variants inherit a trailing metadata blob like
# "view 0 0 100 0 0 initstate 0 noise 45" or "add 16". Strip those tokens so
# the description matches the base instruction used in the training dataset.
_LIBERO_PERTURBATION_TAIL_RE = re.compile(
r"(?:\s(?:view|initstate|noise|add|tb|table|light|level)(?:\s\d+)+)+$"
)
def _strip_libero_perturbation_tail(instruction: str) -> str:
return _LIBERO_PERTURBATION_TAIL_RE.sub("", instruction).strip()
def _libero_descriptions(task_suite: str) -> dict[str, str]:
from libero.libero import benchmark # type: ignore[import-untyped]
@@ -47,7 +61,10 @@ def _libero_descriptions(task_suite: str) -> dict[str, str]:
)
return {}
suite = suite_dict[task_suite]()
return {f"{task_suite}_{i}": suite.get_task(i).language for i in range(suite.n_tasks)}
return {
f"{task_suite}_{i}": _strip_libero_perturbation_tail(suite.get_task(i).language)
for i in range(suite.n_tasks)
}
def _metaworld_descriptions(task_name: str) -> dict[str, str]:
@@ -57,19 +74,103 @@ def _metaworld_descriptions(task_name: str) -> dict[str, str]:
return {f"{task_name}_0": label}
def _robotwin_descriptions(task_names: str) -> dict[str, str]:
"""Return descriptions for each requested RoboTwin task. Reads
`description/task_instruction/<task>.json` from the RoboTwin clone
(cwd is /opt/robotwin in CI). Falls back to the task name if missing."""
out: dict[str, str] = {}
root = Path("description/task_instruction")
for name in (t.strip() for t in task_names.split(",") if t.strip()):
desc_file = root / f"{name}.json"
desc = name.replace("_", " ")
if desc_file.is_file():
data = json.loads(desc_file.read_text())
full = data.get("full_description") or desc
# Strip the schema placeholders ({A}, {a}) — keep the sentence readable.
desc = full.replace("<", "").replace(">", "")
out[f"{name}_0"] = desc
return out
def _robocasa_descriptions(task_spec: str) -> dict[str, str]:
"""For each task in the comma-separated list, emit a cleaned-name label.
RoboCasa episodes carry their language instruction in the env's
`ep_meta['lang']`, populated per reset. Pulling it requires spinning
up the full kitchen env per task (~seconds each); we use the task
name as the key here and let the eval's episode info carry the
actual instruction.
"""
out: dict[str, str] = {}
for task in (t.strip() for t in task_spec.split(",") if t.strip()):
# Split CamelCase into words: "CloseFridge" → "close fridge".
label = "".join(f" {c.lower()}" if c.isupper() else c for c in task).strip()
out[f"{task}_0"] = label or task
return out
_ROBOMME_DESCRIPTIONS = {
"BinFill": "Fill the target bin with the correct number of cubes",
"PickXtimes": "Pick the indicated cube the specified number of times",
"SwingXtimes": "Swing the object the specified number of times",
"StopCube": "Grasp and stop the moving cube",
"VideoUnmask": "Pick the cube shown in the reference video",
"VideoUnmaskSwap": "Pick the cube matching the reference video after a swap",
"ButtonUnmask": "Press the button indicated by the reference",
"ButtonUnmaskSwap": "Press the correct button after objects are swapped",
"PickHighlight": "Pick the highlighted cube",
"VideoRepick": "Repick the cube shown in the reference video",
"VideoPlaceButton": "Place the cube on the button shown in the video",
"VideoPlaceOrder": "Place cubes in the order shown in the video",
"MoveCube": "Move the cube to the target location",
"InsertPeg": "Insert the peg into the target hole",
"PatternLock": "Unlock the pattern by pressing buttons in sequence",
"RouteStick": "Route the stick through the required waypoints",
}
def _robomme_descriptions(task_names: str, task_ids: list[int] | None = None) -> dict[str, str]:
"""Return descriptions for each requested RoboMME task. Keys match the
video filename pattern `<task>_<task_id>` used by the eval script."""
if task_ids is None:
task_ids = [0]
out: dict[str, str] = {}
for name in (t.strip() for t in task_names.split(",") if t.strip()):
desc = _ROBOMME_DESCRIPTIONS.get(name, name)
for tid in task_ids:
out[f"{name}_{tid}"] = desc
return out
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--env", required=True, help="Environment family (libero, metaworld, ...)")
parser.add_argument("--task", required=True, help="Task/suite name (e.g. libero_spatial)")
parser.add_argument(
"--task-ids",
type=str,
default=None,
help="Comma-separated task IDs (e.g. '0,1,2'). Default: [0]",
)
parser.add_argument("--output", required=True, help="Path to write task_descriptions.json")
args = parser.parse_args()
task_ids: list[int] | None = None
if args.task_ids:
task_ids = [int(x.strip()) for x in args.task_ids.split(",")]
descriptions: dict[str, str] = {}
try:
if args.env == "libero":
if args.env == ("libero", "libero_plus"):
descriptions = _libero_descriptions(args.task)
elif args.env == "metaworld":
descriptions = _metaworld_descriptions(args.task)
elif args.env == "robotwin":
descriptions = _robotwin_descriptions(args.task)
elif args.env == "robocasa":
descriptions = _robocasa_descriptions(args.task)
elif args.env == "robomme":
descriptions = _robomme_descriptions(args.task, task_ids=task_ids)
else:
print(
f"[extract_task_descriptions] No description extractor for env '{args.env}'.",

View File

@@ -331,6 +331,7 @@ class LiberoEnv(EnvConfig):
camera_name_mapping: dict[str, str] | None = None
observation_height: int = 360
observation_width: int = 360
is_libero_plus: bool = False
features: dict[str, PolicyFeature] = field(
default_factory=lambda: {
ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(7,)),
@@ -432,6 +433,7 @@ class LiberoEnv(EnvConfig):
control_mode=self.control_mode,
episode_length=self.episode_length,
camera_name_mapping=self.camera_name_mapping,
is_libero_plus=self.is_libero_plus,
)
def get_env_processors(self):
@@ -496,6 +498,81 @@ class MetaworldEnv(EnvConfig):
)
@EnvConfig.register_subclass("robocasa")
@dataclass
class RoboCasaEnv(EnvConfig):
task: str = "CloseFridge"
fps: int = 20
episode_length: int = 1000
obs_type: str = "pixels_agent_pos"
render_mode: str = "rgb_array"
camera_name: str = "robot0_agentview_left,robot0_eye_in_hand,robot0_agentview_right"
observation_height: int = 256
observation_width: int = 256
visualization_height: int = 512
visualization_width: int = 512
split: str | None = None
# Object-mesh registries to sample from. Upstream default is
# ("objaverse", "lightwheel"), but objaverse is ~30GB and the CI image
# only ships the lightwheel pack. Override to include objaverse once
# you've run `python -m robocasa.scripts.download_kitchen_assets
# --type objaverse` locally.
obj_registries: list[str] = field(default_factory=lambda: ["lightwheel"])
features: dict[str, PolicyFeature] = field(
default_factory=lambda: {ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(12,))}
)
features_map: dict[str, str] = field(default_factory=lambda: {ACTION: ACTION, "agent_pos": OBS_STATE})
def __post_init__(self):
if self.obs_type not in ("pixels", "pixels_agent_pos"):
raise ValueError(f"Unsupported obs_type: {self.obs_type}")
# Preserve raw RoboCasa camera names end-to-end (e.g.
# `observation.images.robot0_agentview_left`). This matches the
# naming convention used by the RoboCasa datasets on the Hub, so
# trained policies don't need a `--rename_map` at eval time.
cams = [c.strip() for c in self.camera_name.split(",") if c.strip()]
for cam in cams:
self.features[f"pixels/{cam}"] = PolicyFeature(
type=FeatureType.VISUAL,
shape=(self.observation_height, self.observation_width, 3),
)
self.features_map[f"pixels/{cam}"] = f"{OBS_IMAGES}.{cam}"
if self.obs_type == "pixels_agent_pos":
self.features["agent_pos"] = PolicyFeature(type=FeatureType.STATE, shape=(16,))
@property
def gym_kwargs(self) -> dict:
kwargs: dict[str, Any] = {
"obs_type": self.obs_type,
"render_mode": self.render_mode,
"observation_height": self.observation_height,
"observation_width": self.observation_width,
"visualization_height": self.visualization_height,
"visualization_width": self.visualization_width,
}
if self.split is not None:
kwargs["split"] = self.split
return kwargs
def create_envs(self, n_envs: int, use_async_envs: bool = False):
from .robocasa import create_robocasa_envs
if self.task is None:
raise ValueError("RoboCasaEnv requires a task to be specified")
env_cls = _make_vec_env_cls(use_async_envs, n_envs)
return create_robocasa_envs(
task=self.task,
n_envs=n_envs,
camera_name=self.camera_name,
gym_kwargs=self.gym_kwargs,
env_cls=env_cls,
episode_length=self.episode_length,
obj_registries=tuple(self.obj_registries),
)
@EnvConfig.register_subclass("isaaclab_arena")
@dataclass
class IsaaclabArenaEnv(HubEnvConfig):
@@ -574,3 +651,171 @@ class IsaaclabArenaEnv(HubEnvConfig):
),
PolicyProcessorPipeline(steps=[]),
)
@EnvConfig.register_subclass("libero_plus")
@dataclass
class LiberoPlusEnv(LiberoEnv):
"""Config for LIBERO-plus robustness benchmark evaluation.
LIBERO-plus extends LIBERO with 7 perturbation dimensions (camera viewpoints,
object layouts, robot initial states, language instructions, lighting, background
textures, sensor noise) producing ~10k task variants.
The gym interface is identical to LIBERO so this class reuses ``LiberoEnv``
entirely — only the registered name and default task suite differ.
Install: see docker/Dockerfile.benchmark.libero_plus — LIBERO-plus ships
as a namespace package from a git fork and must be cloned + PYTHONPATH'd
rather than installed as a pyproject extra.
See Also:
https://github.com/sylvestf/LIBERO-plus
"""
task: str = "libero_spatial"
is_libero_plus: bool = True
@EnvConfig.register_subclass("robotwin")
@dataclass
class RoboTwinEnvConfig(EnvConfig):
"""Configuration for RoboTwin 2.0 benchmark environments.
RoboTwin 2.0 is a dual-arm manipulation benchmark with 50 tasks built on the
SAPIEN simulator. The robot is an Aloha-AgileX bimanual platform with 14 DOF
(7 per arm). All three cameras are enabled by default.
See: https://robotwin-platform.github.io
Dataset: https://huggingface.co/datasets/lerobot/robotwin_unified
"""
task: str = "beat_block_hammer" # single task or comma-separated list
fps: int = 25
episode_length: int = 300
obs_type: str = "pixels_agent_pos"
render_mode: str = "rgb_array"
# Available cameras from RoboTwin's aloha-agilex embodiment: head_camera
# (torso-mounted) + left_camera / right_camera (wrists).
camera_names: str = "head_camera,left_camera,right_camera"
# Match the D435 dims in task_config/demo_clean.yml (_camera_config.yml).
# Gym's vector-env concatenate pre-allocates buffers of this shape, so it
# must equal what SAPIEN actually renders.
observation_height: int = 240
observation_width: int = 320
features: dict[str, PolicyFeature] = field(
default_factory=lambda: {
ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(14,)),
}
)
features_map: dict[str, str] = field(
default_factory=lambda: {
ACTION: ACTION,
"pixels/head_camera": f"{OBS_IMAGES}.head_camera",
"pixels/left_camera": f"{OBS_IMAGES}.left_camera",
"pixels/right_camera": f"{OBS_IMAGES}.right_camera",
"agent_pos": OBS_STATE,
}
)
def __post_init__(self):
cam_list = [c.strip() for c in self.camera_names.split(",") if c.strip()]
for cam in cam_list:
self.features[f"pixels/{cam}"] = PolicyFeature(
type=FeatureType.VISUAL,
shape=(self.observation_height, self.observation_width, 3),
)
# Keep features_map entry if already set (default_factory); add if missing.
key = f"pixels/{cam}"
if key not in self.features_map:
self.features_map[key] = f"{OBS_IMAGES}.{cam}"
if self.obs_type == "pixels_agent_pos":
self.features["agent_pos"] = PolicyFeature(
type=FeatureType.STATE,
shape=(14,), # 14 DOF: 7 per arm
)
elif self.obs_type != "pixels":
raise ValueError(
f"Unsupported obs_type '{self.obs_type}'. "
"RoboTwinEnvConfig supports 'pixels' and 'pixels_agent_pos'."
)
@property
def gym_kwargs(self) -> dict:
return {}
def create_envs(self, n_envs: int, use_async_envs: bool = True):
from lerobot.envs.robotwin import create_robotwin_envs
if not self.task:
raise ValueError("RoboTwinEnvConfig requires `task` to be specified.")
env_cls = _make_vec_env_cls(use_async_envs, n_envs)
cam_list = [c.strip() for c in self.camera_names.split(",") if c.strip()]
return create_robotwin_envs(
task=self.task,
n_envs=n_envs,
env_cls=env_cls,
camera_names=cam_list,
observation_height=self.observation_height,
observation_width=self.observation_width,
episode_length=self.episode_length,
)
@EnvConfig.register_subclass("robomme")
@dataclass
class RoboMMEEnv(EnvConfig):
"""RoboMME memory-augmented manipulation benchmark (ManiSkill/SAPIEN).
16 tasks across 4 suites: Counting, Permanence, Reference, Imitation.
Dataset: lerobot/robomme (LeRobot v3.0, 1,600 episodes).
Benchmark: https://github.com/RoboMME/robomme_benchmark
Requires the `robomme` git package installed separately (Linux only);
see docker/Dockerfile.benchmark.robomme for the canonical install.
"""
task: str = "PickXtimes"
fps: int = 10
episode_length: int = 300
action_space: str = "joint_angle" # or "ee_pose" (7-D)
dataset_split: str = "test" # "train" | "val" | "test"
task_ids: list[int] | None = None
features: dict[str, PolicyFeature] = field(default_factory=dict)
features_map: dict[str, str] = field(
default_factory=lambda: {
ACTION: ACTION,
"pixels/image": f"{OBS_IMAGES}.image",
"pixels/wrist_image": f"{OBS_IMAGES}.wrist_image",
"agent_pos": OBS_STATE,
}
)
def __post_init__(self):
action_dim = 8 if self.action_space == "joint_angle" else 7
self.features = {
ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(action_dim,)),
"pixels/image": PolicyFeature(type=FeatureType.VISUAL, shape=(256, 256, 3)),
"pixels/wrist_image": PolicyFeature(type=FeatureType.VISUAL, shape=(256, 256, 3)),
"agent_pos": PolicyFeature(type=FeatureType.STATE, shape=(8,)),
}
@property
def gym_kwargs(self) -> dict:
return {}
def create_envs(self, n_envs: int, use_async_envs: bool = True):
from lerobot.envs.robomme import create_robomme_envs
env_cls = _make_vec_env_cls(use_async_envs, n_envs)
return create_robomme_envs(
task=self.task,
n_envs=n_envs,
action_space_type=self.action_space,
dataset=self.dataset_split,
episode_length=self.episode_length,
task_ids=self.task_ids,
env_cls=env_cls,
)

View File

@@ -16,6 +16,7 @@
from __future__ import annotations
import os
import re
from collections import defaultdict
from collections.abc import Callable, Iterable, Mapping, Sequence
from functools import partial
@@ -31,20 +32,7 @@ from libero.libero.envs import OffScreenRenderEnv
from lerobot.types import RobotObservation
from .utils import _LazyAsyncVectorEnv
def _parse_camera_names(camera_name: str | Sequence[str]) -> list[str]:
"""Normalize camera_name into a non-empty list of strings."""
if isinstance(camera_name, str):
cams = [c.strip() for c in camera_name.split(",") if c.strip()]
elif isinstance(camera_name, (list | tuple)):
cams = [str(c).strip() for c in camera_name if str(c).strip()]
else:
raise TypeError(f"camera_name must be str or sequence[str], got {type(camera_name).__name__}")
if not cams:
raise ValueError("camera_name resolved to an empty list.")
return cams
from .utils import _LazyAsyncVectorEnv, parse_camera_names
def _get_suite(name: str) -> benchmark.Benchmark:
@@ -69,14 +57,34 @@ def _select_task_ids(total_tasks: int, task_ids: Iterable[int] | None) -> list[i
return ids
def get_task_init_states(task_suite: Any, i: int) -> np.ndarray:
init_states_path = (
Path(get_libero_path("init_states"))
/ task_suite.tasks[i].problem_folder
/ task_suite.tasks[i].init_states_file
)
init_states = torch.load(init_states_path, weights_only=False) # nosec B614
return init_states
# LIBERO-plus perturbation variants encode the perturbation in the filename
# but on disk only the base `.pruned_init` exists — strip the suffix to match
# LIBERO-plus's own suite.get_task_init_states() (we reimplement it here so we
# can pass weights_only=False for PyTorch 2.6+ numpy pickles).
_LIBERO_PERTURBATION_SUFFIX_RE = re.compile(r"_(?:language|view|light)_[^.]*|_(?:table|tb)_\d+")
def get_task_init_states(task_suite: Any, i: int, is_libero_plus: bool = False) -> np.ndarray:
task = task_suite.tasks[i]
filename = Path(task.init_states_file)
root = Path(get_libero_path("init_states"))
if not is_libero_plus:
init_states_path = root / task.problem_folder / filename.name
return torch.load(init_states_path, weights_only=False) # nosec B614
# LIBERO-plus: `_add_` / `_level` variants store extra-object layouts under
# libero_newobj/ as a flat array that must be reshaped to (1, -1).
if "_add_" in filename.name or "_level" in filename.name:
init_states_path = root / "libero_newobj" / task.problem_folder / filename.name
init_states = torch.load(init_states_path, weights_only=False) # nosec B614
return init_states.reshape(1, -1)
# LIBERO-plus perturbation variants encode the perturbation in the filename
# but on disk only the base `.pruned_init` exists — strip the suffix to match.
stripped = _LIBERO_PERTURBATION_SUFFIX_RE.sub("", filename.stem) + filename.suffix
init_states_path = root / task.problem_folder / stripped
return torch.load(init_states_path, weights_only=False) # nosec B614
def get_libero_dummy_action():
@@ -118,9 +126,11 @@ class LiberoEnv(gym.Env):
camera_name_mapping: dict[str, str] | None = None,
num_steps_wait: int = 10,
control_mode: str = "relative",
is_libero_plus: bool = False,
):
super().__init__()
self.task_id = task_id
self.is_libero_plus = is_libero_plus
self.obs_type = obs_type
self.render_mode = render_mode
self.observation_width = observation_width
@@ -128,7 +138,7 @@ class LiberoEnv(gym.Env):
self.visualization_width = visualization_width
self.visualization_height = visualization_height
self.init_states = init_states
self.camera_name = _parse_camera_names(
self.camera_name = parse_camera_names(
camera_name
) # agentview_image (main) or robot0_eye_in_hand_image (wrist)
@@ -147,7 +157,11 @@ class LiberoEnv(gym.Env):
self.episode_index = episode_index
self.episode_length = episode_length
# Load once and keep
self._init_states = get_task_init_states(task_suite, self.task_id) if self.init_states else None
self._init_states = (
get_task_init_states(task_suite, self.task_id, is_libero_plus=self.is_libero_plus)
if self.init_states
else None
)
self._reset_stride = n_envs # when performing a reset, append `_reset_stride` to `init_state_id`.
self.init_state_id = self.episode_index # tie each sub-env to a fixed init state
@@ -380,6 +394,7 @@ def _make_env_fns(
gym_kwargs: Mapping[str, Any],
control_mode: str,
camera_name_mapping: dict[str, str] | None = None,
is_libero_plus: bool = False,
) -> list[Callable[[], LiberoEnv]]:
"""Build n_envs factory callables for a single (suite, task_id)."""
@@ -396,6 +411,7 @@ def _make_env_fns(
n_envs=n_envs,
control_mode=control_mode,
camera_name_mapping=camera_name_mapping,
is_libero_plus=is_libero_plus,
**local_kwargs,
)
@@ -418,6 +434,7 @@ def create_libero_envs(
control_mode: str = "relative",
episode_length: int | None = None,
camera_name_mapping: dict[str, str] | None = None,
is_libero_plus: bool = False,
) -> dict[str, dict[int, Any]]:
"""
Create vectorized LIBERO environments with a consistent return shape.
@@ -437,7 +454,7 @@ def create_libero_envs(
gym_kwargs = dict(gym_kwargs or {})
task_ids_filter = gym_kwargs.pop("task_ids", None) # optional: limit to specific tasks
camera_names = _parse_camera_names(camera_name)
camera_names = parse_camera_names(camera_name)
suite_names = [s.strip() for s in str(task).split(",") if s.strip()]
if not suite_names:
raise ValueError("`task` must contain at least one LIBERO suite name.")
@@ -462,6 +479,7 @@ def create_libero_envs(
# Probe once and reuse to avoid creating a temp env per task.
cached_obs_space: spaces.Space | None = None
cached_act_space: spaces.Space | None = None
cached_metadata: dict[str, Any] | None = None
for tid in selected:
fns = _make_env_fns(
@@ -475,12 +493,14 @@ def create_libero_envs(
gym_kwargs=gym_kwargs,
control_mode=control_mode,
camera_name_mapping=camera_name_mapping,
is_libero_plus=is_libero_plus,
)
if is_async:
lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space)
lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
if cached_obs_space is None:
cached_obs_space = lazy.observation_space
cached_act_space = lazy.action_space
cached_metadata = lazy.metadata
out[suite_name][tid] = lazy
else:
out[suite_name][tid] = env_cls(fns)

View File

@@ -311,6 +311,7 @@ def create_metaworld_envs(
is_async = env_cls is gym.vector.AsyncVectorEnv
cached_obs_space = None
cached_act_space = None
cached_metadata = None
out: dict[str, dict[int, Any]] = defaultdict(dict)
for group in task_groups:
@@ -324,10 +325,11 @@ def create_metaworld_envs(
fns = [(lambda tn=task_name: MetaworldEnv(task=tn, **gym_kwargs)) for _ in range(n_envs)]
if is_async:
lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space)
lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
if cached_obs_space is None:
cached_obs_space = lazy.observation_space
cached_act_space = lazy.action_space
cached_metadata = lazy.metadata
out[group][tid] = lazy
else:
out[group][tid] = env_cls(fns)

View File

@@ -0,0 +1,425 @@
#!/usr/bin/env python
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import logging
from collections import defaultdict
from collections.abc import Callable, Sequence
from functools import partial
from typing import Any
import gymnasium as gym
import numpy as np
from gymnasium import spaces
from lerobot.types import RobotObservation
from .utils import _LazyAsyncVectorEnv, parse_camera_names
logger = logging.getLogger(__name__)
# Dimensions for the flat action/state vectors used by the LeRobot wrapper.
# These correspond to the PandaOmron robot in RoboCasa365.
OBS_STATE_DIM = 16 # base_pos(3) + base_quat(4) + ee_pos_rel(3) + ee_quat_rel(4) + gripper_qpos(2)
ACTION_DIM = 12 # base_motion(4) + control_mode(1) + ee_pos(3) + ee_rot(3) + gripper(1)
ACTION_LOW = -1.0
ACTION_HIGH = 1.0
# Default PandaOmron cameras. We surface these raw names directly as
# `observation.images.<name>` so the LeRobot dataset/policy keys match
# RoboCasa's native convention (no implicit renaming).
DEFAULT_CAMERAS = [
"robot0_agentview_left",
"robot0_eye_in_hand",
"robot0_agentview_right",
]
# Object-mesh registries to sample from. RoboCasa's upstream default is
# ("objaverse", "lightwheel"), but the objaverse pack is huge (~30GB) and
# most users — including our CI image — only download the lightwheel pack
# (`--type objs_lw` in `download_kitchen_assets`). When a sampled object
# category has zero candidates in every registry, robocasa crashes with
# `ValueError: Probabilities contain NaN` (0/0 divide in the probability
# normalization). Restricting to registries that are actually on disk
# avoids the NaN and matches what the asset download provides.
DEFAULT_OBJ_REGISTRIES: tuple[str, ...] = ("lightwheel",)
# Task-group shortcuts accepted as `--env.task`. When the user passes one of
# these names, we expand it to the upstream RoboCasa task list and auto-set
# the dataset split. Individual task names (optionally comma-separated) still
# take precedence; this only triggers on an exact group-name match.
_TASK_GROUP_SPLITS = {
"atomic_seen": "target",
"composite_seen": "target",
"composite_unseen": "target",
"pretrain50": "pretrain",
"pretrain100": "pretrain",
"pretrain200": "pretrain",
"pretrain300": "pretrain",
}
def _resolve_tasks(task: str) -> tuple[list[str], str | None]:
"""Resolve a `--env.task` value to (task_names, split_override).
If `task` is a known task-group name (e.g. `atomic_seen`, `pretrain100`),
expand it via `robocasa.utils.dataset_registry.{TARGET,PRETRAINING}_TASKS`
and return the matching split. Otherwise treat `task` as a single task or
comma-separated list and leave the split untouched (None).
"""
key = task.strip()
if key in _TASK_GROUP_SPLITS:
from robocasa.utils.dataset_registry import PRETRAINING_TASKS, TARGET_TASKS
combined = {**TARGET_TASKS, **PRETRAINING_TASKS}
if key not in combined:
raise ValueError(
f"Task group '{key}' is not available in this version of robocasa. "
f"Known groups: {sorted(combined.keys())}."
)
return list(combined[key]), _TASK_GROUP_SPLITS[key]
names = [t.strip() for t in task.split(",") if t.strip()]
if not names:
raise ValueError("`task` must contain at least one RoboCasa task name.")
return names, None
def convert_action(flat_action: np.ndarray) -> dict[str, Any]:
"""Split a flat (12,) action vector into a RoboCasa action dict.
Layout: base_motion(4) + control_mode(1) + ee_pos(3) + ee_rot(3) + gripper(1)
"""
return {
"action.base_motion": flat_action[0:4],
"action.control_mode": flat_action[4:5],
"action.end_effector_position": flat_action[5:8],
"action.end_effector_rotation": flat_action[8:11],
"action.gripper_close": flat_action[11:12],
}
class RoboCasaEnv(gym.Env):
"""LeRobot gym.Env wrapper for RoboCasa365 kitchen environments.
Wraps RoboCasaGymEnv from the robocasa package and converts its
dict-based observations and actions into the flat arrays LeRobot expects.
Raw RoboCasa camera names are preserved verbatim under `pixels/<cam>`.
"""
metadata = {"render_modes": ["rgb_array"], "render_fps": 20}
def __init__(
self,
task: str,
camera_name: str | Sequence[str] = ",".join(DEFAULT_CAMERAS),
obs_type: str = "pixels_agent_pos",
render_mode: str = "rgb_array",
observation_width: int = 256,
observation_height: int = 256,
visualization_width: int = 512,
visualization_height: int = 512,
split: str | None = None,
episode_length: int | None = None,
obj_registries: Sequence[str] = DEFAULT_OBJ_REGISTRIES,
episode_index: int = 0,
):
super().__init__()
self.task = task
self.obs_type = obs_type
self.render_mode = render_mode
self.observation_width = observation_width
self.observation_height = observation_height
self.visualization_width = visualization_width
self.visualization_height = visualization_height
self.split = split
self.obj_registries = tuple(obj_registries)
# Per-worker index (0..n_envs-1) used to spread the user-provided
# seed across factories so each sub-env explores a distinct layout
# even when the same seed is passed to `reset()`.
self.episode_index = int(episode_index)
self.camera_name = parse_camera_names(camera_name)
self._max_episode_steps = episode_length if episode_length is not None else 1000
# Deferred — created on first reset() inside the worker subprocess
# to avoid inheriting stale GPU/EGL contexts across fork().
self._env: Any = None
self.task_description = ""
images = {
cam: spaces.Box(
low=0,
high=255,
shape=(self.observation_height, self.observation_width, 3),
dtype=np.uint8,
)
for cam in self.camera_name
}
if self.obs_type == "pixels":
self.observation_space = spaces.Dict({"pixels": spaces.Dict(images)})
elif self.obs_type == "pixels_agent_pos":
self.observation_space = spaces.Dict(
{
"pixels": spaces.Dict(images),
"agent_pos": spaces.Box(
low=-np.inf,
high=np.inf,
shape=(OBS_STATE_DIM,),
dtype=np.float32,
),
}
)
else:
raise ValueError(f"Unsupported obs_type '{self.obs_type}'. Use 'pixels' or 'pixels_agent_pos'.")
self.action_space = spaces.Box(
low=ACTION_LOW,
high=ACTION_HIGH,
shape=(ACTION_DIM,),
dtype=np.float32,
)
def _ensure_env(self) -> None:
"""Create the underlying RoboCasaGymEnv on first use.
Called inside the worker subprocess after fork(), so each worker gets
its own clean rendering context rather than inheriting a stale one from
the parent process (which causes crashes with AsyncVectorEnv).
"""
if self._env is not None:
return
from robocasa.wrappers.gym_wrapper import RoboCasaGymEnv
# RoboCasaGymEnv defaults split="test", which create_env rejects
# (only None/"all"/"pretrain"/"target" are valid). Always pass a
# valid value so we don't hit that default. Extra kwargs are
# forwarded to the underlying kitchen env via create_env/robosuite.make.
self._env = RoboCasaGymEnv(
env_name=self.task,
camera_widths=self.observation_width,
camera_heights=self.observation_height,
split=self.split if self.split is not None else "all",
obj_registries=self.obj_registries,
)
ep_meta = self._env.env.get_ep_meta()
self.task_description = ep_meta.get("lang", self.task)
def _format_raw_obs(self, raw_obs: dict) -> RobotObservation:
"""Convert RoboCasaGymEnv observation dict to LeRobot format."""
# RoboCasaGymEnv emits camera frames under "video.<cam>".
images = {cam: raw_obs[f"video.{cam}"] for cam in self.camera_name if f"video.{cam}" in raw_obs}
if self.obs_type == "pixels":
return {"pixels": images}
# `state.*` keys come from PandaOmronKeyConverter inside the wrapper.
agent_pos = np.concatenate(
[
raw_obs.get("state.base_position", np.zeros(3)),
raw_obs.get("state.base_rotation", np.zeros(4)),
raw_obs.get("state.end_effector_position_relative", np.zeros(3)),
raw_obs.get("state.end_effector_rotation_relative", np.zeros(4)),
raw_obs.get("state.gripper_qpos", np.zeros(2)),
],
axis=-1,
).astype(np.float32)
return {"pixels": images, "agent_pos": agent_pos}
def render(self) -> np.ndarray:
self._ensure_env()
assert self._env is not None
return self._env.render()
def reset(self, seed=None, **kwargs):
self._ensure_env()
assert self._env is not None
super().reset(seed=seed)
# Spread the seed across workers so n_envs factories don't all
# roll the same scene. With an explicit user seed we shift it by
# episode_index; with no seed we fall back to episode_index so
# each worker is still distinct rather than inheriting the same
# global RNG state.
worker_seed = seed + self.episode_index if seed is not None else self.episode_index
raw_obs, info = self._env.reset(seed=worker_seed)
ep_meta = self._env.env.get_ep_meta()
self.task_description = ep_meta.get("lang", self.task)
observation = self._format_raw_obs(raw_obs)
info = {"is_success": False}
return observation, info
def step(self, action: np.ndarray) -> tuple[RobotObservation, float, bool, bool, dict[str, Any]]:
self._ensure_env()
assert self._env is not None
if action.ndim != 1:
raise ValueError(
f"Expected action to be 1-D (shape (action_dim,)), "
f"but got shape {action.shape} with ndim={action.ndim}"
)
action_dict = convert_action(action)
raw_obs, reward, done, truncated, info = self._env.step(action_dict)
is_success = bool(info.get("success", False))
terminated = done or is_success
info.update({"task": self.task, "done": done, "is_success": is_success})
observation = self._format_raw_obs(raw_obs)
if terminated:
info["final_info"] = {
"task": self.task,
"done": bool(done),
"is_success": bool(is_success),
}
self.reset()
return observation, reward, terminated, truncated, info
def close(self):
if self._env is not None:
self._env.close()
def _make_env_fns(
*,
task: str,
n_envs: int,
camera_names: list[str],
obs_type: str,
render_mode: str,
observation_width: int,
observation_height: int,
visualization_width: int,
visualization_height: int,
split: str | None,
episode_length: int | None,
obj_registries: Sequence[str],
) -> list[Callable[[], RoboCasaEnv]]:
"""Build n_envs factory callables for a single task.
Each factory carries a distinct ``episode_index`` (``0..n_envs-1``) so
``RoboCasaEnv.reset()`` can derive a per-worker seed series from the
user-provided seed.
"""
def _make_env(episode_index: int) -> RoboCasaEnv:
return RoboCasaEnv(
task=task,
camera_name=camera_names,
obs_type=obs_type,
render_mode=render_mode,
observation_width=observation_width,
observation_height=observation_height,
visualization_width=visualization_width,
visualization_height=visualization_height,
split=split,
episode_length=episode_length,
obj_registries=obj_registries,
episode_index=episode_index,
)
return [partial(_make_env, i) for i in range(n_envs)]
def create_robocasa_envs(
task: str,
n_envs: int,
gym_kwargs: dict[str, Any] | None = None,
camera_name: str | Sequence[str] = ",".join(DEFAULT_CAMERAS),
env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None,
episode_length: int | None = None,
obj_registries: Sequence[str] = DEFAULT_OBJ_REGISTRIES,
) -> dict[str, dict[int, Any]]:
"""Create vectorized RoboCasa365 environments with a consistent return shape.
Returns:
dict[task_name][task_id] -> vec_env (env_cls([...]) with exactly n_envs factories)
`task` can be:
- a single task name (e.g. `CloseFridge`)
- a comma-separated list of task names (e.g. `CloseFridge,PickPlaceCoffee`)
- a benchmark-group shortcut (`atomic_seen`, `composite_seen`,
`composite_unseen`, `pretrain50`, `pretrain100`, `pretrain200`,
`pretrain300`), which auto-expands to the upstream task list and
auto-sets the dataset `split` ("target" or "pretrain").
"""
if env_cls is None or not callable(env_cls):
raise ValueError("env_cls must be a callable that wraps a list of environment factory callables.")
if not isinstance(n_envs, int) or n_envs <= 0:
raise ValueError(f"n_envs must be a positive int; got {n_envs}.")
gym_kwargs = dict(gym_kwargs or {})
obs_type = gym_kwargs.pop("obs_type", "pixels_agent_pos")
render_mode = gym_kwargs.pop("render_mode", "rgb_array")
observation_width = gym_kwargs.pop("observation_width", 256)
observation_height = gym_kwargs.pop("observation_height", 256)
visualization_width = gym_kwargs.pop("visualization_width", 512)
visualization_height = gym_kwargs.pop("visualization_height", 512)
split = gym_kwargs.pop("split", None)
camera_names = parse_camera_names(camera_name)
task_names, group_split = _resolve_tasks(str(task))
if group_split is not None and split is None:
split = group_split
logger.info(
"Creating RoboCasa envs | tasks=%s | split=%s | n_envs(per task)=%d",
task_names,
split,
n_envs,
)
is_async = env_cls is gym.vector.AsyncVectorEnv
cached_obs_space: spaces.Space | None = None
cached_act_space: spaces.Space | None = None
cached_metadata: dict[str, Any] | None = None
out: dict[str, dict[int, Any]] = defaultdict(dict)
for task_name in task_names:
fns = _make_env_fns(
task=task_name,
n_envs=n_envs,
camera_names=camera_names,
obs_type=obs_type,
render_mode=render_mode,
observation_width=observation_width,
observation_height=observation_height,
visualization_width=visualization_width,
visualization_height=visualization_height,
split=split,
episode_length=episode_length,
obj_registries=obj_registries,
)
if is_async:
lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
if cached_obs_space is None:
cached_obs_space = lazy.observation_space
cached_act_space = lazy.action_space
cached_metadata = lazy.metadata
out[task_name][0] = lazy
else:
out[task_name][0] = env_cls(fns)
logger.info("Built vec env | task=%s | n_envs=%d", task_name, n_envs)
return {name: dict(task_map) for name, task_map in out.items()}

245
src/lerobot/envs/robomme.py Normal file
View File

@@ -0,0 +1,245 @@
"""RoboMME environment wrapper for LeRobot evaluation.
Wraps the RoboMME ``BenchmarkEnvBuilder`` into a Gymnasium-compatible
``VectorEnv`` suitable for ``lerobot_eval``.
RoboMME tasks:
Counting: BinFill, PickXtimes, SwingXtimes, StopCube
Permanence: VideoUnmask, VideoUnmaskSwap, ButtonUnmask, ButtonUnmaskSwap
Reference: PickHighlight, VideoRepick, VideoPlaceButton, VideoPlaceOrder
Imitation: MoveCube, InsertPeg, PatternLock, RouteStick
Dataset: lerobot/robomme (LeRobot v3.0, 1,600 episodes)
Install: see docker/Dockerfile.benchmark.robomme (Linux only — mani-skill vs numpy pin conflict)
Benchmark: https://github.com/RoboMME/robomme_benchmark
"""
from __future__ import annotations
from collections.abc import Callable, Sequence
from functools import partial
from typing import Any
import gymnasium as gym
import numpy as np
from gymnasium import spaces
from .utils import _LazyAsyncVectorEnv
ROBOMME_TASKS = [
"BinFill",
"PickXtimes",
"SwingXtimes",
"StopCube",
"VideoUnmask",
"VideoUnmaskSwap",
"ButtonUnmask",
"ButtonUnmaskSwap",
"PickHighlight",
"VideoRepick",
"VideoPlaceButton",
"VideoPlaceOrder",
"MoveCube",
"InsertPeg",
"PatternLock",
"RouteStick",
]
class RoboMMEGymEnv(gym.Env):
"""Thin Gymnasium wrapper around a single RoboMME episode env."""
metadata = {"render_modes": ["rgb_array"], "render_fps": 10}
def __init__(
self,
task: str = "PickXtimes",
action_space_type: str = "joint_angle",
dataset: str = "test",
episode_idx: int = 0,
max_steps: int = 300,
):
super().__init__()
from robomme.env_record_wrapper import BenchmarkEnvBuilder
self._task = task
self._action_space_type = action_space_type
self._dataset = dataset
self._episode_idx = episode_idx
self._max_steps = max_steps
self._max_episode_steps = max_steps
self._builder = BenchmarkEnvBuilder(
env_id=task,
dataset=dataset,
action_space=action_space_type,
gui_render=False,
max_steps=max_steps,
)
self._env = None
self._last_raw_obs: dict | None = None
action_dim = 8 if action_space_type == "joint_angle" else 7
self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(action_dim,), dtype=np.float32)
# `pixels` must be a nested Dict so `preprocess_observation()` in
# envs/utils.py picks it up and maps each camera to
# `observation.images.<cam>`. A flat layout (`pixels/image`,
# `pixels/wrist_image`) silently drops every image from the batch.
self.observation_space = spaces.Dict(
{
"pixels": spaces.Dict(
{
"image": spaces.Box(0, 255, shape=(256, 256, 3), dtype=np.uint8),
"wrist_image": spaces.Box(0, 255, shape=(256, 256, 3), dtype=np.uint8),
}
),
"agent_pos": spaces.Box(-np.inf, np.inf, shape=(8,), dtype=np.float32),
}
)
def reset(self, *, seed=None, options=None):
super().reset(seed=seed)
self._env = self._builder.make_env_for_episode(
episode_idx=self._episode_idx,
max_steps=self._max_steps,
)
obs, info = self._env.reset()
self._last_raw_obs = obs
return self._convert_obs(obs), self._convert_info(info)
def step(self, action):
obs, reward, terminated, truncated, info = self._env.step(action)
self._last_raw_obs = obs
terminated_bool = bool(terminated.item()) if hasattr(terminated, "item") else bool(terminated)
truncated_bool = bool(truncated.item()) if hasattr(truncated, "item") else bool(truncated)
status = info.get("status", "ongoing")
is_success = status == "success"
conv_info = self._convert_info(info)
conv_info["is_success"] = is_success
return self._convert_obs(obs), float(reward), terminated_bool, truncated_bool, conv_info
def render(self) -> np.ndarray | None:
"""Return the front camera image from the last observation for video recording."""
if self._last_raw_obs is None:
return np.zeros((256, 256, 3), dtype=np.uint8)
front = self._last_raw_obs.get("front_rgb_list")
if front is None:
return np.zeros((256, 256, 3), dtype=np.uint8)
frame = front[-1] if isinstance(front, list) else front
return np.asarray(frame, dtype=np.uint8)
def _convert_obs(self, obs: dict) -> dict:
front_rgb = (
obs["front_rgb_list"][-1] if isinstance(obs["front_rgb_list"], list) else obs["front_rgb_list"]
)
wrist_rgb = (
obs["wrist_rgb_list"][-1] if isinstance(obs["wrist_rgb_list"], list) else obs["wrist_rgb_list"]
)
joint_state = (
obs["joint_state_list"][-1]
if isinstance(obs["joint_state_list"], list)
else obs["joint_state_list"]
)
gripper_state = (
obs["gripper_state_list"][-1]
if isinstance(obs["gripper_state_list"], list)
else obs["gripper_state_list"]
)
front_rgb = np.asarray(front_rgb, dtype=np.uint8)
wrist_rgb = np.asarray(wrist_rgb, dtype=np.uint8)
joint = np.asarray(joint_state, dtype=np.float32).flatten()[:7]
gripper = np.asarray(gripper_state, dtype=np.float32).flatten()[:1]
state = np.concatenate([joint, gripper])
return {
"pixels": {"image": front_rgb, "wrist_image": wrist_rgb},
"agent_pos": state,
}
def _convert_info(self, info: dict) -> dict:
return {
"status": info.get("status", "ongoing"),
"task_goal": info.get("task_goal", ""),
}
def _make_env_fns(
*,
task: str,
n_envs: int,
action_space_type: str,
dataset: str,
episode_length: int,
task_id: int,
) -> list[Callable[[], RoboMMEGymEnv]]:
"""Build n_envs factory callables for one RoboMME task id."""
def _make_one(episode_index: int) -> RoboMMEGymEnv:
return RoboMMEGymEnv(
task=task,
action_space_type=action_space_type,
dataset=dataset,
episode_idx=episode_index,
max_steps=episode_length,
)
return [partial(_make_one, task_id + i) for i in range(n_envs)]
def create_robomme_envs(
task: str,
n_envs: int = 1,
action_space_type: str = "joint_angle",
dataset: str = "test",
episode_length: int = 300,
task_ids: list[int] | None = None,
env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None,
) -> dict[str, dict[int, gym.vector.VectorEnv]]:
"""Create vectorized RoboMME environments for evaluation.
`task` may be a single RoboMME task name (e.g. "PickXtimes") or a
comma-separated list (e.g. "PickXtimes,BinFill,StopCube"). Each task
becomes its own suite in the returned mapping.
Returns {suite_name: {task_id: VectorEnv}} matching lerobot's expected format.
"""
if env_cls is None or not callable(env_cls):
raise ValueError("env_cls must be a callable that wraps a list of env factory callables.")
if not isinstance(n_envs, int) or n_envs <= 0:
raise ValueError(f"n_envs must be a positive int; got {n_envs}.")
if task_ids is None:
task_ids = [0]
task_names = [t.strip() for t in task.split(",") if t.strip()]
is_async = env_cls is gym.vector.AsyncVectorEnv
cached_obs_space: spaces.Space | None = None
cached_act_space: spaces.Space | None = None
cached_metadata: dict[str, Any] | None = None
out: dict[str, dict[int, gym.vector.VectorEnv]] = {}
for task_name in task_names:
envs_by_task: dict[int, gym.vector.VectorEnv] = {}
for task_id in task_ids:
fns = _make_env_fns(
task=task_name,
n_envs=n_envs,
action_space_type=action_space_type,
dataset=dataset,
episode_length=episode_length,
task_id=task_id,
)
if is_async:
lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
if cached_obs_space is None:
cached_obs_space = lazy.observation_space
cached_act_space = lazy.action_space
cached_metadata = lazy.metadata
envs_by_task[task_id] = lazy
else:
envs_by_task[task_id] = env_cls(fns)
out[task_name] = envs_by_task
return out

View File

@@ -0,0 +1,488 @@
#!/usr/bin/env python
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import importlib
import logging
from collections import defaultdict
from collections.abc import Callable, Sequence
from functools import partial
from typing import Any
import gymnasium as gym
import numpy as np
import torch
from gymnasium import spaces
from lerobot.types import RobotObservation
from .utils import _LazyAsyncVectorEnv
logger = logging.getLogger(__name__)
# Camera names as used by RoboTwin 2.0. The wrapper appends "_rgb" when looking
# up keys in get_obs() output (e.g. "head_camera" → "head_camera_rgb").
ROBOTWIN_CAMERA_NAMES: tuple[str, ...] = (
"head_camera",
"left_camera",
"right_camera",
)
ACTION_DIM = 14 # 7 DOF × 2 arms
ACTION_LOW = -1.0
ACTION_HIGH = 1.0
DEFAULT_EPISODE_LENGTH = 300
# D435 dims from task_config/_camera_config.yml (what demo_clean.yml selects).
DEFAULT_CAMERA_H = 240
DEFAULT_CAMERA_W = 320
# Task list from RoboTwin 2.0's `envs/` directory — mirrors upstream exactly
# (50 tasks as of main; earlier revisions had 60 with a different split).
# Keep this in sync with:
# gh api /repos/RoboTwin-Platform/RoboTwin/contents/envs --paginate \
# | jq -r '.[].name' | grep -E '\.py$' | grep -v '^_' | sed 's/\.py$//'
ROBOTWIN_TASKS: tuple[str, ...] = (
"adjust_bottle",
"beat_block_hammer",
"blocks_ranking_rgb",
"blocks_ranking_size",
"click_alarmclock",
"click_bell",
"dump_bin_bigbin",
"grab_roller",
"handover_block",
"handover_mic",
"hanging_mug",
"lift_pot",
"move_can_pot",
"move_pillbottle_pad",
"move_playingcard_away",
"move_stapler_pad",
"open_laptop",
"open_microwave",
"pick_diverse_bottles",
"pick_dual_bottles",
"place_a2b_left",
"place_a2b_right",
"place_bread_basket",
"place_bread_skillet",
"place_burger_fries",
"place_can_basket",
"place_cans_plasticbox",
"place_container_plate",
"place_dual_shoes",
"place_empty_cup",
"place_fan",
"place_mouse_pad",
"place_object_basket",
"place_object_scale",
"place_object_stand",
"place_phone_stand",
"place_shoe",
"press_stapler",
"put_bottles_dustbin",
"put_object_cabinet",
"rotate_qrcode",
"scan_object",
"shake_bottle",
"shake_bottle_horizontally",
"stack_blocks_three",
"stack_blocks_two",
"stack_bowls_three",
"stack_bowls_two",
"stamp_seal",
"turn_switch",
)
_ROBOTWIN_SETUP_CACHE: dict[str, dict[str, Any]] = {}
def _load_robotwin_setup_kwargs(task_name: str) -> dict[str, Any]:
"""Build the kwargs dict RoboTwin's setup_demo expects.
Mirrors the config loading done by RoboTwin's ``script/eval_policy.py``:
reads ``task_config/demo_clean.yml``, resolves the embodiment file from
``_embodiment_config.yml``, loads the robot's own ``config.yml``, and
reads camera dimensions from ``_camera_config.yml``.
Uses ``aloha-agilex`` single-robot dual-arm by default (the only embodiment
used by beat_block_hammer and most smoke-test tasks).
"""
if task_name in _ROBOTWIN_SETUP_CACHE:
return dict(_ROBOTWIN_SETUP_CACHE[task_name])
import os
import yaml # type: ignore[import-untyped]
from envs import CONFIGS_PATH # type: ignore[import-not-found]
task_config = "demo_clean"
with open(os.path.join(CONFIGS_PATH, f"{task_config}.yml"), encoding="utf-8") as f:
args = yaml.safe_load(f)
# Resolve embodiment — demo_clean.yml uses [aloha-agilex] (dual-arm single robot)
with open(os.path.join(CONFIGS_PATH, "_embodiment_config.yml"), encoding="utf-8") as f:
embodiment_types = yaml.safe_load(f)
embodiment = args.get("embodiment", ["aloha-agilex"])
if len(embodiment) == 1:
robot_file = embodiment_types[embodiment[0]]["file_path"]
args["left_robot_file"] = robot_file
args["right_robot_file"] = robot_file
args["dual_arm_embodied"] = True
elif len(embodiment) == 3:
args["left_robot_file"] = embodiment_types[embodiment[0]]["file_path"]
args["right_robot_file"] = embodiment_types[embodiment[1]]["file_path"]
args["embodiment_dis"] = embodiment[2]
args["dual_arm_embodied"] = False
else:
raise ValueError(f"embodiment must have 1 or 3 items, got {len(embodiment)}")
with open(os.path.join(args["left_robot_file"], "config.yml"), encoding="utf-8") as f:
args["left_embodiment_config"] = yaml.safe_load(f)
with open(os.path.join(args["right_robot_file"], "config.yml"), encoding="utf-8") as f:
args["right_embodiment_config"] = yaml.safe_load(f)
# Camera dimensions
with open(os.path.join(CONFIGS_PATH, "_camera_config.yml"), encoding="utf-8") as f:
camera_config = yaml.safe_load(f)
head_cam = args["camera"]["head_camera_type"]
args["head_camera_h"] = camera_config[head_cam]["h"]
args["head_camera_w"] = camera_config[head_cam]["w"]
# Headless overrides
args["render_freq"] = 0
args["task_name"] = task_name
args["task_config"] = task_config
_ROBOTWIN_SETUP_CACHE[task_name] = args
return dict(args)
def _load_robotwin_task(task_name: str) -> type:
"""Dynamically import and return a RoboTwin 2.0 task class.
RoboTwin tasks live in ``envs/<task_name>.py`` relative to the repository
root and are expected to be on ``sys.path`` after installation.
"""
try:
module = importlib.import_module(f"envs.{task_name}")
except ModuleNotFoundError as e:
raise ModuleNotFoundError(
f"Could not import RoboTwin task '{task_name}'. "
"Ensure RoboTwin 2.0 is installed and its 'envs/' directory is on PYTHONPATH. "
"See the RoboTwin installation guide: https://robotwin-platform.github.io/doc/usage/robotwin-install.html"
) from e
task_cls = getattr(module, task_name, None)
if task_cls is None:
raise AttributeError(f"Task class '{task_name}' not found in envs/{task_name}.py")
return task_cls
class RoboTwinEnv(gym.Env):
"""Gymnasium wrapper around a single RoboTwin 2.0 task.
RoboTwin uses a custom SAPIEN-based API (``setup_demo`` / ``get_obs`` /
``take_action`` / ``check_success``) rather than the standard gym interface.
This class bridges that API to Gymnasium so that ``lerobot-eval`` can drive
RoboTwin exactly like LIBERO or Meta-World.
The underlying SAPIEN environment is created lazily on the first ``reset()``
call *inside the worker process*. This is required for
``gym.vector.AsyncVectorEnv`` compatibility: SAPIEN allocates EGL/GPU
contexts that must not be forked from the parent process.
Observations
------------
The ``pixels`` dict uses the raw RoboTwin camera names as keys (e.g.
``"head_camera"``, ``"left_camera"``). ``preprocess_observation`` in
``envs/utils.py`` then converts these to ``observation.images.<cam>``.
Actions
-------
14-dim float32 array in ``[-1, 1]`` (joint-space, 7 DOF per arm).
Autograd
--------
``setup_demo`` and ``take_action`` drive CuRobo's Newton trajectory
optimizer, which calls ``cost.backward()`` internally. lerobot_eval wraps
the rollout in ``torch.no_grad()``, so both call sites re-enable grad.
"""
metadata = {"render_modes": ["rgb_array"], "render_fps": 25}
def __init__(
self,
task_name: str,
episode_index: int = 0,
n_envs: int = 1,
camera_names: Sequence[str] = ROBOTWIN_CAMERA_NAMES,
observation_height: int | None = None,
observation_width: int | None = None,
episode_length: int = DEFAULT_EPISODE_LENGTH,
render_mode: str = "rgb_array",
):
super().__init__()
self.task_name = task_name
self.task = task_name # used by add_envs_task() in utils.py
self.task_description = task_name.replace("_", " ")
self.episode_index = episode_index
self._reset_stride = n_envs
self.camera_names = list(camera_names)
# Default to D435 dims (the camera type baked into task_config/demo_clean.yml).
# The YAML-driven lookup is deferred to reset() so construction doesn't
# import RoboTwin's `envs` module — fast-tests run without RoboTwin installed.
self.observation_height = observation_height or DEFAULT_CAMERA_H
self.observation_width = observation_width or DEFAULT_CAMERA_W
self.episode_length = episode_length
self._max_episode_steps = episode_length # lerobot_eval.rollout reads this
self.render_mode = render_mode
self._env: Any | None = None # deferred — created on first reset() inside worker
self._step_count: int = 0
self._black_frame = np.zeros((self.observation_height, self.observation_width, 3), dtype=np.uint8)
image_spaces = {
cam: spaces.Box(
low=0,
high=255,
shape=(self.observation_height, self.observation_width, 3),
dtype=np.uint8,
)
for cam in self.camera_names
}
self.observation_space = spaces.Dict(
{
"pixels": spaces.Dict(image_spaces),
"agent_pos": spaces.Box(low=-np.inf, high=np.inf, shape=(ACTION_DIM,), dtype=np.float32),
}
)
self.action_space = spaces.Box(
low=ACTION_LOW, high=ACTION_HIGH, shape=(ACTION_DIM,), dtype=np.float32
)
def _ensure_env(self) -> None:
"""Create the SAPIEN environment on first use.
Called inside the worker subprocess after fork(), so each worker gets
its own EGL/GPU context rather than inheriting a stale one from the
parent process (which causes crashes with AsyncVectorEnv).
"""
if self._env is not None:
return
task_cls = _load_robotwin_task(self.task_name)
self._env = task_cls()
def _get_obs(self) -> RobotObservation:
assert self._env is not None, "_get_obs called before _ensure_env()"
raw = self._env.get_obs()
cameras_raw = raw.get("observation", {})
images: dict[str, np.ndarray] = {}
for cam in self.camera_names:
cam_data = cameras_raw.get(cam)
img = cam_data.get("rgb") if cam_data else None
if img is None:
images[cam] = self._black_frame
continue
img = np.asarray(img, dtype=np.uint8)
if img.ndim == 2:
img = np.stack([img, img, img], axis=-1)
elif img.shape[-1] != 3:
img = img[..., :3]
images[cam] = img
ja = raw.get("joint_action") or {}
vec = ja.get("vector")
if vec is not None:
arr = np.asarray(vec, dtype=np.float32).ravel()
joint_state = (
arr[:ACTION_DIM] if arr.size >= ACTION_DIM else np.zeros(ACTION_DIM, dtype=np.float32)
)
else:
joint_state = np.zeros(ACTION_DIM, dtype=np.float32)
return {"pixels": images, "agent_pos": joint_state}
def reset(self, seed: int | None = None, **kwargs) -> tuple[RobotObservation, dict]:
self._ensure_env()
super().reset(seed=seed)
assert self._env is not None # set by _ensure_env() above
actual_seed = self.episode_index if seed is None else seed
setup_kwargs = _load_robotwin_setup_kwargs(self.task_name)
setup_kwargs.update(seed=actual_seed, is_test=True)
with torch.enable_grad():
self._env.setup_demo(**setup_kwargs)
self.episode_index += self._reset_stride
self._step_count = 0
obs = self._get_obs()
return obs, {"is_success": False, "task": self.task_name}
def step(self, action: np.ndarray) -> tuple[RobotObservation, float, bool, bool, dict[str, Any]]:
assert self._env is not None, "step() called before reset()"
if action.ndim != 1 or action.shape[0] != ACTION_DIM:
raise ValueError(f"Expected 1-D action of shape ({ACTION_DIM},), got {action.shape}")
with torch.enable_grad():
if hasattr(self._env, "take_action"):
self._env.take_action(action)
else:
self._env.step(action)
self._step_count += 1
is_success = bool(getattr(self._env, "eval_success", False))
if not is_success and hasattr(self._env, "check_success"):
is_success = bool(self._env.check_success())
obs = self._get_obs()
reward = float(is_success)
terminated = is_success
truncated = self._step_count >= self.episode_length
info: dict[str, Any] = {
"task": self.task_name,
"is_success": is_success,
"step": self._step_count,
}
if terminated or truncated:
info["final_info"] = {
"task": self.task_name,
"is_success": is_success,
}
self.reset()
return obs, reward, terminated, truncated, info
def render(self) -> np.ndarray:
self._ensure_env()
obs = self._get_obs()
# Prefer head camera for rendering; fall back to first available.
if "head_camera" in obs["pixels"]:
return obs["pixels"]["head_camera"]
return next(iter(obs["pixels"].values()))
def close(self) -> None:
if self._env is not None:
if hasattr(self._env, "close_env"):
import contextlib
with contextlib.suppress(TypeError):
self._env.close_env()
self._env = None
# ---- Multi-task factory --------------------------------------------------------
def _make_env_fns(
*,
task_name: str,
n_envs: int,
camera_names: list[str],
observation_height: int,
observation_width: int,
episode_length: int,
) -> list[Callable[[], RoboTwinEnv]]:
"""Return n_envs factory callables for a single task."""
def _make_one(episode_index: int) -> RoboTwinEnv:
return RoboTwinEnv(
task_name=task_name,
episode_index=episode_index,
n_envs=n_envs,
camera_names=camera_names,
observation_height=observation_height,
observation_width=observation_width,
episode_length=episode_length,
)
return [partial(_make_one, i) for i in range(n_envs)]
def create_robotwin_envs(
task: str,
n_envs: int,
env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None,
camera_names: Sequence[str] = ROBOTWIN_CAMERA_NAMES,
observation_height: int = DEFAULT_CAMERA_H,
observation_width: int = DEFAULT_CAMERA_W,
episode_length: int = DEFAULT_EPISODE_LENGTH,
) -> dict[str, dict[int, Any]]:
"""Create vectorized RoboTwin 2.0 environments.
Returns:
``dict[task_name][0] -> VectorEnv`` — one entry per task, each wrapping
``n_envs`` parallel rollouts.
Args:
task: Comma-separated list of task names (e.g. ``"beat_block_hammer"``
or ``"beat_block_hammer,click_bell"``).
n_envs: Number of parallel rollouts per task.
env_cls: Vector env constructor (e.g. ``gym.vector.AsyncVectorEnv``).
camera_names: Cameras to include in observations.
observation_height: Pixel height for all cameras.
observation_width: Pixel width for all cameras.
episode_length: Max steps before truncation.
"""
if env_cls is None or not callable(env_cls):
raise ValueError("env_cls must be callable (e.g. gym.vector.AsyncVectorEnv).")
if not isinstance(n_envs, int) or n_envs <= 0:
raise ValueError(f"n_envs must be a positive int; got {n_envs}.")
task_names = [t.strip() for t in str(task).split(",") if t.strip()]
if not task_names:
raise ValueError("`task` must contain at least one RoboTwin task name.")
unknown = [t for t in task_names if t not in ROBOTWIN_TASKS]
if unknown:
raise ValueError(f"Unknown RoboTwin tasks: {unknown}. Available tasks: {sorted(ROBOTWIN_TASKS)}")
logger.info(
"Creating RoboTwin envs | tasks=%s | n_envs(per task)=%d",
task_names,
n_envs,
)
is_async = env_cls is gym.vector.AsyncVectorEnv
cached_obs_space: spaces.Space | None = None
cached_act_space: spaces.Space | None = None
cached_metadata: dict[str, Any] | None = None
out: dict[str, dict[int, Any]] = defaultdict(dict)
for task_name in task_names:
fns = _make_env_fns(
task_name=task_name,
n_envs=n_envs,
camera_names=list(camera_names),
observation_height=observation_height,
observation_width=observation_width,
episode_length=episode_length,
)
if is_async:
lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
if cached_obs_space is None:
cached_obs_space = lazy.observation_space
cached_act_space = lazy.action_space
cached_metadata = lazy.metadata
out[task_name][0] = lazy
else:
out[task_name][0] = env_cls(fns)
logger.info("Built vec env | task=%s | n_envs=%d", task_name, n_envs)
return {k: dict(v) for k, v in out.items()}

View File

@@ -34,6 +34,25 @@ from lerobot.utils.utils import get_channel_first_image_shape
from .configs import EnvConfig
def parse_camera_names(camera_name: str | Sequence[str]) -> list[str]:
"""Normalize ``camera_name`` into a non-empty list of strings.
Accepts a comma-separated string (``"cam_a,cam_b"``) or a sequence of
strings (tuples/lists). Whitespace is stripped; empty entries are
dropped. Raises ``TypeError`` for unsupported input types and
``ValueError`` when the normalized list is empty.
"""
if isinstance(camera_name, str):
cams = [c.strip() for c in camera_name.split(",") if c.strip()]
elif isinstance(camera_name, (list | tuple)):
cams = [str(c).strip() for c in camera_name if str(c).strip()]
else:
raise TypeError(f"camera_name must be str or sequence[str], got {type(camera_name).__name__}")
if not cams:
raise ValueError("camera_name resolved to an empty list.")
return cams
def _convert_nested_dict(d):
result = {}
for k, v in d.items():
@@ -153,17 +172,20 @@ class _LazyAsyncVectorEnv:
env_fns: list[Callable],
observation_space=None,
action_space=None,
metadata=None,
):
self._env_fns = env_fns
self._env: gym.vector.AsyncVectorEnv | None = None
self.num_envs = len(env_fns)
if observation_space is not None and action_space is not None:
if observation_space is not None and action_space is not None and metadata is not None:
self.observation_space = observation_space
self.action_space = action_space
self.metadata = metadata
else:
tmp = env_fns[0]()
self.observation_space = tmp.observation_space
self.action_space = tmp.action_space
self.metadata = tmp.metadata
tmp.close()
self.single_observation_space = self.observation_space
self.single_action_space = self.action_space
@@ -172,6 +194,10 @@ class _LazyAsyncVectorEnv:
if self._env is None:
self._env = gym.vector.AsyncVectorEnv(self._env_fns, context="forkserver", shared_memory=True)
@property
def unwrapped(self):
return self
def reset(self, **kwargs):
self._ensure()
return self._env.reset(**kwargs)

282
tests/envs/test_robotwin.py Normal file
View File

@@ -0,0 +1,282 @@
#!/usr/bin/env python
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Unit tests for the RoboTwin 2.0 Gymnasium wrapper.
These tests mock out the SAPIEN-based RoboTwin runtime (task modules +
YAML config loader) so they run without the full RoboTwin installation
(SAPIEN, CuRobo, mplib, asset downloads, etc.).
"""
from __future__ import annotations
from contextlib import contextmanager
from unittest.mock import MagicMock, patch
import gymnasium as gym
import numpy as np
import pytest
from lerobot.envs.robotwin import (
ACTION_DIM,
ROBOTWIN_CAMERA_NAMES,
ROBOTWIN_TASKS,
RoboTwinEnv,
create_robotwin_envs,
)
# ---------------------------------------------------------------------------
# Fixtures / helpers
# ---------------------------------------------------------------------------
def _make_mock_task_env(
height: int = 240,
width: int = 320,
cameras: tuple[str, ...] = ROBOTWIN_CAMERA_NAMES,
) -> MagicMock:
"""Return a mock that mimics the RoboTwin task class API.
RoboTwin's real get_obs returns
{"observation": {cam: {"rgb": img}}, "joint_action": {"vector": np.ndarray}, ...}
so the mock follows the same nested shape.
"""
obs_dict = {
"observation": {cam: {"rgb": np.zeros((height, width, 3), dtype=np.uint8)} for cam in cameras},
"joint_action": {"vector": np.zeros(ACTION_DIM, dtype=np.float32)},
"endpose": {},
}
mock = MagicMock()
mock.get_obs.return_value = obs_dict
mock.setup_demo.return_value = None
mock.take_action.return_value = None
mock.eval_success = False
mock.check_success.return_value = False
mock.close_env.return_value = None
return mock
@contextmanager
def _patch_runtime(mock_task_instance: MagicMock):
"""Patch both the task-class loader and the YAML config loader so the
env can construct + reset without a real RoboTwin install."""
task_cls = MagicMock(return_value=mock_task_instance)
fake_setup = {
"head_camera_h": 240,
"head_camera_w": 320,
"left_embodiment_config": {},
"right_embodiment_config": {},
"left_robot_file": "",
"right_robot_file": "",
"dual_arm_embodied": True,
"render_freq": 0,
"task_name": "beat_block_hammer",
"task_config": "demo_clean",
}
with (
patch("lerobot.envs.robotwin._load_robotwin_task", return_value=task_cls),
patch("lerobot.envs.robotwin._load_robotwin_setup_kwargs", return_value=fake_setup),
):
yield
# ---------------------------------------------------------------------------
# RoboTwinEnv unit tests
# ---------------------------------------------------------------------------
class TestRoboTwinEnv:
def test_observation_space_shape(self):
"""observation_space should have the configured h×w×3 for every camera."""
h, w = 240, 320
env = RoboTwinEnv(
task_name="beat_block_hammer",
observation_height=h,
observation_width=w,
camera_names=["head_camera", "left_camera"],
)
pixels_space = env.observation_space["pixels"]
assert pixels_space["head_camera"].shape == (h, w, 3)
assert pixels_space["left_camera"].shape == (h, w, 3)
assert "right_camera" not in pixels_space
def test_action_space(self):
env = RoboTwinEnv(task_name="beat_block_hammer")
assert env.action_space.shape == (ACTION_DIM,)
assert env.action_space.dtype == np.float32
def test_reset_returns_correct_obs_keys(self):
mock_task = _make_mock_task_env()
env = RoboTwinEnv(task_name="beat_block_hammer")
with _patch_runtime(mock_task):
obs, info = env.reset()
assert "pixels" in obs
for cam in ROBOTWIN_CAMERA_NAMES:
assert cam in obs["pixels"], f"Missing camera '{cam}' in obs"
assert "agent_pos" in obs
assert obs["agent_pos"].shape == (ACTION_DIM,)
assert info["is_success"] is False
def test_reset_calls_setup_demo(self):
mock_task = _make_mock_task_env()
env = RoboTwinEnv(task_name="beat_block_hammer")
with _patch_runtime(mock_task):
env.reset(seed=42)
# setup_demo receives the full YAML-derived kwargs plus seed + is_test;
# we only assert the caller-provided bits.
assert mock_task.setup_demo.call_count == 1
call_kwargs = mock_task.setup_demo.call_args.kwargs
assert call_kwargs["seed"] == 42
assert call_kwargs["is_test"] is True
def test_step_returns_correct_types(self):
mock_task = _make_mock_task_env()
env = RoboTwinEnv(task_name="beat_block_hammer")
action = np.zeros(ACTION_DIM, dtype=np.float32)
with _patch_runtime(mock_task):
env.reset()
obs, reward, terminated, truncated, info = env.step(action)
assert isinstance(obs, dict)
assert isinstance(reward, float)
assert isinstance(terminated, bool)
assert isinstance(truncated, bool)
assert isinstance(info, dict)
def test_step_wrong_action_shape_raises(self):
mock_task = _make_mock_task_env()
env = RoboTwinEnv(task_name="beat_block_hammer")
bad_action = np.zeros(7, dtype=np.float32) # wrong dim
with _patch_runtime(mock_task):
env.reset()
with pytest.raises(ValueError, match="Expected 1-D action"):
env.step(bad_action)
def test_success_terminates_episode(self):
mock_task = _make_mock_task_env()
mock_task.check_success.return_value = True
env = RoboTwinEnv(task_name="beat_block_hammer")
action = np.zeros(ACTION_DIM, dtype=np.float32)
with _patch_runtime(mock_task):
env.reset()
_, _, terminated, _, info = env.step(action)
assert terminated is True
assert info["is_success"] is True
def test_truncation_after_episode_length(self):
mock_task = _make_mock_task_env()
env = RoboTwinEnv(task_name="beat_block_hammer", episode_length=2)
action = np.zeros(ACTION_DIM, dtype=np.float32)
with _patch_runtime(mock_task):
env.reset()
env.step(action) # step 1
_, _, _, truncated, _ = env.step(action) # step 2 → truncated
assert truncated is True
def test_close_calls_close_env(self):
mock_task = _make_mock_task_env()
env = RoboTwinEnv(task_name="beat_block_hammer")
with _patch_runtime(mock_task):
env.reset()
env.close()
mock_task.close_env.assert_called_once()
def test_black_frame_for_missing_camera(self):
"""If a camera key is absent from get_obs(), a black frame is returned."""
# Mock exposes only head_camera; we ask for both head_camera + left_camera.
mock_task = _make_mock_task_env(height=10, width=10, cameras=("head_camera",))
env = RoboTwinEnv(
task_name="beat_block_hammer",
camera_names=["head_camera", "left_camera"],
observation_height=10,
observation_width=10,
)
with _patch_runtime(mock_task):
obs, _ = env.reset()
assert obs["pixels"]["left_camera"].shape == (10, 10, 3)
assert obs["pixels"]["left_camera"].sum() == 0
def test_task_and_task_description_attributes(self):
env = RoboTwinEnv(task_name="beat_block_hammer")
assert env.task == "beat_block_hammer"
assert isinstance(env.task_description, str)
def test_deferred_init_env_is_none_before_reset(self):
env = RoboTwinEnv(task_name="beat_block_hammer")
assert env._env is None # noqa: SLF001 (testing internal state)
# ---------------------------------------------------------------------------
# create_robotwin_envs tests
# ---------------------------------------------------------------------------
class TestCreateRoboTwinEnvs:
def test_returns_correct_structure(self):
mock_task = _make_mock_task_env()
with _patch_runtime(mock_task):
envs = create_robotwin_envs(
task="beat_block_hammer",
n_envs=1,
env_cls=gym.vector.SyncVectorEnv,
)
assert "beat_block_hammer" in envs
assert 0 in envs["beat_block_hammer"]
assert isinstance(envs["beat_block_hammer"][0], gym.vector.SyncVectorEnv)
def test_multi_task(self):
mock_task = _make_mock_task_env()
with _patch_runtime(mock_task):
envs = create_robotwin_envs(
task="beat_block_hammer,click_bell",
n_envs=1,
env_cls=gym.vector.SyncVectorEnv,
)
assert set(envs.keys()) == {"beat_block_hammer", "click_bell"}
def test_unknown_task_raises(self):
with pytest.raises(ValueError, match="Unknown RoboTwin tasks"):
create_robotwin_envs(
task="not_a_real_task",
n_envs=1,
env_cls=gym.vector.SyncVectorEnv,
)
def test_invalid_n_envs_raises(self):
with pytest.raises(ValueError, match="n_envs must be a positive int"):
create_robotwin_envs(
task="beat_block_hammer",
n_envs=0,
env_cls=gym.vector.SyncVectorEnv,
)
# ---------------------------------------------------------------------------
# ROBOTWIN_TASKS list
# ---------------------------------------------------------------------------
def test_task_list_not_empty():
assert len(ROBOTWIN_TASKS) >= 50
def test_all_tasks_are_strings():
assert all(isinstance(t, str) and t for t in ROBOTWIN_TASKS)
def test_no_duplicate_tasks():
assert len(ROBOTWIN_TASKS) == len(set(ROBOTWIN_TASKS))

232
tests/test_robomme_env.py Normal file
View File

@@ -0,0 +1,232 @@
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Unit tests for the RoboMME env wrapper and config.
RoboMME requires Linux + ManiSkill (Vulkan/SAPIEN), so tests that touch the
env wrapper mock the ``robomme`` package. Tests that only exercise the
dataclass config run without any mocking.
"""
from __future__ import annotations
import sys
from types import ModuleType
from unittest.mock import MagicMock
import numpy as np
def _install_robomme_stub():
"""Register a minimal stub for the ``robomme`` package on sys.modules."""
stub = ModuleType("robomme")
wrapper_stub = ModuleType("robomme.env_record_wrapper")
class FakeBuilder:
def __init__(self, **kwargs):
pass
def make_env_for_episode(self, episode_idx: int, max_steps: int):
env = MagicMock()
obs = {
"front_rgb_list": [np.zeros((256, 256, 3), dtype=np.uint8)],
"wrist_rgb_list": [np.zeros((256, 256, 3), dtype=np.uint8)],
"joint_state_list": [np.zeros(7, dtype=np.float32)],
"gripper_state_list": [np.zeros(2, dtype=np.float32)],
}
env.reset.return_value = (obs, {"status": "ongoing", "task_goal": "pick the cube"})
env.step.return_value = (obs, 0.0, False, False, {"status": "ongoing", "task_goal": ""})
return env
wrapper_stub.BenchmarkEnvBuilder = FakeBuilder
stub.env_record_wrapper = wrapper_stub
sys.modules["robomme"] = stub
sys.modules["robomme.env_record_wrapper"] = wrapper_stub
def _uninstall_robomme_stub():
sys.modules.pop("robomme", None)
sys.modules.pop("robomme.env_record_wrapper", None)
# ---------------------------------------------------------------------------
# Config tests (no sim required)
# ---------------------------------------------------------------------------
def test_robomme_env_config_defaults():
from lerobot.envs.configs import RoboMMEEnv
cfg = RoboMMEEnv()
assert cfg.task == "PickXtimes"
assert cfg.fps == 10
assert cfg.episode_length == 300
assert cfg.action_space == "joint_angle"
assert cfg.dataset_split == "test"
assert cfg.task_ids is None
def test_robomme_env_config_type():
from lerobot.envs.configs import RoboMMEEnv
cfg = RoboMMEEnv()
assert cfg.type == "robomme"
def test_robomme_features_map():
from lerobot.envs.configs import RoboMMEEnv
from lerobot.utils.constants import ACTION, OBS_IMAGES, OBS_STATE
cfg = RoboMMEEnv()
assert cfg.features_map[ACTION] == ACTION
assert cfg.features_map["pixels/image"] == f"{OBS_IMAGES}.image"
assert cfg.features_map["pixels/wrist_image"] == f"{OBS_IMAGES}.wrist_image"
assert cfg.features_map["agent_pos"] == OBS_STATE
def test_robomme_features_action_dim_joint_angle():
from lerobot.envs.configs import RoboMMEEnv
from lerobot.utils.constants import ACTION
cfg = RoboMMEEnv(action_space="joint_angle")
assert cfg.features[ACTION].shape == (8,)
def test_robomme_features_action_dim_ee_pose():
"""`ee_pose` uses a 7-D action; __post_init__ sets the correct shape."""
from lerobot.envs.configs import RoboMMEEnv
from lerobot.utils.constants import ACTION
cfg = RoboMMEEnv(action_space="ee_pose")
assert cfg.features[ACTION].shape == (7,)
# ---------------------------------------------------------------------------
# Obs conversion (pure Python, no sim)
# ---------------------------------------------------------------------------
def test_convert_obs_list_format():
"""_convert_obs takes the last element from list-format obs fields and
emits a nested ``pixels`` dict (image, wrist_image) plus ``agent_pos``.
The nested layout is required so ``preprocess_observation()`` in
``envs/utils.py`` maps each camera to ``observation.images.<cam>``.
"""
_install_robomme_stub()
try:
from lerobot.envs.robomme import RoboMMEGymEnv
env = RoboMMEGymEnv.__new__(RoboMMEGymEnv)
front = np.full((256, 256, 3), 42, dtype=np.uint8)
wrist = np.full((256, 256, 3), 7, dtype=np.uint8)
joints = np.arange(7, dtype=np.float32)
gripper = np.array([0.5, 0.5], dtype=np.float32)
obs_raw = {
"front_rgb_list": [np.zeros_like(front), front],
"wrist_rgb_list": [np.zeros_like(wrist), wrist],
"joint_state_list": [np.zeros(7, dtype=np.float32), joints],
"gripper_state_list": [np.zeros(2, dtype=np.float32), gripper],
}
result = env._convert_obs(obs_raw)
np.testing.assert_array_equal(result["pixels"]["image"], front)
np.testing.assert_array_equal(result["pixels"]["wrist_image"], wrist)
assert result["agent_pos"].shape == (8,)
np.testing.assert_array_almost_equal(result["agent_pos"][:7], joints)
assert result["agent_pos"][7] == gripper[0]
finally:
_uninstall_robomme_stub()
def test_convert_obs_array_format():
"""_convert_obs also handles non-list (direct array) obs."""
_install_robomme_stub()
try:
from lerobot.envs.robomme import RoboMMEGymEnv
env = RoboMMEGymEnv.__new__(RoboMMEGymEnv)
front = np.zeros((256, 256, 3), dtype=np.uint8)
obs_raw = {
"front_rgb_list": front,
"wrist_rgb_list": front,
"joint_state_list": np.zeros(7, dtype=np.float32),
"gripper_state_list": np.zeros(2, dtype=np.float32),
}
result = env._convert_obs(obs_raw)
assert result["pixels"]["image"].shape == (256, 256, 3)
assert result["pixels"]["wrist_image"].shape == (256, 256, 3)
assert result["agent_pos"].shape == (8,)
finally:
_uninstall_robomme_stub()
# ---------------------------------------------------------------------------
# create_robomme_envs (mocked sim)
# ---------------------------------------------------------------------------
def test_create_robomme_envs_returns_correct_structure():
"""Single task -> {task_name: {task_id: VectorEnv}} with one entry per task_id."""
_install_robomme_stub()
try:
from lerobot.envs.robomme import create_robomme_envs
env_cls = MagicMock(return_value=MagicMock())
result = create_robomme_envs(
task="PickXtimes",
n_envs=1,
task_ids=[0, 1],
env_cls=env_cls,
)
assert "PickXtimes" in result
assert 0 in result["PickXtimes"]
assert 1 in result["PickXtimes"]
assert env_cls.call_count == 2
finally:
_uninstall_robomme_stub()
def test_create_robomme_envs_multi_task():
"""Comma-separated task list produces one suite per task."""
_install_robomme_stub()
try:
from lerobot.envs.robomme import create_robomme_envs
env_cls = MagicMock(return_value=MagicMock())
result = create_robomme_envs(
task="PickXtimes,BinFill,StopCube",
n_envs=1,
env_cls=env_cls,
)
assert set(result.keys()) == {"PickXtimes", "BinFill", "StopCube"}
finally:
_uninstall_robomme_stub()
def test_create_robomme_envs_raises_on_invalid_env_cls():
_install_robomme_stub()
try:
import pytest
from lerobot.envs.robomme import create_robomme_envs
with pytest.raises(ValueError, match="env_cls must be a callable"):
create_robomme_envs(task="PickXtimes", n_envs=1, env_cls=None)
finally:
_uninstall_robomme_stub()