diff --git a/.github/workflows/benchmark_tests.yml b/.github/workflows/benchmark_tests.yml index 79d5614b2..3a5d92c44 100644 --- a/.github/workflows/benchmark_tests.yml +++ b/.github/workflows/benchmark_tests.yml @@ -83,10 +83,13 @@ jobs: cache-binary: false - name: Login to Docker Hub + if: ${{ env.DOCKERHUB_USERNAME != '' }} uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses] with: username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }} + env: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} # Build the benchmark-specific image. The Dockerfile separates dep-install # from source-copy, so code-only changes skip the slow uv-sync layer @@ -115,7 +118,7 @@ jobs: bash -c " hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true lerobot-eval \ - --policy.path=pepijn223/smolvla_libero \ + --policy.path=lerobot/smolvla_libero \ --env.type=libero \ --env.task=libero_spatial \ --eval.batch_size=1 \ @@ -144,7 +147,7 @@ jobs: --artifacts-dir /tmp/libero-artifacts \ --env libero \ --task libero_spatial \ - --policy pepijn223/smolvla_libero + --policy lerobot/smolvla_libero - name: Upload Libero rollout video if: always() @@ -238,10 +241,13 @@ jobs: cache-binary: false - name: Login to Docker Hub + if: ${{ env.DOCKERHUB_USERNAME != '' }} uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses] with: username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }} + env: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} - name: Build MetaWorld benchmark image uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses] @@ -264,7 +270,7 @@ jobs: bash -c " hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true lerobot-eval \ - --policy.path=pepijn223/smolvla_metaworld \ + --policy.path=lerobot/smolvla_metaworld \ --env.type=metaworld \ --env.task=metaworld-push-v3 \ --eval.batch_size=1 \ @@ -293,7 +299,7 @@ jobs: --artifacts-dir /tmp/metaworld-artifacts \ --env metaworld \ --task metaworld-push-v3 \ - --policy pepijn223/smolvla_metaworld + --policy lerobot/smolvla_metaworld - name: Upload MetaWorld rollout video if: always() @@ -310,3 +316,530 @@ jobs: name: metaworld-metrics path: /tmp/metaworld-artifacts/metrics.json if-no-files-found: warn + + # ── ROBOTWIN 2.0 ────────────────────────────────────────────────────────── + # Isolated image: full RoboTwin 2.0 stack — SAPIEN, mplib, CuRobo, + # pytorch3d, + simulation assets (~4 GB). + # Build takes ~20 min on first run; subsequent runs hit the layer cache. + # Requires an NVIDIA GPU runner with CUDA 12.1 drivers. + robotwin-integration-test: + name: RoboTwin 2.0 — build image + 1-episode eval + runs-on: + group: aws-g6-4xlarge-plus + env: + HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }} + ROBOTWIN_POLICY: lerobot/smolvla_robotwin + ROBOTWIN_TASKS: beat_block_hammer,click_bell,handover_block,stack_blocks_two,click_alarmclock,open_microwave,adjust_bottle,lift_pot,stamp_seal,turn_switch + + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + lfs: true + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses] + with: + cache-binary: false + + - name: Login to Docker Hub + if: ${{ env.DOCKERHUB_USERNAME != '' }} + uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses] + with: + username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} + password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }} + env: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} + + # Build the full-install image: SAPIEN, mplib, CuRobo, pytorch3d + + # simulation assets (~4 GB). Layer cache lives in the runner's local + # Docker daemon — reused across re-runs on the same machine. + - name: Build RoboTwin 2.0 benchmark image + uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses] + with: + context: . + file: docker/Dockerfile.benchmark.robotwin + push: false + load: true + tags: lerobot-benchmark-robotwin:ci + cache-from: type=local,src=/tmp/.buildx-cache-robotwin + cache-to: type=local,dest=/tmp/.buildx-cache-robotwin,mode=max + + - name: Run RoboTwin 2.0 smoke eval (10 tasks, 1 episode each) + if: env.HF_USER_TOKEN != '' + run: | + # Named container (no --rm) so we can docker cp artifacts out. + docker run --name robotwin-eval --gpus all \ + --shm-size=4g \ + -e HF_HOME=/tmp/hf \ + -e HF_USER_TOKEN="${HF_USER_TOKEN}" \ + -e ROBOTWIN_POLICY="${ROBOTWIN_POLICY}" \ + -e ROBOTWIN_TASKS="${ROBOTWIN_TASKS}" \ + lerobot-benchmark-robotwin:ci \ + bash -c " + hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true + cd /opt/robotwin && lerobot-eval \ + --policy.path=\"\$ROBOTWIN_POLICY\" \ + --env.type=robotwin \ + --env.task=\"\$ROBOTWIN_TASKS\" \ + --eval.batch_size=1 \ + --eval.n_episodes=1 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--rename_map={\"observation.images.head_camera\": \"observation.images.camera1\", \"observation.images.left_camera\": \"observation.images.camera2\", \"observation.images.right_camera\": \"observation.images.camera3\"}' \ + --output_dir=/tmp/eval-artifacts + python /lerobot/scripts/ci/extract_task_descriptions.py \ + --env robotwin \ + --task \"\$ROBOTWIN_TASKS\" \ + --output /tmp/eval-artifacts/task_descriptions.json + " + + - name: Copy RoboTwin artifacts from container + if: always() + run: | + mkdir -p /tmp/robotwin-artifacts + docker cp robotwin-eval:/tmp/eval-artifacts/. /tmp/robotwin-artifacts/ 2>/dev/null || true + docker rm -f robotwin-eval || true + + - name: Parse RoboTwin eval metrics + if: always() + run: | + python3 scripts/ci/parse_eval_metrics.py \ + --artifacts-dir /tmp/robotwin-artifacts \ + --env robotwin \ + --task "${ROBOTWIN_TASKS}" \ + --policy "${ROBOTWIN_POLICY}" + + - name: Upload RoboTwin rollout video + if: always() + uses: actions/upload-artifact@v4 + with: + name: robotwin-rollout-video + path: /tmp/robotwin-artifacts/videos/ + if-no-files-found: warn + + - name: Upload RoboTwin eval metrics + if: always() + uses: actions/upload-artifact@v4 + with: + name: robotwin-metrics + path: /tmp/robotwin-artifacts/metrics.json + if-no-files-found: warn + + # ── ROBOCASA365 ────────────────────────────────────────────────────────── + # Isolated image: robocasa + robosuite installed manually as editable + # clones (no `lerobot[robocasa]` extra — robocasa's setup.py pins + # `lerobot==0.3.3`, which would shadow this repo's lerobot). + robocasa-integration-test: + name: RoboCasa365 — build image + 1-episode eval + runs-on: + group: aws-g6-4xlarge-plus + env: + HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }} + + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + lfs: true + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses] + with: + cache-binary: false + + - name: Login to Docker Hub + if: ${{ env.DOCKERHUB_USERNAME != '' }} + uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses] + with: + username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} + password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }} + env: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} + + - name: Build RoboCasa365 benchmark image + uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses] + with: + context: . + file: docker/Dockerfile.benchmark.robocasa + push: false + load: true + tags: lerobot-benchmark-robocasa:ci + + - name: Run RoboCasa365 smoke eval (10 atomic tasks, 1 episode each) + if: env.HF_USER_TOKEN != '' + run: | + docker run --name robocasa-eval --gpus all \ + --shm-size=4g \ + -e HF_HOME=/tmp/hf \ + -e HF_USER_TOKEN="${HF_USER_TOKEN}" \ + -e HF_HUB_DOWNLOAD_TIMEOUT=300 \ + -e MUJOCO_GL=egl \ + lerobot-benchmark-robocasa:ci \ + bash -c " + hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true + lerobot-eval \ + --policy.path=lerobot/smolvla_robocasa \ + --env.type=robocasa \ + --env.task=CloseFridge,OpenCabinet,OpenDrawer,TurnOnMicrowave,TurnOffStove,CloseToasterOvenDoor,SlideDishwasherRack,TurnOnSinkFaucet,NavigateKitchen,TurnOnElectricKettle \ + --eval.batch_size=1 \ + --eval.n_episodes=1 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--rename_map={\"observation.images.robot0_agentview_left\": \"observation.images.camera1\", \"observation.images.robot0_eye_in_hand\": \"observation.images.camera2\", \"observation.images.robot0_agentview_right\": \"observation.images.camera3\"}' \ + --output_dir=/tmp/eval-artifacts + python scripts/ci/extract_task_descriptions.py \ + --env robocasa \ + --task CloseFridge,OpenCabinet,OpenDrawer,TurnOnMicrowave,TurnOffStove,CloseToasterOvenDoor,SlideDishwasherRack,TurnOnSinkFaucet,NavigateKitchen,TurnOnElectricKettle \ + --output /tmp/eval-artifacts/task_descriptions.json + " + + - name: Copy RoboCasa365 artifacts from container + if: always() + run: | + mkdir -p /tmp/robocasa-artifacts + docker cp robocasa-eval:/tmp/eval-artifacts/. /tmp/robocasa-artifacts/ 2>/dev/null || true + docker rm -f robocasa-eval || true + + - name: Parse RoboCasa365 eval metrics + if: always() + run: | + python3 scripts/ci/parse_eval_metrics.py \ + --artifacts-dir /tmp/robocasa-artifacts \ + --env robocasa \ + --task atomic_smoke_10 \ + --policy lerobot/smolvla_robocasa + + - name: Upload RoboCasa365 rollout video + if: always() + uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses] + with: + name: robocasa-rollout-video + path: /tmp/robocasa-artifacts/videos/ + if-no-files-found: warn + + - name: Upload RoboCasa365 eval metrics + if: always() + uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses] + with: + name: robocasa-metrics + path: /tmp/robocasa-artifacts/metrics.json + if-no-files-found: warn + + # ── ROBOCEREBRA ─────────────────────────────────────────────────────────── + # Reuses the LIBERO simulator (libero_10 suite) with RoboCerebra camera + # defaults (image/wrist_image). The image is layered on + # huggingface/lerobot-gpu, which already ships [libero] as part of [all]. + robocerebra-integration-test: + name: RoboCerebra — build image + 1-episode eval + runs-on: + group: aws-g6-4xlarge-plus + env: + HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }} + + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + lfs: true + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses] + with: + cache-binary: false + + - name: Login to Docker Hub + if: ${{ env.DOCKERHUB_USERNAME != '' }} + uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses] + with: + username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} + password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }} + env: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} + + - name: Build RoboCerebra benchmark image + uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses] + with: + context: . + file: docker/Dockerfile.benchmark.robocerebra + push: false + load: true + tags: lerobot-benchmark-robocerebra:ci + cache-from: type=local,src=/tmp/.buildx-cache-robocerebra + cache-to: type=local,dest=/tmp/.buildx-cache-robocerebra,mode=max + + - name: Run RoboCerebra smoke eval (1 episode) + if: env.HF_USER_TOKEN != '' + run: | + docker run --name robocerebra-eval --gpus all \ + --shm-size=4g \ + -e HF_HOME=/tmp/hf \ + -e HF_USER_TOKEN="${HF_USER_TOKEN}" \ + -e HF_HUB_DOWNLOAD_TIMEOUT=300 \ + -e LIBERO_DATA_FOLDER=/tmp/libero_data \ + lerobot-benchmark-robocerebra:ci \ + bash -c " + hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true + lerobot-eval \ + --policy.path=lerobot/smolvla_robocerebra \ + --env.type=libero \ + --env.task=libero_10 \ + --env.fps=20 \ + --env.obs_type=pixels_agent_pos \ + --env.observation_height=256 \ + --env.observation_width=256 \ + '--env.camera_name_mapping={\"agentview_image\": \"image\", \"robot0_eye_in_hand_image\": \"wrist_image\"}' \ + --eval.batch_size=1 \ + --eval.n_episodes=1 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--rename_map={\"observation.images.image\": \"observation.images.camera1\", \"observation.images.wrist_image\": \"observation.images.camera2\"}' \ + --policy.empty_cameras=1 \ + --output_dir=/tmp/eval-artifacts + python scripts/ci/extract_task_descriptions.py \ + --env libero --task libero_10 \ + --output /tmp/eval-artifacts/task_descriptions.json + " + + - name: Copy RoboCerebra artifacts from container + if: always() + run: | + mkdir -p /tmp/robocerebra-artifacts + docker cp robocerebra-eval:/tmp/eval-artifacts/. /tmp/robocerebra-artifacts/ 2>/dev/null || true + docker rm -f robocerebra-eval || true + + - name: Parse RoboCerebra eval metrics + if: always() + run: | + python3 scripts/ci/parse_eval_metrics.py \ + --artifacts-dir /tmp/robocerebra-artifacts \ + --env robocerebra \ + --task libero_10 \ + --policy lerobot/smolvla_robocerebra + + - name: Upload RoboCerebra rollout video + if: always() + uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses] + with: + name: robocerebra-rollout-video + path: /tmp/robocerebra-artifacts/videos/ + if-no-files-found: warn + + - name: Upload RoboCerebra eval metrics + if: always() + uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses] + with: + name: robocerebra-metrics + path: /tmp/robocerebra-artifacts/metrics.json + if-no-files-found: warn + + # ── ROBOMME ─────────────────────────────────────────────────────────────── + # Isolated image: mani-skill/SAPIEN/Vulkan chain with gymnasium and numpy + # overrides (robomme can't be a pyproject extra due to numpy<2 pin). + robomme-integration-test: + name: RoboMME — build image + 1-episode eval + runs-on: + group: aws-g6-4xlarge-plus + env: + HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }} + ROBOMME_POLICY: lerobot/smolvla_robomme + ROBOMME_TASKS: PickXtimes,BinFill,StopCube,MoveCube,InsertPeg,SwingXtimes,VideoUnmask,ButtonUnmask,PickHighlight,PatternLock + + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + lfs: true + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses] + with: + cache-binary: false + + - name: Login to Docker Hub + if: ${{ env.DOCKERHUB_USERNAME != '' }} + uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses] + with: + username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} + password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }} + env: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} + + - name: Build RoboMME benchmark image + uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses] + with: + context: . + file: docker/Dockerfile.benchmark.robomme + push: false + load: true + tags: lerobot-benchmark-robomme:ci + + - name: Run RoboMME smoke eval (10 tasks, 1 episode each) + if: env.HF_USER_TOKEN != '' + run: | + docker run --name robomme-eval --gpus all \ + --shm-size=4g \ + -e HF_HOME=/tmp/hf \ + -e HF_USER_TOKEN="${HF_USER_TOKEN}" \ + -e HF_HUB_DOWNLOAD_TIMEOUT=300 \ + -e ROBOMME_POLICY="${ROBOMME_POLICY}" \ + -e ROBOMME_TASKS="${ROBOMME_TASKS}" \ + lerobot-benchmark-robomme:ci \ + bash -c " + hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true + lerobot-eval \ + --policy.path=\"\$ROBOMME_POLICY\" \ + --env.type=robomme \ + --env.task=\"\$ROBOMME_TASKS\" \ + --env.dataset_split=test \ + --env.task_ids=[0] \ + --eval.batch_size=1 \ + --eval.n_episodes=1 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--rename_map={\"observation.images.image\": \"observation.images.camera1\", \"observation.images.wrist_image\": \"observation.images.camera2\"}' \ + --policy.empty_cameras=3 \ + --output_dir=/tmp/eval-artifacts + python scripts/ci/extract_task_descriptions.py \ + --env robomme --task \"\$ROBOMME_TASKS\" \ + --output /tmp/eval-artifacts/task_descriptions.json + " + + - name: Copy RoboMME artifacts from container + if: always() + run: | + mkdir -p /tmp/robomme-artifacts + docker cp robomme-eval:/tmp/eval-artifacts/. /tmp/robomme-artifacts/ 2>/dev/null || true + docker rm -f robomme-eval || true + + - name: Parse RoboMME eval metrics + if: always() + run: | + python3 scripts/ci/parse_eval_metrics.py \ + --artifacts-dir /tmp/robomme-artifacts \ + --env robomme \ + --task "${ROBOMME_TASKS}" \ + --policy "${ROBOMME_POLICY}" + + - name: Upload RoboMME rollout video + if: always() + uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses] + with: + name: robomme-rollout-video + path: /tmp/robomme-artifacts/videos/ + if-no-files-found: warn + + - name: Upload RoboMME eval metrics + if: always() + uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses] + with: + name: robomme-metrics + path: /tmp/robomme-artifacts/metrics.json + if-no-files-found: warn + + # ── LIBERO-plus ─────────────────────────────────────────────────────────── + # Isolated image: LIBERO-plus fork cloned into /home/user_lerobot on top of + # huggingface/lerobot-gpu (see docker/Dockerfile.benchmark.libero_plus). + libero-plus-integration-test: + name: LIBERO-plus — build image + 1-episode eval + runs-on: + group: aws-g6-4xlarge-plus + env: + HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }} + LIBERO_PLUS_SUITE: libero_spatial + LIBERO_PLUS_POLICY: lerobot/smolvla_libero_plus + LIBERO_PLUS_TASK_IDS: "[0,100,260,500,1000,1500,2000,2400]" + + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + lfs: true + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses] + with: + cache-binary: false + + - name: Login to Docker Hub + if: ${{ env.DOCKERHUB_USERNAME != '' }} + uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses] + with: + username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} + password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }} + env: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }} + + - name: Build LIBERO-plus benchmark image + uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses] + with: + context: . + file: docker/Dockerfile.benchmark.libero_plus + push: false + load: true + tags: lerobot-benchmark-libero-plus:ci + cache-from: type=local,src=/tmp/.buildx-cache-libero-plus + cache-to: type=local,dest=/tmp/.buildx-cache-libero-plus,mode=max + + - name: Run LIBERO-plus smoke eval (1 episode) + if: env.HF_USER_TOKEN != '' + run: | + docker run --name libero-plus-eval --gpus all \ + --shm-size=4g \ + -e HF_HOME=/tmp/hf \ + -e HF_USER_TOKEN="${HF_USER_TOKEN}" \ + -e HF_HUB_DOWNLOAD_TIMEOUT=300 \ + -e LIBERO_PLUS_SUITE="${LIBERO_PLUS_SUITE}" \ + -e LIBERO_PLUS_POLICY="${LIBERO_PLUS_POLICY}" \ + -e LIBERO_PLUS_TASK_IDS="${LIBERO_PLUS_TASK_IDS}" \ + lerobot-benchmark-libero-plus:ci \ + bash -c " + hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true + lerobot-eval \ + --policy.path=\"\$LIBERO_PLUS_POLICY\" \ + --env.type=libero_plus \ + --env.task=\"\$LIBERO_PLUS_SUITE\" \ + --env.task_ids=\"\$LIBERO_PLUS_TASK_IDS\" \ + --eval.batch_size=1 \ + --eval.n_episodes=1 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--env.camera_name_mapping={\"agentview_image\": \"camera1\", \"robot0_eye_in_hand_image\": \"camera2\"}' \ + --policy.empty_cameras=1 \ + --output_dir=/tmp/eval-artifacts + python scripts/ci/extract_task_descriptions.py \ + --env libero_plus --task \"\$LIBERO_PLUS_SUITE\" \ + --output /tmp/eval-artifacts/task_descriptions.json + " + + - name: Copy LIBERO-plus artifacts from container + if: always() + run: | + mkdir -p /tmp/libero-plus-artifacts + docker cp libero-plus-eval:/tmp/eval-artifacts/. /tmp/libero-plus-artifacts/ 2>/dev/null || true + docker rm -f libero-plus-eval || true + + - name: Parse LIBERO-plus eval metrics + if: always() + run: | + python3 scripts/ci/parse_eval_metrics.py \ + --artifacts-dir /tmp/libero-plus-artifacts \ + --env libero_plus \ + --task "${LIBERO_PLUS_SUITE}" \ + --policy "${LIBERO_PLUS_POLICY}" + + - name: Upload LIBERO-plus rollout video + if: always() + uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses] + with: + name: libero-plus-rollout-video + path: /tmp/libero-plus-artifacts/videos/ + if-no-files-found: warn + + - name: Upload LIBERO-plus eval metrics + if: always() + uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses] + with: + name: libero-plus-metrics + path: /tmp/libero-plus-artifacts/metrics.json + if-no-files-found: warn diff --git a/docker/Dockerfile.benchmark.libero_plus b/docker/Dockerfile.benchmark.libero_plus new file mode 100644 index 000000000..5911329a4 --- /dev/null +++ b/docker/Dockerfile.benchmark.libero_plus @@ -0,0 +1,84 @@ +# Copyright 2026 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Benchmark image for LIBERO-plus integration tests. +# Extends the nightly GPU image (which has lerobot[all]) with the LIBERO-plus +# fork source + its 6.4 GB perturbation assets. +# +# Build: docker build -f docker/Dockerfile.benchmark.libero_plus -t lerobot-benchmark-libero-plus . +# Run: docker run --gpus all --rm lerobot-benchmark-libero-plus lerobot-eval ... + +FROM huggingface/lerobot-gpu:latest +ENV MUJOCO_GL=egl + +# unzip for the 6.4 GB assets.zip; the rest are LIBERO-plus build-time extras +# (wand / ImageMagick / fontconfig) not in the nightly base. +USER root +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + unzip libexpat1 libfontconfig1-dev libmagickwand-dev \ + && apt-get clean && rm -rf /var/lib/apt/lists/* +USER user_lerobot + +# robosuite==1.4.1 is mandatory (the fork uses `single_arm_env` removed in +# v1.5+). The rest are LIBERO-plus runtime deps pulled from its setup.py. +# We install these explicitly instead of via the [libero_plus] extra because +# the extra's `libero @ git+...` dep installs as a namespace package and then +# clone and PYTHONPATH-override it below. +RUN uv pip install --no-cache \ + "robosuite==1.4.1" \ + "bddl==1.0.1" \ + "easydict==1.13" \ + "mujoco==3.7.0" \ + "matplotlib==3.10.8" \ + "Wand==0.6.13" \ + "scikit-image==0.25.2" \ + "gym==0.26.2" + +# Clone LIBERO-plus and make it importable as `libero`. The nightly base has +# hf-libero (10 tasks) preinstalled via lerobot[libero]; uninstall it so +# Python resolves `import libero` to the 2402-task LIBERO-plus module instead. +# Pinned to the current upstream main SHA so benchmark builds stay reproducible. +ARG LIBERO_PLUS_SHA=4976dc3 +ENV LIBERO_PLUS_ROOT=/home/user_lerobot/libero-plus/libero/libero +RUN git clone https://github.com/sylvestf/LIBERO-plus.git /home/user_lerobot/libero-plus \ + && git -C /home/user_lerobot/libero-plus checkout ${LIBERO_PLUS_SHA} \ + && cd /home/user_lerobot/libero-plus && uv pip install --no-cache --no-deps -e "." \ + && (uv pip uninstall hf-libero 2>/dev/null || true) +ENV PYTHONPATH="/home/user_lerobot/libero-plus:${PYTHONPATH}" + +# Perturbation textures/scenes: bddl_base_domain.py resolves XMLs via +# DIR_PATH/../assets (package-relative, ignoring ~/.libero/config.yaml). All +# 2402 tasks reference files that ship only in Sylvest/LIBERO-plus's +# assets.zip (6.4 GB) under a deep author-internal prefix — extract and +# flatten it under ${LIBERO_PLUS_ROOT}/assets. +RUN python -c "\ +from huggingface_hub import hf_hub_download; \ +hf_hub_download(repo_id='Sylvest/LIBERO-plus', repo_type='dataset', \ + filename='assets.zip', local_dir='/tmp/libero-plus-dl')" \ + && unzip -q /tmp/libero-plus-dl/assets.zip -d /tmp/libero-plus-dl/extract \ + && ASSETS_DIR=$(find /tmp/libero-plus-dl/extract -type d -name assets | head -1) \ + && mv "${ASSETS_DIR}" ${LIBERO_PLUS_ROOT}/assets \ + && rm -rf /tmp/libero-plus-dl + +# Point ~/.libero/config.yaml at the clone so LIBERO-plus's imports are +# non-interactive (it calls input() when the config is missing). +RUN mkdir -p /home/user_lerobot/.libero \ + && printf "assets: ${LIBERO_PLUS_ROOT}/assets\nbddl_files: ${LIBERO_PLUS_ROOT}/bddl_files\ndatasets: ${LIBERO_PLUS_ROOT}/../datasets\ninit_states: ${LIBERO_PLUS_ROOT}/init_files\n" \ + > /home/user_lerobot/.libero/config.yaml + +# Overlay the PR's source code on top of the nightly image. +COPY --chown=user_lerobot:user_lerobot . . + +CMD ["/bin/bash"] diff --git a/docker/Dockerfile.benchmark.robocasa b/docker/Dockerfile.benchmark.robocasa new file mode 100644 index 000000000..9de1612cb --- /dev/null +++ b/docker/Dockerfile.benchmark.robocasa @@ -0,0 +1,71 @@ +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Benchmark image for RoboCasa365 integration tests. +# Extends the nightly GPU image (which already has all extras installed) +# with the PR's source code and RoboCasa-specific asset setup. +# +# Build: docker build -f docker/Dockerfile.benchmark.robocasa -t lerobot-benchmark-robocasa . +# Run: docker run --gpus all --rm lerobot-benchmark-robocasa lerobot-eval ... + +FROM huggingface/lerobot-gpu:latest + +# Install robocasa + robosuite as editable clones. pip-installing from git +# omits data files like robocasa/models/assets/box_links/box_links_assets.json +# (not declared in package_data), which download_kitchen_assets needs at import. +# +# `--no-deps` on robocasa is deliberate: its setup.py pins `lerobot==0.3.3` +# in install_requires, which would shadow the editable lerobot baked into +# this image. We install robocasa's actual runtime deps explicitly instead. +# Pinned SHAs for reproducible benchmark runs. Bump when you need an +# upstream fix; don't rely on `main`/`master` drift. +ARG ROBOCASA_SHA=56e355ccc64389dfc1b8a61a33b9127b975ba681 +ARG ROBOSUITE_SHA=aaa8b9b214ce8e77e82926d677b4d61d55e577ab +RUN git clone https://github.com/robocasa/robocasa.git ~/robocasa && \ + git -C ~/robocasa checkout ${ROBOCASA_SHA} && \ + git clone https://github.com/ARISE-Initiative/robosuite.git ~/robosuite && \ + git -C ~/robosuite checkout ${ROBOSUITE_SHA} && \ + uv pip install --no-cache -e ~/robocasa --no-deps && \ + uv pip install --no-cache -e ~/robosuite && \ + uv pip install --no-cache \ + "numpy==2.2.5" "numba==0.61.2" "scipy==1.15.3" "mujoco==3.3.1" \ + "pygame==2.6.1" "Pillow==12.2.0" "opencv-python==4.13.0.92" \ + "pyyaml==6.0.3" "pynput==1.8.1" "tqdm==4.67.3" "termcolor==3.3.0" \ + "imageio==2.37.3" "h5py==3.16.0" "lxml==6.0.4" "hidapi==0.14.0.post4" \ + "tianshou==0.4.10" "gymnasium==1.2.3" + +# Set up robocasa macros and download kitchen assets. We need: +# - tex : base environment textures +# - tex_generative : AI-generated textures; kitchen fixture XMLs embed +# refs to generative_textures/wall/tex*.png +# unconditionally, so MjModel.from_xml_string fails +# at reset time without them (even if the env is +# constructed with generative_textures=None). +# - fixtures_lw : lightwheel kitchen fixtures (fridge, counters...) +# - objs_lw : lightwheel object meshes (stools, misc props) +# We skip the objaverse/aigen object packs (~30GB combined) by pairing +# this with --env.obj_registries=["lightwheel"] on the lerobot side. +# The download script prompts interactively, so pipe 'y' to auto-accept. +RUN python -m robocasa.scripts.setup_macros && \ + yes y | python -m robocasa.scripts.download_kitchen_assets \ + --type tex tex_generative fixtures_lw objs_lw + +# Overlay the PR's source code on top of the nightly image. +COPY --chown=user_lerobot:user_lerobot . . + +# Re-install lerobot editably so the new source (with RoboCasaEnv registration) +# replaces the stale package baked into the nightly image. +RUN uv pip install --no-cache --no-deps -e . + +CMD ["/bin/bash"] diff --git a/docker/Dockerfile.benchmark.robocerebra b/docker/Dockerfile.benchmark.robocerebra new file mode 100644 index 000000000..9378bd66a --- /dev/null +++ b/docker/Dockerfile.benchmark.robocerebra @@ -0,0 +1,43 @@ +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Benchmark image for RoboCerebra integration tests. +# RoboCerebra reuses LIBERO's simulator (libero_10 suite) with a different +# rename_map, so this image is identical to the LIBERO benchmark image — +# extends the nightly GPU base with LIBERO assets + the PR's source code. +# +# Build: docker build -f docker/Dockerfile.benchmark.robocerebra -t lerobot-benchmark-robocerebra . +# Run: docker run --gpus all --rm lerobot-benchmark-robocerebra lerobot-eval ... + +FROM huggingface/lerobot-gpu:latest + +# Pre-download lerobot/libero-assets from HF Hub so nothing is fetched at +# runtime (which times out on CI). Point the libero config at the cached path. +# libero/libero/__init__.py calls input() when ~/.libero/config.yaml is missing, +# so we write the config before any libero import can happen. +RUN LIBERO_DIR=$(python -c \ + "import importlib.util, os; s=importlib.util.find_spec('libero'); \ + print(os.path.join(os.path.dirname(s.origin), 'libero'))") && \ + mkdir -p /home/user_lerobot/.libero && \ + python -c "\ +from huggingface_hub import snapshot_download; \ +snapshot_download(repo_id='lerobot/libero-assets', repo_type='dataset', \ + local_dir='/home/user_lerobot/.libero/assets')" && \ + printf "assets: /home/user_lerobot/.libero/assets\nbddl_files: ${LIBERO_DIR}/bddl_files\ndatasets: ${LIBERO_DIR}/../datasets\ninit_states: ${LIBERO_DIR}/init_files\n" \ + > /home/user_lerobot/.libero/config.yaml + +# Overlay the PR's source code on top of the nightly image. +COPY --chown=user_lerobot:user_lerobot . . + +CMD ["/bin/bash"] diff --git a/docker/Dockerfile.benchmark.robomme b/docker/Dockerfile.benchmark.robomme new file mode 100644 index 000000000..2bfc83b4f --- /dev/null +++ b/docker/Dockerfile.benchmark.robomme @@ -0,0 +1,56 @@ +# Copyright 2026 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Benchmark image for RoboMME integration tests. +# Extends the nightly GPU image (which has lerobot[all]) with Vulkan system +# libs for ManiSkill/SAPIEN and the robomme extra. robomme isn't in [all] +# because mani-skill hard-pins gymnasium==0.29.1 and numpy<2.0.0 which +# conflict with lerobot's defaults; both are safe at runtime: +# - gymnasium 0.29.x has the same 5-tuple step() API as 1.x (since 0.26) +# - numpy 1.26.4 is API-compatible with lerobot's actual usage. +# +# Build: docker build -f docker/Dockerfile.benchmark.robomme -t lerobot-benchmark-robomme . +# Run: docker run --gpus all --rm lerobot-benchmark-robomme lerobot-eval ... + +FROM huggingface/lerobot-gpu:latest + +# NVIDIA Container Toolkit: expose Vulkan driver capability for headless rendering. +ENV NVIDIA_DRIVER_CAPABILITIES=all \ + VK_ICD_FILENAMES=/usr/share/vulkan/icd.d/nvidia_icd.json + +# ManiSkill/SAPIEN's renderer needs Vulkan, which isn't in the base image. +USER root +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + libvulkan1 libvulkan-dev mesa-vulkan-drivers \ + && mkdir -p /usr/share/vulkan/icd.d \ + && echo '{"file_format_version":"1.0.0","ICD":{"library_path":"libGLX_nvidia.so.0","api_version":"1.3.0"}}' \ + > /usr/share/vulkan/icd.d/nvidia_icd.json \ + && apt-get clean && rm -rf /var/lib/apt/lists/* +USER user_lerobot + +# Install smolvla + av-dep via the PR's pyproject, then layer robomme on top +# with gymnasium/numpy overrides. robomme isn't a pyproject extra because its +# mani-skill pin conflicts with lerobot's base numpy>=2 (see pyproject.toml). +COPY --chown=user_lerobot:user_lerobot setup.py pyproject.toml uv.lock README.md MANIFEST.in ./ +RUN printf 'gymnasium==0.29.1\nnumpy==1.26.4\n' > /tmp/robomme_override.txt \ + && uv pip install --no-cache --override /tmp/robomme_override.txt \ + -e ".[smolvla,av-dep]" \ + "robomme @ git+https://github.com/RoboMME/robomme_benchmark.git@main" \ + && python -c "import robomme; print('robomme import OK')" + +# Overlay the PR's source code on top of the nightly image. +COPY --chown=user_lerobot:user_lerobot . . + +CMD ["/bin/bash"] diff --git a/docker/Dockerfile.benchmark.robotwin b/docker/Dockerfile.benchmark.robotwin new file mode 100644 index 000000000..423854c31 --- /dev/null +++ b/docker/Dockerfile.benchmark.robotwin @@ -0,0 +1,122 @@ +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Benchmark image for RoboTwin 2.0 integration tests. +# Extends the nightly GPU image with the RoboTwin simulator stack: +# sapien/mplib/pytorch3d + NVlabs CuRobo + embodiments.zip + objects.zip +# (~3.96 GB of assets; background_texture.zip ~11 GB skipped for smoke eval). +# +# Build: docker build -f docker/Dockerfile.benchmark.robotwin -t lerobot-benchmark-robotwin . +# Run: docker run --gpus all --rm lerobot-benchmark-robotwin \ +# lerobot-eval --env.type=robotwin --env.task=beat_block_hammer ... + +FROM huggingface/lerobot-gpu:latest + +ENV NVIDIA_DRIVER_CAPABILITIES=all \ + VK_ICD_FILENAMES=/usr/share/vulkan/icd.d/nvidia_icd.json \ + ROBOTWIN_ROOT=/opt/robotwin + +# The nightly base is CUDA -base (no compiler, no Vulkan loader). CuRobo's +# `pip install -e .` runs nvcc, and SAPIEN renders via Vulkan — add both. +USER root +# Pinned upstream SHA for reproducible benchmark runs. Bump when we need +# an upstream fix; don't rely on `main` drift. +ARG ROBOTWIN_SHA=0aeea2d669c0f8516f4d5785f0aa33ba812c14b4 +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + cuda-nvcc-12-4 cuda-cudart-dev-12-4 \ + libvulkan1 vulkan-tools \ + && mkdir -p /usr/share/vulkan/icd.d \ + && echo '{"file_format_version":"1.0.0","ICD":{"library_path":"libGLX_nvidia.so.0","api_version":"1.3.0"}}' \ + > /usr/share/vulkan/icd.d/nvidia_icd.json \ + && git clone https://github.com/RoboTwin-Platform/RoboTwin.git ${ROBOTWIN_ROOT} \ + && git -C ${ROBOTWIN_ROOT} checkout ${ROBOTWIN_SHA} \ + && chown -R user_lerobot:user_lerobot ${ROBOTWIN_ROOT} \ + && apt-get clean && rm -rf /var/lib/apt/lists/* +USER user_lerobot + +# RoboTwin runtime deps (av is already in the base via [av-dep]). +RUN uv pip install --no-cache \ + "sapien==3.0.0b1" "mplib==0.2.1" "transforms3d==0.4.2" "trimesh==4.4.3" \ + "open3d==0.19.0" "imageio==2.34.2" termcolor zarr pydantic h5py + +# pytorch3d has no universal wheel; must be built from source (~10 min, cached). +RUN uv pip install --no-cache --no-build-isolation \ + "git+https://github.com/facebookresearch/pytorch3d.git@stable" + +# CuRobo — NVlabs motion generator; TORCH_CUDA_ARCH_LIST must be set or the +# build aborts on an empty arch list. Pinned SHA for reproducibility. +ARG CUROBO_SHA=ca941586c33b8482ed9c0e74d60f23efd64b516a +RUN cd ${ROBOTWIN_ROOT}/envs \ + && git clone https://github.com/NVlabs/curobo.git \ + && git -C curobo checkout ${CUROBO_SHA} \ + && cd curobo \ + && TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;8.9;9.0" \ + uv pip install -e . --no-build-isolation --no-cache + +# Upstream patches (mirror RoboTwin's script/_install.sh). +# These patches target the exact versions pinned above; re-check when upgrading. +# mplib==0.2.1: drop a broken `or collide` clause in planner.py. +# Safe to remove once mplib > 0.2.1 ships with the fix upstream. +# sapien==3.0.0b1: fix URDF loader encoding + .srdf extension check. +# Safe to remove once sapien > 3.0.0b1 ships with the fix upstream. +RUN python - <<'EOF' +import pathlib, re, site +for d in site.getsitepackages(): + p = pathlib.Path(d) / "mplib" / "planner.py" + if p.exists(): + p.write_text(re.sub(r"\bor collide\b", "", p.read_text(), count=1)) + print(f"mplib patch applied: {p}") + p = pathlib.Path(d) / "sapien" / "wrapper" / "urdf_loader.py" + if p.exists(): + src = p.read_text().replace( + "with open(srdf_path) as f:", 'with open(srdf_path, encoding="utf-8") as f:' + ).replace('"srdf"', '".srdf"') + p.write_text(src) + print(f"sapien patch applied: {p}") +EOF + +# Simulation assets from TianxingChen/RoboTwin2.0: embodiments (~220 MB) + +# objects (~3.74 GB). background_texture (~11 GB) is intentionally skipped. +# The dataset is public — no auth token needed. +RUN python - <<'EOF' +import os, pathlib, zipfile +from huggingface_hub import hf_hub_download + +assets_dir = pathlib.Path(os.environ["ROBOTWIN_ROOT"]) / "assets" +assets_dir.mkdir(parents=True, exist_ok=True) +for fname in ("embodiments.zip", "objects.zip"): + local = hf_hub_download( + repo_id="TianxingChen/RoboTwin2.0", + repo_type="dataset", + filename=fname, + local_dir=str(assets_dir), + ) + with zipfile.ZipFile(local, "r") as z: + z.extractall(str(assets_dir)) + pathlib.Path(local).unlink() +EOF + +WORKDIR ${ROBOTWIN_ROOT} +RUN python script/update_embodiment_config_path.py + +ENV PYTHONPATH="${ROBOTWIN_ROOT}:${PYTHONPATH}" + +# Return to the lerobot source directory (set by base image) before overlaying. +WORKDIR /lerobot + +# Overlay the PR's source code on top of the nightly image. +COPY --chown=user_lerobot:user_lerobot . . + +CMD ["/bin/bash"] diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 3dcba5993..d29f4c545 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -77,8 +77,18 @@ title: Adding a New Benchmark - local: libero title: LIBERO + - local: libero_plus + title: LIBERO-plus - local: metaworld title: Meta-World + - local: robotwin + title: RoboTwin 2.0 + - local: robocasa + title: RoboCasa365 + - local: robocerebra + title: RoboCerebra + - local: robomme + title: RoboMME - local: envhub_isaaclab_arena title: NVIDIA IsaacLab Arena Environments title: "Benchmarks" diff --git a/docs/source/libero_plus.mdx b/docs/source/libero_plus.mdx new file mode 100644 index 000000000..4249bf49e --- /dev/null +++ b/docs/source/libero_plus.mdx @@ -0,0 +1,188 @@ +# LIBERO-plus + +LIBERO-plus is a **robustness benchmark** for Vision-Language-Action (VLA) models built on top of [LIBERO](./libero). It systematically stress-tests policies by applying **seven independent perturbation dimensions** to the original LIBERO task set, exposing failure modes that standard benchmarks miss. + +- Paper: [In-depth Robustness Analysis of Vision-Language-Action Models](https://arxiv.org/abs/2510.13626) +- GitHub: [sylvestf/LIBERO-plus](https://github.com/sylvestf/LIBERO-plus) +- Dataset: [lerobot/libero_plus](https://huggingface.co/datasets/lerobot/libero_plus) + +![An overview of the LIBERO-plus benchmark perturbation dimensions](https://github.com/sylvestf/LIBERO-plus/raw/main/static/images/libero-plus.jpg) + +## Perturbation dimensions + +LIBERO-plus creates ~10 000 task variants by perturbing each original LIBERO task along these axes: + +| Dimension | What changes | +| --------------------- | ----------------------------------------------------- | +| Objects layout | Target position, presence of confounding objects | +| Camera viewpoints | Camera position, orientation, field-of-view | +| Robot initial states | Manipulator start pose | +| Language instructions | LLM-rewritten task description (paraphrase / synonym) | +| Light conditions | Intensity, direction, color, shadow | +| Background textures | Scene surface and object appearance | +| Sensor noise | Photometric distortions and image degradation | + +## Available task suites + +LIBERO-plus covers the same five suites as LIBERO: + +| Suite | CLI name | Tasks | Max steps | Description | +| -------------- | ---------------- | ----- | --------- | -------------------------------------------------- | +| LIBERO-Spatial | `libero_spatial` | 10 | 280 | Tasks requiring reasoning about spatial relations | +| LIBERO-Object | `libero_object` | 10 | 280 | Tasks centered on manipulating different objects | +| LIBERO-Goal | `libero_goal` | 10 | 300 | Goal-conditioned tasks with changing targets | +| LIBERO-90 | `libero_90` | 90 | 400 | Short-horizon tasks from the LIBERO-100 collection | +| LIBERO-Long | `libero_10` | 10 | 520 | Long-horizon tasks from the LIBERO-100 collection | + + + Installing LIBERO-plus **replaces** vanilla LIBERO — it uninstalls `hf-libero` + so that `import libero` resolves to the LIBERO-plus fork. You cannot have both + installed at the same time. To switch back to vanilla LIBERO, uninstall the + fork and reinstall with `pip install -e ".[libero]"`. + + +## Installation + +### System dependencies (Linux only) + +```bash +sudo apt install libexpat1 libfontconfig1-dev libmagickwand-dev +``` + +### Python package + +```bash +pip install -e ".[libero]" "robosuite==1.4.1" bddl easydict mujoco wand scikit-image gym +git clone https://github.com/sylvestf/LIBERO-plus.git +cd LIBERO-plus && pip install --no-deps -e . +pip uninstall -y hf-libero # so `import libero` resolves to the fork +``` + +LIBERO-plus is installed from its GitHub fork rather than a pyproject extra — the fork ships as a namespace package that pip can't handle, so it must be cloned and added to `PYTHONPATH`. See `docker/Dockerfile.benchmark.libero_plus` for the canonical install. MuJoCo is required, so only Linux is supported. + + +Set the MuJoCo rendering backend before running evaluation: + +```bash +export MUJOCO_GL=egl # headless / HPC / cloud +``` + + + +### Download LIBERO-plus assets + +LIBERO-plus ships its extended asset pack separately. Download `assets.zip` from the [Hugging Face dataset](https://huggingface.co/datasets/Sylvest/LIBERO-plus/tree/main) and extract it into the LIBERO-plus package directory: + +```bash +# After installing the package, find where it was installed: +python -c "import libero; print(libero.__file__)" +# Then extract assets.zip into /libero/assets/ +``` + +## Evaluation + +### Default evaluation (recommended) + +Evaluate across the four standard suites (10 episodes per task): + +```bash +lerobot-eval \ + --policy.path="your-policy-id" \ + --env.type=libero_plus \ + --env.task=libero_spatial,libero_object,libero_goal,libero_10 \ + --eval.batch_size=1 \ + --eval.n_episodes=10 \ + --env.max_parallel_tasks=1 +``` + +### Single-suite evaluation + +Evaluate on one LIBERO-plus suite: + +```bash +lerobot-eval \ + --policy.path="your-policy-id" \ + --env.type=libero_plus \ + --env.task=libero_spatial \ + --eval.batch_size=1 \ + --eval.n_episodes=10 +``` + +- `--env.task` picks the suite (`libero_spatial`, `libero_object`, etc.). +- `--env.task_ids` restricts to specific task indices (`[0]`, `[1,2,3]`, etc.). Omit to run all tasks in the suite. +- `--eval.batch_size` controls how many environments run in parallel. +- `--eval.n_episodes` sets how many episodes to run per task. + +### Multi-suite evaluation + +Benchmark a policy across multiple suites at once by passing a comma-separated list: + +```bash +lerobot-eval \ + --policy.path="your-policy-id" \ + --env.type=libero_plus \ + --env.task=libero_spatial,libero_object \ + --eval.batch_size=1 \ + --eval.n_episodes=10 +``` + +### Control mode + +LIBERO-plus supports two control modes — `relative` (default) and `absolute`. Different VLA checkpoints are trained with different action parameterizations, so make sure the mode matches your policy: + +```bash +--env.control_mode=relative # or "absolute" +``` + +### Policy inputs and outputs + +**Observations:** + +- `observation.state` — 8-dim proprioceptive features (eef position, axis-angle orientation, gripper qpos) +- `observation.images.image` — main camera view (`agentview_image`), HWC uint8 +- `observation.images.image2` — wrist camera view (`robot0_eye_in_hand_image`), HWC uint8 + +**Actions:** + +- Continuous control in `Box(-1, 1, shape=(7,))` — 6D end-effector delta + 1D gripper + +### Recommended evaluation episodes + +For reproducible benchmarking, use **10 episodes per task** across all four standard suites (Spatial, Object, Goal, Long). This gives 400 total episodes and matches the protocol used for published results. + +## Training + +### Dataset + +A LeRobot-format training dataset for LIBERO-plus is available at: + +- [lerobot/libero_plus](https://huggingface.co/datasets/lerobot/libero_plus) + +### Example training command + +```bash +lerobot-train \ + --policy.type=smolvla \ + --policy.repo_id=${HF_USER}/smolvla_libero_plus \ + --policy.load_vlm_weights=true \ + --dataset.repo_id=lerobot/libero_plus \ + --env.type=libero_plus \ + --env.task=libero_spatial \ + --output_dir=./outputs/ \ + --steps=100000 \ + --batch_size=4 \ + --eval.batch_size=1 \ + --eval.n_episodes=1 \ + --eval_freq=1000 +``` + +## Relationship to LIBERO + +LIBERO-plus is a drop-in extension of LIBERO: + +- Same Python gym interface (`LiberoEnv`, `LiberoProcessorStep`) +- Same camera names and observation/action format +- Same task suite names +- Installs under the same `libero` Python package name (different GitHub repo) + +To use the original LIBERO benchmark, see [LIBERO](./libero) and use `--env.type=libero`. diff --git a/docs/source/robocasa.mdx b/docs/source/robocasa.mdx new file mode 100644 index 000000000..f6a784e72 --- /dev/null +++ b/docs/source/robocasa.mdx @@ -0,0 +1,188 @@ +# RoboCasa365 + +[RoboCasa365](https://robocasa.ai) is a large-scale simulation framework for training and benchmarking **generalist robots** in everyday kitchen tasks. It ships 365 diverse manipulation tasks across 2,500 kitchen environments, 3,200+ object assets and 600+ hours of human demonstration data, on a PandaOmron 12-DOF mobile manipulator (Franka arm on a holonomic base). + +- Paper: [RoboCasa: Large-Scale Simulation of Everyday Tasks for Generalist Robots](https://arxiv.org/abs/2406.02523) +- GitHub: [robocasa/robocasa](https://github.com/robocasa/robocasa) +- Project website: [robocasa.ai](https://robocasa.ai) +- Pretrained policy: [`lerobot/smolvla_robocasa`](https://huggingface.co/lerobot/smolvla_robocasa) +- Single-task dataset (CloseFridge): [`pepijn223/robocasa_CloseFridge`](https://huggingface.co/datasets/pepijn223/robocasa_CloseFridge) + +RoboCasa365 benchmark overview + +## Available tasks + +RoboCasa365 organizes its 365 tasks into two families and three upstream benchmark groups that LeRobot exposes as first-class `--env.task` shortcuts: + +| Family | Tasks | Description | +| --------- | ----- | ------------------------------------------------------------------------------- | +| Atomic | ~65 | Single-skill tasks: pick-and-place, door/drawer manipulation, appliance control | +| Composite | ~300 | Multi-step tasks across 60+ categories: cooking, cleaning, organizing, etc. | + +**Atomic task examples:** `CloseFridge`, `OpenDrawer`, `OpenCabinet`, `TurnOnMicrowave`, `TurnOffStove`, `NavigateKitchen`, `PickPlaceCounterToStove`. + +**Composite task categories:** baking, boiling, brewing, chopping, clearing table, defrosting food, loading dishwasher, making tea, microwaving food, washing dishes, and more. + +`--env.task` accepts three forms: + +- a single task name (`CloseFridge`) +- a comma-separated list (`CloseFridge,OpenBlenderLid,PickPlaceCoffee`) +- a benchmark-group shortcut — `atomic_seen`, `composite_seen`, `composite_unseen`, `pretrain50`, `pretrain100`, `pretrain200`, `pretrain300` — which auto-expands to the upstream task list and auto-sets the dataset `split` (`target` or `pretrain`). + +## Installation + +RoboCasa and its dependency `robosuite` are not published on PyPI, and RoboCasa's own `setup.py` hardcodes `lerobot==0.3.3`, which conflicts with this repo's `lerobot`. LeRobot therefore does **not** expose a `robocasa` extra — install the two packages manually as editable clones (using `--no-deps` on `robocasa` to skip its shadowed `lerobot` pin): + +```bash +# After following the standard LeRobot installation instructions. + +git clone https://github.com/robocasa/robocasa.git ~/robocasa +git clone https://github.com/ARISE-Initiative/robosuite.git ~/robosuite +pip install -e ~/robocasa --no-deps +pip install -e ~/robosuite + +# Robocasa's runtime deps (the ones its setup.py would have pulled, minus +# the bad lerobot pin). +pip install numpy numba scipy mujoco pygame Pillow opencv-python \ + pyyaml pynput tqdm termcolor imageio h5py lxml hidapi \ + tianshou gymnasium + +python -m robocasa.scripts.setup_macros +# Lightweight assets (lightwheel object meshes + textures). Enough for +# the default env out of the box. +python -m robocasa.scripts.download_kitchen_assets \ + --type tex tex_generative fixtures_lw objs_lw +# Optional: full objaverse/aigen registries (~30GB) for richer object +# variety. Enable at eval time via --env.obj_registries (see below). +# python -m robocasa.scripts.download_kitchen_assets --type objs_objaverse +``` + + +RoboCasa requires MuJoCo. Set the rendering backend before training or evaluation: + +```bash +export MUJOCO_GL=egl # for headless servers (HPC, cloud) +``` + + + +### Object registries + +By default the env samples objects only from the `lightwheel` registry (what `--type objs_lw` ships), which avoids a `Probabilities contain NaN` crash when the objaverse / aigen packs aren't on disk. If you've downloaded the full asset set, enable the full registry at runtime: + +```bash +--env.obj_registries='[objaverse,lightwheel]' +``` + +## Evaluation + +All eval snippets below mirror the CI command (see `.github/workflows/benchmark_tests.yml`). The `--rename_map` argument maps RoboCasa's native camera keys (`robot0_agentview_left` / `robot0_eye_in_hand` / `robot0_agentview_right`) onto the three-camera (`camera1` / `camera2` / `camera3`) input layout the released `smolvla_robocasa` policy was trained on. + +### Single-task evaluation (recommended for quick iteration) + +```bash +lerobot-eval \ + --policy.path=lerobot/smolvla_robocasa \ + --env.type=robocasa \ + --env.task=CloseFridge \ + --eval.batch_size=1 \ + --eval.n_episodes=20 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--rename_map={"observation.images.robot0_agentview_left": "observation.images.camera1", "observation.images.robot0_eye_in_hand": "observation.images.camera2", "observation.images.robot0_agentview_right": "observation.images.camera3"}' +``` + +### Multi-task evaluation + +Pass a comma-separated list of tasks: + +```bash +lerobot-eval \ + --policy.path=lerobot/smolvla_robocasa \ + --env.type=robocasa \ + --env.task=CloseFridge,OpenCabinet,OpenDrawer,TurnOnMicrowave,TurnOffStove \ + --eval.batch_size=1 \ + --eval.n_episodes=20 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--rename_map={"observation.images.robot0_agentview_left": "observation.images.camera1", "observation.images.robot0_eye_in_hand": "observation.images.camera2", "observation.images.robot0_agentview_right": "observation.images.camera3"}' +``` + +### Benchmark-group evaluation + +Run an entire upstream group (e.g. all 18 `atomic_seen` tasks with `split=target`): + +```bash +lerobot-eval \ + --policy.path=lerobot/smolvla_robocasa \ + --env.type=robocasa \ + --env.task=atomic_seen \ + --eval.batch_size=1 \ + --eval.n_episodes=20 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--rename_map={"observation.images.robot0_agentview_left": "observation.images.camera1", "observation.images.robot0_eye_in_hand": "observation.images.camera2", "observation.images.robot0_agentview_right": "observation.images.camera3"}' +``` + +### Recommended evaluation episodes + +**20 episodes per task** for reproducible benchmarking. Matches the protocol used in published results. + +## Policy inputs and outputs + +**Observations** (raw RoboCasa camera names are preserved verbatim): + +- `observation.state` — 16-dim proprioceptive state (base position, base quaternion, relative end-effector position, relative end-effector quaternion, gripper qpos) +- `observation.images.robot0_agentview_left` — left agent view, 256×256 HWC uint8 +- `observation.images.robot0_eye_in_hand` — wrist camera view, 256×256 HWC uint8 +- `observation.images.robot0_agentview_right` — right agent view, 256×256 HWC uint8 + +**Actions:** + +- Continuous control in `Box(-1, 1, shape=(12,))` — base motion (4D) + control mode (1D) + end-effector position (3D) + end-effector rotation (3D) + gripper (1D). + +## Training + +### Single-task example + +A ready-to-use single-task dataset is on the Hub: +[`pepijn223/robocasa_CloseFridge`](https://huggingface.co/datasets/pepijn223/robocasa_CloseFridge). + +Fine-tune a SmolVLA base on `CloseFridge`: + +```bash +lerobot-train \ + --policy.type=smolvla \ + --policy.repo_id=${HF_USER}/smolvla_robocasa_CloseFridge \ + --policy.load_vlm_weights=true \ + --policy.push_to_hub=true \ + --dataset.repo_id=pepijn223/robocasa_CloseFridge \ + --env.type=robocasa \ + --env.task=CloseFridge \ + --output_dir=./outputs/smolvla_robocasa_CloseFridge \ + --steps=100000 \ + --batch_size=4 \ + --eval_freq=5000 \ + --eval.batch_size=1 \ + --eval.n_episodes=5 \ + --save_freq=10000 +``` + +Evaluate the resulting checkpoint: + +```bash +lerobot-eval \ + --policy.path=${HF_USER}/smolvla_robocasa_CloseFridge \ + --env.type=robocasa \ + --env.task=CloseFridge \ + --eval.batch_size=1 \ + --eval.n_episodes=20 +``` + +## Reproducing published results + +The released checkpoint [`lerobot/smolvla_robocasa`](https://huggingface.co/lerobot/smolvla_robocasa) is evaluated with the commands in the [Evaluation](#evaluation) section. CI runs a 10-atomic-task smoke eval (one episode each) on every PR touching the benchmark, picking fixture-centric tasks that don't require the objaverse asset pack. diff --git a/docs/source/robocerebra.mdx b/docs/source/robocerebra.mdx new file mode 100644 index 000000000..9776bd40f --- /dev/null +++ b/docs/source/robocerebra.mdx @@ -0,0 +1,99 @@ +# RoboCerebra + +[RoboCerebra](https://robocerebra-project.github.io/) is a long-horizon manipulation benchmark that evaluates **high-level reasoning, planning, and memory** in VLAs. Episodes chain multiple sub-goals with language-grounded intermediate instructions, built on top of LIBERO's simulator stack (MuJoCo + robosuite, Franka Panda 7-DOF). + +- Paper: [RoboCerebra: A Large-scale Benchmark for Long-horizon Robotic Manipulation Evaluation](https://arxiv.org/abs/2506.06677) +- Project website: [robocerebra-project.github.io](https://robocerebra-project.github.io/) +- Dataset: [`lerobot/robocerebra_unified`](https://huggingface.co/datasets/lerobot/robocerebra_unified) — LeRobot v3.0, 6,660 episodes / 571,116 frames at 20 fps, 1,728 language-grounded sub-tasks. +- Pretrained policy: [`lerobot/smolvla_robocerebra`](https://huggingface.co/lerobot/smolvla_robocerebra) + +## Available tasks + +RoboCerebra reuses LIBERO's simulator, so evaluation runs against the LIBERO `libero_10` long-horizon suite: + +| Suite | CLI name | Tasks | Description | +| --------- | ----------- | ----- | ------------------------------------------------------------- | +| LIBERO-10 | `libero_10` | 10 | Long-horizon kitchen/living room tasks chaining 3–6 sub-goals | + +Each RoboCerebra episode in the dataset is segmented into multiple sub-tasks with natural-language instructions, which the unified dataset exposes as independent supervision signals. + +## Installation + +RoboCerebra piggybacks on LIBERO, so the `libero` extra is all you need: + +```bash +pip install -e ".[libero]" +``` + + +RoboCerebra requires Linux (MuJoCo / robosuite). Set the rendering backend before training or evaluation: + +```bash +export MUJOCO_GL=egl # for headless servers (HPC, cloud) +``` + + + +## Evaluation + +RoboCerebra eval runs against LIBERO's `libero_10` suite with RoboCerebra's camera naming (`image` + `wrist_image`) and an extra empty-camera slot so a three-view-trained policy receives the expected input layout: + +```bash +lerobot-eval \ + --policy.path=lerobot/smolvla_robocerebra \ + --env.type=libero \ + --env.task=libero_10 \ + --env.fps=20 \ + --env.obs_type=pixels_agent_pos \ + --env.observation_height=256 \ + --env.observation_width=256 \ + '--env.camera_name_mapping={"agentview_image": "image", "robot0_eye_in_hand_image": "wrist_image"}' \ + --eval.batch_size=1 \ + --eval.n_episodes=10 \ + --eval.use_async_envs=false \ + --policy.device=cuda \ + '--rename_map={"observation.images.image": "observation.images.camera1", "observation.images.wrist_image": "observation.images.camera2"}' \ + --policy.empty_cameras=1 +``` + +### Recommended evaluation episodes + +**10 episodes per task** across the `libero_10` suite (100 total) for reproducible benchmarking. Matches the protocol used in the RoboCerebra paper. + +## Policy inputs and outputs + +**Observations:** + +- `observation.state` — 8-dim proprioceptive state (7 joint positions + gripper) +- `observation.images.image` — third-person view, 256×256 HWC uint8 +- `observation.images.wrist_image` — wrist-mounted camera view, 256×256 HWC uint8 + +**Actions:** + +- Continuous control in `Box(-1, 1, shape=(7,))` — end-effector delta (6D) + gripper (1D) + +## Training + +The unified dataset at [`lerobot/robocerebra_unified`](https://huggingface.co/datasets/lerobot/robocerebra_unified) exposes two RGB streams and language-grounded sub-task annotations: + +| Feature | Shape | Description | +| -------------------------------- | ------------- | -------------------- | +| `observation.images.image` | (256, 256, 3) | Third-person view | +| `observation.images.wrist_image` | (256, 256, 3) | Wrist-mounted camera | +| `observation.state` | (8,) | Joint pos + gripper | +| `action` | (7,) | EEF delta + gripper | + +Fine-tune a SmolVLA base on it: + +```bash +lerobot-train \ + --policy.path=lerobot/smolvla_base \ + --dataset.repo_id=lerobot/robocerebra_unified \ + --env.type=libero \ + --env.task=libero_10 \ + --output_dir=outputs/smolvla_robocerebra +``` + +## Reproducing published results + +The released checkpoint [`lerobot/smolvla_robocerebra`](https://huggingface.co/lerobot/smolvla_robocerebra) was trained on `lerobot/robocerebra_unified` and evaluated with the command in the [Evaluation](#evaluation) section. CI runs the same command with `--eval.n_episodes=1` as a smoke test on every PR touching the benchmark. diff --git a/docs/source/robomme.mdx b/docs/source/robomme.mdx new file mode 100644 index 000000000..6613a3923 --- /dev/null +++ b/docs/source/robomme.mdx @@ -0,0 +1,130 @@ +# RoboMME + +[RoboMME](https://robomme.github.io) is a memory-augmented manipulation benchmark built on ManiSkill (SAPIEN). It evaluates a robot's ability to retain and use information across an episode — counting, object permanence, reference, and imitation. + +- **16 tasks** across 4 memory-skill suites +- **1,600 training demos** (100 per task, 50 val, 50 test) +- **Dataset**: [`lerobot/robomme`](https://huggingface.co/datasets/lerobot/robomme) — LeRobot v3.0, 768K frames at 10 fps +- **Simulator**: ManiSkill / SAPIEN, Panda arm, Linux only + +![RoboMME benchmark tasks overview](https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2603.04639/gradient.png) + +## Tasks + +| Suite | Tasks | +| --------------------------------- | ------------------------------------------------------------- | +| **Counting** (temporal memory) | BinFill, PickXtimes, SwingXtimes, StopCube | +| **Permanence** (spatial memory) | VideoUnmask, VideoUnmaskSwap, ButtonUnmask, ButtonUnmaskSwap | +| **Reference** (object memory) | PickHighlight, VideoRepick, VideoPlaceButton, VideoPlaceOrder | +| **Imitation** (procedural memory) | MoveCube, InsertPeg, PatternLock, RouteStick | + +## Installation + +> RoboMME requires **Linux** (ManiSkill/SAPIEN uses Vulkan rendering). Docker is recommended to isolate dependency conflicts. + +### Native (Linux) + +```bash +pip install --override <(printf 'gymnasium==0.29.1\nnumpy==1.26.4\n') \ + -e '.[smolvla,av-dep]' \ + 'robomme @ git+https://github.com/RoboMME/robomme_benchmark.git@main' +``` + +> **Dependency note**: `mani-skill` (pulled by `robomme`) pins `gymnasium==0.29.1` and `numpy<2.0.0`, which conflict with lerobot's base `numpy>=2.0.0`. That's why `robomme` is not a pyproject extra — use the override install above, or the Docker approach below to avoid conflicts entirely. + +### Docker (recommended) + +```bash +# Build base image first (from repo root) +docker build -f docker/Dockerfile.eval-base -t lerobot-eval-base . + +# Build RoboMME eval image (applies gymnasium + numpy pin overrides) +docker build -f docker/Dockerfile.benchmark.robomme -t lerobot-robomme . +``` + +The `docker/Dockerfile.benchmark.robomme` image overrides `gymnasium==0.29.1` and `numpy==1.26.4` after lerobot's install. Both versions are runtime-safe for lerobot's actual API usage. + +## Running Evaluation + +### Default (single task, single episode) + +```bash +lerobot-eval \ + --policy.path= \ + --env.type=robomme \ + --env.task=PickXtimes \ + --env.dataset_split=test \ + --env.task_ids=[0] \ + --eval.batch_size=1 \ + --eval.n_episodes=1 +``` + +### Multi-task evaluation + +Evaluate multiple tasks in one run by comma-separating task names. Use `task_ids` to control which episodes are evaluated per task. Recommended: 50 episodes per task for the test split. + +```bash +lerobot-eval \ + --policy.path= \ + --env.type=robomme \ + --env.task=PickXtimes,BinFill,StopCube,MoveCube,InsertPeg \ + --env.dataset_split=test \ + --env.task_ids=[0,1,2,3,4,5,6,7,8,9] \ + --eval.batch_size=1 \ + --eval.n_episodes=50 +``` + +### Key CLI options for `env.type=robomme` + +| Option | Default | Description | +| -------------------- | ------------- | -------------------------------------------------- | +| `env.task` | `PickXtimes` | Any of the 16 task names above (comma-separated) | +| `env.dataset_split` | `test` | `train`, `val`, or `test` | +| `env.action_space` | `joint_angle` | `joint_angle` (8-D) or `ee_pose` (7-D) | +| `env.episode_length` | `300` | Max steps per episode | +| `env.task_ids` | `null` | List of episode indices to evaluate (null = `[0]`) | + +## Dataset + +The dataset [`lerobot/robomme`](https://huggingface.co/datasets/lerobot/robomme) is in **LeRobot v3.0 format** and can be loaded directly: + +```python +from lerobot.datasets.lerobot_dataset import LeRobotDataset + +dataset = LeRobotDataset("lerobot/robomme") +``` + +### Dataset features + +| Feature | Shape | Description | +| ------------------ | ------------- | ------------------------------- | +| `image` | (256, 256, 3) | Front camera RGB | +| `wrist_image` | (256, 256, 3) | Wrist camera RGB | +| `actions` | (8,) | Joint angles + gripper | +| `state` | (8,) | Joint positions + gripper state | +| `simple_subgoal` | str | High-level language annotation | +| `grounded_subgoal` | str | Grounded language annotation | +| `episode_index` | int | Episode ID | +| `frame_index` | int | Frame within episode | + +### Feature key alignment (training) + +The env wrapper exposes `pixels/image` and `pixels/wrist_image` as observation keys. The `features_map` in `RoboMMEEnv` maps these to `observation.images.image` and `observation.images.wrist_image` for the policy. State is exposed as `agent_pos` and maps to `observation.state`. + +The dataset's `image` and `wrist_image` columns already align with the policy input keys, so no renaming is needed when fine-tuning. + +## Action Spaces + +| Type | Dim | Description | +| ------------- | --- | --------------------------------------------------------- | +| `joint_angle` | 8 | 7 joint angles + 1 gripper (−1 closed, +1 open, absolute) | +| `ee_pose` | 7 | xyz + roll/pitch/yaw + gripper | + +Set via `--env.action_space=joint_angle` (default) or `--env.action_space=ee_pose`. + +## Platform Notes + +- **Linux only**: ManiSkill requires SAPIEN/Vulkan. macOS and Windows are not supported. +- **GPU recommended**: Rendering is CPU-capable but slow; CUDA + Vulkan gives full speed. +- **gymnasium / numpy conflict**: See installation note above. Docker image handles this automatically. +- **ManiSkill fork**: `robomme` depends on a specific ManiSkill fork (`YinpeiDai/ManiSkill`), pulled in automatically via the `robomme` package. diff --git a/docs/source/robotwin.mdx b/docs/source/robotwin.mdx new file mode 100644 index 000000000..ad1db766f --- /dev/null +++ b/docs/source/robotwin.mdx @@ -0,0 +1,223 @@ +# RoboTwin 2.0 + +RoboTwin 2.0 is a **large-scale dual-arm manipulation benchmark** built on the SAPIEN physics engine. It provides a standardized evaluation protocol for bimanual robotic policies across 50 tasks (as of upstream `main`) with strong domain randomization (clutter, lighting, background, tabletop height, and language instructions). + +- Paper: [RoboTwin 2.0: A Scalable Data Generator and Benchmark with Strong Domain Randomization for Robust Bimanual Robotic Manipulation](https://arxiv.org/abs/2506.18088) +- GitHub: [RoboTwin-Platform/RoboTwin](https://github.com/RoboTwin-Platform/RoboTwin) +- Leaderboard: [robotwin-platform.github.io/leaderboard](https://robotwin-platform.github.io/leaderboard) +- Dataset: [lerobot/robotwin_unified](https://huggingface.co/datasets/lerobot/robotwin_unified) + +![RoboTwin 2.0 benchmark overview](https://www.aitntnews.com/pictures/2025/7/8/9a7f79cb-5ba9-11f0-8581-fa163e47d677.png) + +## Overview + +| Property | Value | +| ------------- | -------------------------------------------------------- | +| Tasks | 50 dual-arm manipulation tasks | +| Robot | Aloha-AgileX bimanual (14 DOF, 7 per arm) | +| Action space | 14-dim joint-space, continuous in `[-1, 1]` | +| Cameras | `head_camera`, `left_camera`, `right_camera` | +| Simulator | SAPIEN (not MuJoCo) | +| Eval protocol | 100 episodes/task, 50 demo_clean demonstrations | +| Eval settings | **Easy** (`demo_clean`) and **Hard** (`demo_randomized`) | + +## Available tasks + +RoboTwin 2.0 ships 50 dual-arm manipulation tasks in its upstream `envs/` directory. The canonical list is the `ROBOTWIN_TASKS` tuple in `src/lerobot/envs/robotwin.py`, mirrored verbatim from the upstream repo. Example tasks: + +| Task | CLI name | Category | +| ------------------------ | ------------------------ | ----------------- | +| Beat block with hammer | `beat_block_hammer` | Tool use | +| Click bell / alarm clock | `click_bell` | Precision press | +| Stack blocks (2 / 3) | `stack_blocks_two/three` | Stacking | +| Stack bowls (2 / 3) | `stack_bowls_two/three` | Stacking | +| Handover block / mic | `handover_block` | Bimanual coord. | +| Lift pot | `lift_pot` | Bimanual lift | +| Shake bottle | `shake_bottle` | Continuous motion | +| Turn switch | `turn_switch` | Articulated obj | +| Stamp seal | `stamp_seal` | Precision place | +| Scan object | `scan_object` | Mobile manip. | + +Pass a comma-separated list to `--env.task` to run multiple tasks in a single eval sweep. + + + `open_laptop` is currently broken upstream (its `check_success()` uses + `self.arm_tag`, which is only set inside the scripted-expert `play_once()` + path and therefore unavailable during normal policy eval). Avoid it until the + upstream bug is fixed, or patch the task to default `self.arm_tag = "left"` in + `load_actors()`. + + +## Dataset + +The RoboTwin 2.0 dataset is available in **LeRobot v3.0 format** on the Hugging Face Hub: + +``` +lerobot/robotwin_unified +``` + +It contains over 100,000 pre-collected trajectories across all 50 tasks (79.6 GB, Apache 2.0 license). No format conversion is needed — it is already in the correct LeRobot v3.0 schema with video observations and action labels. + +You can load it directly with the HF Datasets library: + +```python +from datasets import load_dataset + +ds = load_dataset("lerobot/robotwin_unified", split="train") +``` + +## Installation + +RoboTwin 2.0 requires **Linux** with an NVIDIA GPU (CUDA 12.1 recommended). Installation takes approximately 20 minutes. + +### 1. Create a conda environment + +```bash +conda create -n robotwin python=3.10 -y +conda activate robotwin +``` + +### 2. Install LeRobot + +```bash +git clone https://github.com/huggingface/lerobot.git +cd lerobot +pip install -e "." +``` + +### 3. Install RoboTwin 2.0 + +```bash +git clone https://github.com/RoboTwin-Platform/RoboTwin.git +cd RoboTwin +bash script/_install.sh +bash script/_download_assets.sh +``` + +The install script handles all Python dependencies including SAPIEN, CuRobo, mplib, and pytorch3d. + + +If the automated install fails, install manually: + +```bash +pip install -r requirements.txt +pip install "git+https://github.com/facebookresearch/pytorch3d.git@stable" +cd envs && git clone https://github.com/NVlabs/curobo.git && cd curobo +pip install -e . --no-build-isolation +``` + +Then apply the required mplib fix: in `mplib/planner.py` line 807, remove `or collide` from the conditional. + + + +### 4. Add RoboTwin to PYTHONPATH + +The RoboTwin task modules must be importable by LeRobot. From within the `RoboTwin/` directory: + +```bash +export PYTHONPATH="${PYTHONPATH}:$(pwd)" +``` + +Add this to your shell profile to make it permanent. + +## Evaluation + +### Standard evaluation (recommended) + +Evaluate a policy on a single task with the official protocol (100 episodes): + +```bash +lerobot-eval \ + --policy.path="your-hf-policy-id" \ + --env.type=robotwin \ + --env.task=beat_block_hammer \ + --eval.batch_size=1 \ + --eval.n_episodes=100 +``` + +### Single-task quick check + +```bash +lerobot-eval \ + --policy.path="your-hf-policy-id" \ + --env.type=robotwin \ + --env.task=beat_block_hammer \ + --eval.batch_size=1 \ + --eval.n_episodes=5 +``` + +### Multi-task sweep + +Evaluate on several tasks in one run: + +```bash +lerobot-eval \ + --policy.path="your-hf-policy-id" \ + --env.type=robotwin \ + --env.task=beat_block_hammer,click_bell,handover_block,stack_blocks_two \ + --eval.batch_size=1 \ + --eval.n_episodes=100 +``` + +### Full benchmark (all 50 tasks) + +```bash +lerobot-eval \ + --policy.path="your-hf-policy-id" \ + --env.type=robotwin \ + --env.task=adjust_bottle,beat_block_hammer,blocks_ranking_rgb,blocks_ranking_size,click_alarmclock,click_bell,dump_bin_bigbin,grab_roller,handover_block,handover_mic,hanging_mug,lift_pot,move_can_pot,move_pillbottle_pad,move_playingcard_away,move_stapler_pad,open_microwave,pick_diverse_bottles,pick_dual_bottles,place_a2b_left,place_a2b_right,place_bread_basket,place_bread_skillet,place_burger_fries,place_can_basket,place_cans_plasticbox,place_container_plate,place_dual_shoes,place_empty_cup,place_fan,place_mouse_pad,place_object_basket,place_object_scale,place_object_stand,place_phone_stand,place_shoe,press_stapler,put_bottles_dustbin,put_object_cabinet,rotate_qrcode,scan_object,shake_bottle,shake_bottle_horizontally,stack_blocks_three,stack_blocks_two,stack_bowls_three,stack_bowls_two,stamp_seal,turn_switch \ + --eval.batch_size=1 \ + --eval.n_episodes=100 +``` + + + `open_laptop` is intentionally omitted above because of the upstream + `self.arm_tag` bug (see the **Available tasks** section). Re-add it once the + upstream fix lands. + + +## Camera configuration + +By default, all three cameras are included: + +| Camera key | Description | +| -------------- | ------------------------------ | +| `head_camera` | Torso-mounted overhead view | +| `left_camera` | Left arm wrist-mounted camera | +| `right_camera` | Right arm wrist-mounted camera | + +To use a subset of cameras, override `--env.camera_names`: + +```bash +lerobot-eval \ + --policy.path="your-hf-policy-id" \ + --env.type=robotwin \ + --env.task=beat_block_hammer \ + --env.camera_names="head_camera,left_camera" \ + --eval.batch_size=1 \ + --eval.n_episodes=10 +``` + +## Environment config reference + +Key parameters for `RoboTwinEnvConfig`: + +| Parameter | Default | Description | +| -------------------- | ---------------------------------------- | ---------------------------------- | +| `task` | `"beat_block_hammer"` | Comma-separated task name(s) | +| `fps` | `25` | Simulation FPS | +| `episode_length` | `300` | Max steps per episode | +| `obs_type` | `"pixels_agent_pos"` | `"pixels"` or `"pixels_agent_pos"` | +| `camera_names` | `"head_camera,left_camera,right_camera"` | Comma-separated active cameras | +| `observation_height` | `240` | Camera pixel height | +| `observation_width` | `320` | Camera pixel width | + +## Leaderboard submission + +Results can be submitted to the [RoboTwin 2.0 leaderboard](https://robotwin-platform.github.io/leaderboard). The official protocol requires: + +- Training on 50 `demo_clean` demonstrations per task +- Evaluating 100 episodes per task +- Reporting success rate separately for **Easy** (`demo_clean`) and **Hard** (`demo_randomized`) settings + +For submission instructions, refer to the [RoboTwin 2.0 documentation](https://robotwin-platform.github.io/doc/). diff --git a/pyproject.toml b/pyproject.toml index 6e4993c85..dbc866a49 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -212,6 +212,15 @@ aloha = ["lerobot[dataset]", "gym-aloha>=0.1.2,<0.2.0", "lerobot[scipy-dep]"] pusht = ["lerobot[dataset]", "gym-pusht>=0.1.5,<0.2.0", "pymunk>=6.6.0,<7.0.0"] # TODO: Fix pymunk version in gym-pusht instead libero = ["lerobot[dataset]", "lerobot[transformers-dep]", "hf-libero>=0.1.3,<0.2.0; sys_platform == 'linux'", "lerobot[scipy-dep]"] metaworld = ["lerobot[dataset]", "metaworld==3.0.0", "lerobot[scipy-dep]"] +# NOTE: robomme is NOT a pyproject extra — mani-skill hard-pins numpy<2 +# which conflicts with lerobot's numpy>=2 base pin, so the two trees can't +# resolve into a single env. Install it only in the RoboMME Docker image +# via `uv pip install --override` (see docker/Dockerfile.benchmark.robomme). +# NOTE: robocasa is NOT exposed as a `lerobot` extra. Its setup.py pins +# `lerobot==0.3.3` in install_requires, which cyclically shadows our own +# workspace `lerobot` and makes the graph unsolvable under any resolver +# (uv, pip). Install it manually alongside robosuite — see +# docs/source/robocasa.mdx for the recipe. # All all = [ diff --git a/scripts/ci/extract_task_descriptions.py b/scripts/ci/extract_task_descriptions.py index 5fbc1c35a..3bdc9035f 100644 --- a/scripts/ci/extract_task_descriptions.py +++ b/scripts/ci/extract_task_descriptions.py @@ -31,9 +31,23 @@ from __future__ import annotations import argparse import json +import re import sys from pathlib import Path +# LIBERO-plus derives task.language by space-joining the perturbation-variant +# filename (grab_language_from_filename in libero/libero/benchmark/__init__.py), +# so non-_language_ variants inherit a trailing metadata blob like +# "view 0 0 100 0 0 initstate 0 noise 45" or "add 16". Strip those tokens so +# the description matches the base instruction used in the training dataset. +_LIBERO_PERTURBATION_TAIL_RE = re.compile( + r"(?:\s(?:view|initstate|noise|add|tb|table|light|level)(?:\s\d+)+)+$" +) + + +def _strip_libero_perturbation_tail(instruction: str) -> str: + return _LIBERO_PERTURBATION_TAIL_RE.sub("", instruction).strip() + def _libero_descriptions(task_suite: str) -> dict[str, str]: from libero.libero import benchmark # type: ignore[import-untyped] @@ -47,7 +61,10 @@ def _libero_descriptions(task_suite: str) -> dict[str, str]: ) return {} suite = suite_dict[task_suite]() - return {f"{task_suite}_{i}": suite.get_task(i).language for i in range(suite.n_tasks)} + return { + f"{task_suite}_{i}": _strip_libero_perturbation_tail(suite.get_task(i).language) + for i in range(suite.n_tasks) + } def _metaworld_descriptions(task_name: str) -> dict[str, str]: @@ -57,19 +74,103 @@ def _metaworld_descriptions(task_name: str) -> dict[str, str]: return {f"{task_name}_0": label} +def _robotwin_descriptions(task_names: str) -> dict[str, str]: + """Return descriptions for each requested RoboTwin task. Reads + `description/task_instruction/.json` from the RoboTwin clone + (cwd is /opt/robotwin in CI). Falls back to the task name if missing.""" + out: dict[str, str] = {} + root = Path("description/task_instruction") + for name in (t.strip() for t in task_names.split(",") if t.strip()): + desc_file = root / f"{name}.json" + desc = name.replace("_", " ") + if desc_file.is_file(): + data = json.loads(desc_file.read_text()) + full = data.get("full_description") or desc + # Strip the schema placeholders ({A}, {a}) — keep the sentence readable. + desc = full.replace("<", "").replace(">", "") + out[f"{name}_0"] = desc + return out + + +def _robocasa_descriptions(task_spec: str) -> dict[str, str]: + """For each task in the comma-separated list, emit a cleaned-name label. + + RoboCasa episodes carry their language instruction in the env's + `ep_meta['lang']`, populated per reset. Pulling it requires spinning + up the full kitchen env per task (~seconds each); we use the task + name as the key here and let the eval's episode info carry the + actual instruction. + """ + out: dict[str, str] = {} + for task in (t.strip() for t in task_spec.split(",") if t.strip()): + # Split CamelCase into words: "CloseFridge" → "close fridge". + label = "".join(f" {c.lower()}" if c.isupper() else c for c in task).strip() + out[f"{task}_0"] = label or task + return out + + +_ROBOMME_DESCRIPTIONS = { + "BinFill": "Fill the target bin with the correct number of cubes", + "PickXtimes": "Pick the indicated cube the specified number of times", + "SwingXtimes": "Swing the object the specified number of times", + "StopCube": "Grasp and stop the moving cube", + "VideoUnmask": "Pick the cube shown in the reference video", + "VideoUnmaskSwap": "Pick the cube matching the reference video after a swap", + "ButtonUnmask": "Press the button indicated by the reference", + "ButtonUnmaskSwap": "Press the correct button after objects are swapped", + "PickHighlight": "Pick the highlighted cube", + "VideoRepick": "Repick the cube shown in the reference video", + "VideoPlaceButton": "Place the cube on the button shown in the video", + "VideoPlaceOrder": "Place cubes in the order shown in the video", + "MoveCube": "Move the cube to the target location", + "InsertPeg": "Insert the peg into the target hole", + "PatternLock": "Unlock the pattern by pressing buttons in sequence", + "RouteStick": "Route the stick through the required waypoints", +} + + +def _robomme_descriptions(task_names: str, task_ids: list[int] | None = None) -> dict[str, str]: + """Return descriptions for each requested RoboMME task. Keys match the + video filename pattern `_` used by the eval script.""" + if task_ids is None: + task_ids = [0] + out: dict[str, str] = {} + for name in (t.strip() for t in task_names.split(",") if t.strip()): + desc = _ROBOMME_DESCRIPTIONS.get(name, name) + for tid in task_ids: + out[f"{name}_{tid}"] = desc + return out + + def main() -> int: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--env", required=True, help="Environment family (libero, metaworld, ...)") parser.add_argument("--task", required=True, help="Task/suite name (e.g. libero_spatial)") + parser.add_argument( + "--task-ids", + type=str, + default=None, + help="Comma-separated task IDs (e.g. '0,1,2'). Default: [0]", + ) parser.add_argument("--output", required=True, help="Path to write task_descriptions.json") args = parser.parse_args() + task_ids: list[int] | None = None + if args.task_ids: + task_ids = [int(x.strip()) for x in args.task_ids.split(",")] + descriptions: dict[str, str] = {} try: - if args.env == "libero": + if args.env == ("libero", "libero_plus"): descriptions = _libero_descriptions(args.task) elif args.env == "metaworld": descriptions = _metaworld_descriptions(args.task) + elif args.env == "robotwin": + descriptions = _robotwin_descriptions(args.task) + elif args.env == "robocasa": + descriptions = _robocasa_descriptions(args.task) + elif args.env == "robomme": + descriptions = _robomme_descriptions(args.task, task_ids=task_ids) else: print( f"[extract_task_descriptions] No description extractor for env '{args.env}'.", diff --git a/src/lerobot/envs/configs.py b/src/lerobot/envs/configs.py index 2a7c52d45..5ce06b475 100644 --- a/src/lerobot/envs/configs.py +++ b/src/lerobot/envs/configs.py @@ -331,6 +331,7 @@ class LiberoEnv(EnvConfig): camera_name_mapping: dict[str, str] | None = None observation_height: int = 360 observation_width: int = 360 + is_libero_plus: bool = False features: dict[str, PolicyFeature] = field( default_factory=lambda: { ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(7,)), @@ -432,6 +433,7 @@ class LiberoEnv(EnvConfig): control_mode=self.control_mode, episode_length=self.episode_length, camera_name_mapping=self.camera_name_mapping, + is_libero_plus=self.is_libero_plus, ) def get_env_processors(self): @@ -496,6 +498,81 @@ class MetaworldEnv(EnvConfig): ) +@EnvConfig.register_subclass("robocasa") +@dataclass +class RoboCasaEnv(EnvConfig): + task: str = "CloseFridge" + fps: int = 20 + episode_length: int = 1000 + obs_type: str = "pixels_agent_pos" + render_mode: str = "rgb_array" + camera_name: str = "robot0_agentview_left,robot0_eye_in_hand,robot0_agentview_right" + observation_height: int = 256 + observation_width: int = 256 + visualization_height: int = 512 + visualization_width: int = 512 + split: str | None = None + # Object-mesh registries to sample from. Upstream default is + # ("objaverse", "lightwheel"), but objaverse is ~30GB and the CI image + # only ships the lightwheel pack. Override to include objaverse once + # you've run `python -m robocasa.scripts.download_kitchen_assets + # --type objaverse` locally. + obj_registries: list[str] = field(default_factory=lambda: ["lightwheel"]) + features: dict[str, PolicyFeature] = field( + default_factory=lambda: {ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(12,))} + ) + features_map: dict[str, str] = field(default_factory=lambda: {ACTION: ACTION, "agent_pos": OBS_STATE}) + + def __post_init__(self): + if self.obs_type not in ("pixels", "pixels_agent_pos"): + raise ValueError(f"Unsupported obs_type: {self.obs_type}") + + # Preserve raw RoboCasa camera names end-to-end (e.g. + # `observation.images.robot0_agentview_left`). This matches the + # naming convention used by the RoboCasa datasets on the Hub, so + # trained policies don't need a `--rename_map` at eval time. + cams = [c.strip() for c in self.camera_name.split(",") if c.strip()] + for cam in cams: + self.features[f"pixels/{cam}"] = PolicyFeature( + type=FeatureType.VISUAL, + shape=(self.observation_height, self.observation_width, 3), + ) + self.features_map[f"pixels/{cam}"] = f"{OBS_IMAGES}.{cam}" + + if self.obs_type == "pixels_agent_pos": + self.features["agent_pos"] = PolicyFeature(type=FeatureType.STATE, shape=(16,)) + + @property + def gym_kwargs(self) -> dict: + kwargs: dict[str, Any] = { + "obs_type": self.obs_type, + "render_mode": self.render_mode, + "observation_height": self.observation_height, + "observation_width": self.observation_width, + "visualization_height": self.visualization_height, + "visualization_width": self.visualization_width, + } + if self.split is not None: + kwargs["split"] = self.split + return kwargs + + def create_envs(self, n_envs: int, use_async_envs: bool = False): + from .robocasa import create_robocasa_envs + + if self.task is None: + raise ValueError("RoboCasaEnv requires a task to be specified") + env_cls = _make_vec_env_cls(use_async_envs, n_envs) + return create_robocasa_envs( + task=self.task, + n_envs=n_envs, + camera_name=self.camera_name, + gym_kwargs=self.gym_kwargs, + env_cls=env_cls, + episode_length=self.episode_length, + obj_registries=tuple(self.obj_registries), + ) + + @EnvConfig.register_subclass("isaaclab_arena") @dataclass class IsaaclabArenaEnv(HubEnvConfig): @@ -574,3 +651,171 @@ class IsaaclabArenaEnv(HubEnvConfig): ), PolicyProcessorPipeline(steps=[]), ) + + +@EnvConfig.register_subclass("libero_plus") +@dataclass +class LiberoPlusEnv(LiberoEnv): + """Config for LIBERO-plus robustness benchmark evaluation. + + LIBERO-plus extends LIBERO with 7 perturbation dimensions (camera viewpoints, + object layouts, robot initial states, language instructions, lighting, background + textures, sensor noise) producing ~10k task variants. + + The gym interface is identical to LIBERO so this class reuses ``LiberoEnv`` + entirely — only the registered name and default task suite differ. + + Install: see docker/Dockerfile.benchmark.libero_plus — LIBERO-plus ships + as a namespace package from a git fork and must be cloned + PYTHONPATH'd + rather than installed as a pyproject extra. + + See Also: + https://github.com/sylvestf/LIBERO-plus + """ + + task: str = "libero_spatial" + is_libero_plus: bool = True + + +@EnvConfig.register_subclass("robotwin") +@dataclass +class RoboTwinEnvConfig(EnvConfig): + """Configuration for RoboTwin 2.0 benchmark environments. + + RoboTwin 2.0 is a dual-arm manipulation benchmark with 50 tasks built on the + SAPIEN simulator. The robot is an Aloha-AgileX bimanual platform with 14 DOF + (7 per arm). All three cameras are enabled by default. + + See: https://robotwin-platform.github.io + Dataset: https://huggingface.co/datasets/lerobot/robotwin_unified + """ + + task: str = "beat_block_hammer" # single task or comma-separated list + fps: int = 25 + episode_length: int = 300 + obs_type: str = "pixels_agent_pos" + render_mode: str = "rgb_array" + # Available cameras from RoboTwin's aloha-agilex embodiment: head_camera + # (torso-mounted) + left_camera / right_camera (wrists). + camera_names: str = "head_camera,left_camera,right_camera" + # Match the D435 dims in task_config/demo_clean.yml (_camera_config.yml). + # Gym's vector-env concatenate pre-allocates buffers of this shape, so it + # must equal what SAPIEN actually renders. + observation_height: int = 240 + observation_width: int = 320 + features: dict[str, PolicyFeature] = field( + default_factory=lambda: { + ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(14,)), + } + ) + features_map: dict[str, str] = field( + default_factory=lambda: { + ACTION: ACTION, + "pixels/head_camera": f"{OBS_IMAGES}.head_camera", + "pixels/left_camera": f"{OBS_IMAGES}.left_camera", + "pixels/right_camera": f"{OBS_IMAGES}.right_camera", + "agent_pos": OBS_STATE, + } + ) + + def __post_init__(self): + cam_list = [c.strip() for c in self.camera_names.split(",") if c.strip()] + for cam in cam_list: + self.features[f"pixels/{cam}"] = PolicyFeature( + type=FeatureType.VISUAL, + shape=(self.observation_height, self.observation_width, 3), + ) + # Keep features_map entry if already set (default_factory); add if missing. + key = f"pixels/{cam}" + if key not in self.features_map: + self.features_map[key] = f"{OBS_IMAGES}.{cam}" + + if self.obs_type == "pixels_agent_pos": + self.features["agent_pos"] = PolicyFeature( + type=FeatureType.STATE, + shape=(14,), # 14 DOF: 7 per arm + ) + elif self.obs_type != "pixels": + raise ValueError( + f"Unsupported obs_type '{self.obs_type}'. " + "RoboTwinEnvConfig supports 'pixels' and 'pixels_agent_pos'." + ) + + @property + def gym_kwargs(self) -> dict: + return {} + + def create_envs(self, n_envs: int, use_async_envs: bool = True): + from lerobot.envs.robotwin import create_robotwin_envs + + if not self.task: + raise ValueError("RoboTwinEnvConfig requires `task` to be specified.") + + env_cls = _make_vec_env_cls(use_async_envs, n_envs) + cam_list = [c.strip() for c in self.camera_names.split(",") if c.strip()] + return create_robotwin_envs( + task=self.task, + n_envs=n_envs, + env_cls=env_cls, + camera_names=cam_list, + observation_height=self.observation_height, + observation_width=self.observation_width, + episode_length=self.episode_length, + ) + + +@EnvConfig.register_subclass("robomme") +@dataclass +class RoboMMEEnv(EnvConfig): + """RoboMME memory-augmented manipulation benchmark (ManiSkill/SAPIEN). + + 16 tasks across 4 suites: Counting, Permanence, Reference, Imitation. + Dataset: lerobot/robomme (LeRobot v3.0, 1,600 episodes). + Benchmark: https://github.com/RoboMME/robomme_benchmark + + Requires the `robomme` git package installed separately (Linux only); + see docker/Dockerfile.benchmark.robomme for the canonical install. + """ + + task: str = "PickXtimes" + fps: int = 10 + episode_length: int = 300 + action_space: str = "joint_angle" # or "ee_pose" (7-D) + dataset_split: str = "test" # "train" | "val" | "test" + task_ids: list[int] | None = None + features: dict[str, PolicyFeature] = field(default_factory=dict) + features_map: dict[str, str] = field( + default_factory=lambda: { + ACTION: ACTION, + "pixels/image": f"{OBS_IMAGES}.image", + "pixels/wrist_image": f"{OBS_IMAGES}.wrist_image", + "agent_pos": OBS_STATE, + } + ) + + def __post_init__(self): + action_dim = 8 if self.action_space == "joint_angle" else 7 + self.features = { + ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(action_dim,)), + "pixels/image": PolicyFeature(type=FeatureType.VISUAL, shape=(256, 256, 3)), + "pixels/wrist_image": PolicyFeature(type=FeatureType.VISUAL, shape=(256, 256, 3)), + "agent_pos": PolicyFeature(type=FeatureType.STATE, shape=(8,)), + } + + @property + def gym_kwargs(self) -> dict: + return {} + + def create_envs(self, n_envs: int, use_async_envs: bool = True): + from lerobot.envs.robomme import create_robomme_envs + + env_cls = _make_vec_env_cls(use_async_envs, n_envs) + return create_robomme_envs( + task=self.task, + n_envs=n_envs, + action_space_type=self.action_space, + dataset=self.dataset_split, + episode_length=self.episode_length, + task_ids=self.task_ids, + env_cls=env_cls, + ) diff --git a/src/lerobot/envs/libero.py b/src/lerobot/envs/libero.py index ec90d0ffd..12be9e196 100644 --- a/src/lerobot/envs/libero.py +++ b/src/lerobot/envs/libero.py @@ -16,6 +16,7 @@ from __future__ import annotations import os +import re from collections import defaultdict from collections.abc import Callable, Iterable, Mapping, Sequence from functools import partial @@ -31,20 +32,7 @@ from libero.libero.envs import OffScreenRenderEnv from lerobot.types import RobotObservation -from .utils import _LazyAsyncVectorEnv - - -def _parse_camera_names(camera_name: str | Sequence[str]) -> list[str]: - """Normalize camera_name into a non-empty list of strings.""" - if isinstance(camera_name, str): - cams = [c.strip() for c in camera_name.split(",") if c.strip()] - elif isinstance(camera_name, (list | tuple)): - cams = [str(c).strip() for c in camera_name if str(c).strip()] - else: - raise TypeError(f"camera_name must be str or sequence[str], got {type(camera_name).__name__}") - if not cams: - raise ValueError("camera_name resolved to an empty list.") - return cams +from .utils import _LazyAsyncVectorEnv, parse_camera_names def _get_suite(name: str) -> benchmark.Benchmark: @@ -69,14 +57,34 @@ def _select_task_ids(total_tasks: int, task_ids: Iterable[int] | None) -> list[i return ids -def get_task_init_states(task_suite: Any, i: int) -> np.ndarray: - init_states_path = ( - Path(get_libero_path("init_states")) - / task_suite.tasks[i].problem_folder - / task_suite.tasks[i].init_states_file - ) - init_states = torch.load(init_states_path, weights_only=False) # nosec B614 - return init_states +# LIBERO-plus perturbation variants encode the perturbation in the filename +# but on disk only the base `.pruned_init` exists — strip the suffix to match +# LIBERO-plus's own suite.get_task_init_states() (we reimplement it here so we +# can pass weights_only=False for PyTorch 2.6+ numpy pickles). +_LIBERO_PERTURBATION_SUFFIX_RE = re.compile(r"_(?:language|view|light)_[^.]*|_(?:table|tb)_\d+") + + +def get_task_init_states(task_suite: Any, i: int, is_libero_plus: bool = False) -> np.ndarray: + task = task_suite.tasks[i] + filename = Path(task.init_states_file) + root = Path(get_libero_path("init_states")) + + if not is_libero_plus: + init_states_path = root / task.problem_folder / filename.name + return torch.load(init_states_path, weights_only=False) # nosec B614 + + # LIBERO-plus: `_add_` / `_level` variants store extra-object layouts under + # libero_newobj/ as a flat array that must be reshaped to (1, -1). + if "_add_" in filename.name or "_level" in filename.name: + init_states_path = root / "libero_newobj" / task.problem_folder / filename.name + init_states = torch.load(init_states_path, weights_only=False) # nosec B614 + return init_states.reshape(1, -1) + + # LIBERO-plus perturbation variants encode the perturbation in the filename + # but on disk only the base `.pruned_init` exists — strip the suffix to match. + stripped = _LIBERO_PERTURBATION_SUFFIX_RE.sub("", filename.stem) + filename.suffix + init_states_path = root / task.problem_folder / stripped + return torch.load(init_states_path, weights_only=False) # nosec B614 def get_libero_dummy_action(): @@ -118,9 +126,11 @@ class LiberoEnv(gym.Env): camera_name_mapping: dict[str, str] | None = None, num_steps_wait: int = 10, control_mode: str = "relative", + is_libero_plus: bool = False, ): super().__init__() self.task_id = task_id + self.is_libero_plus = is_libero_plus self.obs_type = obs_type self.render_mode = render_mode self.observation_width = observation_width @@ -128,7 +138,7 @@ class LiberoEnv(gym.Env): self.visualization_width = visualization_width self.visualization_height = visualization_height self.init_states = init_states - self.camera_name = _parse_camera_names( + self.camera_name = parse_camera_names( camera_name ) # agentview_image (main) or robot0_eye_in_hand_image (wrist) @@ -147,7 +157,11 @@ class LiberoEnv(gym.Env): self.episode_index = episode_index self.episode_length = episode_length # Load once and keep - self._init_states = get_task_init_states(task_suite, self.task_id) if self.init_states else None + self._init_states = ( + get_task_init_states(task_suite, self.task_id, is_libero_plus=self.is_libero_plus) + if self.init_states + else None + ) self._reset_stride = n_envs # when performing a reset, append `_reset_stride` to `init_state_id`. self.init_state_id = self.episode_index # tie each sub-env to a fixed init state @@ -380,6 +394,7 @@ def _make_env_fns( gym_kwargs: Mapping[str, Any], control_mode: str, camera_name_mapping: dict[str, str] | None = None, + is_libero_plus: bool = False, ) -> list[Callable[[], LiberoEnv]]: """Build n_envs factory callables for a single (suite, task_id).""" @@ -396,6 +411,7 @@ def _make_env_fns( n_envs=n_envs, control_mode=control_mode, camera_name_mapping=camera_name_mapping, + is_libero_plus=is_libero_plus, **local_kwargs, ) @@ -418,6 +434,7 @@ def create_libero_envs( control_mode: str = "relative", episode_length: int | None = None, camera_name_mapping: dict[str, str] | None = None, + is_libero_plus: bool = False, ) -> dict[str, dict[int, Any]]: """ Create vectorized LIBERO environments with a consistent return shape. @@ -437,7 +454,7 @@ def create_libero_envs( gym_kwargs = dict(gym_kwargs or {}) task_ids_filter = gym_kwargs.pop("task_ids", None) # optional: limit to specific tasks - camera_names = _parse_camera_names(camera_name) + camera_names = parse_camera_names(camera_name) suite_names = [s.strip() for s in str(task).split(",") if s.strip()] if not suite_names: raise ValueError("`task` must contain at least one LIBERO suite name.") @@ -462,6 +479,7 @@ def create_libero_envs( # Probe once and reuse to avoid creating a temp env per task. cached_obs_space: spaces.Space | None = None cached_act_space: spaces.Space | None = None + cached_metadata: dict[str, Any] | None = None for tid in selected: fns = _make_env_fns( @@ -475,12 +493,14 @@ def create_libero_envs( gym_kwargs=gym_kwargs, control_mode=control_mode, camera_name_mapping=camera_name_mapping, + is_libero_plus=is_libero_plus, ) if is_async: - lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space) + lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata) if cached_obs_space is None: cached_obs_space = lazy.observation_space cached_act_space = lazy.action_space + cached_metadata = lazy.metadata out[suite_name][tid] = lazy else: out[suite_name][tid] = env_cls(fns) diff --git a/src/lerobot/envs/metaworld.py b/src/lerobot/envs/metaworld.py index 1dc513a68..bffcf6b6e 100644 --- a/src/lerobot/envs/metaworld.py +++ b/src/lerobot/envs/metaworld.py @@ -311,6 +311,7 @@ def create_metaworld_envs( is_async = env_cls is gym.vector.AsyncVectorEnv cached_obs_space = None cached_act_space = None + cached_metadata = None out: dict[str, dict[int, Any]] = defaultdict(dict) for group in task_groups: @@ -324,10 +325,11 @@ def create_metaworld_envs( fns = [(lambda tn=task_name: MetaworldEnv(task=tn, **gym_kwargs)) for _ in range(n_envs)] if is_async: - lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space) + lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata) if cached_obs_space is None: cached_obs_space = lazy.observation_space cached_act_space = lazy.action_space + cached_metadata = lazy.metadata out[group][tid] = lazy else: out[group][tid] = env_cls(fns) diff --git a/src/lerobot/envs/robocasa.py b/src/lerobot/envs/robocasa.py new file mode 100644 index 000000000..a84a7c766 --- /dev/null +++ b/src/lerobot/envs/robocasa.py @@ -0,0 +1,425 @@ +#!/usr/bin/env python + +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import logging +from collections import defaultdict +from collections.abc import Callable, Sequence +from functools import partial +from typing import Any + +import gymnasium as gym +import numpy as np +from gymnasium import spaces + +from lerobot.types import RobotObservation + +from .utils import _LazyAsyncVectorEnv, parse_camera_names + +logger = logging.getLogger(__name__) + +# Dimensions for the flat action/state vectors used by the LeRobot wrapper. +# These correspond to the PandaOmron robot in RoboCasa365. +OBS_STATE_DIM = 16 # base_pos(3) + base_quat(4) + ee_pos_rel(3) + ee_quat_rel(4) + gripper_qpos(2) +ACTION_DIM = 12 # base_motion(4) + control_mode(1) + ee_pos(3) + ee_rot(3) + gripper(1) +ACTION_LOW = -1.0 +ACTION_HIGH = 1.0 + +# Default PandaOmron cameras. We surface these raw names directly as +# `observation.images.` so the LeRobot dataset/policy keys match +# RoboCasa's native convention (no implicit renaming). +DEFAULT_CAMERAS = [ + "robot0_agentview_left", + "robot0_eye_in_hand", + "robot0_agentview_right", +] + +# Object-mesh registries to sample from. RoboCasa's upstream default is +# ("objaverse", "lightwheel"), but the objaverse pack is huge (~30GB) and +# most users — including our CI image — only download the lightwheel pack +# (`--type objs_lw` in `download_kitchen_assets`). When a sampled object +# category has zero candidates in every registry, robocasa crashes with +# `ValueError: Probabilities contain NaN` (0/0 divide in the probability +# normalization). Restricting to registries that are actually on disk +# avoids the NaN and matches what the asset download provides. +DEFAULT_OBJ_REGISTRIES: tuple[str, ...] = ("lightwheel",) + +# Task-group shortcuts accepted as `--env.task`. When the user passes one of +# these names, we expand it to the upstream RoboCasa task list and auto-set +# the dataset split. Individual task names (optionally comma-separated) still +# take precedence; this only triggers on an exact group-name match. +_TASK_GROUP_SPLITS = { + "atomic_seen": "target", + "composite_seen": "target", + "composite_unseen": "target", + "pretrain50": "pretrain", + "pretrain100": "pretrain", + "pretrain200": "pretrain", + "pretrain300": "pretrain", +} + + +def _resolve_tasks(task: str) -> tuple[list[str], str | None]: + """Resolve a `--env.task` value to (task_names, split_override). + + If `task` is a known task-group name (e.g. `atomic_seen`, `pretrain100`), + expand it via `robocasa.utils.dataset_registry.{TARGET,PRETRAINING}_TASKS` + and return the matching split. Otherwise treat `task` as a single task or + comma-separated list and leave the split untouched (None). + """ + key = task.strip() + if key in _TASK_GROUP_SPLITS: + from robocasa.utils.dataset_registry import PRETRAINING_TASKS, TARGET_TASKS + + combined = {**TARGET_TASKS, **PRETRAINING_TASKS} + if key not in combined: + raise ValueError( + f"Task group '{key}' is not available in this version of robocasa. " + f"Known groups: {sorted(combined.keys())}." + ) + return list(combined[key]), _TASK_GROUP_SPLITS[key] + + names = [t.strip() for t in task.split(",") if t.strip()] + if not names: + raise ValueError("`task` must contain at least one RoboCasa task name.") + return names, None + + +def convert_action(flat_action: np.ndarray) -> dict[str, Any]: + """Split a flat (12,) action vector into a RoboCasa action dict. + + Layout: base_motion(4) + control_mode(1) + ee_pos(3) + ee_rot(3) + gripper(1) + """ + return { + "action.base_motion": flat_action[0:4], + "action.control_mode": flat_action[4:5], + "action.end_effector_position": flat_action[5:8], + "action.end_effector_rotation": flat_action[8:11], + "action.gripper_close": flat_action[11:12], + } + + +class RoboCasaEnv(gym.Env): + """LeRobot gym.Env wrapper for RoboCasa365 kitchen environments. + + Wraps RoboCasaGymEnv from the robocasa package and converts its + dict-based observations and actions into the flat arrays LeRobot expects. + Raw RoboCasa camera names are preserved verbatim under `pixels/`. + """ + + metadata = {"render_modes": ["rgb_array"], "render_fps": 20} + + def __init__( + self, + task: str, + camera_name: str | Sequence[str] = ",".join(DEFAULT_CAMERAS), + obs_type: str = "pixels_agent_pos", + render_mode: str = "rgb_array", + observation_width: int = 256, + observation_height: int = 256, + visualization_width: int = 512, + visualization_height: int = 512, + split: str | None = None, + episode_length: int | None = None, + obj_registries: Sequence[str] = DEFAULT_OBJ_REGISTRIES, + episode_index: int = 0, + ): + super().__init__() + self.task = task + self.obs_type = obs_type + self.render_mode = render_mode + self.observation_width = observation_width + self.observation_height = observation_height + self.visualization_width = visualization_width + self.visualization_height = visualization_height + self.split = split + self.obj_registries = tuple(obj_registries) + # Per-worker index (0..n_envs-1) used to spread the user-provided + # seed across factories so each sub-env explores a distinct layout + # even when the same seed is passed to `reset()`. + self.episode_index = int(episode_index) + + self.camera_name = parse_camera_names(camera_name) + + self._max_episode_steps = episode_length if episode_length is not None else 1000 + + # Deferred — created on first reset() inside the worker subprocess + # to avoid inheriting stale GPU/EGL contexts across fork(). + self._env: Any = None + self.task_description = "" + + images = { + cam: spaces.Box( + low=0, + high=255, + shape=(self.observation_height, self.observation_width, 3), + dtype=np.uint8, + ) + for cam in self.camera_name + } + + if self.obs_type == "pixels": + self.observation_space = spaces.Dict({"pixels": spaces.Dict(images)}) + elif self.obs_type == "pixels_agent_pos": + self.observation_space = spaces.Dict( + { + "pixels": spaces.Dict(images), + "agent_pos": spaces.Box( + low=-np.inf, + high=np.inf, + shape=(OBS_STATE_DIM,), + dtype=np.float32, + ), + } + ) + else: + raise ValueError(f"Unsupported obs_type '{self.obs_type}'. Use 'pixels' or 'pixels_agent_pos'.") + + self.action_space = spaces.Box( + low=ACTION_LOW, + high=ACTION_HIGH, + shape=(ACTION_DIM,), + dtype=np.float32, + ) + + def _ensure_env(self) -> None: + """Create the underlying RoboCasaGymEnv on first use. + + Called inside the worker subprocess after fork(), so each worker gets + its own clean rendering context rather than inheriting a stale one from + the parent process (which causes crashes with AsyncVectorEnv). + """ + if self._env is not None: + return + from robocasa.wrappers.gym_wrapper import RoboCasaGymEnv + + # RoboCasaGymEnv defaults split="test", which create_env rejects + # (only None/"all"/"pretrain"/"target" are valid). Always pass a + # valid value so we don't hit that default. Extra kwargs are + # forwarded to the underlying kitchen env via create_env/robosuite.make. + self._env = RoboCasaGymEnv( + env_name=self.task, + camera_widths=self.observation_width, + camera_heights=self.observation_height, + split=self.split if self.split is not None else "all", + obj_registries=self.obj_registries, + ) + + ep_meta = self._env.env.get_ep_meta() + self.task_description = ep_meta.get("lang", self.task) + + def _format_raw_obs(self, raw_obs: dict) -> RobotObservation: + """Convert RoboCasaGymEnv observation dict to LeRobot format.""" + # RoboCasaGymEnv emits camera frames under "video.". + images = {cam: raw_obs[f"video.{cam}"] for cam in self.camera_name if f"video.{cam}" in raw_obs} + + if self.obs_type == "pixels": + return {"pixels": images} + + # `state.*` keys come from PandaOmronKeyConverter inside the wrapper. + agent_pos = np.concatenate( + [ + raw_obs.get("state.base_position", np.zeros(3)), + raw_obs.get("state.base_rotation", np.zeros(4)), + raw_obs.get("state.end_effector_position_relative", np.zeros(3)), + raw_obs.get("state.end_effector_rotation_relative", np.zeros(4)), + raw_obs.get("state.gripper_qpos", np.zeros(2)), + ], + axis=-1, + ).astype(np.float32) + + return {"pixels": images, "agent_pos": agent_pos} + + def render(self) -> np.ndarray: + self._ensure_env() + assert self._env is not None + return self._env.render() + + def reset(self, seed=None, **kwargs): + self._ensure_env() + assert self._env is not None + super().reset(seed=seed) + # Spread the seed across workers so n_envs factories don't all + # roll the same scene. With an explicit user seed we shift it by + # episode_index; with no seed we fall back to episode_index so + # each worker is still distinct rather than inheriting the same + # global RNG state. + worker_seed = seed + self.episode_index if seed is not None else self.episode_index + raw_obs, info = self._env.reset(seed=worker_seed) + + ep_meta = self._env.env.get_ep_meta() + self.task_description = ep_meta.get("lang", self.task) + + observation = self._format_raw_obs(raw_obs) + info = {"is_success": False} + return observation, info + + def step(self, action: np.ndarray) -> tuple[RobotObservation, float, bool, bool, dict[str, Any]]: + self._ensure_env() + assert self._env is not None + if action.ndim != 1: + raise ValueError( + f"Expected action to be 1-D (shape (action_dim,)), " + f"but got shape {action.shape} with ndim={action.ndim}" + ) + + action_dict = convert_action(action) + raw_obs, reward, done, truncated, info = self._env.step(action_dict) + + is_success = bool(info.get("success", False)) + terminated = done or is_success + info.update({"task": self.task, "done": done, "is_success": is_success}) + + observation = self._format_raw_obs(raw_obs) + if terminated: + info["final_info"] = { + "task": self.task, + "done": bool(done), + "is_success": bool(is_success), + } + self.reset() + + return observation, reward, terminated, truncated, info + + def close(self): + if self._env is not None: + self._env.close() + + +def _make_env_fns( + *, + task: str, + n_envs: int, + camera_names: list[str], + obs_type: str, + render_mode: str, + observation_width: int, + observation_height: int, + visualization_width: int, + visualization_height: int, + split: str | None, + episode_length: int | None, + obj_registries: Sequence[str], +) -> list[Callable[[], RoboCasaEnv]]: + """Build n_envs factory callables for a single task. + + Each factory carries a distinct ``episode_index`` (``0..n_envs-1``) so + ``RoboCasaEnv.reset()`` can derive a per-worker seed series from the + user-provided seed. + """ + + def _make_env(episode_index: int) -> RoboCasaEnv: + return RoboCasaEnv( + task=task, + camera_name=camera_names, + obs_type=obs_type, + render_mode=render_mode, + observation_width=observation_width, + observation_height=observation_height, + visualization_width=visualization_width, + visualization_height=visualization_height, + split=split, + episode_length=episode_length, + obj_registries=obj_registries, + episode_index=episode_index, + ) + + return [partial(_make_env, i) for i in range(n_envs)] + + +def create_robocasa_envs( + task: str, + n_envs: int, + gym_kwargs: dict[str, Any] | None = None, + camera_name: str | Sequence[str] = ",".join(DEFAULT_CAMERAS), + env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None, + episode_length: int | None = None, + obj_registries: Sequence[str] = DEFAULT_OBJ_REGISTRIES, +) -> dict[str, dict[int, Any]]: + """Create vectorized RoboCasa365 environments with a consistent return shape. + + Returns: + dict[task_name][task_id] -> vec_env (env_cls([...]) with exactly n_envs factories) + + `task` can be: + - a single task name (e.g. `CloseFridge`) + - a comma-separated list of task names (e.g. `CloseFridge,PickPlaceCoffee`) + - a benchmark-group shortcut (`atomic_seen`, `composite_seen`, + `composite_unseen`, `pretrain50`, `pretrain100`, `pretrain200`, + `pretrain300`), which auto-expands to the upstream task list and + auto-sets the dataset `split` ("target" or "pretrain"). + """ + if env_cls is None or not callable(env_cls): + raise ValueError("env_cls must be a callable that wraps a list of environment factory callables.") + if not isinstance(n_envs, int) or n_envs <= 0: + raise ValueError(f"n_envs must be a positive int; got {n_envs}.") + + gym_kwargs = dict(gym_kwargs or {}) + obs_type = gym_kwargs.pop("obs_type", "pixels_agent_pos") + render_mode = gym_kwargs.pop("render_mode", "rgb_array") + observation_width = gym_kwargs.pop("observation_width", 256) + observation_height = gym_kwargs.pop("observation_height", 256) + visualization_width = gym_kwargs.pop("visualization_width", 512) + visualization_height = gym_kwargs.pop("visualization_height", 512) + split = gym_kwargs.pop("split", None) + + camera_names = parse_camera_names(camera_name) + task_names, group_split = _resolve_tasks(str(task)) + if group_split is not None and split is None: + split = group_split + + logger.info( + "Creating RoboCasa envs | tasks=%s | split=%s | n_envs(per task)=%d", + task_names, + split, + n_envs, + ) + + is_async = env_cls is gym.vector.AsyncVectorEnv + + cached_obs_space: spaces.Space | None = None + cached_act_space: spaces.Space | None = None + cached_metadata: dict[str, Any] | None = None + out: dict[str, dict[int, Any]] = defaultdict(dict) + + for task_name in task_names: + fns = _make_env_fns( + task=task_name, + n_envs=n_envs, + camera_names=camera_names, + obs_type=obs_type, + render_mode=render_mode, + observation_width=observation_width, + observation_height=observation_height, + visualization_width=visualization_width, + visualization_height=visualization_height, + split=split, + episode_length=episode_length, + obj_registries=obj_registries, + ) + + if is_async: + lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata) + if cached_obs_space is None: + cached_obs_space = lazy.observation_space + cached_act_space = lazy.action_space + cached_metadata = lazy.metadata + out[task_name][0] = lazy + else: + out[task_name][0] = env_cls(fns) + logger.info("Built vec env | task=%s | n_envs=%d", task_name, n_envs) + + return {name: dict(task_map) for name, task_map in out.items()} diff --git a/src/lerobot/envs/robomme.py b/src/lerobot/envs/robomme.py new file mode 100644 index 000000000..69d665bd4 --- /dev/null +++ b/src/lerobot/envs/robomme.py @@ -0,0 +1,245 @@ +"""RoboMME environment wrapper for LeRobot evaluation. + +Wraps the RoboMME ``BenchmarkEnvBuilder`` into a Gymnasium-compatible +``VectorEnv`` suitable for ``lerobot_eval``. + +RoboMME tasks: + Counting: BinFill, PickXtimes, SwingXtimes, StopCube + Permanence: VideoUnmask, VideoUnmaskSwap, ButtonUnmask, ButtonUnmaskSwap + Reference: PickHighlight, VideoRepick, VideoPlaceButton, VideoPlaceOrder + Imitation: MoveCube, InsertPeg, PatternLock, RouteStick + +Dataset: lerobot/robomme (LeRobot v3.0, 1,600 episodes) +Install: see docker/Dockerfile.benchmark.robomme (Linux only — mani-skill vs numpy pin conflict) +Benchmark: https://github.com/RoboMME/robomme_benchmark +""" + +from __future__ import annotations + +from collections.abc import Callable, Sequence +from functools import partial +from typing import Any + +import gymnasium as gym +import numpy as np +from gymnasium import spaces + +from .utils import _LazyAsyncVectorEnv + +ROBOMME_TASKS = [ + "BinFill", + "PickXtimes", + "SwingXtimes", + "StopCube", + "VideoUnmask", + "VideoUnmaskSwap", + "ButtonUnmask", + "ButtonUnmaskSwap", + "PickHighlight", + "VideoRepick", + "VideoPlaceButton", + "VideoPlaceOrder", + "MoveCube", + "InsertPeg", + "PatternLock", + "RouteStick", +] + + +class RoboMMEGymEnv(gym.Env): + """Thin Gymnasium wrapper around a single RoboMME episode env.""" + + metadata = {"render_modes": ["rgb_array"], "render_fps": 10} + + def __init__( + self, + task: str = "PickXtimes", + action_space_type: str = "joint_angle", + dataset: str = "test", + episode_idx: int = 0, + max_steps: int = 300, + ): + super().__init__() + from robomme.env_record_wrapper import BenchmarkEnvBuilder + + self._task = task + self._action_space_type = action_space_type + self._dataset = dataset + self._episode_idx = episode_idx + self._max_steps = max_steps + self._max_episode_steps = max_steps + + self._builder = BenchmarkEnvBuilder( + env_id=task, + dataset=dataset, + action_space=action_space_type, + gui_render=False, + max_steps=max_steps, + ) + self._env = None + self._last_raw_obs: dict | None = None + + action_dim = 8 if action_space_type == "joint_angle" else 7 + self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(action_dim,), dtype=np.float32) + # `pixels` must be a nested Dict so `preprocess_observation()` in + # envs/utils.py picks it up and maps each camera to + # `observation.images.`. A flat layout (`pixels/image`, + # `pixels/wrist_image`) silently drops every image from the batch. + self.observation_space = spaces.Dict( + { + "pixels": spaces.Dict( + { + "image": spaces.Box(0, 255, shape=(256, 256, 3), dtype=np.uint8), + "wrist_image": spaces.Box(0, 255, shape=(256, 256, 3), dtype=np.uint8), + } + ), + "agent_pos": spaces.Box(-np.inf, np.inf, shape=(8,), dtype=np.float32), + } + ) + + def reset(self, *, seed=None, options=None): + super().reset(seed=seed) + self._env = self._builder.make_env_for_episode( + episode_idx=self._episode_idx, + max_steps=self._max_steps, + ) + obs, info = self._env.reset() + self._last_raw_obs = obs + return self._convert_obs(obs), self._convert_info(info) + + def step(self, action): + obs, reward, terminated, truncated, info = self._env.step(action) + self._last_raw_obs = obs + + terminated_bool = bool(terminated.item()) if hasattr(terminated, "item") else bool(terminated) + truncated_bool = bool(truncated.item()) if hasattr(truncated, "item") else bool(truncated) + + status = info.get("status", "ongoing") + is_success = status == "success" + conv_info = self._convert_info(info) + conv_info["is_success"] = is_success + + return self._convert_obs(obs), float(reward), terminated_bool, truncated_bool, conv_info + + def render(self) -> np.ndarray | None: + """Return the front camera image from the last observation for video recording.""" + if self._last_raw_obs is None: + return np.zeros((256, 256, 3), dtype=np.uint8) + front = self._last_raw_obs.get("front_rgb_list") + if front is None: + return np.zeros((256, 256, 3), dtype=np.uint8) + frame = front[-1] if isinstance(front, list) else front + return np.asarray(frame, dtype=np.uint8) + + def _convert_obs(self, obs: dict) -> dict: + front_rgb = ( + obs["front_rgb_list"][-1] if isinstance(obs["front_rgb_list"], list) else obs["front_rgb_list"] + ) + wrist_rgb = ( + obs["wrist_rgb_list"][-1] if isinstance(obs["wrist_rgb_list"], list) else obs["wrist_rgb_list"] + ) + joint_state = ( + obs["joint_state_list"][-1] + if isinstance(obs["joint_state_list"], list) + else obs["joint_state_list"] + ) + gripper_state = ( + obs["gripper_state_list"][-1] + if isinstance(obs["gripper_state_list"], list) + else obs["gripper_state_list"] + ) + + front_rgb = np.asarray(front_rgb, dtype=np.uint8) + wrist_rgb = np.asarray(wrist_rgb, dtype=np.uint8) + joint = np.asarray(joint_state, dtype=np.float32).flatten()[:7] + gripper = np.asarray(gripper_state, dtype=np.float32).flatten()[:1] + state = np.concatenate([joint, gripper]) + + return { + "pixels": {"image": front_rgb, "wrist_image": wrist_rgb}, + "agent_pos": state, + } + + def _convert_info(self, info: dict) -> dict: + return { + "status": info.get("status", "ongoing"), + "task_goal": info.get("task_goal", ""), + } + + +def _make_env_fns( + *, + task: str, + n_envs: int, + action_space_type: str, + dataset: str, + episode_length: int, + task_id: int, +) -> list[Callable[[], RoboMMEGymEnv]]: + """Build n_envs factory callables for one RoboMME task id.""" + + def _make_one(episode_index: int) -> RoboMMEGymEnv: + return RoboMMEGymEnv( + task=task, + action_space_type=action_space_type, + dataset=dataset, + episode_idx=episode_index, + max_steps=episode_length, + ) + + return [partial(_make_one, task_id + i) for i in range(n_envs)] + + +def create_robomme_envs( + task: str, + n_envs: int = 1, + action_space_type: str = "joint_angle", + dataset: str = "test", + episode_length: int = 300, + task_ids: list[int] | None = None, + env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None, +) -> dict[str, dict[int, gym.vector.VectorEnv]]: + """Create vectorized RoboMME environments for evaluation. + + `task` may be a single RoboMME task name (e.g. "PickXtimes") or a + comma-separated list (e.g. "PickXtimes,BinFill,StopCube"). Each task + becomes its own suite in the returned mapping. + + Returns {suite_name: {task_id: VectorEnv}} matching lerobot's expected format. + """ + if env_cls is None or not callable(env_cls): + raise ValueError("env_cls must be a callable that wraps a list of env factory callables.") + if not isinstance(n_envs, int) or n_envs <= 0: + raise ValueError(f"n_envs must be a positive int; got {n_envs}.") + + if task_ids is None: + task_ids = [0] + + task_names = [t.strip() for t in task.split(",") if t.strip()] + is_async = env_cls is gym.vector.AsyncVectorEnv + cached_obs_space: spaces.Space | None = None + cached_act_space: spaces.Space | None = None + cached_metadata: dict[str, Any] | None = None + out: dict[str, dict[int, gym.vector.VectorEnv]] = {} + for task_name in task_names: + envs_by_task: dict[int, gym.vector.VectorEnv] = {} + for task_id in task_ids: + fns = _make_env_fns( + task=task_name, + n_envs=n_envs, + action_space_type=action_space_type, + dataset=dataset, + episode_length=episode_length, + task_id=task_id, + ) + if is_async: + lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata) + if cached_obs_space is None: + cached_obs_space = lazy.observation_space + cached_act_space = lazy.action_space + cached_metadata = lazy.metadata + envs_by_task[task_id] = lazy + else: + envs_by_task[task_id] = env_cls(fns) + out[task_name] = envs_by_task + return out diff --git a/src/lerobot/envs/robotwin.py b/src/lerobot/envs/robotwin.py new file mode 100644 index 000000000..823f14fa0 --- /dev/null +++ b/src/lerobot/envs/robotwin.py @@ -0,0 +1,488 @@ +#!/usr/bin/env python + +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import importlib +import logging +from collections import defaultdict +from collections.abc import Callable, Sequence +from functools import partial +from typing import Any + +import gymnasium as gym +import numpy as np +import torch +from gymnasium import spaces + +from lerobot.types import RobotObservation + +from .utils import _LazyAsyncVectorEnv + +logger = logging.getLogger(__name__) + +# Camera names as used by RoboTwin 2.0. The wrapper appends "_rgb" when looking +# up keys in get_obs() output (e.g. "head_camera" → "head_camera_rgb"). +ROBOTWIN_CAMERA_NAMES: tuple[str, ...] = ( + "head_camera", + "left_camera", + "right_camera", +) + +ACTION_DIM = 14 # 7 DOF × 2 arms +ACTION_LOW = -1.0 +ACTION_HIGH = 1.0 +DEFAULT_EPISODE_LENGTH = 300 +# D435 dims from task_config/_camera_config.yml (what demo_clean.yml selects). +DEFAULT_CAMERA_H = 240 +DEFAULT_CAMERA_W = 320 + +# Task list from RoboTwin 2.0's `envs/` directory — mirrors upstream exactly +# (50 tasks as of main; earlier revisions had 60 with a different split). +# Keep this in sync with: +# gh api /repos/RoboTwin-Platform/RoboTwin/contents/envs --paginate \ +# | jq -r '.[].name' | grep -E '\.py$' | grep -v '^_' | sed 's/\.py$//' +ROBOTWIN_TASKS: tuple[str, ...] = ( + "adjust_bottle", + "beat_block_hammer", + "blocks_ranking_rgb", + "blocks_ranking_size", + "click_alarmclock", + "click_bell", + "dump_bin_bigbin", + "grab_roller", + "handover_block", + "handover_mic", + "hanging_mug", + "lift_pot", + "move_can_pot", + "move_pillbottle_pad", + "move_playingcard_away", + "move_stapler_pad", + "open_laptop", + "open_microwave", + "pick_diverse_bottles", + "pick_dual_bottles", + "place_a2b_left", + "place_a2b_right", + "place_bread_basket", + "place_bread_skillet", + "place_burger_fries", + "place_can_basket", + "place_cans_plasticbox", + "place_container_plate", + "place_dual_shoes", + "place_empty_cup", + "place_fan", + "place_mouse_pad", + "place_object_basket", + "place_object_scale", + "place_object_stand", + "place_phone_stand", + "place_shoe", + "press_stapler", + "put_bottles_dustbin", + "put_object_cabinet", + "rotate_qrcode", + "scan_object", + "shake_bottle", + "shake_bottle_horizontally", + "stack_blocks_three", + "stack_blocks_two", + "stack_bowls_three", + "stack_bowls_two", + "stamp_seal", + "turn_switch", +) + + +_ROBOTWIN_SETUP_CACHE: dict[str, dict[str, Any]] = {} + + +def _load_robotwin_setup_kwargs(task_name: str) -> dict[str, Any]: + """Build the kwargs dict RoboTwin's setup_demo expects. + + Mirrors the config loading done by RoboTwin's ``script/eval_policy.py``: + reads ``task_config/demo_clean.yml``, resolves the embodiment file from + ``_embodiment_config.yml``, loads the robot's own ``config.yml``, and + reads camera dimensions from ``_camera_config.yml``. + + Uses ``aloha-agilex`` single-robot dual-arm by default (the only embodiment + used by beat_block_hammer and most smoke-test tasks). + """ + if task_name in _ROBOTWIN_SETUP_CACHE: + return dict(_ROBOTWIN_SETUP_CACHE[task_name]) + + import os + + import yaml # type: ignore[import-untyped] + from envs import CONFIGS_PATH # type: ignore[import-not-found] + + task_config = "demo_clean" + with open(os.path.join(CONFIGS_PATH, f"{task_config}.yml"), encoding="utf-8") as f: + args = yaml.safe_load(f) + + # Resolve embodiment — demo_clean.yml uses [aloha-agilex] (dual-arm single robot) + with open(os.path.join(CONFIGS_PATH, "_embodiment_config.yml"), encoding="utf-8") as f: + embodiment_types = yaml.safe_load(f) + embodiment = args.get("embodiment", ["aloha-agilex"]) + if len(embodiment) == 1: + robot_file = embodiment_types[embodiment[0]]["file_path"] + args["left_robot_file"] = robot_file + args["right_robot_file"] = robot_file + args["dual_arm_embodied"] = True + elif len(embodiment) == 3: + args["left_robot_file"] = embodiment_types[embodiment[0]]["file_path"] + args["right_robot_file"] = embodiment_types[embodiment[1]]["file_path"] + args["embodiment_dis"] = embodiment[2] + args["dual_arm_embodied"] = False + else: + raise ValueError(f"embodiment must have 1 or 3 items, got {len(embodiment)}") + + with open(os.path.join(args["left_robot_file"], "config.yml"), encoding="utf-8") as f: + args["left_embodiment_config"] = yaml.safe_load(f) + with open(os.path.join(args["right_robot_file"], "config.yml"), encoding="utf-8") as f: + args["right_embodiment_config"] = yaml.safe_load(f) + + # Camera dimensions + with open(os.path.join(CONFIGS_PATH, "_camera_config.yml"), encoding="utf-8") as f: + camera_config = yaml.safe_load(f) + head_cam = args["camera"]["head_camera_type"] + args["head_camera_h"] = camera_config[head_cam]["h"] + args["head_camera_w"] = camera_config[head_cam]["w"] + + # Headless overrides + args["render_freq"] = 0 + args["task_name"] = task_name + args["task_config"] = task_config + + _ROBOTWIN_SETUP_CACHE[task_name] = args + return dict(args) + + +def _load_robotwin_task(task_name: str) -> type: + """Dynamically import and return a RoboTwin 2.0 task class. + + RoboTwin tasks live in ``envs/.py`` relative to the repository + root and are expected to be on ``sys.path`` after installation. + """ + try: + module = importlib.import_module(f"envs.{task_name}") + except ModuleNotFoundError as e: + raise ModuleNotFoundError( + f"Could not import RoboTwin task '{task_name}'. " + "Ensure RoboTwin 2.0 is installed and its 'envs/' directory is on PYTHONPATH. " + "See the RoboTwin installation guide: https://robotwin-platform.github.io/doc/usage/robotwin-install.html" + ) from e + task_cls = getattr(module, task_name, None) + if task_cls is None: + raise AttributeError(f"Task class '{task_name}' not found in envs/{task_name}.py") + return task_cls + + +class RoboTwinEnv(gym.Env): + """Gymnasium wrapper around a single RoboTwin 2.0 task. + + RoboTwin uses a custom SAPIEN-based API (``setup_demo`` / ``get_obs`` / + ``take_action`` / ``check_success``) rather than the standard gym interface. + This class bridges that API to Gymnasium so that ``lerobot-eval`` can drive + RoboTwin exactly like LIBERO or Meta-World. + + The underlying SAPIEN environment is created lazily on the first ``reset()`` + call *inside the worker process*. This is required for + ``gym.vector.AsyncVectorEnv`` compatibility: SAPIEN allocates EGL/GPU + contexts that must not be forked from the parent process. + + Observations + ------------ + The ``pixels`` dict uses the raw RoboTwin camera names as keys (e.g. + ``"head_camera"``, ``"left_camera"``). ``preprocess_observation`` in + ``envs/utils.py`` then converts these to ``observation.images.``. + + Actions + ------- + 14-dim float32 array in ``[-1, 1]`` (joint-space, 7 DOF per arm). + + Autograd + -------- + ``setup_demo`` and ``take_action`` drive CuRobo's Newton trajectory + optimizer, which calls ``cost.backward()`` internally. lerobot_eval wraps + the rollout in ``torch.no_grad()``, so both call sites re-enable grad. + """ + + metadata = {"render_modes": ["rgb_array"], "render_fps": 25} + + def __init__( + self, + task_name: str, + episode_index: int = 0, + n_envs: int = 1, + camera_names: Sequence[str] = ROBOTWIN_CAMERA_NAMES, + observation_height: int | None = None, + observation_width: int | None = None, + episode_length: int = DEFAULT_EPISODE_LENGTH, + render_mode: str = "rgb_array", + ): + super().__init__() + self.task_name = task_name + self.task = task_name # used by add_envs_task() in utils.py + self.task_description = task_name.replace("_", " ") + self.episode_index = episode_index + self._reset_stride = n_envs + self.camera_names = list(camera_names) + # Default to D435 dims (the camera type baked into task_config/demo_clean.yml). + # The YAML-driven lookup is deferred to reset() so construction doesn't + # import RoboTwin's `envs` module — fast-tests run without RoboTwin installed. + self.observation_height = observation_height or DEFAULT_CAMERA_H + self.observation_width = observation_width or DEFAULT_CAMERA_W + self.episode_length = episode_length + self._max_episode_steps = episode_length # lerobot_eval.rollout reads this + self.render_mode = render_mode + + self._env: Any | None = None # deferred — created on first reset() inside worker + self._step_count: int = 0 + self._black_frame = np.zeros((self.observation_height, self.observation_width, 3), dtype=np.uint8) + + image_spaces = { + cam: spaces.Box( + low=0, + high=255, + shape=(self.observation_height, self.observation_width, 3), + dtype=np.uint8, + ) + for cam in self.camera_names + } + self.observation_space = spaces.Dict( + { + "pixels": spaces.Dict(image_spaces), + "agent_pos": spaces.Box(low=-np.inf, high=np.inf, shape=(ACTION_DIM,), dtype=np.float32), + } + ) + self.action_space = spaces.Box( + low=ACTION_LOW, high=ACTION_HIGH, shape=(ACTION_DIM,), dtype=np.float32 + ) + + def _ensure_env(self) -> None: + """Create the SAPIEN environment on first use. + + Called inside the worker subprocess after fork(), so each worker gets + its own EGL/GPU context rather than inheriting a stale one from the + parent process (which causes crashes with AsyncVectorEnv). + """ + if self._env is not None: + return + task_cls = _load_robotwin_task(self.task_name) + self._env = task_cls() + + def _get_obs(self) -> RobotObservation: + assert self._env is not None, "_get_obs called before _ensure_env()" + raw = self._env.get_obs() + cameras_raw = raw.get("observation", {}) + + images: dict[str, np.ndarray] = {} + for cam in self.camera_names: + cam_data = cameras_raw.get(cam) + img = cam_data.get("rgb") if cam_data else None + if img is None: + images[cam] = self._black_frame + continue + img = np.asarray(img, dtype=np.uint8) + if img.ndim == 2: + img = np.stack([img, img, img], axis=-1) + elif img.shape[-1] != 3: + img = img[..., :3] + images[cam] = img + + ja = raw.get("joint_action") or {} + vec = ja.get("vector") + if vec is not None: + arr = np.asarray(vec, dtype=np.float32).ravel() + joint_state = ( + arr[:ACTION_DIM] if arr.size >= ACTION_DIM else np.zeros(ACTION_DIM, dtype=np.float32) + ) + else: + joint_state = np.zeros(ACTION_DIM, dtype=np.float32) + + return {"pixels": images, "agent_pos": joint_state} + + def reset(self, seed: int | None = None, **kwargs) -> tuple[RobotObservation, dict]: + self._ensure_env() + super().reset(seed=seed) + assert self._env is not None # set by _ensure_env() above + + actual_seed = self.episode_index if seed is None else seed + setup_kwargs = _load_robotwin_setup_kwargs(self.task_name) + setup_kwargs.update(seed=actual_seed, is_test=True) + with torch.enable_grad(): + self._env.setup_demo(**setup_kwargs) + self.episode_index += self._reset_stride + self._step_count = 0 + + obs = self._get_obs() + return obs, {"is_success": False, "task": self.task_name} + + def step(self, action: np.ndarray) -> tuple[RobotObservation, float, bool, bool, dict[str, Any]]: + assert self._env is not None, "step() called before reset()" + if action.ndim != 1 or action.shape[0] != ACTION_DIM: + raise ValueError(f"Expected 1-D action of shape ({ACTION_DIM},), got {action.shape}") + + with torch.enable_grad(): + if hasattr(self._env, "take_action"): + self._env.take_action(action) + else: + self._env.step(action) + + self._step_count += 1 + + is_success = bool(getattr(self._env, "eval_success", False)) + if not is_success and hasattr(self._env, "check_success"): + is_success = bool(self._env.check_success()) + + obs = self._get_obs() + reward = float(is_success) + terminated = is_success + truncated = self._step_count >= self.episode_length + + info: dict[str, Any] = { + "task": self.task_name, + "is_success": is_success, + "step": self._step_count, + } + if terminated or truncated: + info["final_info"] = { + "task": self.task_name, + "is_success": is_success, + } + self.reset() + + return obs, reward, terminated, truncated, info + + def render(self) -> np.ndarray: + self._ensure_env() + obs = self._get_obs() + # Prefer head camera for rendering; fall back to first available. + if "head_camera" in obs["pixels"]: + return obs["pixels"]["head_camera"] + return next(iter(obs["pixels"].values())) + + def close(self) -> None: + if self._env is not None: + if hasattr(self._env, "close_env"): + import contextlib + + with contextlib.suppress(TypeError): + self._env.close_env() + self._env = None + + +# ---- Multi-task factory -------------------------------------------------------- + + +def _make_env_fns( + *, + task_name: str, + n_envs: int, + camera_names: list[str], + observation_height: int, + observation_width: int, + episode_length: int, +) -> list[Callable[[], RoboTwinEnv]]: + """Return n_envs factory callables for a single task.""" + + def _make_one(episode_index: int) -> RoboTwinEnv: + return RoboTwinEnv( + task_name=task_name, + episode_index=episode_index, + n_envs=n_envs, + camera_names=camera_names, + observation_height=observation_height, + observation_width=observation_width, + episode_length=episode_length, + ) + + return [partial(_make_one, i) for i in range(n_envs)] + + +def create_robotwin_envs( + task: str, + n_envs: int, + env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None, + camera_names: Sequence[str] = ROBOTWIN_CAMERA_NAMES, + observation_height: int = DEFAULT_CAMERA_H, + observation_width: int = DEFAULT_CAMERA_W, + episode_length: int = DEFAULT_EPISODE_LENGTH, +) -> dict[str, dict[int, Any]]: + """Create vectorized RoboTwin 2.0 environments. + + Returns: + ``dict[task_name][0] -> VectorEnv`` — one entry per task, each wrapping + ``n_envs`` parallel rollouts. + + Args: + task: Comma-separated list of task names (e.g. ``"beat_block_hammer"`` + or ``"beat_block_hammer,click_bell"``). + n_envs: Number of parallel rollouts per task. + env_cls: Vector env constructor (e.g. ``gym.vector.AsyncVectorEnv``). + camera_names: Cameras to include in observations. + observation_height: Pixel height for all cameras. + observation_width: Pixel width for all cameras. + episode_length: Max steps before truncation. + """ + if env_cls is None or not callable(env_cls): + raise ValueError("env_cls must be callable (e.g. gym.vector.AsyncVectorEnv).") + if not isinstance(n_envs, int) or n_envs <= 0: + raise ValueError(f"n_envs must be a positive int; got {n_envs}.") + + task_names = [t.strip() for t in str(task).split(",") if t.strip()] + if not task_names: + raise ValueError("`task` must contain at least one RoboTwin task name.") + + unknown = [t for t in task_names if t not in ROBOTWIN_TASKS] + if unknown: + raise ValueError(f"Unknown RoboTwin tasks: {unknown}. Available tasks: {sorted(ROBOTWIN_TASKS)}") + + logger.info( + "Creating RoboTwin envs | tasks=%s | n_envs(per task)=%d", + task_names, + n_envs, + ) + + is_async = env_cls is gym.vector.AsyncVectorEnv + cached_obs_space: spaces.Space | None = None + cached_act_space: spaces.Space | None = None + cached_metadata: dict[str, Any] | None = None + + out: dict[str, dict[int, Any]] = defaultdict(dict) + for task_name in task_names: + fns = _make_env_fns( + task_name=task_name, + n_envs=n_envs, + camera_names=list(camera_names), + observation_height=observation_height, + observation_width=observation_width, + episode_length=episode_length, + ) + if is_async: + lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata) + if cached_obs_space is None: + cached_obs_space = lazy.observation_space + cached_act_space = lazy.action_space + cached_metadata = lazy.metadata + out[task_name][0] = lazy + else: + out[task_name][0] = env_cls(fns) + logger.info("Built vec env | task=%s | n_envs=%d", task_name, n_envs) + + return {k: dict(v) for k, v in out.items()} diff --git a/src/lerobot/envs/utils.py b/src/lerobot/envs/utils.py index b0d834a05..6e6f352e9 100644 --- a/src/lerobot/envs/utils.py +++ b/src/lerobot/envs/utils.py @@ -34,6 +34,25 @@ from lerobot.utils.utils import get_channel_first_image_shape from .configs import EnvConfig +def parse_camera_names(camera_name: str | Sequence[str]) -> list[str]: + """Normalize ``camera_name`` into a non-empty list of strings. + + Accepts a comma-separated string (``"cam_a,cam_b"``) or a sequence of + strings (tuples/lists). Whitespace is stripped; empty entries are + dropped. Raises ``TypeError`` for unsupported input types and + ``ValueError`` when the normalized list is empty. + """ + if isinstance(camera_name, str): + cams = [c.strip() for c in camera_name.split(",") if c.strip()] + elif isinstance(camera_name, (list | tuple)): + cams = [str(c).strip() for c in camera_name if str(c).strip()] + else: + raise TypeError(f"camera_name must be str or sequence[str], got {type(camera_name).__name__}") + if not cams: + raise ValueError("camera_name resolved to an empty list.") + return cams + + def _convert_nested_dict(d): result = {} for k, v in d.items(): @@ -153,17 +172,20 @@ class _LazyAsyncVectorEnv: env_fns: list[Callable], observation_space=None, action_space=None, + metadata=None, ): self._env_fns = env_fns self._env: gym.vector.AsyncVectorEnv | None = None self.num_envs = len(env_fns) - if observation_space is not None and action_space is not None: + if observation_space is not None and action_space is not None and metadata is not None: self.observation_space = observation_space self.action_space = action_space + self.metadata = metadata else: tmp = env_fns[0]() self.observation_space = tmp.observation_space self.action_space = tmp.action_space + self.metadata = tmp.metadata tmp.close() self.single_observation_space = self.observation_space self.single_action_space = self.action_space @@ -172,6 +194,10 @@ class _LazyAsyncVectorEnv: if self._env is None: self._env = gym.vector.AsyncVectorEnv(self._env_fns, context="forkserver", shared_memory=True) + @property + def unwrapped(self): + return self + def reset(self, **kwargs): self._ensure() return self._env.reset(**kwargs) diff --git a/tests/envs/test_robotwin.py b/tests/envs/test_robotwin.py new file mode 100644 index 000000000..fcd45adbf --- /dev/null +++ b/tests/envs/test_robotwin.py @@ -0,0 +1,282 @@ +#!/usr/bin/env python + +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Unit tests for the RoboTwin 2.0 Gymnasium wrapper. + +These tests mock out the SAPIEN-based RoboTwin runtime (task modules + +YAML config loader) so they run without the full RoboTwin installation +(SAPIEN, CuRobo, mplib, asset downloads, etc.). +""" + +from __future__ import annotations + +from contextlib import contextmanager +from unittest.mock import MagicMock, patch + +import gymnasium as gym +import numpy as np +import pytest + +from lerobot.envs.robotwin import ( + ACTION_DIM, + ROBOTWIN_CAMERA_NAMES, + ROBOTWIN_TASKS, + RoboTwinEnv, + create_robotwin_envs, +) + +# --------------------------------------------------------------------------- +# Fixtures / helpers +# --------------------------------------------------------------------------- + + +def _make_mock_task_env( + height: int = 240, + width: int = 320, + cameras: tuple[str, ...] = ROBOTWIN_CAMERA_NAMES, +) -> MagicMock: + """Return a mock that mimics the RoboTwin task class API. + + RoboTwin's real get_obs returns + {"observation": {cam: {"rgb": img}}, "joint_action": {"vector": np.ndarray}, ...} + so the mock follows the same nested shape. + """ + obs_dict = { + "observation": {cam: {"rgb": np.zeros((height, width, 3), dtype=np.uint8)} for cam in cameras}, + "joint_action": {"vector": np.zeros(ACTION_DIM, dtype=np.float32)}, + "endpose": {}, + } + + mock = MagicMock() + mock.get_obs.return_value = obs_dict + mock.setup_demo.return_value = None + mock.take_action.return_value = None + mock.eval_success = False + mock.check_success.return_value = False + mock.close_env.return_value = None + return mock + + +@contextmanager +def _patch_runtime(mock_task_instance: MagicMock): + """Patch both the task-class loader and the YAML config loader so the + env can construct + reset without a real RoboTwin install.""" + task_cls = MagicMock(return_value=mock_task_instance) + fake_setup = { + "head_camera_h": 240, + "head_camera_w": 320, + "left_embodiment_config": {}, + "right_embodiment_config": {}, + "left_robot_file": "", + "right_robot_file": "", + "dual_arm_embodied": True, + "render_freq": 0, + "task_name": "beat_block_hammer", + "task_config": "demo_clean", + } + with ( + patch("lerobot.envs.robotwin._load_robotwin_task", return_value=task_cls), + patch("lerobot.envs.robotwin._load_robotwin_setup_kwargs", return_value=fake_setup), + ): + yield + + +# --------------------------------------------------------------------------- +# RoboTwinEnv unit tests +# --------------------------------------------------------------------------- + + +class TestRoboTwinEnv: + def test_observation_space_shape(self): + """observation_space should have the configured h×w×3 for every camera.""" + h, w = 240, 320 + env = RoboTwinEnv( + task_name="beat_block_hammer", + observation_height=h, + observation_width=w, + camera_names=["head_camera", "left_camera"], + ) + pixels_space = env.observation_space["pixels"] + assert pixels_space["head_camera"].shape == (h, w, 3) + assert pixels_space["left_camera"].shape == (h, w, 3) + assert "right_camera" not in pixels_space + + def test_action_space(self): + env = RoboTwinEnv(task_name="beat_block_hammer") + assert env.action_space.shape == (ACTION_DIM,) + assert env.action_space.dtype == np.float32 + + def test_reset_returns_correct_obs_keys(self): + mock_task = _make_mock_task_env() + env = RoboTwinEnv(task_name="beat_block_hammer") + with _patch_runtime(mock_task): + obs, info = env.reset() + + assert "pixels" in obs + for cam in ROBOTWIN_CAMERA_NAMES: + assert cam in obs["pixels"], f"Missing camera '{cam}' in obs" + assert "agent_pos" in obs + assert obs["agent_pos"].shape == (ACTION_DIM,) + assert info["is_success"] is False + + def test_reset_calls_setup_demo(self): + mock_task = _make_mock_task_env() + env = RoboTwinEnv(task_name="beat_block_hammer") + with _patch_runtime(mock_task): + env.reset(seed=42) + # setup_demo receives the full YAML-derived kwargs plus seed + is_test; + # we only assert the caller-provided bits. + assert mock_task.setup_demo.call_count == 1 + call_kwargs = mock_task.setup_demo.call_args.kwargs + assert call_kwargs["seed"] == 42 + assert call_kwargs["is_test"] is True + + def test_step_returns_correct_types(self): + mock_task = _make_mock_task_env() + env = RoboTwinEnv(task_name="beat_block_hammer") + action = np.zeros(ACTION_DIM, dtype=np.float32) + with _patch_runtime(mock_task): + env.reset() + obs, reward, terminated, truncated, info = env.step(action) + + assert isinstance(obs, dict) + assert isinstance(reward, float) + assert isinstance(terminated, bool) + assert isinstance(truncated, bool) + assert isinstance(info, dict) + + def test_step_wrong_action_shape_raises(self): + mock_task = _make_mock_task_env() + env = RoboTwinEnv(task_name="beat_block_hammer") + bad_action = np.zeros(7, dtype=np.float32) # wrong dim + with _patch_runtime(mock_task): + env.reset() + with pytest.raises(ValueError, match="Expected 1-D action"): + env.step(bad_action) + + def test_success_terminates_episode(self): + mock_task = _make_mock_task_env() + mock_task.check_success.return_value = True + env = RoboTwinEnv(task_name="beat_block_hammer") + action = np.zeros(ACTION_DIM, dtype=np.float32) + with _patch_runtime(mock_task): + env.reset() + _, _, terminated, _, info = env.step(action) + assert terminated is True + assert info["is_success"] is True + + def test_truncation_after_episode_length(self): + mock_task = _make_mock_task_env() + env = RoboTwinEnv(task_name="beat_block_hammer", episode_length=2) + action = np.zeros(ACTION_DIM, dtype=np.float32) + with _patch_runtime(mock_task): + env.reset() + env.step(action) # step 1 + _, _, _, truncated, _ = env.step(action) # step 2 → truncated + assert truncated is True + + def test_close_calls_close_env(self): + mock_task = _make_mock_task_env() + env = RoboTwinEnv(task_name="beat_block_hammer") + with _patch_runtime(mock_task): + env.reset() + env.close() + mock_task.close_env.assert_called_once() + + def test_black_frame_for_missing_camera(self): + """If a camera key is absent from get_obs(), a black frame is returned.""" + # Mock exposes only head_camera; we ask for both head_camera + left_camera. + mock_task = _make_mock_task_env(height=10, width=10, cameras=("head_camera",)) + env = RoboTwinEnv( + task_name="beat_block_hammer", + camera_names=["head_camera", "left_camera"], + observation_height=10, + observation_width=10, + ) + with _patch_runtime(mock_task): + obs, _ = env.reset() + assert obs["pixels"]["left_camera"].shape == (10, 10, 3) + assert obs["pixels"]["left_camera"].sum() == 0 + + def test_task_and_task_description_attributes(self): + env = RoboTwinEnv(task_name="beat_block_hammer") + assert env.task == "beat_block_hammer" + assert isinstance(env.task_description, str) + + def test_deferred_init_env_is_none_before_reset(self): + env = RoboTwinEnv(task_name="beat_block_hammer") + assert env._env is None # noqa: SLF001 (testing internal state) + + +# --------------------------------------------------------------------------- +# create_robotwin_envs tests +# --------------------------------------------------------------------------- + + +class TestCreateRoboTwinEnvs: + def test_returns_correct_structure(self): + mock_task = _make_mock_task_env() + with _patch_runtime(mock_task): + envs = create_robotwin_envs( + task="beat_block_hammer", + n_envs=1, + env_cls=gym.vector.SyncVectorEnv, + ) + assert "beat_block_hammer" in envs + assert 0 in envs["beat_block_hammer"] + assert isinstance(envs["beat_block_hammer"][0], gym.vector.SyncVectorEnv) + + def test_multi_task(self): + mock_task = _make_mock_task_env() + with _patch_runtime(mock_task): + envs = create_robotwin_envs( + task="beat_block_hammer,click_bell", + n_envs=1, + env_cls=gym.vector.SyncVectorEnv, + ) + assert set(envs.keys()) == {"beat_block_hammer", "click_bell"} + + def test_unknown_task_raises(self): + with pytest.raises(ValueError, match="Unknown RoboTwin tasks"): + create_robotwin_envs( + task="not_a_real_task", + n_envs=1, + env_cls=gym.vector.SyncVectorEnv, + ) + + def test_invalid_n_envs_raises(self): + with pytest.raises(ValueError, match="n_envs must be a positive int"): + create_robotwin_envs( + task="beat_block_hammer", + n_envs=0, + env_cls=gym.vector.SyncVectorEnv, + ) + + +# --------------------------------------------------------------------------- +# ROBOTWIN_TASKS list +# --------------------------------------------------------------------------- + + +def test_task_list_not_empty(): + assert len(ROBOTWIN_TASKS) >= 50 + + +def test_all_tasks_are_strings(): + assert all(isinstance(t, str) and t for t in ROBOTWIN_TASKS) + + +def test_no_duplicate_tasks(): + assert len(ROBOTWIN_TASKS) == len(set(ROBOTWIN_TASKS)) diff --git a/tests/test_robomme_env.py b/tests/test_robomme_env.py new file mode 100644 index 000000000..20646430a --- /dev/null +++ b/tests/test_robomme_env.py @@ -0,0 +1,232 @@ +# Copyright 2026 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Unit tests for the RoboMME env wrapper and config. + +RoboMME requires Linux + ManiSkill (Vulkan/SAPIEN), so tests that touch the +env wrapper mock the ``robomme`` package. Tests that only exercise the +dataclass config run without any mocking. +""" + +from __future__ import annotations + +import sys +from types import ModuleType +from unittest.mock import MagicMock + +import numpy as np + + +def _install_robomme_stub(): + """Register a minimal stub for the ``robomme`` package on sys.modules.""" + stub = ModuleType("robomme") + wrapper_stub = ModuleType("robomme.env_record_wrapper") + + class FakeBuilder: + def __init__(self, **kwargs): + pass + + def make_env_for_episode(self, episode_idx: int, max_steps: int): + env = MagicMock() + obs = { + "front_rgb_list": [np.zeros((256, 256, 3), dtype=np.uint8)], + "wrist_rgb_list": [np.zeros((256, 256, 3), dtype=np.uint8)], + "joint_state_list": [np.zeros(7, dtype=np.float32)], + "gripper_state_list": [np.zeros(2, dtype=np.float32)], + } + env.reset.return_value = (obs, {"status": "ongoing", "task_goal": "pick the cube"}) + env.step.return_value = (obs, 0.0, False, False, {"status": "ongoing", "task_goal": ""}) + return env + + wrapper_stub.BenchmarkEnvBuilder = FakeBuilder + stub.env_record_wrapper = wrapper_stub + sys.modules["robomme"] = stub + sys.modules["robomme.env_record_wrapper"] = wrapper_stub + + +def _uninstall_robomme_stub(): + sys.modules.pop("robomme", None) + sys.modules.pop("robomme.env_record_wrapper", None) + + +# --------------------------------------------------------------------------- +# Config tests (no sim required) +# --------------------------------------------------------------------------- + + +def test_robomme_env_config_defaults(): + from lerobot.envs.configs import RoboMMEEnv + + cfg = RoboMMEEnv() + assert cfg.task == "PickXtimes" + assert cfg.fps == 10 + assert cfg.episode_length == 300 + assert cfg.action_space == "joint_angle" + assert cfg.dataset_split == "test" + assert cfg.task_ids is None + + +def test_robomme_env_config_type(): + from lerobot.envs.configs import RoboMMEEnv + + cfg = RoboMMEEnv() + assert cfg.type == "robomme" + + +def test_robomme_features_map(): + from lerobot.envs.configs import RoboMMEEnv + from lerobot.utils.constants import ACTION, OBS_IMAGES, OBS_STATE + + cfg = RoboMMEEnv() + assert cfg.features_map[ACTION] == ACTION + assert cfg.features_map["pixels/image"] == f"{OBS_IMAGES}.image" + assert cfg.features_map["pixels/wrist_image"] == f"{OBS_IMAGES}.wrist_image" + assert cfg.features_map["agent_pos"] == OBS_STATE + + +def test_robomme_features_action_dim_joint_angle(): + from lerobot.envs.configs import RoboMMEEnv + from lerobot.utils.constants import ACTION + + cfg = RoboMMEEnv(action_space="joint_angle") + assert cfg.features[ACTION].shape == (8,) + + +def test_robomme_features_action_dim_ee_pose(): + """`ee_pose` uses a 7-D action; __post_init__ sets the correct shape.""" + from lerobot.envs.configs import RoboMMEEnv + from lerobot.utils.constants import ACTION + + cfg = RoboMMEEnv(action_space="ee_pose") + assert cfg.features[ACTION].shape == (7,) + + +# --------------------------------------------------------------------------- +# Obs conversion (pure Python, no sim) +# --------------------------------------------------------------------------- + + +def test_convert_obs_list_format(): + """_convert_obs takes the last element from list-format obs fields and + emits a nested ``pixels`` dict (image, wrist_image) plus ``agent_pos``. + + The nested layout is required so ``preprocess_observation()`` in + ``envs/utils.py`` maps each camera to ``observation.images.``. + """ + _install_robomme_stub() + try: + from lerobot.envs.robomme import RoboMMEGymEnv + + env = RoboMMEGymEnv.__new__(RoboMMEGymEnv) + + front = np.full((256, 256, 3), 42, dtype=np.uint8) + wrist = np.full((256, 256, 3), 7, dtype=np.uint8) + joints = np.arange(7, dtype=np.float32) + gripper = np.array([0.5, 0.5], dtype=np.float32) + + obs_raw = { + "front_rgb_list": [np.zeros_like(front), front], + "wrist_rgb_list": [np.zeros_like(wrist), wrist], + "joint_state_list": [np.zeros(7, dtype=np.float32), joints], + "gripper_state_list": [np.zeros(2, dtype=np.float32), gripper], + } + + result = env._convert_obs(obs_raw) + np.testing.assert_array_equal(result["pixels"]["image"], front) + np.testing.assert_array_equal(result["pixels"]["wrist_image"], wrist) + assert result["agent_pos"].shape == (8,) + np.testing.assert_array_almost_equal(result["agent_pos"][:7], joints) + assert result["agent_pos"][7] == gripper[0] + finally: + _uninstall_robomme_stub() + + +def test_convert_obs_array_format(): + """_convert_obs also handles non-list (direct array) obs.""" + _install_robomme_stub() + try: + from lerobot.envs.robomme import RoboMMEGymEnv + + env = RoboMMEGymEnv.__new__(RoboMMEGymEnv) + + front = np.zeros((256, 256, 3), dtype=np.uint8) + obs_raw = { + "front_rgb_list": front, + "wrist_rgb_list": front, + "joint_state_list": np.zeros(7, dtype=np.float32), + "gripper_state_list": np.zeros(2, dtype=np.float32), + } + result = env._convert_obs(obs_raw) + assert result["pixels"]["image"].shape == (256, 256, 3) + assert result["pixels"]["wrist_image"].shape == (256, 256, 3) + assert result["agent_pos"].shape == (8,) + finally: + _uninstall_robomme_stub() + + +# --------------------------------------------------------------------------- +# create_robomme_envs (mocked sim) +# --------------------------------------------------------------------------- + + +def test_create_robomme_envs_returns_correct_structure(): + """Single task -> {task_name: {task_id: VectorEnv}} with one entry per task_id.""" + _install_robomme_stub() + try: + from lerobot.envs.robomme import create_robomme_envs + + env_cls = MagicMock(return_value=MagicMock()) + result = create_robomme_envs( + task="PickXtimes", + n_envs=1, + task_ids=[0, 1], + env_cls=env_cls, + ) + + assert "PickXtimes" in result + assert 0 in result["PickXtimes"] + assert 1 in result["PickXtimes"] + assert env_cls.call_count == 2 + finally: + _uninstall_robomme_stub() + + +def test_create_robomme_envs_multi_task(): + """Comma-separated task list produces one suite per task.""" + _install_robomme_stub() + try: + from lerobot.envs.robomme import create_robomme_envs + + env_cls = MagicMock(return_value=MagicMock()) + result = create_robomme_envs( + task="PickXtimes,BinFill,StopCube", + n_envs=1, + env_cls=env_cls, + ) + + assert set(result.keys()) == {"PickXtimes", "BinFill", "StopCube"} + finally: + _uninstall_robomme_stub() + + +def test_create_robomme_envs_raises_on_invalid_env_cls(): + _install_robomme_stub() + try: + import pytest + + from lerobot.envs.robomme import create_robomme_envs + + with pytest.raises(ValueError, match="env_cls must be a callable"): + create_robomme_envs(task="PickXtimes", n_envs=1, env_cls=None) + finally: + _uninstall_robomme_stub()