mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-31 19:01:28 +00:00
Merge branch 'main' into codex/model-profiling
This commit is contained in:
541
.github/workflows/benchmark_tests.yml
vendored
541
.github/workflows/benchmark_tests.yml
vendored
@@ -83,10 +83,13 @@ jobs:
|
||||
cache-binary: false
|
||||
|
||||
- name: Login to Docker Hub
|
||||
if: ${{ env.DOCKERHUB_USERNAME != '' }}
|
||||
uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
|
||||
env:
|
||||
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
|
||||
|
||||
# Build the benchmark-specific image. The Dockerfile separates dep-install
|
||||
# from source-copy, so code-only changes skip the slow uv-sync layer
|
||||
@@ -115,7 +118,7 @@ jobs:
|
||||
bash -c "
|
||||
hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
|
||||
lerobot-eval \
|
||||
--policy.path=pepijn223/smolvla_libero \
|
||||
--policy.path=lerobot/smolvla_libero \
|
||||
--env.type=libero \
|
||||
--env.task=libero_spatial \
|
||||
--eval.batch_size=1 \
|
||||
@@ -144,7 +147,7 @@ jobs:
|
||||
--artifacts-dir /tmp/libero-artifacts \
|
||||
--env libero \
|
||||
--task libero_spatial \
|
||||
--policy pepijn223/smolvla_libero
|
||||
--policy lerobot/smolvla_libero
|
||||
|
||||
- name: Upload Libero rollout video
|
||||
if: always()
|
||||
@@ -238,10 +241,13 @@ jobs:
|
||||
cache-binary: false
|
||||
|
||||
- name: Login to Docker Hub
|
||||
if: ${{ env.DOCKERHUB_USERNAME != '' }}
|
||||
uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
|
||||
env:
|
||||
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
|
||||
|
||||
- name: Build MetaWorld benchmark image
|
||||
uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
|
||||
@@ -264,7 +270,7 @@ jobs:
|
||||
bash -c "
|
||||
hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
|
||||
lerobot-eval \
|
||||
--policy.path=pepijn223/smolvla_metaworld \
|
||||
--policy.path=lerobot/smolvla_metaworld \
|
||||
--env.type=metaworld \
|
||||
--env.task=metaworld-push-v3 \
|
||||
--eval.batch_size=1 \
|
||||
@@ -293,7 +299,7 @@ jobs:
|
||||
--artifacts-dir /tmp/metaworld-artifacts \
|
||||
--env metaworld \
|
||||
--task metaworld-push-v3 \
|
||||
--policy pepijn223/smolvla_metaworld
|
||||
--policy lerobot/smolvla_metaworld
|
||||
|
||||
- name: Upload MetaWorld rollout video
|
||||
if: always()
|
||||
@@ -310,3 +316,530 @@ jobs:
|
||||
name: metaworld-metrics
|
||||
path: /tmp/metaworld-artifacts/metrics.json
|
||||
if-no-files-found: warn
|
||||
|
||||
# ── ROBOTWIN 2.0 ──────────────────────────────────────────────────────────
|
||||
# Isolated image: full RoboTwin 2.0 stack — SAPIEN, mplib, CuRobo,
|
||||
# pytorch3d, + simulation assets (~4 GB).
|
||||
# Build takes ~20 min on first run; subsequent runs hit the layer cache.
|
||||
# Requires an NVIDIA GPU runner with CUDA 12.1 drivers.
|
||||
robotwin-integration-test:
|
||||
name: RoboTwin 2.0 — build image + 1-episode eval
|
||||
runs-on:
|
||||
group: aws-g6-4xlarge-plus
|
||||
env:
|
||||
HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
|
||||
ROBOTWIN_POLICY: lerobot/smolvla_robotwin
|
||||
ROBOTWIN_TASKS: beat_block_hammer,click_bell,handover_block,stack_blocks_two,click_alarmclock,open_microwave,adjust_bottle,lift_pot,stamp_seal,turn_switch
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
persist-credentials: false
|
||||
lfs: true
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
cache-binary: false
|
||||
|
||||
- name: Login to Docker Hub
|
||||
if: ${{ env.DOCKERHUB_USERNAME != '' }}
|
||||
uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
|
||||
env:
|
||||
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
|
||||
|
||||
# Build the full-install image: SAPIEN, mplib, CuRobo, pytorch3d +
|
||||
# simulation assets (~4 GB). Layer cache lives in the runner's local
|
||||
# Docker daemon — reused across re-runs on the same machine.
|
||||
- name: Build RoboTwin 2.0 benchmark image
|
||||
uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
context: .
|
||||
file: docker/Dockerfile.benchmark.robotwin
|
||||
push: false
|
||||
load: true
|
||||
tags: lerobot-benchmark-robotwin:ci
|
||||
cache-from: type=local,src=/tmp/.buildx-cache-robotwin
|
||||
cache-to: type=local,dest=/tmp/.buildx-cache-robotwin,mode=max
|
||||
|
||||
- name: Run RoboTwin 2.0 smoke eval (10 tasks, 1 episode each)
|
||||
if: env.HF_USER_TOKEN != ''
|
||||
run: |
|
||||
# Named container (no --rm) so we can docker cp artifacts out.
|
||||
docker run --name robotwin-eval --gpus all \
|
||||
--shm-size=4g \
|
||||
-e HF_HOME=/tmp/hf \
|
||||
-e HF_USER_TOKEN="${HF_USER_TOKEN}" \
|
||||
-e ROBOTWIN_POLICY="${ROBOTWIN_POLICY}" \
|
||||
-e ROBOTWIN_TASKS="${ROBOTWIN_TASKS}" \
|
||||
lerobot-benchmark-robotwin:ci \
|
||||
bash -c "
|
||||
hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
|
||||
cd /opt/robotwin && lerobot-eval \
|
||||
--policy.path=\"\$ROBOTWIN_POLICY\" \
|
||||
--env.type=robotwin \
|
||||
--env.task=\"\$ROBOTWIN_TASKS\" \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=1 \
|
||||
--eval.use_async_envs=false \
|
||||
--policy.device=cuda \
|
||||
'--rename_map={\"observation.images.head_camera\": \"observation.images.camera1\", \"observation.images.left_camera\": \"observation.images.camera2\", \"observation.images.right_camera\": \"observation.images.camera3\"}' \
|
||||
--output_dir=/tmp/eval-artifacts
|
||||
python /lerobot/scripts/ci/extract_task_descriptions.py \
|
||||
--env robotwin \
|
||||
--task \"\$ROBOTWIN_TASKS\" \
|
||||
--output /tmp/eval-artifacts/task_descriptions.json
|
||||
"
|
||||
|
||||
- name: Copy RoboTwin artifacts from container
|
||||
if: always()
|
||||
run: |
|
||||
mkdir -p /tmp/robotwin-artifacts
|
||||
docker cp robotwin-eval:/tmp/eval-artifacts/. /tmp/robotwin-artifacts/ 2>/dev/null || true
|
||||
docker rm -f robotwin-eval || true
|
||||
|
||||
- name: Parse RoboTwin eval metrics
|
||||
if: always()
|
||||
run: |
|
||||
python3 scripts/ci/parse_eval_metrics.py \
|
||||
--artifacts-dir /tmp/robotwin-artifacts \
|
||||
--env robotwin \
|
||||
--task "${ROBOTWIN_TASKS}" \
|
||||
--policy "${ROBOTWIN_POLICY}"
|
||||
|
||||
- name: Upload RoboTwin rollout video
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: robotwin-rollout-video
|
||||
path: /tmp/robotwin-artifacts/videos/
|
||||
if-no-files-found: warn
|
||||
|
||||
- name: Upload RoboTwin eval metrics
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: robotwin-metrics
|
||||
path: /tmp/robotwin-artifacts/metrics.json
|
||||
if-no-files-found: warn
|
||||
|
||||
# ── ROBOCASA365 ──────────────────────────────────────────────────────────
|
||||
# Isolated image: robocasa + robosuite installed manually as editable
|
||||
# clones (no `lerobot[robocasa]` extra — robocasa's setup.py pins
|
||||
# `lerobot==0.3.3`, which would shadow this repo's lerobot).
|
||||
robocasa-integration-test:
|
||||
name: RoboCasa365 — build image + 1-episode eval
|
||||
runs-on:
|
||||
group: aws-g6-4xlarge-plus
|
||||
env:
|
||||
HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
persist-credentials: false
|
||||
lfs: true
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
cache-binary: false
|
||||
|
||||
- name: Login to Docker Hub
|
||||
if: ${{ env.DOCKERHUB_USERNAME != '' }}
|
||||
uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
|
||||
env:
|
||||
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
|
||||
|
||||
- name: Build RoboCasa365 benchmark image
|
||||
uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
context: .
|
||||
file: docker/Dockerfile.benchmark.robocasa
|
||||
push: false
|
||||
load: true
|
||||
tags: lerobot-benchmark-robocasa:ci
|
||||
|
||||
- name: Run RoboCasa365 smoke eval (10 atomic tasks, 1 episode each)
|
||||
if: env.HF_USER_TOKEN != ''
|
||||
run: |
|
||||
docker run --name robocasa-eval --gpus all \
|
||||
--shm-size=4g \
|
||||
-e HF_HOME=/tmp/hf \
|
||||
-e HF_USER_TOKEN="${HF_USER_TOKEN}" \
|
||||
-e HF_HUB_DOWNLOAD_TIMEOUT=300 \
|
||||
-e MUJOCO_GL=egl \
|
||||
lerobot-benchmark-robocasa:ci \
|
||||
bash -c "
|
||||
hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
|
||||
lerobot-eval \
|
||||
--policy.path=lerobot/smolvla_robocasa \
|
||||
--env.type=robocasa \
|
||||
--env.task=CloseFridge,OpenCabinet,OpenDrawer,TurnOnMicrowave,TurnOffStove,CloseToasterOvenDoor,SlideDishwasherRack,TurnOnSinkFaucet,NavigateKitchen,TurnOnElectricKettle \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=1 \
|
||||
--eval.use_async_envs=false \
|
||||
--policy.device=cuda \
|
||||
'--rename_map={\"observation.images.robot0_agentview_left\": \"observation.images.camera1\", \"observation.images.robot0_eye_in_hand\": \"observation.images.camera2\", \"observation.images.robot0_agentview_right\": \"observation.images.camera3\"}' \
|
||||
--output_dir=/tmp/eval-artifacts
|
||||
python scripts/ci/extract_task_descriptions.py \
|
||||
--env robocasa \
|
||||
--task CloseFridge,OpenCabinet,OpenDrawer,TurnOnMicrowave,TurnOffStove,CloseToasterOvenDoor,SlideDishwasherRack,TurnOnSinkFaucet,NavigateKitchen,TurnOnElectricKettle \
|
||||
--output /tmp/eval-artifacts/task_descriptions.json
|
||||
"
|
||||
|
||||
- name: Copy RoboCasa365 artifacts from container
|
||||
if: always()
|
||||
run: |
|
||||
mkdir -p /tmp/robocasa-artifacts
|
||||
docker cp robocasa-eval:/tmp/eval-artifacts/. /tmp/robocasa-artifacts/ 2>/dev/null || true
|
||||
docker rm -f robocasa-eval || true
|
||||
|
||||
- name: Parse RoboCasa365 eval metrics
|
||||
if: always()
|
||||
run: |
|
||||
python3 scripts/ci/parse_eval_metrics.py \
|
||||
--artifacts-dir /tmp/robocasa-artifacts \
|
||||
--env robocasa \
|
||||
--task atomic_smoke_10 \
|
||||
--policy lerobot/smolvla_robocasa
|
||||
|
||||
- name: Upload RoboCasa365 rollout video
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
name: robocasa-rollout-video
|
||||
path: /tmp/robocasa-artifacts/videos/
|
||||
if-no-files-found: warn
|
||||
|
||||
- name: Upload RoboCasa365 eval metrics
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
name: robocasa-metrics
|
||||
path: /tmp/robocasa-artifacts/metrics.json
|
||||
if-no-files-found: warn
|
||||
|
||||
# ── ROBOCEREBRA ───────────────────────────────────────────────────────────
|
||||
# Reuses the LIBERO simulator (libero_10 suite) with RoboCerebra camera
|
||||
# defaults (image/wrist_image). The image is layered on
|
||||
# huggingface/lerobot-gpu, which already ships [libero] as part of [all].
|
||||
robocerebra-integration-test:
|
||||
name: RoboCerebra — build image + 1-episode eval
|
||||
runs-on:
|
||||
group: aws-g6-4xlarge-plus
|
||||
env:
|
||||
HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
persist-credentials: false
|
||||
lfs: true
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
cache-binary: false
|
||||
|
||||
- name: Login to Docker Hub
|
||||
if: ${{ env.DOCKERHUB_USERNAME != '' }}
|
||||
uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
|
||||
env:
|
||||
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
|
||||
|
||||
- name: Build RoboCerebra benchmark image
|
||||
uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
context: .
|
||||
file: docker/Dockerfile.benchmark.robocerebra
|
||||
push: false
|
||||
load: true
|
||||
tags: lerobot-benchmark-robocerebra:ci
|
||||
cache-from: type=local,src=/tmp/.buildx-cache-robocerebra
|
||||
cache-to: type=local,dest=/tmp/.buildx-cache-robocerebra,mode=max
|
||||
|
||||
- name: Run RoboCerebra smoke eval (1 episode)
|
||||
if: env.HF_USER_TOKEN != ''
|
||||
run: |
|
||||
docker run --name robocerebra-eval --gpus all \
|
||||
--shm-size=4g \
|
||||
-e HF_HOME=/tmp/hf \
|
||||
-e HF_USER_TOKEN="${HF_USER_TOKEN}" \
|
||||
-e HF_HUB_DOWNLOAD_TIMEOUT=300 \
|
||||
-e LIBERO_DATA_FOLDER=/tmp/libero_data \
|
||||
lerobot-benchmark-robocerebra:ci \
|
||||
bash -c "
|
||||
hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
|
||||
lerobot-eval \
|
||||
--policy.path=lerobot/smolvla_robocerebra \
|
||||
--env.type=libero \
|
||||
--env.task=libero_10 \
|
||||
--env.fps=20 \
|
||||
--env.obs_type=pixels_agent_pos \
|
||||
--env.observation_height=256 \
|
||||
--env.observation_width=256 \
|
||||
'--env.camera_name_mapping={\"agentview_image\": \"image\", \"robot0_eye_in_hand_image\": \"wrist_image\"}' \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=1 \
|
||||
--eval.use_async_envs=false \
|
||||
--policy.device=cuda \
|
||||
'--rename_map={\"observation.images.image\": \"observation.images.camera1\", \"observation.images.wrist_image\": \"observation.images.camera2\"}' \
|
||||
--policy.empty_cameras=1 \
|
||||
--output_dir=/tmp/eval-artifacts
|
||||
python scripts/ci/extract_task_descriptions.py \
|
||||
--env libero --task libero_10 \
|
||||
--output /tmp/eval-artifacts/task_descriptions.json
|
||||
"
|
||||
|
||||
- name: Copy RoboCerebra artifacts from container
|
||||
if: always()
|
||||
run: |
|
||||
mkdir -p /tmp/robocerebra-artifacts
|
||||
docker cp robocerebra-eval:/tmp/eval-artifacts/. /tmp/robocerebra-artifacts/ 2>/dev/null || true
|
||||
docker rm -f robocerebra-eval || true
|
||||
|
||||
- name: Parse RoboCerebra eval metrics
|
||||
if: always()
|
||||
run: |
|
||||
python3 scripts/ci/parse_eval_metrics.py \
|
||||
--artifacts-dir /tmp/robocerebra-artifacts \
|
||||
--env robocerebra \
|
||||
--task libero_10 \
|
||||
--policy lerobot/smolvla_robocerebra
|
||||
|
||||
- name: Upload RoboCerebra rollout video
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
name: robocerebra-rollout-video
|
||||
path: /tmp/robocerebra-artifacts/videos/
|
||||
if-no-files-found: warn
|
||||
|
||||
- name: Upload RoboCerebra eval metrics
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
name: robocerebra-metrics
|
||||
path: /tmp/robocerebra-artifacts/metrics.json
|
||||
if-no-files-found: warn
|
||||
|
||||
# ── ROBOMME ───────────────────────────────────────────────────────────────
|
||||
# Isolated image: mani-skill/SAPIEN/Vulkan chain with gymnasium and numpy
|
||||
# overrides (robomme can't be a pyproject extra due to numpy<2 pin).
|
||||
robomme-integration-test:
|
||||
name: RoboMME — build image + 1-episode eval
|
||||
runs-on:
|
||||
group: aws-g6-4xlarge-plus
|
||||
env:
|
||||
HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
|
||||
ROBOMME_POLICY: lerobot/smolvla_robomme
|
||||
ROBOMME_TASKS: PickXtimes,BinFill,StopCube,MoveCube,InsertPeg,SwingXtimes,VideoUnmask,ButtonUnmask,PickHighlight,PatternLock
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
persist-credentials: false
|
||||
lfs: true
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
cache-binary: false
|
||||
|
||||
- name: Login to Docker Hub
|
||||
if: ${{ env.DOCKERHUB_USERNAME != '' }}
|
||||
uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
|
||||
env:
|
||||
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
|
||||
|
||||
- name: Build RoboMME benchmark image
|
||||
uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
context: .
|
||||
file: docker/Dockerfile.benchmark.robomme
|
||||
push: false
|
||||
load: true
|
||||
tags: lerobot-benchmark-robomme:ci
|
||||
|
||||
- name: Run RoboMME smoke eval (10 tasks, 1 episode each)
|
||||
if: env.HF_USER_TOKEN != ''
|
||||
run: |
|
||||
docker run --name robomme-eval --gpus all \
|
||||
--shm-size=4g \
|
||||
-e HF_HOME=/tmp/hf \
|
||||
-e HF_USER_TOKEN="${HF_USER_TOKEN}" \
|
||||
-e HF_HUB_DOWNLOAD_TIMEOUT=300 \
|
||||
-e ROBOMME_POLICY="${ROBOMME_POLICY}" \
|
||||
-e ROBOMME_TASKS="${ROBOMME_TASKS}" \
|
||||
lerobot-benchmark-robomme:ci \
|
||||
bash -c "
|
||||
hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
|
||||
lerobot-eval \
|
||||
--policy.path=\"\$ROBOMME_POLICY\" \
|
||||
--env.type=robomme \
|
||||
--env.task=\"\$ROBOMME_TASKS\" \
|
||||
--env.dataset_split=test \
|
||||
--env.task_ids=[0] \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=1 \
|
||||
--eval.use_async_envs=false \
|
||||
--policy.device=cuda \
|
||||
'--rename_map={\"observation.images.image\": \"observation.images.camera1\", \"observation.images.wrist_image\": \"observation.images.camera2\"}' \
|
||||
--policy.empty_cameras=3 \
|
||||
--output_dir=/tmp/eval-artifacts
|
||||
python scripts/ci/extract_task_descriptions.py \
|
||||
--env robomme --task \"\$ROBOMME_TASKS\" \
|
||||
--output /tmp/eval-artifacts/task_descriptions.json
|
||||
"
|
||||
|
||||
- name: Copy RoboMME artifacts from container
|
||||
if: always()
|
||||
run: |
|
||||
mkdir -p /tmp/robomme-artifacts
|
||||
docker cp robomme-eval:/tmp/eval-artifacts/. /tmp/robomme-artifacts/ 2>/dev/null || true
|
||||
docker rm -f robomme-eval || true
|
||||
|
||||
- name: Parse RoboMME eval metrics
|
||||
if: always()
|
||||
run: |
|
||||
python3 scripts/ci/parse_eval_metrics.py \
|
||||
--artifacts-dir /tmp/robomme-artifacts \
|
||||
--env robomme \
|
||||
--task "${ROBOMME_TASKS}" \
|
||||
--policy "${ROBOMME_POLICY}"
|
||||
|
||||
- name: Upload RoboMME rollout video
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
name: robomme-rollout-video
|
||||
path: /tmp/robomme-artifacts/videos/
|
||||
if-no-files-found: warn
|
||||
|
||||
- name: Upload RoboMME eval metrics
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
name: robomme-metrics
|
||||
path: /tmp/robomme-artifacts/metrics.json
|
||||
if-no-files-found: warn
|
||||
|
||||
# ── LIBERO-plus ───────────────────────────────────────────────────────────
|
||||
# Isolated image: LIBERO-plus fork cloned into /home/user_lerobot on top of
|
||||
# huggingface/lerobot-gpu (see docker/Dockerfile.benchmark.libero_plus).
|
||||
libero-plus-integration-test:
|
||||
name: LIBERO-plus — build image + 1-episode eval
|
||||
runs-on:
|
||||
group: aws-g6-4xlarge-plus
|
||||
env:
|
||||
HF_USER_TOKEN: ${{ secrets.LEROBOT_HF_USER }}
|
||||
LIBERO_PLUS_SUITE: libero_spatial
|
||||
LIBERO_PLUS_POLICY: lerobot/smolvla_libero_plus
|
||||
LIBERO_PLUS_TASK_IDS: "[0,100,260,500,1000,1500,2000,2400]"
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
persist-credentials: false
|
||||
lfs: true
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
cache-binary: false
|
||||
|
||||
- name: Login to Docker Hub
|
||||
if: ${{ env.DOCKERHUB_USERNAME != '' }}
|
||||
uses: docker/login-action@v3 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_LEROBOT_PASSWORD }}
|
||||
env:
|
||||
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_LEROBOT_USERNAME }}
|
||||
|
||||
- name: Build LIBERO-plus benchmark image
|
||||
uses: docker/build-push-action@v6 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
context: .
|
||||
file: docker/Dockerfile.benchmark.libero_plus
|
||||
push: false
|
||||
load: true
|
||||
tags: lerobot-benchmark-libero-plus:ci
|
||||
cache-from: type=local,src=/tmp/.buildx-cache-libero-plus
|
||||
cache-to: type=local,dest=/tmp/.buildx-cache-libero-plus,mode=max
|
||||
|
||||
- name: Run LIBERO-plus smoke eval (1 episode)
|
||||
if: env.HF_USER_TOKEN != ''
|
||||
run: |
|
||||
docker run --name libero-plus-eval --gpus all \
|
||||
--shm-size=4g \
|
||||
-e HF_HOME=/tmp/hf \
|
||||
-e HF_USER_TOKEN="${HF_USER_TOKEN}" \
|
||||
-e HF_HUB_DOWNLOAD_TIMEOUT=300 \
|
||||
-e LIBERO_PLUS_SUITE="${LIBERO_PLUS_SUITE}" \
|
||||
-e LIBERO_PLUS_POLICY="${LIBERO_PLUS_POLICY}" \
|
||||
-e LIBERO_PLUS_TASK_IDS="${LIBERO_PLUS_TASK_IDS}" \
|
||||
lerobot-benchmark-libero-plus:ci \
|
||||
bash -c "
|
||||
hf auth login --token \"\$HF_USER_TOKEN\" --add-to-git-credential 2>/dev/null || true
|
||||
lerobot-eval \
|
||||
--policy.path=\"\$LIBERO_PLUS_POLICY\" \
|
||||
--env.type=libero_plus \
|
||||
--env.task=\"\$LIBERO_PLUS_SUITE\" \
|
||||
--env.task_ids=\"\$LIBERO_PLUS_TASK_IDS\" \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=1 \
|
||||
--eval.use_async_envs=false \
|
||||
--policy.device=cuda \
|
||||
'--env.camera_name_mapping={\"agentview_image\": \"camera1\", \"robot0_eye_in_hand_image\": \"camera2\"}' \
|
||||
--policy.empty_cameras=1 \
|
||||
--output_dir=/tmp/eval-artifacts
|
||||
python scripts/ci/extract_task_descriptions.py \
|
||||
--env libero_plus --task \"\$LIBERO_PLUS_SUITE\" \
|
||||
--output /tmp/eval-artifacts/task_descriptions.json
|
||||
"
|
||||
|
||||
- name: Copy LIBERO-plus artifacts from container
|
||||
if: always()
|
||||
run: |
|
||||
mkdir -p /tmp/libero-plus-artifacts
|
||||
docker cp libero-plus-eval:/tmp/eval-artifacts/. /tmp/libero-plus-artifacts/ 2>/dev/null || true
|
||||
docker rm -f libero-plus-eval || true
|
||||
|
||||
- name: Parse LIBERO-plus eval metrics
|
||||
if: always()
|
||||
run: |
|
||||
python3 scripts/ci/parse_eval_metrics.py \
|
||||
--artifacts-dir /tmp/libero-plus-artifacts \
|
||||
--env libero_plus \
|
||||
--task "${LIBERO_PLUS_SUITE}" \
|
||||
--policy "${LIBERO_PLUS_POLICY}"
|
||||
|
||||
- name: Upload LIBERO-plus rollout video
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
name: libero-plus-rollout-video
|
||||
path: /tmp/libero-plus-artifacts/videos/
|
||||
if-no-files-found: warn
|
||||
|
||||
- name: Upload LIBERO-plus eval metrics
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4 # zizmor: ignore[unpinned-uses]
|
||||
with:
|
||||
name: libero-plus-metrics
|
||||
path: /tmp/libero-plus-artifacts/metrics.json
|
||||
if-no-files-found: warn
|
||||
|
||||
84
docker/Dockerfile.benchmark.libero_plus
Normal file
84
docker/Dockerfile.benchmark.libero_plus
Normal file
@@ -0,0 +1,84 @@
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Benchmark image for LIBERO-plus integration tests.
|
||||
# Extends the nightly GPU image (which has lerobot[all]) with the LIBERO-plus
|
||||
# fork source + its 6.4 GB perturbation assets.
|
||||
#
|
||||
# Build: docker build -f docker/Dockerfile.benchmark.libero_plus -t lerobot-benchmark-libero-plus .
|
||||
# Run: docker run --gpus all --rm lerobot-benchmark-libero-plus lerobot-eval ...
|
||||
|
||||
FROM huggingface/lerobot-gpu:latest
|
||||
ENV MUJOCO_GL=egl
|
||||
|
||||
# unzip for the 6.4 GB assets.zip; the rest are LIBERO-plus build-time extras
|
||||
# (wand / ImageMagick / fontconfig) not in the nightly base.
|
||||
USER root
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
unzip libexpat1 libfontconfig1-dev libmagickwand-dev \
|
||||
&& apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
USER user_lerobot
|
||||
|
||||
# robosuite==1.4.1 is mandatory (the fork uses `single_arm_env` removed in
|
||||
# v1.5+). The rest are LIBERO-plus runtime deps pulled from its setup.py.
|
||||
# We install these explicitly instead of via the [libero_plus] extra because
|
||||
# the extra's `libero @ git+...` dep installs as a namespace package and then
|
||||
# clone and PYTHONPATH-override it below.
|
||||
RUN uv pip install --no-cache \
|
||||
"robosuite==1.4.1" \
|
||||
"bddl==1.0.1" \
|
||||
"easydict==1.13" \
|
||||
"mujoco==3.7.0" \
|
||||
"matplotlib==3.10.8" \
|
||||
"Wand==0.6.13" \
|
||||
"scikit-image==0.25.2" \
|
||||
"gym==0.26.2"
|
||||
|
||||
# Clone LIBERO-plus and make it importable as `libero`. The nightly base has
|
||||
# hf-libero (10 tasks) preinstalled via lerobot[libero]; uninstall it so
|
||||
# Python resolves `import libero` to the 2402-task LIBERO-plus module instead.
|
||||
# Pinned to the current upstream main SHA so benchmark builds stay reproducible.
|
||||
ARG LIBERO_PLUS_SHA=4976dc3
|
||||
ENV LIBERO_PLUS_ROOT=/home/user_lerobot/libero-plus/libero/libero
|
||||
RUN git clone https://github.com/sylvestf/LIBERO-plus.git /home/user_lerobot/libero-plus \
|
||||
&& git -C /home/user_lerobot/libero-plus checkout ${LIBERO_PLUS_SHA} \
|
||||
&& cd /home/user_lerobot/libero-plus && uv pip install --no-cache --no-deps -e "." \
|
||||
&& (uv pip uninstall hf-libero 2>/dev/null || true)
|
||||
ENV PYTHONPATH="/home/user_lerobot/libero-plus:${PYTHONPATH}"
|
||||
|
||||
# Perturbation textures/scenes: bddl_base_domain.py resolves XMLs via
|
||||
# DIR_PATH/../assets (package-relative, ignoring ~/.libero/config.yaml). All
|
||||
# 2402 tasks reference files that ship only in Sylvest/LIBERO-plus's
|
||||
# assets.zip (6.4 GB) under a deep author-internal prefix — extract and
|
||||
# flatten it under ${LIBERO_PLUS_ROOT}/assets.
|
||||
RUN python -c "\
|
||||
from huggingface_hub import hf_hub_download; \
|
||||
hf_hub_download(repo_id='Sylvest/LIBERO-plus', repo_type='dataset', \
|
||||
filename='assets.zip', local_dir='/tmp/libero-plus-dl')" \
|
||||
&& unzip -q /tmp/libero-plus-dl/assets.zip -d /tmp/libero-plus-dl/extract \
|
||||
&& ASSETS_DIR=$(find /tmp/libero-plus-dl/extract -type d -name assets | head -1) \
|
||||
&& mv "${ASSETS_DIR}" ${LIBERO_PLUS_ROOT}/assets \
|
||||
&& rm -rf /tmp/libero-plus-dl
|
||||
|
||||
# Point ~/.libero/config.yaml at the clone so LIBERO-plus's imports are
|
||||
# non-interactive (it calls input() when the config is missing).
|
||||
RUN mkdir -p /home/user_lerobot/.libero \
|
||||
&& printf "assets: ${LIBERO_PLUS_ROOT}/assets\nbddl_files: ${LIBERO_PLUS_ROOT}/bddl_files\ndatasets: ${LIBERO_PLUS_ROOT}/../datasets\ninit_states: ${LIBERO_PLUS_ROOT}/init_files\n" \
|
||||
> /home/user_lerobot/.libero/config.yaml
|
||||
|
||||
# Overlay the PR's source code on top of the nightly image.
|
||||
COPY --chown=user_lerobot:user_lerobot . .
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
71
docker/Dockerfile.benchmark.robocasa
Normal file
71
docker/Dockerfile.benchmark.robocasa
Normal file
@@ -0,0 +1,71 @@
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Benchmark image for RoboCasa365 integration tests.
|
||||
# Extends the nightly GPU image (which already has all extras installed)
|
||||
# with the PR's source code and RoboCasa-specific asset setup.
|
||||
#
|
||||
# Build: docker build -f docker/Dockerfile.benchmark.robocasa -t lerobot-benchmark-robocasa .
|
||||
# Run: docker run --gpus all --rm lerobot-benchmark-robocasa lerobot-eval ...
|
||||
|
||||
FROM huggingface/lerobot-gpu:latest
|
||||
|
||||
# Install robocasa + robosuite as editable clones. pip-installing from git
|
||||
# omits data files like robocasa/models/assets/box_links/box_links_assets.json
|
||||
# (not declared in package_data), which download_kitchen_assets needs at import.
|
||||
#
|
||||
# `--no-deps` on robocasa is deliberate: its setup.py pins `lerobot==0.3.3`
|
||||
# in install_requires, which would shadow the editable lerobot baked into
|
||||
# this image. We install robocasa's actual runtime deps explicitly instead.
|
||||
# Pinned SHAs for reproducible benchmark runs. Bump when you need an
|
||||
# upstream fix; don't rely on `main`/`master` drift.
|
||||
ARG ROBOCASA_SHA=56e355ccc64389dfc1b8a61a33b9127b975ba681
|
||||
ARG ROBOSUITE_SHA=aaa8b9b214ce8e77e82926d677b4d61d55e577ab
|
||||
RUN git clone https://github.com/robocasa/robocasa.git ~/robocasa && \
|
||||
git -C ~/robocasa checkout ${ROBOCASA_SHA} && \
|
||||
git clone https://github.com/ARISE-Initiative/robosuite.git ~/robosuite && \
|
||||
git -C ~/robosuite checkout ${ROBOSUITE_SHA} && \
|
||||
uv pip install --no-cache -e ~/robocasa --no-deps && \
|
||||
uv pip install --no-cache -e ~/robosuite && \
|
||||
uv pip install --no-cache \
|
||||
"numpy==2.2.5" "numba==0.61.2" "scipy==1.15.3" "mujoco==3.3.1" \
|
||||
"pygame==2.6.1" "Pillow==12.2.0" "opencv-python==4.13.0.92" \
|
||||
"pyyaml==6.0.3" "pynput==1.8.1" "tqdm==4.67.3" "termcolor==3.3.0" \
|
||||
"imageio==2.37.3" "h5py==3.16.0" "lxml==6.0.4" "hidapi==0.14.0.post4" \
|
||||
"tianshou==0.4.10" "gymnasium==1.2.3"
|
||||
|
||||
# Set up robocasa macros and download kitchen assets. We need:
|
||||
# - tex : base environment textures
|
||||
# - tex_generative : AI-generated textures; kitchen fixture XMLs embed
|
||||
# refs to generative_textures/wall/tex*.png
|
||||
# unconditionally, so MjModel.from_xml_string fails
|
||||
# at reset time without them (even if the env is
|
||||
# constructed with generative_textures=None).
|
||||
# - fixtures_lw : lightwheel kitchen fixtures (fridge, counters...)
|
||||
# - objs_lw : lightwheel object meshes (stools, misc props)
|
||||
# We skip the objaverse/aigen object packs (~30GB combined) by pairing
|
||||
# this with --env.obj_registries=["lightwheel"] on the lerobot side.
|
||||
# The download script prompts interactively, so pipe 'y' to auto-accept.
|
||||
RUN python -m robocasa.scripts.setup_macros && \
|
||||
yes y | python -m robocasa.scripts.download_kitchen_assets \
|
||||
--type tex tex_generative fixtures_lw objs_lw
|
||||
|
||||
# Overlay the PR's source code on top of the nightly image.
|
||||
COPY --chown=user_lerobot:user_lerobot . .
|
||||
|
||||
# Re-install lerobot editably so the new source (with RoboCasaEnv registration)
|
||||
# replaces the stale package baked into the nightly image.
|
||||
RUN uv pip install --no-cache --no-deps -e .
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
43
docker/Dockerfile.benchmark.robocerebra
Normal file
43
docker/Dockerfile.benchmark.robocerebra
Normal file
@@ -0,0 +1,43 @@
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Benchmark image for RoboCerebra integration tests.
|
||||
# RoboCerebra reuses LIBERO's simulator (libero_10 suite) with a different
|
||||
# rename_map, so this image is identical to the LIBERO benchmark image —
|
||||
# extends the nightly GPU base with LIBERO assets + the PR's source code.
|
||||
#
|
||||
# Build: docker build -f docker/Dockerfile.benchmark.robocerebra -t lerobot-benchmark-robocerebra .
|
||||
# Run: docker run --gpus all --rm lerobot-benchmark-robocerebra lerobot-eval ...
|
||||
|
||||
FROM huggingface/lerobot-gpu:latest
|
||||
|
||||
# Pre-download lerobot/libero-assets from HF Hub so nothing is fetched at
|
||||
# runtime (which times out on CI). Point the libero config at the cached path.
|
||||
# libero/libero/__init__.py calls input() when ~/.libero/config.yaml is missing,
|
||||
# so we write the config before any libero import can happen.
|
||||
RUN LIBERO_DIR=$(python -c \
|
||||
"import importlib.util, os; s=importlib.util.find_spec('libero'); \
|
||||
print(os.path.join(os.path.dirname(s.origin), 'libero'))") && \
|
||||
mkdir -p /home/user_lerobot/.libero && \
|
||||
python -c "\
|
||||
from huggingface_hub import snapshot_download; \
|
||||
snapshot_download(repo_id='lerobot/libero-assets', repo_type='dataset', \
|
||||
local_dir='/home/user_lerobot/.libero/assets')" && \
|
||||
printf "assets: /home/user_lerobot/.libero/assets\nbddl_files: ${LIBERO_DIR}/bddl_files\ndatasets: ${LIBERO_DIR}/../datasets\ninit_states: ${LIBERO_DIR}/init_files\n" \
|
||||
> /home/user_lerobot/.libero/config.yaml
|
||||
|
||||
# Overlay the PR's source code on top of the nightly image.
|
||||
COPY --chown=user_lerobot:user_lerobot . .
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
56
docker/Dockerfile.benchmark.robomme
Normal file
56
docker/Dockerfile.benchmark.robomme
Normal file
@@ -0,0 +1,56 @@
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Benchmark image for RoboMME integration tests.
|
||||
# Extends the nightly GPU image (which has lerobot[all]) with Vulkan system
|
||||
# libs for ManiSkill/SAPIEN and the robomme extra. robomme isn't in [all]
|
||||
# because mani-skill hard-pins gymnasium==0.29.1 and numpy<2.0.0 which
|
||||
# conflict with lerobot's defaults; both are safe at runtime:
|
||||
# - gymnasium 0.29.x has the same 5-tuple step() API as 1.x (since 0.26)
|
||||
# - numpy 1.26.4 is API-compatible with lerobot's actual usage.
|
||||
#
|
||||
# Build: docker build -f docker/Dockerfile.benchmark.robomme -t lerobot-benchmark-robomme .
|
||||
# Run: docker run --gpus all --rm lerobot-benchmark-robomme lerobot-eval ...
|
||||
|
||||
FROM huggingface/lerobot-gpu:latest
|
||||
|
||||
# NVIDIA Container Toolkit: expose Vulkan driver capability for headless rendering.
|
||||
ENV NVIDIA_DRIVER_CAPABILITIES=all \
|
||||
VK_ICD_FILENAMES=/usr/share/vulkan/icd.d/nvidia_icd.json
|
||||
|
||||
# ManiSkill/SAPIEN's renderer needs Vulkan, which isn't in the base image.
|
||||
USER root
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
libvulkan1 libvulkan-dev mesa-vulkan-drivers \
|
||||
&& mkdir -p /usr/share/vulkan/icd.d \
|
||||
&& echo '{"file_format_version":"1.0.0","ICD":{"library_path":"libGLX_nvidia.so.0","api_version":"1.3.0"}}' \
|
||||
> /usr/share/vulkan/icd.d/nvidia_icd.json \
|
||||
&& apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
USER user_lerobot
|
||||
|
||||
# Install smolvla + av-dep via the PR's pyproject, then layer robomme on top
|
||||
# with gymnasium/numpy overrides. robomme isn't a pyproject extra because its
|
||||
# mani-skill pin conflicts with lerobot's base numpy>=2 (see pyproject.toml).
|
||||
COPY --chown=user_lerobot:user_lerobot setup.py pyproject.toml uv.lock README.md MANIFEST.in ./
|
||||
RUN printf 'gymnasium==0.29.1\nnumpy==1.26.4\n' > /tmp/robomme_override.txt \
|
||||
&& uv pip install --no-cache --override /tmp/robomme_override.txt \
|
||||
-e ".[smolvla,av-dep]" \
|
||||
"robomme @ git+https://github.com/RoboMME/robomme_benchmark.git@main" \
|
||||
&& python -c "import robomme; print('robomme import OK')"
|
||||
|
||||
# Overlay the PR's source code on top of the nightly image.
|
||||
COPY --chown=user_lerobot:user_lerobot . .
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
122
docker/Dockerfile.benchmark.robotwin
Normal file
122
docker/Dockerfile.benchmark.robotwin
Normal file
@@ -0,0 +1,122 @@
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Benchmark image for RoboTwin 2.0 integration tests.
|
||||
# Extends the nightly GPU image with the RoboTwin simulator stack:
|
||||
# sapien/mplib/pytorch3d + NVlabs CuRobo + embodiments.zip + objects.zip
|
||||
# (~3.96 GB of assets; background_texture.zip ~11 GB skipped for smoke eval).
|
||||
#
|
||||
# Build: docker build -f docker/Dockerfile.benchmark.robotwin -t lerobot-benchmark-robotwin .
|
||||
# Run: docker run --gpus all --rm lerobot-benchmark-robotwin \
|
||||
# lerobot-eval --env.type=robotwin --env.task=beat_block_hammer ...
|
||||
|
||||
FROM huggingface/lerobot-gpu:latest
|
||||
|
||||
ENV NVIDIA_DRIVER_CAPABILITIES=all \
|
||||
VK_ICD_FILENAMES=/usr/share/vulkan/icd.d/nvidia_icd.json \
|
||||
ROBOTWIN_ROOT=/opt/robotwin
|
||||
|
||||
# The nightly base is CUDA -base (no compiler, no Vulkan loader). CuRobo's
|
||||
# `pip install -e .` runs nvcc, and SAPIEN renders via Vulkan — add both.
|
||||
USER root
|
||||
# Pinned upstream SHA for reproducible benchmark runs. Bump when we need
|
||||
# an upstream fix; don't rely on `main` drift.
|
||||
ARG ROBOTWIN_SHA=0aeea2d669c0f8516f4d5785f0aa33ba812c14b4
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
cuda-nvcc-12-4 cuda-cudart-dev-12-4 \
|
||||
libvulkan1 vulkan-tools \
|
||||
&& mkdir -p /usr/share/vulkan/icd.d \
|
||||
&& echo '{"file_format_version":"1.0.0","ICD":{"library_path":"libGLX_nvidia.so.0","api_version":"1.3.0"}}' \
|
||||
> /usr/share/vulkan/icd.d/nvidia_icd.json \
|
||||
&& git clone https://github.com/RoboTwin-Platform/RoboTwin.git ${ROBOTWIN_ROOT} \
|
||||
&& git -C ${ROBOTWIN_ROOT} checkout ${ROBOTWIN_SHA} \
|
||||
&& chown -R user_lerobot:user_lerobot ${ROBOTWIN_ROOT} \
|
||||
&& apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
USER user_lerobot
|
||||
|
||||
# RoboTwin runtime deps (av is already in the base via [av-dep]).
|
||||
RUN uv pip install --no-cache \
|
||||
"sapien==3.0.0b1" "mplib==0.2.1" "transforms3d==0.4.2" "trimesh==4.4.3" \
|
||||
"open3d==0.19.0" "imageio==2.34.2" termcolor zarr pydantic h5py
|
||||
|
||||
# pytorch3d has no universal wheel; must be built from source (~10 min, cached).
|
||||
RUN uv pip install --no-cache --no-build-isolation \
|
||||
"git+https://github.com/facebookresearch/pytorch3d.git@stable"
|
||||
|
||||
# CuRobo — NVlabs motion generator; TORCH_CUDA_ARCH_LIST must be set or the
|
||||
# build aborts on an empty arch list. Pinned SHA for reproducibility.
|
||||
ARG CUROBO_SHA=ca941586c33b8482ed9c0e74d60f23efd64b516a
|
||||
RUN cd ${ROBOTWIN_ROOT}/envs \
|
||||
&& git clone https://github.com/NVlabs/curobo.git \
|
||||
&& git -C curobo checkout ${CUROBO_SHA} \
|
||||
&& cd curobo \
|
||||
&& TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;8.9;9.0" \
|
||||
uv pip install -e . --no-build-isolation --no-cache
|
||||
|
||||
# Upstream patches (mirror RoboTwin's script/_install.sh).
|
||||
# These patches target the exact versions pinned above; re-check when upgrading.
|
||||
# mplib==0.2.1: drop a broken `or collide` clause in planner.py.
|
||||
# Safe to remove once mplib > 0.2.1 ships with the fix upstream.
|
||||
# sapien==3.0.0b1: fix URDF loader encoding + .srdf extension check.
|
||||
# Safe to remove once sapien > 3.0.0b1 ships with the fix upstream.
|
||||
RUN python - <<'EOF'
|
||||
import pathlib, re, site
|
||||
for d in site.getsitepackages():
|
||||
p = pathlib.Path(d) / "mplib" / "planner.py"
|
||||
if p.exists():
|
||||
p.write_text(re.sub(r"\bor collide\b", "", p.read_text(), count=1))
|
||||
print(f"mplib patch applied: {p}")
|
||||
p = pathlib.Path(d) / "sapien" / "wrapper" / "urdf_loader.py"
|
||||
if p.exists():
|
||||
src = p.read_text().replace(
|
||||
"with open(srdf_path) as f:", 'with open(srdf_path, encoding="utf-8") as f:'
|
||||
).replace('"srdf"', '".srdf"')
|
||||
p.write_text(src)
|
||||
print(f"sapien patch applied: {p}")
|
||||
EOF
|
||||
|
||||
# Simulation assets from TianxingChen/RoboTwin2.0: embodiments (~220 MB) +
|
||||
# objects (~3.74 GB). background_texture (~11 GB) is intentionally skipped.
|
||||
# The dataset is public — no auth token needed.
|
||||
RUN python - <<'EOF'
|
||||
import os, pathlib, zipfile
|
||||
from huggingface_hub import hf_hub_download
|
||||
|
||||
assets_dir = pathlib.Path(os.environ["ROBOTWIN_ROOT"]) / "assets"
|
||||
assets_dir.mkdir(parents=True, exist_ok=True)
|
||||
for fname in ("embodiments.zip", "objects.zip"):
|
||||
local = hf_hub_download(
|
||||
repo_id="TianxingChen/RoboTwin2.0",
|
||||
repo_type="dataset",
|
||||
filename=fname,
|
||||
local_dir=str(assets_dir),
|
||||
)
|
||||
with zipfile.ZipFile(local, "r") as z:
|
||||
z.extractall(str(assets_dir))
|
||||
pathlib.Path(local).unlink()
|
||||
EOF
|
||||
|
||||
WORKDIR ${ROBOTWIN_ROOT}
|
||||
RUN python script/update_embodiment_config_path.py
|
||||
|
||||
ENV PYTHONPATH="${ROBOTWIN_ROOT}:${PYTHONPATH}"
|
||||
|
||||
# Return to the lerobot source directory (set by base image) before overlaying.
|
||||
WORKDIR /lerobot
|
||||
|
||||
# Overlay the PR's source code on top of the nightly image.
|
||||
COPY --chown=user_lerobot:user_lerobot . .
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
@@ -77,8 +77,18 @@
|
||||
title: Adding a New Benchmark
|
||||
- local: libero
|
||||
title: LIBERO
|
||||
- local: libero_plus
|
||||
title: LIBERO-plus
|
||||
- local: metaworld
|
||||
title: Meta-World
|
||||
- local: robotwin
|
||||
title: RoboTwin 2.0
|
||||
- local: robocasa
|
||||
title: RoboCasa365
|
||||
- local: robocerebra
|
||||
title: RoboCerebra
|
||||
- local: robomme
|
||||
title: RoboMME
|
||||
- local: envhub_isaaclab_arena
|
||||
title: NVIDIA IsaacLab Arena Environments
|
||||
title: "Benchmarks"
|
||||
|
||||
188
docs/source/libero_plus.mdx
Normal file
188
docs/source/libero_plus.mdx
Normal file
@@ -0,0 +1,188 @@
|
||||
# LIBERO-plus
|
||||
|
||||
LIBERO-plus is a **robustness benchmark** for Vision-Language-Action (VLA) models built on top of [LIBERO](./libero). It systematically stress-tests policies by applying **seven independent perturbation dimensions** to the original LIBERO task set, exposing failure modes that standard benchmarks miss.
|
||||
|
||||
- Paper: [In-depth Robustness Analysis of Vision-Language-Action Models](https://arxiv.org/abs/2510.13626)
|
||||
- GitHub: [sylvestf/LIBERO-plus](https://github.com/sylvestf/LIBERO-plus)
|
||||
- Dataset: [lerobot/libero_plus](https://huggingface.co/datasets/lerobot/libero_plus)
|
||||
|
||||

|
||||
|
||||
## Perturbation dimensions
|
||||
|
||||
LIBERO-plus creates ~10 000 task variants by perturbing each original LIBERO task along these axes:
|
||||
|
||||
| Dimension | What changes |
|
||||
| --------------------- | ----------------------------------------------------- |
|
||||
| Objects layout | Target position, presence of confounding objects |
|
||||
| Camera viewpoints | Camera position, orientation, field-of-view |
|
||||
| Robot initial states | Manipulator start pose |
|
||||
| Language instructions | LLM-rewritten task description (paraphrase / synonym) |
|
||||
| Light conditions | Intensity, direction, color, shadow |
|
||||
| Background textures | Scene surface and object appearance |
|
||||
| Sensor noise | Photometric distortions and image degradation |
|
||||
|
||||
## Available task suites
|
||||
|
||||
LIBERO-plus covers the same five suites as LIBERO:
|
||||
|
||||
| Suite | CLI name | Tasks | Max steps | Description |
|
||||
| -------------- | ---------------- | ----- | --------- | -------------------------------------------------- |
|
||||
| LIBERO-Spatial | `libero_spatial` | 10 | 280 | Tasks requiring reasoning about spatial relations |
|
||||
| LIBERO-Object | `libero_object` | 10 | 280 | Tasks centered on manipulating different objects |
|
||||
| LIBERO-Goal | `libero_goal` | 10 | 300 | Goal-conditioned tasks with changing targets |
|
||||
| LIBERO-90 | `libero_90` | 90 | 400 | Short-horizon tasks from the LIBERO-100 collection |
|
||||
| LIBERO-Long | `libero_10` | 10 | 520 | Long-horizon tasks from the LIBERO-100 collection |
|
||||
|
||||
<Tip warning={true}>
|
||||
Installing LIBERO-plus **replaces** vanilla LIBERO — it uninstalls `hf-libero`
|
||||
so that `import libero` resolves to the LIBERO-plus fork. You cannot have both
|
||||
installed at the same time. To switch back to vanilla LIBERO, uninstall the
|
||||
fork and reinstall with `pip install -e ".[libero]"`.
|
||||
</Tip>
|
||||
|
||||
## Installation
|
||||
|
||||
### System dependencies (Linux only)
|
||||
|
||||
```bash
|
||||
sudo apt install libexpat1 libfontconfig1-dev libmagickwand-dev
|
||||
```
|
||||
|
||||
### Python package
|
||||
|
||||
```bash
|
||||
pip install -e ".[libero]" "robosuite==1.4.1" bddl easydict mujoco wand scikit-image gym
|
||||
git clone https://github.com/sylvestf/LIBERO-plus.git
|
||||
cd LIBERO-plus && pip install --no-deps -e .
|
||||
pip uninstall -y hf-libero # so `import libero` resolves to the fork
|
||||
```
|
||||
|
||||
LIBERO-plus is installed from its GitHub fork rather than a pyproject extra — the fork ships as a namespace package that pip can't handle, so it must be cloned and added to `PYTHONPATH`. See `docker/Dockerfile.benchmark.libero_plus` for the canonical install. MuJoCo is required, so only Linux is supported.
|
||||
|
||||
<Tip>
|
||||
Set the MuJoCo rendering backend before running evaluation:
|
||||
|
||||
```bash
|
||||
export MUJOCO_GL=egl # headless / HPC / cloud
|
||||
```
|
||||
|
||||
</Tip>
|
||||
|
||||
### Download LIBERO-plus assets
|
||||
|
||||
LIBERO-plus ships its extended asset pack separately. Download `assets.zip` from the [Hugging Face dataset](https://huggingface.co/datasets/Sylvest/LIBERO-plus/tree/main) and extract it into the LIBERO-plus package directory:
|
||||
|
||||
```bash
|
||||
# After installing the package, find where it was installed:
|
||||
python -c "import libero; print(libero.__file__)"
|
||||
# Then extract assets.zip into <package_root>/libero/assets/
|
||||
```
|
||||
|
||||
## Evaluation
|
||||
|
||||
### Default evaluation (recommended)
|
||||
|
||||
Evaluate across the four standard suites (10 episodes per task):
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path="your-policy-id" \
|
||||
--env.type=libero_plus \
|
||||
--env.task=libero_spatial,libero_object,libero_goal,libero_10 \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=10 \
|
||||
--env.max_parallel_tasks=1
|
||||
```
|
||||
|
||||
### Single-suite evaluation
|
||||
|
||||
Evaluate on one LIBERO-plus suite:
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path="your-policy-id" \
|
||||
--env.type=libero_plus \
|
||||
--env.task=libero_spatial \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=10
|
||||
```
|
||||
|
||||
- `--env.task` picks the suite (`libero_spatial`, `libero_object`, etc.).
|
||||
- `--env.task_ids` restricts to specific task indices (`[0]`, `[1,2,3]`, etc.). Omit to run all tasks in the suite.
|
||||
- `--eval.batch_size` controls how many environments run in parallel.
|
||||
- `--eval.n_episodes` sets how many episodes to run per task.
|
||||
|
||||
### Multi-suite evaluation
|
||||
|
||||
Benchmark a policy across multiple suites at once by passing a comma-separated list:
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path="your-policy-id" \
|
||||
--env.type=libero_plus \
|
||||
--env.task=libero_spatial,libero_object \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=10
|
||||
```
|
||||
|
||||
### Control mode
|
||||
|
||||
LIBERO-plus supports two control modes — `relative` (default) and `absolute`. Different VLA checkpoints are trained with different action parameterizations, so make sure the mode matches your policy:
|
||||
|
||||
```bash
|
||||
--env.control_mode=relative # or "absolute"
|
||||
```
|
||||
|
||||
### Policy inputs and outputs
|
||||
|
||||
**Observations:**
|
||||
|
||||
- `observation.state` — 8-dim proprioceptive features (eef position, axis-angle orientation, gripper qpos)
|
||||
- `observation.images.image` — main camera view (`agentview_image`), HWC uint8
|
||||
- `observation.images.image2` — wrist camera view (`robot0_eye_in_hand_image`), HWC uint8
|
||||
|
||||
**Actions:**
|
||||
|
||||
- Continuous control in `Box(-1, 1, shape=(7,))` — 6D end-effector delta + 1D gripper
|
||||
|
||||
### Recommended evaluation episodes
|
||||
|
||||
For reproducible benchmarking, use **10 episodes per task** across all four standard suites (Spatial, Object, Goal, Long). This gives 400 total episodes and matches the protocol used for published results.
|
||||
|
||||
## Training
|
||||
|
||||
### Dataset
|
||||
|
||||
A LeRobot-format training dataset for LIBERO-plus is available at:
|
||||
|
||||
- [lerobot/libero_plus](https://huggingface.co/datasets/lerobot/libero_plus)
|
||||
|
||||
### Example training command
|
||||
|
||||
```bash
|
||||
lerobot-train \
|
||||
--policy.type=smolvla \
|
||||
--policy.repo_id=${HF_USER}/smolvla_libero_plus \
|
||||
--policy.load_vlm_weights=true \
|
||||
--dataset.repo_id=lerobot/libero_plus \
|
||||
--env.type=libero_plus \
|
||||
--env.task=libero_spatial \
|
||||
--output_dir=./outputs/ \
|
||||
--steps=100000 \
|
||||
--batch_size=4 \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=1 \
|
||||
--eval_freq=1000
|
||||
```
|
||||
|
||||
## Relationship to LIBERO
|
||||
|
||||
LIBERO-plus is a drop-in extension of LIBERO:
|
||||
|
||||
- Same Python gym interface (`LiberoEnv`, `LiberoProcessorStep`)
|
||||
- Same camera names and observation/action format
|
||||
- Same task suite names
|
||||
- Installs under the same `libero` Python package name (different GitHub repo)
|
||||
|
||||
To use the original LIBERO benchmark, see [LIBERO](./libero) and use `--env.type=libero`.
|
||||
188
docs/source/robocasa.mdx
Normal file
188
docs/source/robocasa.mdx
Normal file
@@ -0,0 +1,188 @@
|
||||
# RoboCasa365
|
||||
|
||||
[RoboCasa365](https://robocasa.ai) is a large-scale simulation framework for training and benchmarking **generalist robots** in everyday kitchen tasks. It ships 365 diverse manipulation tasks across 2,500 kitchen environments, 3,200+ object assets and 600+ hours of human demonstration data, on a PandaOmron 12-DOF mobile manipulator (Franka arm on a holonomic base).
|
||||
|
||||
- Paper: [RoboCasa: Large-Scale Simulation of Everyday Tasks for Generalist Robots](https://arxiv.org/abs/2406.02523)
|
||||
- GitHub: [robocasa/robocasa](https://github.com/robocasa/robocasa)
|
||||
- Project website: [robocasa.ai](https://robocasa.ai)
|
||||
- Pretrained policy: [`lerobot/smolvla_robocasa`](https://huggingface.co/lerobot/smolvla_robocasa)
|
||||
- Single-task dataset (CloseFridge): [`pepijn223/robocasa_CloseFridge`](https://huggingface.co/datasets/pepijn223/robocasa_CloseFridge)
|
||||
|
||||
<img
|
||||
src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lerobot/robocasa-banner.webp"
|
||||
alt="RoboCasa365 benchmark overview"
|
||||
width="85%"
|
||||
/>
|
||||
|
||||
## Available tasks
|
||||
|
||||
RoboCasa365 organizes its 365 tasks into two families and three upstream benchmark groups that LeRobot exposes as first-class `--env.task` shortcuts:
|
||||
|
||||
| Family | Tasks | Description |
|
||||
| --------- | ----- | ------------------------------------------------------------------------------- |
|
||||
| Atomic | ~65 | Single-skill tasks: pick-and-place, door/drawer manipulation, appliance control |
|
||||
| Composite | ~300 | Multi-step tasks across 60+ categories: cooking, cleaning, organizing, etc. |
|
||||
|
||||
**Atomic task examples:** `CloseFridge`, `OpenDrawer`, `OpenCabinet`, `TurnOnMicrowave`, `TurnOffStove`, `NavigateKitchen`, `PickPlaceCounterToStove`.
|
||||
|
||||
**Composite task categories:** baking, boiling, brewing, chopping, clearing table, defrosting food, loading dishwasher, making tea, microwaving food, washing dishes, and more.
|
||||
|
||||
`--env.task` accepts three forms:
|
||||
|
||||
- a single task name (`CloseFridge`)
|
||||
- a comma-separated list (`CloseFridge,OpenBlenderLid,PickPlaceCoffee`)
|
||||
- a benchmark-group shortcut — `atomic_seen`, `composite_seen`, `composite_unseen`, `pretrain50`, `pretrain100`, `pretrain200`, `pretrain300` — which auto-expands to the upstream task list and auto-sets the dataset `split` (`target` or `pretrain`).
|
||||
|
||||
## Installation
|
||||
|
||||
RoboCasa and its dependency `robosuite` are not published on PyPI, and RoboCasa's own `setup.py` hardcodes `lerobot==0.3.3`, which conflicts with this repo's `lerobot`. LeRobot therefore does **not** expose a `robocasa` extra — install the two packages manually as editable clones (using `--no-deps` on `robocasa` to skip its shadowed `lerobot` pin):
|
||||
|
||||
```bash
|
||||
# After following the standard LeRobot installation instructions.
|
||||
|
||||
git clone https://github.com/robocasa/robocasa.git ~/robocasa
|
||||
git clone https://github.com/ARISE-Initiative/robosuite.git ~/robosuite
|
||||
pip install -e ~/robocasa --no-deps
|
||||
pip install -e ~/robosuite
|
||||
|
||||
# Robocasa's runtime deps (the ones its setup.py would have pulled, minus
|
||||
# the bad lerobot pin).
|
||||
pip install numpy numba scipy mujoco pygame Pillow opencv-python \
|
||||
pyyaml pynput tqdm termcolor imageio h5py lxml hidapi \
|
||||
tianshou gymnasium
|
||||
|
||||
python -m robocasa.scripts.setup_macros
|
||||
# Lightweight assets (lightwheel object meshes + textures). Enough for
|
||||
# the default env out of the box.
|
||||
python -m robocasa.scripts.download_kitchen_assets \
|
||||
--type tex tex_generative fixtures_lw objs_lw
|
||||
# Optional: full objaverse/aigen registries (~30GB) for richer object
|
||||
# variety. Enable at eval time via --env.obj_registries (see below).
|
||||
# python -m robocasa.scripts.download_kitchen_assets --type objs_objaverse
|
||||
```
|
||||
|
||||
<Tip>
|
||||
RoboCasa requires MuJoCo. Set the rendering backend before training or evaluation:
|
||||
|
||||
```bash
|
||||
export MUJOCO_GL=egl # for headless servers (HPC, cloud)
|
||||
```
|
||||
|
||||
</Tip>
|
||||
|
||||
### Object registries
|
||||
|
||||
By default the env samples objects only from the `lightwheel` registry (what `--type objs_lw` ships), which avoids a `Probabilities contain NaN` crash when the objaverse / aigen packs aren't on disk. If you've downloaded the full asset set, enable the full registry at runtime:
|
||||
|
||||
```bash
|
||||
--env.obj_registries='[objaverse,lightwheel]'
|
||||
```
|
||||
|
||||
## Evaluation
|
||||
|
||||
All eval snippets below mirror the CI command (see `.github/workflows/benchmark_tests.yml`). The `--rename_map` argument maps RoboCasa's native camera keys (`robot0_agentview_left` / `robot0_eye_in_hand` / `robot0_agentview_right`) onto the three-camera (`camera1` / `camera2` / `camera3`) input layout the released `smolvla_robocasa` policy was trained on.
|
||||
|
||||
### Single-task evaluation (recommended for quick iteration)
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path=lerobot/smolvla_robocasa \
|
||||
--env.type=robocasa \
|
||||
--env.task=CloseFridge \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=20 \
|
||||
--eval.use_async_envs=false \
|
||||
--policy.device=cuda \
|
||||
'--rename_map={"observation.images.robot0_agentview_left": "observation.images.camera1", "observation.images.robot0_eye_in_hand": "observation.images.camera2", "observation.images.robot0_agentview_right": "observation.images.camera3"}'
|
||||
```
|
||||
|
||||
### Multi-task evaluation
|
||||
|
||||
Pass a comma-separated list of tasks:
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path=lerobot/smolvla_robocasa \
|
||||
--env.type=robocasa \
|
||||
--env.task=CloseFridge,OpenCabinet,OpenDrawer,TurnOnMicrowave,TurnOffStove \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=20 \
|
||||
--eval.use_async_envs=false \
|
||||
--policy.device=cuda \
|
||||
'--rename_map={"observation.images.robot0_agentview_left": "observation.images.camera1", "observation.images.robot0_eye_in_hand": "observation.images.camera2", "observation.images.robot0_agentview_right": "observation.images.camera3"}'
|
||||
```
|
||||
|
||||
### Benchmark-group evaluation
|
||||
|
||||
Run an entire upstream group (e.g. all 18 `atomic_seen` tasks with `split=target`):
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path=lerobot/smolvla_robocasa \
|
||||
--env.type=robocasa \
|
||||
--env.task=atomic_seen \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=20 \
|
||||
--eval.use_async_envs=false \
|
||||
--policy.device=cuda \
|
||||
'--rename_map={"observation.images.robot0_agentview_left": "observation.images.camera1", "observation.images.robot0_eye_in_hand": "observation.images.camera2", "observation.images.robot0_agentview_right": "observation.images.camera3"}'
|
||||
```
|
||||
|
||||
### Recommended evaluation episodes
|
||||
|
||||
**20 episodes per task** for reproducible benchmarking. Matches the protocol used in published results.
|
||||
|
||||
## Policy inputs and outputs
|
||||
|
||||
**Observations** (raw RoboCasa camera names are preserved verbatim):
|
||||
|
||||
- `observation.state` — 16-dim proprioceptive state (base position, base quaternion, relative end-effector position, relative end-effector quaternion, gripper qpos)
|
||||
- `observation.images.robot0_agentview_left` — left agent view, 256×256 HWC uint8
|
||||
- `observation.images.robot0_eye_in_hand` — wrist camera view, 256×256 HWC uint8
|
||||
- `observation.images.robot0_agentview_right` — right agent view, 256×256 HWC uint8
|
||||
|
||||
**Actions:**
|
||||
|
||||
- Continuous control in `Box(-1, 1, shape=(12,))` — base motion (4D) + control mode (1D) + end-effector position (3D) + end-effector rotation (3D) + gripper (1D).
|
||||
|
||||
## Training
|
||||
|
||||
### Single-task example
|
||||
|
||||
A ready-to-use single-task dataset is on the Hub:
|
||||
[`pepijn223/robocasa_CloseFridge`](https://huggingface.co/datasets/pepijn223/robocasa_CloseFridge).
|
||||
|
||||
Fine-tune a SmolVLA base on `CloseFridge`:
|
||||
|
||||
```bash
|
||||
lerobot-train \
|
||||
--policy.type=smolvla \
|
||||
--policy.repo_id=${HF_USER}/smolvla_robocasa_CloseFridge \
|
||||
--policy.load_vlm_weights=true \
|
||||
--policy.push_to_hub=true \
|
||||
--dataset.repo_id=pepijn223/robocasa_CloseFridge \
|
||||
--env.type=robocasa \
|
||||
--env.task=CloseFridge \
|
||||
--output_dir=./outputs/smolvla_robocasa_CloseFridge \
|
||||
--steps=100000 \
|
||||
--batch_size=4 \
|
||||
--eval_freq=5000 \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=5 \
|
||||
--save_freq=10000
|
||||
```
|
||||
|
||||
Evaluate the resulting checkpoint:
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path=${HF_USER}/smolvla_robocasa_CloseFridge \
|
||||
--env.type=robocasa \
|
||||
--env.task=CloseFridge \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=20
|
||||
```
|
||||
|
||||
## Reproducing published results
|
||||
|
||||
The released checkpoint [`lerobot/smolvla_robocasa`](https://huggingface.co/lerobot/smolvla_robocasa) is evaluated with the commands in the [Evaluation](#evaluation) section. CI runs a 10-atomic-task smoke eval (one episode each) on every PR touching the benchmark, picking fixture-centric tasks that don't require the objaverse asset pack.
|
||||
99
docs/source/robocerebra.mdx
Normal file
99
docs/source/robocerebra.mdx
Normal file
@@ -0,0 +1,99 @@
|
||||
# RoboCerebra
|
||||
|
||||
[RoboCerebra](https://robocerebra-project.github.io/) is a long-horizon manipulation benchmark that evaluates **high-level reasoning, planning, and memory** in VLAs. Episodes chain multiple sub-goals with language-grounded intermediate instructions, built on top of LIBERO's simulator stack (MuJoCo + robosuite, Franka Panda 7-DOF).
|
||||
|
||||
- Paper: [RoboCerebra: A Large-scale Benchmark for Long-horizon Robotic Manipulation Evaluation](https://arxiv.org/abs/2506.06677)
|
||||
- Project website: [robocerebra-project.github.io](https://robocerebra-project.github.io/)
|
||||
- Dataset: [`lerobot/robocerebra_unified`](https://huggingface.co/datasets/lerobot/robocerebra_unified) — LeRobot v3.0, 6,660 episodes / 571,116 frames at 20 fps, 1,728 language-grounded sub-tasks.
|
||||
- Pretrained policy: [`lerobot/smolvla_robocerebra`](https://huggingface.co/lerobot/smolvla_robocerebra)
|
||||
|
||||
## Available tasks
|
||||
|
||||
RoboCerebra reuses LIBERO's simulator, so evaluation runs against the LIBERO `libero_10` long-horizon suite:
|
||||
|
||||
| Suite | CLI name | Tasks | Description |
|
||||
| --------- | ----------- | ----- | ------------------------------------------------------------- |
|
||||
| LIBERO-10 | `libero_10` | 10 | Long-horizon kitchen/living room tasks chaining 3–6 sub-goals |
|
||||
|
||||
Each RoboCerebra episode in the dataset is segmented into multiple sub-tasks with natural-language instructions, which the unified dataset exposes as independent supervision signals.
|
||||
|
||||
## Installation
|
||||
|
||||
RoboCerebra piggybacks on LIBERO, so the `libero` extra is all you need:
|
||||
|
||||
```bash
|
||||
pip install -e ".[libero]"
|
||||
```
|
||||
|
||||
<Tip>
|
||||
RoboCerebra requires Linux (MuJoCo / robosuite). Set the rendering backend before training or evaluation:
|
||||
|
||||
```bash
|
||||
export MUJOCO_GL=egl # for headless servers (HPC, cloud)
|
||||
```
|
||||
|
||||
</Tip>
|
||||
|
||||
## Evaluation
|
||||
|
||||
RoboCerebra eval runs against LIBERO's `libero_10` suite with RoboCerebra's camera naming (`image` + `wrist_image`) and an extra empty-camera slot so a three-view-trained policy receives the expected input layout:
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path=lerobot/smolvla_robocerebra \
|
||||
--env.type=libero \
|
||||
--env.task=libero_10 \
|
||||
--env.fps=20 \
|
||||
--env.obs_type=pixels_agent_pos \
|
||||
--env.observation_height=256 \
|
||||
--env.observation_width=256 \
|
||||
'--env.camera_name_mapping={"agentview_image": "image", "robot0_eye_in_hand_image": "wrist_image"}' \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=10 \
|
||||
--eval.use_async_envs=false \
|
||||
--policy.device=cuda \
|
||||
'--rename_map={"observation.images.image": "observation.images.camera1", "observation.images.wrist_image": "observation.images.camera2"}' \
|
||||
--policy.empty_cameras=1
|
||||
```
|
||||
|
||||
### Recommended evaluation episodes
|
||||
|
||||
**10 episodes per task** across the `libero_10` suite (100 total) for reproducible benchmarking. Matches the protocol used in the RoboCerebra paper.
|
||||
|
||||
## Policy inputs and outputs
|
||||
|
||||
**Observations:**
|
||||
|
||||
- `observation.state` — 8-dim proprioceptive state (7 joint positions + gripper)
|
||||
- `observation.images.image` — third-person view, 256×256 HWC uint8
|
||||
- `observation.images.wrist_image` — wrist-mounted camera view, 256×256 HWC uint8
|
||||
|
||||
**Actions:**
|
||||
|
||||
- Continuous control in `Box(-1, 1, shape=(7,))` — end-effector delta (6D) + gripper (1D)
|
||||
|
||||
## Training
|
||||
|
||||
The unified dataset at [`lerobot/robocerebra_unified`](https://huggingface.co/datasets/lerobot/robocerebra_unified) exposes two RGB streams and language-grounded sub-task annotations:
|
||||
|
||||
| Feature | Shape | Description |
|
||||
| -------------------------------- | ------------- | -------------------- |
|
||||
| `observation.images.image` | (256, 256, 3) | Third-person view |
|
||||
| `observation.images.wrist_image` | (256, 256, 3) | Wrist-mounted camera |
|
||||
| `observation.state` | (8,) | Joint pos + gripper |
|
||||
| `action` | (7,) | EEF delta + gripper |
|
||||
|
||||
Fine-tune a SmolVLA base on it:
|
||||
|
||||
```bash
|
||||
lerobot-train \
|
||||
--policy.path=lerobot/smolvla_base \
|
||||
--dataset.repo_id=lerobot/robocerebra_unified \
|
||||
--env.type=libero \
|
||||
--env.task=libero_10 \
|
||||
--output_dir=outputs/smolvla_robocerebra
|
||||
```
|
||||
|
||||
## Reproducing published results
|
||||
|
||||
The released checkpoint [`lerobot/smolvla_robocerebra`](https://huggingface.co/lerobot/smolvla_robocerebra) was trained on `lerobot/robocerebra_unified` and evaluated with the command in the [Evaluation](#evaluation) section. CI runs the same command with `--eval.n_episodes=1` as a smoke test on every PR touching the benchmark.
|
||||
130
docs/source/robomme.mdx
Normal file
130
docs/source/robomme.mdx
Normal file
@@ -0,0 +1,130 @@
|
||||
# RoboMME
|
||||
|
||||
[RoboMME](https://robomme.github.io) is a memory-augmented manipulation benchmark built on ManiSkill (SAPIEN). It evaluates a robot's ability to retain and use information across an episode — counting, object permanence, reference, and imitation.
|
||||
|
||||
- **16 tasks** across 4 memory-skill suites
|
||||
- **1,600 training demos** (100 per task, 50 val, 50 test)
|
||||
- **Dataset**: [`lerobot/robomme`](https://huggingface.co/datasets/lerobot/robomme) — LeRobot v3.0, 768K frames at 10 fps
|
||||
- **Simulator**: ManiSkill / SAPIEN, Panda arm, Linux only
|
||||
|
||||

|
||||
|
||||
## Tasks
|
||||
|
||||
| Suite | Tasks |
|
||||
| --------------------------------- | ------------------------------------------------------------- |
|
||||
| **Counting** (temporal memory) | BinFill, PickXtimes, SwingXtimes, StopCube |
|
||||
| **Permanence** (spatial memory) | VideoUnmask, VideoUnmaskSwap, ButtonUnmask, ButtonUnmaskSwap |
|
||||
| **Reference** (object memory) | PickHighlight, VideoRepick, VideoPlaceButton, VideoPlaceOrder |
|
||||
| **Imitation** (procedural memory) | MoveCube, InsertPeg, PatternLock, RouteStick |
|
||||
|
||||
## Installation
|
||||
|
||||
> RoboMME requires **Linux** (ManiSkill/SAPIEN uses Vulkan rendering). Docker is recommended to isolate dependency conflicts.
|
||||
|
||||
### Native (Linux)
|
||||
|
||||
```bash
|
||||
pip install --override <(printf 'gymnasium==0.29.1\nnumpy==1.26.4\n') \
|
||||
-e '.[smolvla,av-dep]' \
|
||||
'robomme @ git+https://github.com/RoboMME/robomme_benchmark.git@main'
|
||||
```
|
||||
|
||||
> **Dependency note**: `mani-skill` (pulled by `robomme`) pins `gymnasium==0.29.1` and `numpy<2.0.0`, which conflict with lerobot's base `numpy>=2.0.0`. That's why `robomme` is not a pyproject extra — use the override install above, or the Docker approach below to avoid conflicts entirely.
|
||||
|
||||
### Docker (recommended)
|
||||
|
||||
```bash
|
||||
# Build base image first (from repo root)
|
||||
docker build -f docker/Dockerfile.eval-base -t lerobot-eval-base .
|
||||
|
||||
# Build RoboMME eval image (applies gymnasium + numpy pin overrides)
|
||||
docker build -f docker/Dockerfile.benchmark.robomme -t lerobot-robomme .
|
||||
```
|
||||
|
||||
The `docker/Dockerfile.benchmark.robomme` image overrides `gymnasium==0.29.1` and `numpy==1.26.4` after lerobot's install. Both versions are runtime-safe for lerobot's actual API usage.
|
||||
|
||||
## Running Evaluation
|
||||
|
||||
### Default (single task, single episode)
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path=<your_policy_repo> \
|
||||
--env.type=robomme \
|
||||
--env.task=PickXtimes \
|
||||
--env.dataset_split=test \
|
||||
--env.task_ids=[0] \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=1
|
||||
```
|
||||
|
||||
### Multi-task evaluation
|
||||
|
||||
Evaluate multiple tasks in one run by comma-separating task names. Use `task_ids` to control which episodes are evaluated per task. Recommended: 50 episodes per task for the test split.
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path=<your_policy_repo> \
|
||||
--env.type=robomme \
|
||||
--env.task=PickXtimes,BinFill,StopCube,MoveCube,InsertPeg \
|
||||
--env.dataset_split=test \
|
||||
--env.task_ids=[0,1,2,3,4,5,6,7,8,9] \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=50
|
||||
```
|
||||
|
||||
### Key CLI options for `env.type=robomme`
|
||||
|
||||
| Option | Default | Description |
|
||||
| -------------------- | ------------- | -------------------------------------------------- |
|
||||
| `env.task` | `PickXtimes` | Any of the 16 task names above (comma-separated) |
|
||||
| `env.dataset_split` | `test` | `train`, `val`, or `test` |
|
||||
| `env.action_space` | `joint_angle` | `joint_angle` (8-D) or `ee_pose` (7-D) |
|
||||
| `env.episode_length` | `300` | Max steps per episode |
|
||||
| `env.task_ids` | `null` | List of episode indices to evaluate (null = `[0]`) |
|
||||
|
||||
## Dataset
|
||||
|
||||
The dataset [`lerobot/robomme`](https://huggingface.co/datasets/lerobot/robomme) is in **LeRobot v3.0 format** and can be loaded directly:
|
||||
|
||||
```python
|
||||
from lerobot.datasets.lerobot_dataset import LeRobotDataset
|
||||
|
||||
dataset = LeRobotDataset("lerobot/robomme")
|
||||
```
|
||||
|
||||
### Dataset features
|
||||
|
||||
| Feature | Shape | Description |
|
||||
| ------------------ | ------------- | ------------------------------- |
|
||||
| `image` | (256, 256, 3) | Front camera RGB |
|
||||
| `wrist_image` | (256, 256, 3) | Wrist camera RGB |
|
||||
| `actions` | (8,) | Joint angles + gripper |
|
||||
| `state` | (8,) | Joint positions + gripper state |
|
||||
| `simple_subgoal` | str | High-level language annotation |
|
||||
| `grounded_subgoal` | str | Grounded language annotation |
|
||||
| `episode_index` | int | Episode ID |
|
||||
| `frame_index` | int | Frame within episode |
|
||||
|
||||
### Feature key alignment (training)
|
||||
|
||||
The env wrapper exposes `pixels/image` and `pixels/wrist_image` as observation keys. The `features_map` in `RoboMMEEnv` maps these to `observation.images.image` and `observation.images.wrist_image` for the policy. State is exposed as `agent_pos` and maps to `observation.state`.
|
||||
|
||||
The dataset's `image` and `wrist_image` columns already align with the policy input keys, so no renaming is needed when fine-tuning.
|
||||
|
||||
## Action Spaces
|
||||
|
||||
| Type | Dim | Description |
|
||||
| ------------- | --- | --------------------------------------------------------- |
|
||||
| `joint_angle` | 8 | 7 joint angles + 1 gripper (−1 closed, +1 open, absolute) |
|
||||
| `ee_pose` | 7 | xyz + roll/pitch/yaw + gripper |
|
||||
|
||||
Set via `--env.action_space=joint_angle` (default) or `--env.action_space=ee_pose`.
|
||||
|
||||
## Platform Notes
|
||||
|
||||
- **Linux only**: ManiSkill requires SAPIEN/Vulkan. macOS and Windows are not supported.
|
||||
- **GPU recommended**: Rendering is CPU-capable but slow; CUDA + Vulkan gives full speed.
|
||||
- **gymnasium / numpy conflict**: See installation note above. Docker image handles this automatically.
|
||||
- **ManiSkill fork**: `robomme` depends on a specific ManiSkill fork (`YinpeiDai/ManiSkill`), pulled in automatically via the `robomme` package.
|
||||
223
docs/source/robotwin.mdx
Normal file
223
docs/source/robotwin.mdx
Normal file
@@ -0,0 +1,223 @@
|
||||
# RoboTwin 2.0
|
||||
|
||||
RoboTwin 2.0 is a **large-scale dual-arm manipulation benchmark** built on the SAPIEN physics engine. It provides a standardized evaluation protocol for bimanual robotic policies across 50 tasks (as of upstream `main`) with strong domain randomization (clutter, lighting, background, tabletop height, and language instructions).
|
||||
|
||||
- Paper: [RoboTwin 2.0: A Scalable Data Generator and Benchmark with Strong Domain Randomization for Robust Bimanual Robotic Manipulation](https://arxiv.org/abs/2506.18088)
|
||||
- GitHub: [RoboTwin-Platform/RoboTwin](https://github.com/RoboTwin-Platform/RoboTwin)
|
||||
- Leaderboard: [robotwin-platform.github.io/leaderboard](https://robotwin-platform.github.io/leaderboard)
|
||||
- Dataset: [lerobot/robotwin_unified](https://huggingface.co/datasets/lerobot/robotwin_unified)
|
||||
|
||||

|
||||
|
||||
## Overview
|
||||
|
||||
| Property | Value |
|
||||
| ------------- | -------------------------------------------------------- |
|
||||
| Tasks | 50 dual-arm manipulation tasks |
|
||||
| Robot | Aloha-AgileX bimanual (14 DOF, 7 per arm) |
|
||||
| Action space | 14-dim joint-space, continuous in `[-1, 1]` |
|
||||
| Cameras | `head_camera`, `left_camera`, `right_camera` |
|
||||
| Simulator | SAPIEN (not MuJoCo) |
|
||||
| Eval protocol | 100 episodes/task, 50 demo_clean demonstrations |
|
||||
| Eval settings | **Easy** (`demo_clean`) and **Hard** (`demo_randomized`) |
|
||||
|
||||
## Available tasks
|
||||
|
||||
RoboTwin 2.0 ships 50 dual-arm manipulation tasks in its upstream `envs/` directory. The canonical list is the `ROBOTWIN_TASKS` tuple in `src/lerobot/envs/robotwin.py`, mirrored verbatim from the upstream repo. Example tasks:
|
||||
|
||||
| Task | CLI name | Category |
|
||||
| ------------------------ | ------------------------ | ----------------- |
|
||||
| Beat block with hammer | `beat_block_hammer` | Tool use |
|
||||
| Click bell / alarm clock | `click_bell` | Precision press |
|
||||
| Stack blocks (2 / 3) | `stack_blocks_two/three` | Stacking |
|
||||
| Stack bowls (2 / 3) | `stack_bowls_two/three` | Stacking |
|
||||
| Handover block / mic | `handover_block` | Bimanual coord. |
|
||||
| Lift pot | `lift_pot` | Bimanual lift |
|
||||
| Shake bottle | `shake_bottle` | Continuous motion |
|
||||
| Turn switch | `turn_switch` | Articulated obj |
|
||||
| Stamp seal | `stamp_seal` | Precision place |
|
||||
| Scan object | `scan_object` | Mobile manip. |
|
||||
|
||||
Pass a comma-separated list to `--env.task` to run multiple tasks in a single eval sweep.
|
||||
|
||||
<Tip warning={true}>
|
||||
`open_laptop` is currently broken upstream (its `check_success()` uses
|
||||
`self.arm_tag`, which is only set inside the scripted-expert `play_once()`
|
||||
path and therefore unavailable during normal policy eval). Avoid it until the
|
||||
upstream bug is fixed, or patch the task to default `self.arm_tag = "left"` in
|
||||
`load_actors()`.
|
||||
</Tip>
|
||||
|
||||
## Dataset
|
||||
|
||||
The RoboTwin 2.0 dataset is available in **LeRobot v3.0 format** on the Hugging Face Hub:
|
||||
|
||||
```
|
||||
lerobot/robotwin_unified
|
||||
```
|
||||
|
||||
It contains over 100,000 pre-collected trajectories across all 50 tasks (79.6 GB, Apache 2.0 license). No format conversion is needed — it is already in the correct LeRobot v3.0 schema with video observations and action labels.
|
||||
|
||||
You can load it directly with the HF Datasets library:
|
||||
|
||||
```python
|
||||
from datasets import load_dataset
|
||||
|
||||
ds = load_dataset("lerobot/robotwin_unified", split="train")
|
||||
```
|
||||
|
||||
## Installation
|
||||
|
||||
RoboTwin 2.0 requires **Linux** with an NVIDIA GPU (CUDA 12.1 recommended). Installation takes approximately 20 minutes.
|
||||
|
||||
### 1. Create a conda environment
|
||||
|
||||
```bash
|
||||
conda create -n robotwin python=3.10 -y
|
||||
conda activate robotwin
|
||||
```
|
||||
|
||||
### 2. Install LeRobot
|
||||
|
||||
```bash
|
||||
git clone https://github.com/huggingface/lerobot.git
|
||||
cd lerobot
|
||||
pip install -e "."
|
||||
```
|
||||
|
||||
### 3. Install RoboTwin 2.0
|
||||
|
||||
```bash
|
||||
git clone https://github.com/RoboTwin-Platform/RoboTwin.git
|
||||
cd RoboTwin
|
||||
bash script/_install.sh
|
||||
bash script/_download_assets.sh
|
||||
```
|
||||
|
||||
The install script handles all Python dependencies including SAPIEN, CuRobo, mplib, and pytorch3d.
|
||||
|
||||
<Tip warning={true}>
|
||||
If the automated install fails, install manually:
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
pip install "git+https://github.com/facebookresearch/pytorch3d.git@stable"
|
||||
cd envs && git clone https://github.com/NVlabs/curobo.git && cd curobo
|
||||
pip install -e . --no-build-isolation
|
||||
```
|
||||
|
||||
Then apply the required mplib fix: in `mplib/planner.py` line 807, remove `or collide` from the conditional.
|
||||
|
||||
</Tip>
|
||||
|
||||
### 4. Add RoboTwin to PYTHONPATH
|
||||
|
||||
The RoboTwin task modules must be importable by LeRobot. From within the `RoboTwin/` directory:
|
||||
|
||||
```bash
|
||||
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
|
||||
```
|
||||
|
||||
Add this to your shell profile to make it permanent.
|
||||
|
||||
## Evaluation
|
||||
|
||||
### Standard evaluation (recommended)
|
||||
|
||||
Evaluate a policy on a single task with the official protocol (100 episodes):
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path="your-hf-policy-id" \
|
||||
--env.type=robotwin \
|
||||
--env.task=beat_block_hammer \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=100
|
||||
```
|
||||
|
||||
### Single-task quick check
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path="your-hf-policy-id" \
|
||||
--env.type=robotwin \
|
||||
--env.task=beat_block_hammer \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=5
|
||||
```
|
||||
|
||||
### Multi-task sweep
|
||||
|
||||
Evaluate on several tasks in one run:
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path="your-hf-policy-id" \
|
||||
--env.type=robotwin \
|
||||
--env.task=beat_block_hammer,click_bell,handover_block,stack_blocks_two \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=100
|
||||
```
|
||||
|
||||
### Full benchmark (all 50 tasks)
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path="your-hf-policy-id" \
|
||||
--env.type=robotwin \
|
||||
--env.task=adjust_bottle,beat_block_hammer,blocks_ranking_rgb,blocks_ranking_size,click_alarmclock,click_bell,dump_bin_bigbin,grab_roller,handover_block,handover_mic,hanging_mug,lift_pot,move_can_pot,move_pillbottle_pad,move_playingcard_away,move_stapler_pad,open_microwave,pick_diverse_bottles,pick_dual_bottles,place_a2b_left,place_a2b_right,place_bread_basket,place_bread_skillet,place_burger_fries,place_can_basket,place_cans_plasticbox,place_container_plate,place_dual_shoes,place_empty_cup,place_fan,place_mouse_pad,place_object_basket,place_object_scale,place_object_stand,place_phone_stand,place_shoe,press_stapler,put_bottles_dustbin,put_object_cabinet,rotate_qrcode,scan_object,shake_bottle,shake_bottle_horizontally,stack_blocks_three,stack_blocks_two,stack_bowls_three,stack_bowls_two,stamp_seal,turn_switch \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=100
|
||||
```
|
||||
|
||||
<Tip>
|
||||
`open_laptop` is intentionally omitted above because of the upstream
|
||||
`self.arm_tag` bug (see the **Available tasks** section). Re-add it once the
|
||||
upstream fix lands.
|
||||
</Tip>
|
||||
|
||||
## Camera configuration
|
||||
|
||||
By default, all three cameras are included:
|
||||
|
||||
| Camera key | Description |
|
||||
| -------------- | ------------------------------ |
|
||||
| `head_camera` | Torso-mounted overhead view |
|
||||
| `left_camera` | Left arm wrist-mounted camera |
|
||||
| `right_camera` | Right arm wrist-mounted camera |
|
||||
|
||||
To use a subset of cameras, override `--env.camera_names`:
|
||||
|
||||
```bash
|
||||
lerobot-eval \
|
||||
--policy.path="your-hf-policy-id" \
|
||||
--env.type=robotwin \
|
||||
--env.task=beat_block_hammer \
|
||||
--env.camera_names="head_camera,left_camera" \
|
||||
--eval.batch_size=1 \
|
||||
--eval.n_episodes=10
|
||||
```
|
||||
|
||||
## Environment config reference
|
||||
|
||||
Key parameters for `RoboTwinEnvConfig`:
|
||||
|
||||
| Parameter | Default | Description |
|
||||
| -------------------- | ---------------------------------------- | ---------------------------------- |
|
||||
| `task` | `"beat_block_hammer"` | Comma-separated task name(s) |
|
||||
| `fps` | `25` | Simulation FPS |
|
||||
| `episode_length` | `300` | Max steps per episode |
|
||||
| `obs_type` | `"pixels_agent_pos"` | `"pixels"` or `"pixels_agent_pos"` |
|
||||
| `camera_names` | `"head_camera,left_camera,right_camera"` | Comma-separated active cameras |
|
||||
| `observation_height` | `240` | Camera pixel height |
|
||||
| `observation_width` | `320` | Camera pixel width |
|
||||
|
||||
## Leaderboard submission
|
||||
|
||||
Results can be submitted to the [RoboTwin 2.0 leaderboard](https://robotwin-platform.github.io/leaderboard). The official protocol requires:
|
||||
|
||||
- Training on 50 `demo_clean` demonstrations per task
|
||||
- Evaluating 100 episodes per task
|
||||
- Reporting success rate separately for **Easy** (`demo_clean`) and **Hard** (`demo_randomized`) settings
|
||||
|
||||
For submission instructions, refer to the [RoboTwin 2.0 documentation](https://robotwin-platform.github.io/doc/).
|
||||
@@ -212,6 +212,15 @@ aloha = ["lerobot[dataset]", "gym-aloha>=0.1.2,<0.2.0", "lerobot[scipy-dep]"]
|
||||
pusht = ["lerobot[dataset]", "gym-pusht>=0.1.5,<0.2.0", "pymunk>=6.6.0,<7.0.0"] # TODO: Fix pymunk version in gym-pusht instead
|
||||
libero = ["lerobot[dataset]", "lerobot[transformers-dep]", "hf-libero>=0.1.3,<0.2.0; sys_platform == 'linux'", "lerobot[scipy-dep]"]
|
||||
metaworld = ["lerobot[dataset]", "metaworld==3.0.0", "lerobot[scipy-dep]"]
|
||||
# NOTE: robomme is NOT a pyproject extra — mani-skill hard-pins numpy<2
|
||||
# which conflicts with lerobot's numpy>=2 base pin, so the two trees can't
|
||||
# resolve into a single env. Install it only in the RoboMME Docker image
|
||||
# via `uv pip install --override` (see docker/Dockerfile.benchmark.robomme).
|
||||
# NOTE: robocasa is NOT exposed as a `lerobot` extra. Its setup.py pins
|
||||
# `lerobot==0.3.3` in install_requires, which cyclically shadows our own
|
||||
# workspace `lerobot` and makes the graph unsolvable under any resolver
|
||||
# (uv, pip). Install it manually alongside robosuite — see
|
||||
# docs/source/robocasa.mdx for the recipe.
|
||||
|
||||
# All
|
||||
all = [
|
||||
|
||||
@@ -31,9 +31,23 @@ from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# LIBERO-plus derives task.language by space-joining the perturbation-variant
|
||||
# filename (grab_language_from_filename in libero/libero/benchmark/__init__.py),
|
||||
# so non-_language_ variants inherit a trailing metadata blob like
|
||||
# "view 0 0 100 0 0 initstate 0 noise 45" or "add 16". Strip those tokens so
|
||||
# the description matches the base instruction used in the training dataset.
|
||||
_LIBERO_PERTURBATION_TAIL_RE = re.compile(
|
||||
r"(?:\s(?:view|initstate|noise|add|tb|table|light|level)(?:\s\d+)+)+$"
|
||||
)
|
||||
|
||||
|
||||
def _strip_libero_perturbation_tail(instruction: str) -> str:
|
||||
return _LIBERO_PERTURBATION_TAIL_RE.sub("", instruction).strip()
|
||||
|
||||
|
||||
def _libero_descriptions(task_suite: str) -> dict[str, str]:
|
||||
from libero.libero import benchmark # type: ignore[import-untyped]
|
||||
@@ -47,7 +61,10 @@ def _libero_descriptions(task_suite: str) -> dict[str, str]:
|
||||
)
|
||||
return {}
|
||||
suite = suite_dict[task_suite]()
|
||||
return {f"{task_suite}_{i}": suite.get_task(i).language for i in range(suite.n_tasks)}
|
||||
return {
|
||||
f"{task_suite}_{i}": _strip_libero_perturbation_tail(suite.get_task(i).language)
|
||||
for i in range(suite.n_tasks)
|
||||
}
|
||||
|
||||
|
||||
def _metaworld_descriptions(task_name: str) -> dict[str, str]:
|
||||
@@ -57,19 +74,103 @@ def _metaworld_descriptions(task_name: str) -> dict[str, str]:
|
||||
return {f"{task_name}_0": label}
|
||||
|
||||
|
||||
def _robotwin_descriptions(task_names: str) -> dict[str, str]:
|
||||
"""Return descriptions for each requested RoboTwin task. Reads
|
||||
`description/task_instruction/<task>.json` from the RoboTwin clone
|
||||
(cwd is /opt/robotwin in CI). Falls back to the task name if missing."""
|
||||
out: dict[str, str] = {}
|
||||
root = Path("description/task_instruction")
|
||||
for name in (t.strip() for t in task_names.split(",") if t.strip()):
|
||||
desc_file = root / f"{name}.json"
|
||||
desc = name.replace("_", " ")
|
||||
if desc_file.is_file():
|
||||
data = json.loads(desc_file.read_text())
|
||||
full = data.get("full_description") or desc
|
||||
# Strip the schema placeholders ({A}, {a}) — keep the sentence readable.
|
||||
desc = full.replace("<", "").replace(">", "")
|
||||
out[f"{name}_0"] = desc
|
||||
return out
|
||||
|
||||
|
||||
def _robocasa_descriptions(task_spec: str) -> dict[str, str]:
|
||||
"""For each task in the comma-separated list, emit a cleaned-name label.
|
||||
|
||||
RoboCasa episodes carry their language instruction in the env's
|
||||
`ep_meta['lang']`, populated per reset. Pulling it requires spinning
|
||||
up the full kitchen env per task (~seconds each); we use the task
|
||||
name as the key here and let the eval's episode info carry the
|
||||
actual instruction.
|
||||
"""
|
||||
out: dict[str, str] = {}
|
||||
for task in (t.strip() for t in task_spec.split(",") if t.strip()):
|
||||
# Split CamelCase into words: "CloseFridge" → "close fridge".
|
||||
label = "".join(f" {c.lower()}" if c.isupper() else c for c in task).strip()
|
||||
out[f"{task}_0"] = label or task
|
||||
return out
|
||||
|
||||
|
||||
_ROBOMME_DESCRIPTIONS = {
|
||||
"BinFill": "Fill the target bin with the correct number of cubes",
|
||||
"PickXtimes": "Pick the indicated cube the specified number of times",
|
||||
"SwingXtimes": "Swing the object the specified number of times",
|
||||
"StopCube": "Grasp and stop the moving cube",
|
||||
"VideoUnmask": "Pick the cube shown in the reference video",
|
||||
"VideoUnmaskSwap": "Pick the cube matching the reference video after a swap",
|
||||
"ButtonUnmask": "Press the button indicated by the reference",
|
||||
"ButtonUnmaskSwap": "Press the correct button after objects are swapped",
|
||||
"PickHighlight": "Pick the highlighted cube",
|
||||
"VideoRepick": "Repick the cube shown in the reference video",
|
||||
"VideoPlaceButton": "Place the cube on the button shown in the video",
|
||||
"VideoPlaceOrder": "Place cubes in the order shown in the video",
|
||||
"MoveCube": "Move the cube to the target location",
|
||||
"InsertPeg": "Insert the peg into the target hole",
|
||||
"PatternLock": "Unlock the pattern by pressing buttons in sequence",
|
||||
"RouteStick": "Route the stick through the required waypoints",
|
||||
}
|
||||
|
||||
|
||||
def _robomme_descriptions(task_names: str, task_ids: list[int] | None = None) -> dict[str, str]:
|
||||
"""Return descriptions for each requested RoboMME task. Keys match the
|
||||
video filename pattern `<task>_<task_id>` used by the eval script."""
|
||||
if task_ids is None:
|
||||
task_ids = [0]
|
||||
out: dict[str, str] = {}
|
||||
for name in (t.strip() for t in task_names.split(",") if t.strip()):
|
||||
desc = _ROBOMME_DESCRIPTIONS.get(name, name)
|
||||
for tid in task_ids:
|
||||
out[f"{name}_{tid}"] = desc
|
||||
return out
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--env", required=True, help="Environment family (libero, metaworld, ...)")
|
||||
parser.add_argument("--task", required=True, help="Task/suite name (e.g. libero_spatial)")
|
||||
parser.add_argument(
|
||||
"--task-ids",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Comma-separated task IDs (e.g. '0,1,2'). Default: [0]",
|
||||
)
|
||||
parser.add_argument("--output", required=True, help="Path to write task_descriptions.json")
|
||||
args = parser.parse_args()
|
||||
|
||||
task_ids: list[int] | None = None
|
||||
if args.task_ids:
|
||||
task_ids = [int(x.strip()) for x in args.task_ids.split(",")]
|
||||
|
||||
descriptions: dict[str, str] = {}
|
||||
try:
|
||||
if args.env == "libero":
|
||||
if args.env == ("libero", "libero_plus"):
|
||||
descriptions = _libero_descriptions(args.task)
|
||||
elif args.env == "metaworld":
|
||||
descriptions = _metaworld_descriptions(args.task)
|
||||
elif args.env == "robotwin":
|
||||
descriptions = _robotwin_descriptions(args.task)
|
||||
elif args.env == "robocasa":
|
||||
descriptions = _robocasa_descriptions(args.task)
|
||||
elif args.env == "robomme":
|
||||
descriptions = _robomme_descriptions(args.task, task_ids=task_ids)
|
||||
else:
|
||||
print(
|
||||
f"[extract_task_descriptions] No description extractor for env '{args.env}'.",
|
||||
|
||||
@@ -331,6 +331,7 @@ class LiberoEnv(EnvConfig):
|
||||
camera_name_mapping: dict[str, str] | None = None
|
||||
observation_height: int = 360
|
||||
observation_width: int = 360
|
||||
is_libero_plus: bool = False
|
||||
features: dict[str, PolicyFeature] = field(
|
||||
default_factory=lambda: {
|
||||
ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(7,)),
|
||||
@@ -432,6 +433,7 @@ class LiberoEnv(EnvConfig):
|
||||
control_mode=self.control_mode,
|
||||
episode_length=self.episode_length,
|
||||
camera_name_mapping=self.camera_name_mapping,
|
||||
is_libero_plus=self.is_libero_plus,
|
||||
)
|
||||
|
||||
def get_env_processors(self):
|
||||
@@ -496,6 +498,81 @@ class MetaworldEnv(EnvConfig):
|
||||
)
|
||||
|
||||
|
||||
@EnvConfig.register_subclass("robocasa")
|
||||
@dataclass
|
||||
class RoboCasaEnv(EnvConfig):
|
||||
task: str = "CloseFridge"
|
||||
fps: int = 20
|
||||
episode_length: int = 1000
|
||||
obs_type: str = "pixels_agent_pos"
|
||||
render_mode: str = "rgb_array"
|
||||
camera_name: str = "robot0_agentview_left,robot0_eye_in_hand,robot0_agentview_right"
|
||||
observation_height: int = 256
|
||||
observation_width: int = 256
|
||||
visualization_height: int = 512
|
||||
visualization_width: int = 512
|
||||
split: str | None = None
|
||||
# Object-mesh registries to sample from. Upstream default is
|
||||
# ("objaverse", "lightwheel"), but objaverse is ~30GB and the CI image
|
||||
# only ships the lightwheel pack. Override to include objaverse once
|
||||
# you've run `python -m robocasa.scripts.download_kitchen_assets
|
||||
# --type objaverse` locally.
|
||||
obj_registries: list[str] = field(default_factory=lambda: ["lightwheel"])
|
||||
features: dict[str, PolicyFeature] = field(
|
||||
default_factory=lambda: {ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(12,))}
|
||||
)
|
||||
features_map: dict[str, str] = field(default_factory=lambda: {ACTION: ACTION, "agent_pos": OBS_STATE})
|
||||
|
||||
def __post_init__(self):
|
||||
if self.obs_type not in ("pixels", "pixels_agent_pos"):
|
||||
raise ValueError(f"Unsupported obs_type: {self.obs_type}")
|
||||
|
||||
# Preserve raw RoboCasa camera names end-to-end (e.g.
|
||||
# `observation.images.robot0_agentview_left`). This matches the
|
||||
# naming convention used by the RoboCasa datasets on the Hub, so
|
||||
# trained policies don't need a `--rename_map` at eval time.
|
||||
cams = [c.strip() for c in self.camera_name.split(",") if c.strip()]
|
||||
for cam in cams:
|
||||
self.features[f"pixels/{cam}"] = PolicyFeature(
|
||||
type=FeatureType.VISUAL,
|
||||
shape=(self.observation_height, self.observation_width, 3),
|
||||
)
|
||||
self.features_map[f"pixels/{cam}"] = f"{OBS_IMAGES}.{cam}"
|
||||
|
||||
if self.obs_type == "pixels_agent_pos":
|
||||
self.features["agent_pos"] = PolicyFeature(type=FeatureType.STATE, shape=(16,))
|
||||
|
||||
@property
|
||||
def gym_kwargs(self) -> dict:
|
||||
kwargs: dict[str, Any] = {
|
||||
"obs_type": self.obs_type,
|
||||
"render_mode": self.render_mode,
|
||||
"observation_height": self.observation_height,
|
||||
"observation_width": self.observation_width,
|
||||
"visualization_height": self.visualization_height,
|
||||
"visualization_width": self.visualization_width,
|
||||
}
|
||||
if self.split is not None:
|
||||
kwargs["split"] = self.split
|
||||
return kwargs
|
||||
|
||||
def create_envs(self, n_envs: int, use_async_envs: bool = False):
|
||||
from .robocasa import create_robocasa_envs
|
||||
|
||||
if self.task is None:
|
||||
raise ValueError("RoboCasaEnv requires a task to be specified")
|
||||
env_cls = _make_vec_env_cls(use_async_envs, n_envs)
|
||||
return create_robocasa_envs(
|
||||
task=self.task,
|
||||
n_envs=n_envs,
|
||||
camera_name=self.camera_name,
|
||||
gym_kwargs=self.gym_kwargs,
|
||||
env_cls=env_cls,
|
||||
episode_length=self.episode_length,
|
||||
obj_registries=tuple(self.obj_registries),
|
||||
)
|
||||
|
||||
|
||||
@EnvConfig.register_subclass("isaaclab_arena")
|
||||
@dataclass
|
||||
class IsaaclabArenaEnv(HubEnvConfig):
|
||||
@@ -574,3 +651,171 @@ class IsaaclabArenaEnv(HubEnvConfig):
|
||||
),
|
||||
PolicyProcessorPipeline(steps=[]),
|
||||
)
|
||||
|
||||
|
||||
@EnvConfig.register_subclass("libero_plus")
|
||||
@dataclass
|
||||
class LiberoPlusEnv(LiberoEnv):
|
||||
"""Config for LIBERO-plus robustness benchmark evaluation.
|
||||
|
||||
LIBERO-plus extends LIBERO with 7 perturbation dimensions (camera viewpoints,
|
||||
object layouts, robot initial states, language instructions, lighting, background
|
||||
textures, sensor noise) producing ~10k task variants.
|
||||
|
||||
The gym interface is identical to LIBERO so this class reuses ``LiberoEnv``
|
||||
entirely — only the registered name and default task suite differ.
|
||||
|
||||
Install: see docker/Dockerfile.benchmark.libero_plus — LIBERO-plus ships
|
||||
as a namespace package from a git fork and must be cloned + PYTHONPATH'd
|
||||
rather than installed as a pyproject extra.
|
||||
|
||||
See Also:
|
||||
https://github.com/sylvestf/LIBERO-plus
|
||||
"""
|
||||
|
||||
task: str = "libero_spatial"
|
||||
is_libero_plus: bool = True
|
||||
|
||||
|
||||
@EnvConfig.register_subclass("robotwin")
|
||||
@dataclass
|
||||
class RoboTwinEnvConfig(EnvConfig):
|
||||
"""Configuration for RoboTwin 2.0 benchmark environments.
|
||||
|
||||
RoboTwin 2.0 is a dual-arm manipulation benchmark with 50 tasks built on the
|
||||
SAPIEN simulator. The robot is an Aloha-AgileX bimanual platform with 14 DOF
|
||||
(7 per arm). All three cameras are enabled by default.
|
||||
|
||||
See: https://robotwin-platform.github.io
|
||||
Dataset: https://huggingface.co/datasets/lerobot/robotwin_unified
|
||||
"""
|
||||
|
||||
task: str = "beat_block_hammer" # single task or comma-separated list
|
||||
fps: int = 25
|
||||
episode_length: int = 300
|
||||
obs_type: str = "pixels_agent_pos"
|
||||
render_mode: str = "rgb_array"
|
||||
# Available cameras from RoboTwin's aloha-agilex embodiment: head_camera
|
||||
# (torso-mounted) + left_camera / right_camera (wrists).
|
||||
camera_names: str = "head_camera,left_camera,right_camera"
|
||||
# Match the D435 dims in task_config/demo_clean.yml (_camera_config.yml).
|
||||
# Gym's vector-env concatenate pre-allocates buffers of this shape, so it
|
||||
# must equal what SAPIEN actually renders.
|
||||
observation_height: int = 240
|
||||
observation_width: int = 320
|
||||
features: dict[str, PolicyFeature] = field(
|
||||
default_factory=lambda: {
|
||||
ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(14,)),
|
||||
}
|
||||
)
|
||||
features_map: dict[str, str] = field(
|
||||
default_factory=lambda: {
|
||||
ACTION: ACTION,
|
||||
"pixels/head_camera": f"{OBS_IMAGES}.head_camera",
|
||||
"pixels/left_camera": f"{OBS_IMAGES}.left_camera",
|
||||
"pixels/right_camera": f"{OBS_IMAGES}.right_camera",
|
||||
"agent_pos": OBS_STATE,
|
||||
}
|
||||
)
|
||||
|
||||
def __post_init__(self):
|
||||
cam_list = [c.strip() for c in self.camera_names.split(",") if c.strip()]
|
||||
for cam in cam_list:
|
||||
self.features[f"pixels/{cam}"] = PolicyFeature(
|
||||
type=FeatureType.VISUAL,
|
||||
shape=(self.observation_height, self.observation_width, 3),
|
||||
)
|
||||
# Keep features_map entry if already set (default_factory); add if missing.
|
||||
key = f"pixels/{cam}"
|
||||
if key not in self.features_map:
|
||||
self.features_map[key] = f"{OBS_IMAGES}.{cam}"
|
||||
|
||||
if self.obs_type == "pixels_agent_pos":
|
||||
self.features["agent_pos"] = PolicyFeature(
|
||||
type=FeatureType.STATE,
|
||||
shape=(14,), # 14 DOF: 7 per arm
|
||||
)
|
||||
elif self.obs_type != "pixels":
|
||||
raise ValueError(
|
||||
f"Unsupported obs_type '{self.obs_type}'. "
|
||||
"RoboTwinEnvConfig supports 'pixels' and 'pixels_agent_pos'."
|
||||
)
|
||||
|
||||
@property
|
||||
def gym_kwargs(self) -> dict:
|
||||
return {}
|
||||
|
||||
def create_envs(self, n_envs: int, use_async_envs: bool = True):
|
||||
from lerobot.envs.robotwin import create_robotwin_envs
|
||||
|
||||
if not self.task:
|
||||
raise ValueError("RoboTwinEnvConfig requires `task` to be specified.")
|
||||
|
||||
env_cls = _make_vec_env_cls(use_async_envs, n_envs)
|
||||
cam_list = [c.strip() for c in self.camera_names.split(",") if c.strip()]
|
||||
return create_robotwin_envs(
|
||||
task=self.task,
|
||||
n_envs=n_envs,
|
||||
env_cls=env_cls,
|
||||
camera_names=cam_list,
|
||||
observation_height=self.observation_height,
|
||||
observation_width=self.observation_width,
|
||||
episode_length=self.episode_length,
|
||||
)
|
||||
|
||||
|
||||
@EnvConfig.register_subclass("robomme")
|
||||
@dataclass
|
||||
class RoboMMEEnv(EnvConfig):
|
||||
"""RoboMME memory-augmented manipulation benchmark (ManiSkill/SAPIEN).
|
||||
|
||||
16 tasks across 4 suites: Counting, Permanence, Reference, Imitation.
|
||||
Dataset: lerobot/robomme (LeRobot v3.0, 1,600 episodes).
|
||||
Benchmark: https://github.com/RoboMME/robomme_benchmark
|
||||
|
||||
Requires the `robomme` git package installed separately (Linux only);
|
||||
see docker/Dockerfile.benchmark.robomme for the canonical install.
|
||||
"""
|
||||
|
||||
task: str = "PickXtimes"
|
||||
fps: int = 10
|
||||
episode_length: int = 300
|
||||
action_space: str = "joint_angle" # or "ee_pose" (7-D)
|
||||
dataset_split: str = "test" # "train" | "val" | "test"
|
||||
task_ids: list[int] | None = None
|
||||
features: dict[str, PolicyFeature] = field(default_factory=dict)
|
||||
features_map: dict[str, str] = field(
|
||||
default_factory=lambda: {
|
||||
ACTION: ACTION,
|
||||
"pixels/image": f"{OBS_IMAGES}.image",
|
||||
"pixels/wrist_image": f"{OBS_IMAGES}.wrist_image",
|
||||
"agent_pos": OBS_STATE,
|
||||
}
|
||||
)
|
||||
|
||||
def __post_init__(self):
|
||||
action_dim = 8 if self.action_space == "joint_angle" else 7
|
||||
self.features = {
|
||||
ACTION: PolicyFeature(type=FeatureType.ACTION, shape=(action_dim,)),
|
||||
"pixels/image": PolicyFeature(type=FeatureType.VISUAL, shape=(256, 256, 3)),
|
||||
"pixels/wrist_image": PolicyFeature(type=FeatureType.VISUAL, shape=(256, 256, 3)),
|
||||
"agent_pos": PolicyFeature(type=FeatureType.STATE, shape=(8,)),
|
||||
}
|
||||
|
||||
@property
|
||||
def gym_kwargs(self) -> dict:
|
||||
return {}
|
||||
|
||||
def create_envs(self, n_envs: int, use_async_envs: bool = True):
|
||||
from lerobot.envs.robomme import create_robomme_envs
|
||||
|
||||
env_cls = _make_vec_env_cls(use_async_envs, n_envs)
|
||||
return create_robomme_envs(
|
||||
task=self.task,
|
||||
n_envs=n_envs,
|
||||
action_space_type=self.action_space,
|
||||
dataset=self.dataset_split,
|
||||
episode_length=self.episode_length,
|
||||
task_ids=self.task_ids,
|
||||
env_cls=env_cls,
|
||||
)
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from collections.abc import Callable, Iterable, Mapping, Sequence
|
||||
from functools import partial
|
||||
@@ -31,20 +32,7 @@ from libero.libero.envs import OffScreenRenderEnv
|
||||
|
||||
from lerobot.types import RobotObservation
|
||||
|
||||
from .utils import _LazyAsyncVectorEnv
|
||||
|
||||
|
||||
def _parse_camera_names(camera_name: str | Sequence[str]) -> list[str]:
|
||||
"""Normalize camera_name into a non-empty list of strings."""
|
||||
if isinstance(camera_name, str):
|
||||
cams = [c.strip() for c in camera_name.split(",") if c.strip()]
|
||||
elif isinstance(camera_name, (list | tuple)):
|
||||
cams = [str(c).strip() for c in camera_name if str(c).strip()]
|
||||
else:
|
||||
raise TypeError(f"camera_name must be str or sequence[str], got {type(camera_name).__name__}")
|
||||
if not cams:
|
||||
raise ValueError("camera_name resolved to an empty list.")
|
||||
return cams
|
||||
from .utils import _LazyAsyncVectorEnv, parse_camera_names
|
||||
|
||||
|
||||
def _get_suite(name: str) -> benchmark.Benchmark:
|
||||
@@ -69,14 +57,34 @@ def _select_task_ids(total_tasks: int, task_ids: Iterable[int] | None) -> list[i
|
||||
return ids
|
||||
|
||||
|
||||
def get_task_init_states(task_suite: Any, i: int) -> np.ndarray:
|
||||
init_states_path = (
|
||||
Path(get_libero_path("init_states"))
|
||||
/ task_suite.tasks[i].problem_folder
|
||||
/ task_suite.tasks[i].init_states_file
|
||||
)
|
||||
init_states = torch.load(init_states_path, weights_only=False) # nosec B614
|
||||
return init_states
|
||||
# LIBERO-plus perturbation variants encode the perturbation in the filename
|
||||
# but on disk only the base `.pruned_init` exists — strip the suffix to match
|
||||
# LIBERO-plus's own suite.get_task_init_states() (we reimplement it here so we
|
||||
# can pass weights_only=False for PyTorch 2.6+ numpy pickles).
|
||||
_LIBERO_PERTURBATION_SUFFIX_RE = re.compile(r"_(?:language|view|light)_[^.]*|_(?:table|tb)_\d+")
|
||||
|
||||
|
||||
def get_task_init_states(task_suite: Any, i: int, is_libero_plus: bool = False) -> np.ndarray:
|
||||
task = task_suite.tasks[i]
|
||||
filename = Path(task.init_states_file)
|
||||
root = Path(get_libero_path("init_states"))
|
||||
|
||||
if not is_libero_plus:
|
||||
init_states_path = root / task.problem_folder / filename.name
|
||||
return torch.load(init_states_path, weights_only=False) # nosec B614
|
||||
|
||||
# LIBERO-plus: `_add_` / `_level` variants store extra-object layouts under
|
||||
# libero_newobj/ as a flat array that must be reshaped to (1, -1).
|
||||
if "_add_" in filename.name or "_level" in filename.name:
|
||||
init_states_path = root / "libero_newobj" / task.problem_folder / filename.name
|
||||
init_states = torch.load(init_states_path, weights_only=False) # nosec B614
|
||||
return init_states.reshape(1, -1)
|
||||
|
||||
# LIBERO-plus perturbation variants encode the perturbation in the filename
|
||||
# but on disk only the base `.pruned_init` exists — strip the suffix to match.
|
||||
stripped = _LIBERO_PERTURBATION_SUFFIX_RE.sub("", filename.stem) + filename.suffix
|
||||
init_states_path = root / task.problem_folder / stripped
|
||||
return torch.load(init_states_path, weights_only=False) # nosec B614
|
||||
|
||||
|
||||
def get_libero_dummy_action():
|
||||
@@ -118,9 +126,11 @@ class LiberoEnv(gym.Env):
|
||||
camera_name_mapping: dict[str, str] | None = None,
|
||||
num_steps_wait: int = 10,
|
||||
control_mode: str = "relative",
|
||||
is_libero_plus: bool = False,
|
||||
):
|
||||
super().__init__()
|
||||
self.task_id = task_id
|
||||
self.is_libero_plus = is_libero_plus
|
||||
self.obs_type = obs_type
|
||||
self.render_mode = render_mode
|
||||
self.observation_width = observation_width
|
||||
@@ -128,7 +138,7 @@ class LiberoEnv(gym.Env):
|
||||
self.visualization_width = visualization_width
|
||||
self.visualization_height = visualization_height
|
||||
self.init_states = init_states
|
||||
self.camera_name = _parse_camera_names(
|
||||
self.camera_name = parse_camera_names(
|
||||
camera_name
|
||||
) # agentview_image (main) or robot0_eye_in_hand_image (wrist)
|
||||
|
||||
@@ -147,7 +157,11 @@ class LiberoEnv(gym.Env):
|
||||
self.episode_index = episode_index
|
||||
self.episode_length = episode_length
|
||||
# Load once and keep
|
||||
self._init_states = get_task_init_states(task_suite, self.task_id) if self.init_states else None
|
||||
self._init_states = (
|
||||
get_task_init_states(task_suite, self.task_id, is_libero_plus=self.is_libero_plus)
|
||||
if self.init_states
|
||||
else None
|
||||
)
|
||||
self._reset_stride = n_envs # when performing a reset, append `_reset_stride` to `init_state_id`.
|
||||
|
||||
self.init_state_id = self.episode_index # tie each sub-env to a fixed init state
|
||||
@@ -380,6 +394,7 @@ def _make_env_fns(
|
||||
gym_kwargs: Mapping[str, Any],
|
||||
control_mode: str,
|
||||
camera_name_mapping: dict[str, str] | None = None,
|
||||
is_libero_plus: bool = False,
|
||||
) -> list[Callable[[], LiberoEnv]]:
|
||||
"""Build n_envs factory callables for a single (suite, task_id)."""
|
||||
|
||||
@@ -396,6 +411,7 @@ def _make_env_fns(
|
||||
n_envs=n_envs,
|
||||
control_mode=control_mode,
|
||||
camera_name_mapping=camera_name_mapping,
|
||||
is_libero_plus=is_libero_plus,
|
||||
**local_kwargs,
|
||||
)
|
||||
|
||||
@@ -418,6 +434,7 @@ def create_libero_envs(
|
||||
control_mode: str = "relative",
|
||||
episode_length: int | None = None,
|
||||
camera_name_mapping: dict[str, str] | None = None,
|
||||
is_libero_plus: bool = False,
|
||||
) -> dict[str, dict[int, Any]]:
|
||||
"""
|
||||
Create vectorized LIBERO environments with a consistent return shape.
|
||||
@@ -437,7 +454,7 @@ def create_libero_envs(
|
||||
gym_kwargs = dict(gym_kwargs or {})
|
||||
task_ids_filter = gym_kwargs.pop("task_ids", None) # optional: limit to specific tasks
|
||||
|
||||
camera_names = _parse_camera_names(camera_name)
|
||||
camera_names = parse_camera_names(camera_name)
|
||||
suite_names = [s.strip() for s in str(task).split(",") if s.strip()]
|
||||
if not suite_names:
|
||||
raise ValueError("`task` must contain at least one LIBERO suite name.")
|
||||
@@ -462,6 +479,7 @@ def create_libero_envs(
|
||||
# Probe once and reuse to avoid creating a temp env per task.
|
||||
cached_obs_space: spaces.Space | None = None
|
||||
cached_act_space: spaces.Space | None = None
|
||||
cached_metadata: dict[str, Any] | None = None
|
||||
|
||||
for tid in selected:
|
||||
fns = _make_env_fns(
|
||||
@@ -475,12 +493,14 @@ def create_libero_envs(
|
||||
gym_kwargs=gym_kwargs,
|
||||
control_mode=control_mode,
|
||||
camera_name_mapping=camera_name_mapping,
|
||||
is_libero_plus=is_libero_plus,
|
||||
)
|
||||
if is_async:
|
||||
lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space)
|
||||
lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
|
||||
if cached_obs_space is None:
|
||||
cached_obs_space = lazy.observation_space
|
||||
cached_act_space = lazy.action_space
|
||||
cached_metadata = lazy.metadata
|
||||
out[suite_name][tid] = lazy
|
||||
else:
|
||||
out[suite_name][tid] = env_cls(fns)
|
||||
|
||||
@@ -311,6 +311,7 @@ def create_metaworld_envs(
|
||||
is_async = env_cls is gym.vector.AsyncVectorEnv
|
||||
cached_obs_space = None
|
||||
cached_act_space = None
|
||||
cached_metadata = None
|
||||
out: dict[str, dict[int, Any]] = defaultdict(dict)
|
||||
|
||||
for group in task_groups:
|
||||
@@ -324,10 +325,11 @@ def create_metaworld_envs(
|
||||
fns = [(lambda tn=task_name: MetaworldEnv(task=tn, **gym_kwargs)) for _ in range(n_envs)]
|
||||
|
||||
if is_async:
|
||||
lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space)
|
||||
lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
|
||||
if cached_obs_space is None:
|
||||
cached_obs_space = lazy.observation_space
|
||||
cached_act_space = lazy.action_space
|
||||
cached_metadata = lazy.metadata
|
||||
out[group][tid] = lazy
|
||||
else:
|
||||
out[group][tid] = env_cls(fns)
|
||||
|
||||
425
src/lerobot/envs/robocasa.py
Normal file
425
src/lerobot/envs/robocasa.py
Normal file
@@ -0,0 +1,425 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from collections.abc import Callable, Sequence
|
||||
from functools import partial
|
||||
from typing import Any
|
||||
|
||||
import gymnasium as gym
|
||||
import numpy as np
|
||||
from gymnasium import spaces
|
||||
|
||||
from lerobot.types import RobotObservation
|
||||
|
||||
from .utils import _LazyAsyncVectorEnv, parse_camera_names
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Dimensions for the flat action/state vectors used by the LeRobot wrapper.
|
||||
# These correspond to the PandaOmron robot in RoboCasa365.
|
||||
OBS_STATE_DIM = 16 # base_pos(3) + base_quat(4) + ee_pos_rel(3) + ee_quat_rel(4) + gripper_qpos(2)
|
||||
ACTION_DIM = 12 # base_motion(4) + control_mode(1) + ee_pos(3) + ee_rot(3) + gripper(1)
|
||||
ACTION_LOW = -1.0
|
||||
ACTION_HIGH = 1.0
|
||||
|
||||
# Default PandaOmron cameras. We surface these raw names directly as
|
||||
# `observation.images.<name>` so the LeRobot dataset/policy keys match
|
||||
# RoboCasa's native convention (no implicit renaming).
|
||||
DEFAULT_CAMERAS = [
|
||||
"robot0_agentview_left",
|
||||
"robot0_eye_in_hand",
|
||||
"robot0_agentview_right",
|
||||
]
|
||||
|
||||
# Object-mesh registries to sample from. RoboCasa's upstream default is
|
||||
# ("objaverse", "lightwheel"), but the objaverse pack is huge (~30GB) and
|
||||
# most users — including our CI image — only download the lightwheel pack
|
||||
# (`--type objs_lw` in `download_kitchen_assets`). When a sampled object
|
||||
# category has zero candidates in every registry, robocasa crashes with
|
||||
# `ValueError: Probabilities contain NaN` (0/0 divide in the probability
|
||||
# normalization). Restricting to registries that are actually on disk
|
||||
# avoids the NaN and matches what the asset download provides.
|
||||
DEFAULT_OBJ_REGISTRIES: tuple[str, ...] = ("lightwheel",)
|
||||
|
||||
# Task-group shortcuts accepted as `--env.task`. When the user passes one of
|
||||
# these names, we expand it to the upstream RoboCasa task list and auto-set
|
||||
# the dataset split. Individual task names (optionally comma-separated) still
|
||||
# take precedence; this only triggers on an exact group-name match.
|
||||
_TASK_GROUP_SPLITS = {
|
||||
"atomic_seen": "target",
|
||||
"composite_seen": "target",
|
||||
"composite_unseen": "target",
|
||||
"pretrain50": "pretrain",
|
||||
"pretrain100": "pretrain",
|
||||
"pretrain200": "pretrain",
|
||||
"pretrain300": "pretrain",
|
||||
}
|
||||
|
||||
|
||||
def _resolve_tasks(task: str) -> tuple[list[str], str | None]:
|
||||
"""Resolve a `--env.task` value to (task_names, split_override).
|
||||
|
||||
If `task` is a known task-group name (e.g. `atomic_seen`, `pretrain100`),
|
||||
expand it via `robocasa.utils.dataset_registry.{TARGET,PRETRAINING}_TASKS`
|
||||
and return the matching split. Otherwise treat `task` as a single task or
|
||||
comma-separated list and leave the split untouched (None).
|
||||
"""
|
||||
key = task.strip()
|
||||
if key in _TASK_GROUP_SPLITS:
|
||||
from robocasa.utils.dataset_registry import PRETRAINING_TASKS, TARGET_TASKS
|
||||
|
||||
combined = {**TARGET_TASKS, **PRETRAINING_TASKS}
|
||||
if key not in combined:
|
||||
raise ValueError(
|
||||
f"Task group '{key}' is not available in this version of robocasa. "
|
||||
f"Known groups: {sorted(combined.keys())}."
|
||||
)
|
||||
return list(combined[key]), _TASK_GROUP_SPLITS[key]
|
||||
|
||||
names = [t.strip() for t in task.split(",") if t.strip()]
|
||||
if not names:
|
||||
raise ValueError("`task` must contain at least one RoboCasa task name.")
|
||||
return names, None
|
||||
|
||||
|
||||
def convert_action(flat_action: np.ndarray) -> dict[str, Any]:
|
||||
"""Split a flat (12,) action vector into a RoboCasa action dict.
|
||||
|
||||
Layout: base_motion(4) + control_mode(1) + ee_pos(3) + ee_rot(3) + gripper(1)
|
||||
"""
|
||||
return {
|
||||
"action.base_motion": flat_action[0:4],
|
||||
"action.control_mode": flat_action[4:5],
|
||||
"action.end_effector_position": flat_action[5:8],
|
||||
"action.end_effector_rotation": flat_action[8:11],
|
||||
"action.gripper_close": flat_action[11:12],
|
||||
}
|
||||
|
||||
|
||||
class RoboCasaEnv(gym.Env):
|
||||
"""LeRobot gym.Env wrapper for RoboCasa365 kitchen environments.
|
||||
|
||||
Wraps RoboCasaGymEnv from the robocasa package and converts its
|
||||
dict-based observations and actions into the flat arrays LeRobot expects.
|
||||
Raw RoboCasa camera names are preserved verbatim under `pixels/<cam>`.
|
||||
"""
|
||||
|
||||
metadata = {"render_modes": ["rgb_array"], "render_fps": 20}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
task: str,
|
||||
camera_name: str | Sequence[str] = ",".join(DEFAULT_CAMERAS),
|
||||
obs_type: str = "pixels_agent_pos",
|
||||
render_mode: str = "rgb_array",
|
||||
observation_width: int = 256,
|
||||
observation_height: int = 256,
|
||||
visualization_width: int = 512,
|
||||
visualization_height: int = 512,
|
||||
split: str | None = None,
|
||||
episode_length: int | None = None,
|
||||
obj_registries: Sequence[str] = DEFAULT_OBJ_REGISTRIES,
|
||||
episode_index: int = 0,
|
||||
):
|
||||
super().__init__()
|
||||
self.task = task
|
||||
self.obs_type = obs_type
|
||||
self.render_mode = render_mode
|
||||
self.observation_width = observation_width
|
||||
self.observation_height = observation_height
|
||||
self.visualization_width = visualization_width
|
||||
self.visualization_height = visualization_height
|
||||
self.split = split
|
||||
self.obj_registries = tuple(obj_registries)
|
||||
# Per-worker index (0..n_envs-1) used to spread the user-provided
|
||||
# seed across factories so each sub-env explores a distinct layout
|
||||
# even when the same seed is passed to `reset()`.
|
||||
self.episode_index = int(episode_index)
|
||||
|
||||
self.camera_name = parse_camera_names(camera_name)
|
||||
|
||||
self._max_episode_steps = episode_length if episode_length is not None else 1000
|
||||
|
||||
# Deferred — created on first reset() inside the worker subprocess
|
||||
# to avoid inheriting stale GPU/EGL contexts across fork().
|
||||
self._env: Any = None
|
||||
self.task_description = ""
|
||||
|
||||
images = {
|
||||
cam: spaces.Box(
|
||||
low=0,
|
||||
high=255,
|
||||
shape=(self.observation_height, self.observation_width, 3),
|
||||
dtype=np.uint8,
|
||||
)
|
||||
for cam in self.camera_name
|
||||
}
|
||||
|
||||
if self.obs_type == "pixels":
|
||||
self.observation_space = spaces.Dict({"pixels": spaces.Dict(images)})
|
||||
elif self.obs_type == "pixels_agent_pos":
|
||||
self.observation_space = spaces.Dict(
|
||||
{
|
||||
"pixels": spaces.Dict(images),
|
||||
"agent_pos": spaces.Box(
|
||||
low=-np.inf,
|
||||
high=np.inf,
|
||||
shape=(OBS_STATE_DIM,),
|
||||
dtype=np.float32,
|
||||
),
|
||||
}
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unsupported obs_type '{self.obs_type}'. Use 'pixels' or 'pixels_agent_pos'.")
|
||||
|
||||
self.action_space = spaces.Box(
|
||||
low=ACTION_LOW,
|
||||
high=ACTION_HIGH,
|
||||
shape=(ACTION_DIM,),
|
||||
dtype=np.float32,
|
||||
)
|
||||
|
||||
def _ensure_env(self) -> None:
|
||||
"""Create the underlying RoboCasaGymEnv on first use.
|
||||
|
||||
Called inside the worker subprocess after fork(), so each worker gets
|
||||
its own clean rendering context rather than inheriting a stale one from
|
||||
the parent process (which causes crashes with AsyncVectorEnv).
|
||||
"""
|
||||
if self._env is not None:
|
||||
return
|
||||
from robocasa.wrappers.gym_wrapper import RoboCasaGymEnv
|
||||
|
||||
# RoboCasaGymEnv defaults split="test", which create_env rejects
|
||||
# (only None/"all"/"pretrain"/"target" are valid). Always pass a
|
||||
# valid value so we don't hit that default. Extra kwargs are
|
||||
# forwarded to the underlying kitchen env via create_env/robosuite.make.
|
||||
self._env = RoboCasaGymEnv(
|
||||
env_name=self.task,
|
||||
camera_widths=self.observation_width,
|
||||
camera_heights=self.observation_height,
|
||||
split=self.split if self.split is not None else "all",
|
||||
obj_registries=self.obj_registries,
|
||||
)
|
||||
|
||||
ep_meta = self._env.env.get_ep_meta()
|
||||
self.task_description = ep_meta.get("lang", self.task)
|
||||
|
||||
def _format_raw_obs(self, raw_obs: dict) -> RobotObservation:
|
||||
"""Convert RoboCasaGymEnv observation dict to LeRobot format."""
|
||||
# RoboCasaGymEnv emits camera frames under "video.<cam>".
|
||||
images = {cam: raw_obs[f"video.{cam}"] for cam in self.camera_name if f"video.{cam}" in raw_obs}
|
||||
|
||||
if self.obs_type == "pixels":
|
||||
return {"pixels": images}
|
||||
|
||||
# `state.*` keys come from PandaOmronKeyConverter inside the wrapper.
|
||||
agent_pos = np.concatenate(
|
||||
[
|
||||
raw_obs.get("state.base_position", np.zeros(3)),
|
||||
raw_obs.get("state.base_rotation", np.zeros(4)),
|
||||
raw_obs.get("state.end_effector_position_relative", np.zeros(3)),
|
||||
raw_obs.get("state.end_effector_rotation_relative", np.zeros(4)),
|
||||
raw_obs.get("state.gripper_qpos", np.zeros(2)),
|
||||
],
|
||||
axis=-1,
|
||||
).astype(np.float32)
|
||||
|
||||
return {"pixels": images, "agent_pos": agent_pos}
|
||||
|
||||
def render(self) -> np.ndarray:
|
||||
self._ensure_env()
|
||||
assert self._env is not None
|
||||
return self._env.render()
|
||||
|
||||
def reset(self, seed=None, **kwargs):
|
||||
self._ensure_env()
|
||||
assert self._env is not None
|
||||
super().reset(seed=seed)
|
||||
# Spread the seed across workers so n_envs factories don't all
|
||||
# roll the same scene. With an explicit user seed we shift it by
|
||||
# episode_index; with no seed we fall back to episode_index so
|
||||
# each worker is still distinct rather than inheriting the same
|
||||
# global RNG state.
|
||||
worker_seed = seed + self.episode_index if seed is not None else self.episode_index
|
||||
raw_obs, info = self._env.reset(seed=worker_seed)
|
||||
|
||||
ep_meta = self._env.env.get_ep_meta()
|
||||
self.task_description = ep_meta.get("lang", self.task)
|
||||
|
||||
observation = self._format_raw_obs(raw_obs)
|
||||
info = {"is_success": False}
|
||||
return observation, info
|
||||
|
||||
def step(self, action: np.ndarray) -> tuple[RobotObservation, float, bool, bool, dict[str, Any]]:
|
||||
self._ensure_env()
|
||||
assert self._env is not None
|
||||
if action.ndim != 1:
|
||||
raise ValueError(
|
||||
f"Expected action to be 1-D (shape (action_dim,)), "
|
||||
f"but got shape {action.shape} with ndim={action.ndim}"
|
||||
)
|
||||
|
||||
action_dict = convert_action(action)
|
||||
raw_obs, reward, done, truncated, info = self._env.step(action_dict)
|
||||
|
||||
is_success = bool(info.get("success", False))
|
||||
terminated = done or is_success
|
||||
info.update({"task": self.task, "done": done, "is_success": is_success})
|
||||
|
||||
observation = self._format_raw_obs(raw_obs)
|
||||
if terminated:
|
||||
info["final_info"] = {
|
||||
"task": self.task,
|
||||
"done": bool(done),
|
||||
"is_success": bool(is_success),
|
||||
}
|
||||
self.reset()
|
||||
|
||||
return observation, reward, terminated, truncated, info
|
||||
|
||||
def close(self):
|
||||
if self._env is not None:
|
||||
self._env.close()
|
||||
|
||||
|
||||
def _make_env_fns(
|
||||
*,
|
||||
task: str,
|
||||
n_envs: int,
|
||||
camera_names: list[str],
|
||||
obs_type: str,
|
||||
render_mode: str,
|
||||
observation_width: int,
|
||||
observation_height: int,
|
||||
visualization_width: int,
|
||||
visualization_height: int,
|
||||
split: str | None,
|
||||
episode_length: int | None,
|
||||
obj_registries: Sequence[str],
|
||||
) -> list[Callable[[], RoboCasaEnv]]:
|
||||
"""Build n_envs factory callables for a single task.
|
||||
|
||||
Each factory carries a distinct ``episode_index`` (``0..n_envs-1``) so
|
||||
``RoboCasaEnv.reset()`` can derive a per-worker seed series from the
|
||||
user-provided seed.
|
||||
"""
|
||||
|
||||
def _make_env(episode_index: int) -> RoboCasaEnv:
|
||||
return RoboCasaEnv(
|
||||
task=task,
|
||||
camera_name=camera_names,
|
||||
obs_type=obs_type,
|
||||
render_mode=render_mode,
|
||||
observation_width=observation_width,
|
||||
observation_height=observation_height,
|
||||
visualization_width=visualization_width,
|
||||
visualization_height=visualization_height,
|
||||
split=split,
|
||||
episode_length=episode_length,
|
||||
obj_registries=obj_registries,
|
||||
episode_index=episode_index,
|
||||
)
|
||||
|
||||
return [partial(_make_env, i) for i in range(n_envs)]
|
||||
|
||||
|
||||
def create_robocasa_envs(
|
||||
task: str,
|
||||
n_envs: int,
|
||||
gym_kwargs: dict[str, Any] | None = None,
|
||||
camera_name: str | Sequence[str] = ",".join(DEFAULT_CAMERAS),
|
||||
env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None,
|
||||
episode_length: int | None = None,
|
||||
obj_registries: Sequence[str] = DEFAULT_OBJ_REGISTRIES,
|
||||
) -> dict[str, dict[int, Any]]:
|
||||
"""Create vectorized RoboCasa365 environments with a consistent return shape.
|
||||
|
||||
Returns:
|
||||
dict[task_name][task_id] -> vec_env (env_cls([...]) with exactly n_envs factories)
|
||||
|
||||
`task` can be:
|
||||
- a single task name (e.g. `CloseFridge`)
|
||||
- a comma-separated list of task names (e.g. `CloseFridge,PickPlaceCoffee`)
|
||||
- a benchmark-group shortcut (`atomic_seen`, `composite_seen`,
|
||||
`composite_unseen`, `pretrain50`, `pretrain100`, `pretrain200`,
|
||||
`pretrain300`), which auto-expands to the upstream task list and
|
||||
auto-sets the dataset `split` ("target" or "pretrain").
|
||||
"""
|
||||
if env_cls is None or not callable(env_cls):
|
||||
raise ValueError("env_cls must be a callable that wraps a list of environment factory callables.")
|
||||
if not isinstance(n_envs, int) or n_envs <= 0:
|
||||
raise ValueError(f"n_envs must be a positive int; got {n_envs}.")
|
||||
|
||||
gym_kwargs = dict(gym_kwargs or {})
|
||||
obs_type = gym_kwargs.pop("obs_type", "pixels_agent_pos")
|
||||
render_mode = gym_kwargs.pop("render_mode", "rgb_array")
|
||||
observation_width = gym_kwargs.pop("observation_width", 256)
|
||||
observation_height = gym_kwargs.pop("observation_height", 256)
|
||||
visualization_width = gym_kwargs.pop("visualization_width", 512)
|
||||
visualization_height = gym_kwargs.pop("visualization_height", 512)
|
||||
split = gym_kwargs.pop("split", None)
|
||||
|
||||
camera_names = parse_camera_names(camera_name)
|
||||
task_names, group_split = _resolve_tasks(str(task))
|
||||
if group_split is not None and split is None:
|
||||
split = group_split
|
||||
|
||||
logger.info(
|
||||
"Creating RoboCasa envs | tasks=%s | split=%s | n_envs(per task)=%d",
|
||||
task_names,
|
||||
split,
|
||||
n_envs,
|
||||
)
|
||||
|
||||
is_async = env_cls is gym.vector.AsyncVectorEnv
|
||||
|
||||
cached_obs_space: spaces.Space | None = None
|
||||
cached_act_space: spaces.Space | None = None
|
||||
cached_metadata: dict[str, Any] | None = None
|
||||
out: dict[str, dict[int, Any]] = defaultdict(dict)
|
||||
|
||||
for task_name in task_names:
|
||||
fns = _make_env_fns(
|
||||
task=task_name,
|
||||
n_envs=n_envs,
|
||||
camera_names=camera_names,
|
||||
obs_type=obs_type,
|
||||
render_mode=render_mode,
|
||||
observation_width=observation_width,
|
||||
observation_height=observation_height,
|
||||
visualization_width=visualization_width,
|
||||
visualization_height=visualization_height,
|
||||
split=split,
|
||||
episode_length=episode_length,
|
||||
obj_registries=obj_registries,
|
||||
)
|
||||
|
||||
if is_async:
|
||||
lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
|
||||
if cached_obs_space is None:
|
||||
cached_obs_space = lazy.observation_space
|
||||
cached_act_space = lazy.action_space
|
||||
cached_metadata = lazy.metadata
|
||||
out[task_name][0] = lazy
|
||||
else:
|
||||
out[task_name][0] = env_cls(fns)
|
||||
logger.info("Built vec env | task=%s | n_envs=%d", task_name, n_envs)
|
||||
|
||||
return {name: dict(task_map) for name, task_map in out.items()}
|
||||
245
src/lerobot/envs/robomme.py
Normal file
245
src/lerobot/envs/robomme.py
Normal file
@@ -0,0 +1,245 @@
|
||||
"""RoboMME environment wrapper for LeRobot evaluation.
|
||||
|
||||
Wraps the RoboMME ``BenchmarkEnvBuilder`` into a Gymnasium-compatible
|
||||
``VectorEnv`` suitable for ``lerobot_eval``.
|
||||
|
||||
RoboMME tasks:
|
||||
Counting: BinFill, PickXtimes, SwingXtimes, StopCube
|
||||
Permanence: VideoUnmask, VideoUnmaskSwap, ButtonUnmask, ButtonUnmaskSwap
|
||||
Reference: PickHighlight, VideoRepick, VideoPlaceButton, VideoPlaceOrder
|
||||
Imitation: MoveCube, InsertPeg, PatternLock, RouteStick
|
||||
|
||||
Dataset: lerobot/robomme (LeRobot v3.0, 1,600 episodes)
|
||||
Install: see docker/Dockerfile.benchmark.robomme (Linux only — mani-skill vs numpy pin conflict)
|
||||
Benchmark: https://github.com/RoboMME/robomme_benchmark
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Callable, Sequence
|
||||
from functools import partial
|
||||
from typing import Any
|
||||
|
||||
import gymnasium as gym
|
||||
import numpy as np
|
||||
from gymnasium import spaces
|
||||
|
||||
from .utils import _LazyAsyncVectorEnv
|
||||
|
||||
ROBOMME_TASKS = [
|
||||
"BinFill",
|
||||
"PickXtimes",
|
||||
"SwingXtimes",
|
||||
"StopCube",
|
||||
"VideoUnmask",
|
||||
"VideoUnmaskSwap",
|
||||
"ButtonUnmask",
|
||||
"ButtonUnmaskSwap",
|
||||
"PickHighlight",
|
||||
"VideoRepick",
|
||||
"VideoPlaceButton",
|
||||
"VideoPlaceOrder",
|
||||
"MoveCube",
|
||||
"InsertPeg",
|
||||
"PatternLock",
|
||||
"RouteStick",
|
||||
]
|
||||
|
||||
|
||||
class RoboMMEGymEnv(gym.Env):
|
||||
"""Thin Gymnasium wrapper around a single RoboMME episode env."""
|
||||
|
||||
metadata = {"render_modes": ["rgb_array"], "render_fps": 10}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
task: str = "PickXtimes",
|
||||
action_space_type: str = "joint_angle",
|
||||
dataset: str = "test",
|
||||
episode_idx: int = 0,
|
||||
max_steps: int = 300,
|
||||
):
|
||||
super().__init__()
|
||||
from robomme.env_record_wrapper import BenchmarkEnvBuilder
|
||||
|
||||
self._task = task
|
||||
self._action_space_type = action_space_type
|
||||
self._dataset = dataset
|
||||
self._episode_idx = episode_idx
|
||||
self._max_steps = max_steps
|
||||
self._max_episode_steps = max_steps
|
||||
|
||||
self._builder = BenchmarkEnvBuilder(
|
||||
env_id=task,
|
||||
dataset=dataset,
|
||||
action_space=action_space_type,
|
||||
gui_render=False,
|
||||
max_steps=max_steps,
|
||||
)
|
||||
self._env = None
|
||||
self._last_raw_obs: dict | None = None
|
||||
|
||||
action_dim = 8 if action_space_type == "joint_angle" else 7
|
||||
self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(action_dim,), dtype=np.float32)
|
||||
# `pixels` must be a nested Dict so `preprocess_observation()` in
|
||||
# envs/utils.py picks it up and maps each camera to
|
||||
# `observation.images.<cam>`. A flat layout (`pixels/image`,
|
||||
# `pixels/wrist_image`) silently drops every image from the batch.
|
||||
self.observation_space = spaces.Dict(
|
||||
{
|
||||
"pixels": spaces.Dict(
|
||||
{
|
||||
"image": spaces.Box(0, 255, shape=(256, 256, 3), dtype=np.uint8),
|
||||
"wrist_image": spaces.Box(0, 255, shape=(256, 256, 3), dtype=np.uint8),
|
||||
}
|
||||
),
|
||||
"agent_pos": spaces.Box(-np.inf, np.inf, shape=(8,), dtype=np.float32),
|
||||
}
|
||||
)
|
||||
|
||||
def reset(self, *, seed=None, options=None):
|
||||
super().reset(seed=seed)
|
||||
self._env = self._builder.make_env_for_episode(
|
||||
episode_idx=self._episode_idx,
|
||||
max_steps=self._max_steps,
|
||||
)
|
||||
obs, info = self._env.reset()
|
||||
self._last_raw_obs = obs
|
||||
return self._convert_obs(obs), self._convert_info(info)
|
||||
|
||||
def step(self, action):
|
||||
obs, reward, terminated, truncated, info = self._env.step(action)
|
||||
self._last_raw_obs = obs
|
||||
|
||||
terminated_bool = bool(terminated.item()) if hasattr(terminated, "item") else bool(terminated)
|
||||
truncated_bool = bool(truncated.item()) if hasattr(truncated, "item") else bool(truncated)
|
||||
|
||||
status = info.get("status", "ongoing")
|
||||
is_success = status == "success"
|
||||
conv_info = self._convert_info(info)
|
||||
conv_info["is_success"] = is_success
|
||||
|
||||
return self._convert_obs(obs), float(reward), terminated_bool, truncated_bool, conv_info
|
||||
|
||||
def render(self) -> np.ndarray | None:
|
||||
"""Return the front camera image from the last observation for video recording."""
|
||||
if self._last_raw_obs is None:
|
||||
return np.zeros((256, 256, 3), dtype=np.uint8)
|
||||
front = self._last_raw_obs.get("front_rgb_list")
|
||||
if front is None:
|
||||
return np.zeros((256, 256, 3), dtype=np.uint8)
|
||||
frame = front[-1] if isinstance(front, list) else front
|
||||
return np.asarray(frame, dtype=np.uint8)
|
||||
|
||||
def _convert_obs(self, obs: dict) -> dict:
|
||||
front_rgb = (
|
||||
obs["front_rgb_list"][-1] if isinstance(obs["front_rgb_list"], list) else obs["front_rgb_list"]
|
||||
)
|
||||
wrist_rgb = (
|
||||
obs["wrist_rgb_list"][-1] if isinstance(obs["wrist_rgb_list"], list) else obs["wrist_rgb_list"]
|
||||
)
|
||||
joint_state = (
|
||||
obs["joint_state_list"][-1]
|
||||
if isinstance(obs["joint_state_list"], list)
|
||||
else obs["joint_state_list"]
|
||||
)
|
||||
gripper_state = (
|
||||
obs["gripper_state_list"][-1]
|
||||
if isinstance(obs["gripper_state_list"], list)
|
||||
else obs["gripper_state_list"]
|
||||
)
|
||||
|
||||
front_rgb = np.asarray(front_rgb, dtype=np.uint8)
|
||||
wrist_rgb = np.asarray(wrist_rgb, dtype=np.uint8)
|
||||
joint = np.asarray(joint_state, dtype=np.float32).flatten()[:7]
|
||||
gripper = np.asarray(gripper_state, dtype=np.float32).flatten()[:1]
|
||||
state = np.concatenate([joint, gripper])
|
||||
|
||||
return {
|
||||
"pixels": {"image": front_rgb, "wrist_image": wrist_rgb},
|
||||
"agent_pos": state,
|
||||
}
|
||||
|
||||
def _convert_info(self, info: dict) -> dict:
|
||||
return {
|
||||
"status": info.get("status", "ongoing"),
|
||||
"task_goal": info.get("task_goal", ""),
|
||||
}
|
||||
|
||||
|
||||
def _make_env_fns(
|
||||
*,
|
||||
task: str,
|
||||
n_envs: int,
|
||||
action_space_type: str,
|
||||
dataset: str,
|
||||
episode_length: int,
|
||||
task_id: int,
|
||||
) -> list[Callable[[], RoboMMEGymEnv]]:
|
||||
"""Build n_envs factory callables for one RoboMME task id."""
|
||||
|
||||
def _make_one(episode_index: int) -> RoboMMEGymEnv:
|
||||
return RoboMMEGymEnv(
|
||||
task=task,
|
||||
action_space_type=action_space_type,
|
||||
dataset=dataset,
|
||||
episode_idx=episode_index,
|
||||
max_steps=episode_length,
|
||||
)
|
||||
|
||||
return [partial(_make_one, task_id + i) for i in range(n_envs)]
|
||||
|
||||
|
||||
def create_robomme_envs(
|
||||
task: str,
|
||||
n_envs: int = 1,
|
||||
action_space_type: str = "joint_angle",
|
||||
dataset: str = "test",
|
||||
episode_length: int = 300,
|
||||
task_ids: list[int] | None = None,
|
||||
env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None,
|
||||
) -> dict[str, dict[int, gym.vector.VectorEnv]]:
|
||||
"""Create vectorized RoboMME environments for evaluation.
|
||||
|
||||
`task` may be a single RoboMME task name (e.g. "PickXtimes") or a
|
||||
comma-separated list (e.g. "PickXtimes,BinFill,StopCube"). Each task
|
||||
becomes its own suite in the returned mapping.
|
||||
|
||||
Returns {suite_name: {task_id: VectorEnv}} matching lerobot's expected format.
|
||||
"""
|
||||
if env_cls is None or not callable(env_cls):
|
||||
raise ValueError("env_cls must be a callable that wraps a list of env factory callables.")
|
||||
if not isinstance(n_envs, int) or n_envs <= 0:
|
||||
raise ValueError(f"n_envs must be a positive int; got {n_envs}.")
|
||||
|
||||
if task_ids is None:
|
||||
task_ids = [0]
|
||||
|
||||
task_names = [t.strip() for t in task.split(",") if t.strip()]
|
||||
is_async = env_cls is gym.vector.AsyncVectorEnv
|
||||
cached_obs_space: spaces.Space | None = None
|
||||
cached_act_space: spaces.Space | None = None
|
||||
cached_metadata: dict[str, Any] | None = None
|
||||
out: dict[str, dict[int, gym.vector.VectorEnv]] = {}
|
||||
for task_name in task_names:
|
||||
envs_by_task: dict[int, gym.vector.VectorEnv] = {}
|
||||
for task_id in task_ids:
|
||||
fns = _make_env_fns(
|
||||
task=task_name,
|
||||
n_envs=n_envs,
|
||||
action_space_type=action_space_type,
|
||||
dataset=dataset,
|
||||
episode_length=episode_length,
|
||||
task_id=task_id,
|
||||
)
|
||||
if is_async:
|
||||
lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
|
||||
if cached_obs_space is None:
|
||||
cached_obs_space = lazy.observation_space
|
||||
cached_act_space = lazy.action_space
|
||||
cached_metadata = lazy.metadata
|
||||
envs_by_task[task_id] = lazy
|
||||
else:
|
||||
envs_by_task[task_id] = env_cls(fns)
|
||||
out[task_name] = envs_by_task
|
||||
return out
|
||||
488
src/lerobot/envs/robotwin.py
Normal file
488
src/lerobot/envs/robotwin.py
Normal file
@@ -0,0 +1,488 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from collections.abc import Callable, Sequence
|
||||
from functools import partial
|
||||
from typing import Any
|
||||
|
||||
import gymnasium as gym
|
||||
import numpy as np
|
||||
import torch
|
||||
from gymnasium import spaces
|
||||
|
||||
from lerobot.types import RobotObservation
|
||||
|
||||
from .utils import _LazyAsyncVectorEnv
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Camera names as used by RoboTwin 2.0. The wrapper appends "_rgb" when looking
|
||||
# up keys in get_obs() output (e.g. "head_camera" → "head_camera_rgb").
|
||||
ROBOTWIN_CAMERA_NAMES: tuple[str, ...] = (
|
||||
"head_camera",
|
||||
"left_camera",
|
||||
"right_camera",
|
||||
)
|
||||
|
||||
ACTION_DIM = 14 # 7 DOF × 2 arms
|
||||
ACTION_LOW = -1.0
|
||||
ACTION_HIGH = 1.0
|
||||
DEFAULT_EPISODE_LENGTH = 300
|
||||
# D435 dims from task_config/_camera_config.yml (what demo_clean.yml selects).
|
||||
DEFAULT_CAMERA_H = 240
|
||||
DEFAULT_CAMERA_W = 320
|
||||
|
||||
# Task list from RoboTwin 2.0's `envs/` directory — mirrors upstream exactly
|
||||
# (50 tasks as of main; earlier revisions had 60 with a different split).
|
||||
# Keep this in sync with:
|
||||
# gh api /repos/RoboTwin-Platform/RoboTwin/contents/envs --paginate \
|
||||
# | jq -r '.[].name' | grep -E '\.py$' | grep -v '^_' | sed 's/\.py$//'
|
||||
ROBOTWIN_TASKS: tuple[str, ...] = (
|
||||
"adjust_bottle",
|
||||
"beat_block_hammer",
|
||||
"blocks_ranking_rgb",
|
||||
"blocks_ranking_size",
|
||||
"click_alarmclock",
|
||||
"click_bell",
|
||||
"dump_bin_bigbin",
|
||||
"grab_roller",
|
||||
"handover_block",
|
||||
"handover_mic",
|
||||
"hanging_mug",
|
||||
"lift_pot",
|
||||
"move_can_pot",
|
||||
"move_pillbottle_pad",
|
||||
"move_playingcard_away",
|
||||
"move_stapler_pad",
|
||||
"open_laptop",
|
||||
"open_microwave",
|
||||
"pick_diverse_bottles",
|
||||
"pick_dual_bottles",
|
||||
"place_a2b_left",
|
||||
"place_a2b_right",
|
||||
"place_bread_basket",
|
||||
"place_bread_skillet",
|
||||
"place_burger_fries",
|
||||
"place_can_basket",
|
||||
"place_cans_plasticbox",
|
||||
"place_container_plate",
|
||||
"place_dual_shoes",
|
||||
"place_empty_cup",
|
||||
"place_fan",
|
||||
"place_mouse_pad",
|
||||
"place_object_basket",
|
||||
"place_object_scale",
|
||||
"place_object_stand",
|
||||
"place_phone_stand",
|
||||
"place_shoe",
|
||||
"press_stapler",
|
||||
"put_bottles_dustbin",
|
||||
"put_object_cabinet",
|
||||
"rotate_qrcode",
|
||||
"scan_object",
|
||||
"shake_bottle",
|
||||
"shake_bottle_horizontally",
|
||||
"stack_blocks_three",
|
||||
"stack_blocks_two",
|
||||
"stack_bowls_three",
|
||||
"stack_bowls_two",
|
||||
"stamp_seal",
|
||||
"turn_switch",
|
||||
)
|
||||
|
||||
|
||||
_ROBOTWIN_SETUP_CACHE: dict[str, dict[str, Any]] = {}
|
||||
|
||||
|
||||
def _load_robotwin_setup_kwargs(task_name: str) -> dict[str, Any]:
|
||||
"""Build the kwargs dict RoboTwin's setup_demo expects.
|
||||
|
||||
Mirrors the config loading done by RoboTwin's ``script/eval_policy.py``:
|
||||
reads ``task_config/demo_clean.yml``, resolves the embodiment file from
|
||||
``_embodiment_config.yml``, loads the robot's own ``config.yml``, and
|
||||
reads camera dimensions from ``_camera_config.yml``.
|
||||
|
||||
Uses ``aloha-agilex`` single-robot dual-arm by default (the only embodiment
|
||||
used by beat_block_hammer and most smoke-test tasks).
|
||||
"""
|
||||
if task_name in _ROBOTWIN_SETUP_CACHE:
|
||||
return dict(_ROBOTWIN_SETUP_CACHE[task_name])
|
||||
|
||||
import os
|
||||
|
||||
import yaml # type: ignore[import-untyped]
|
||||
from envs import CONFIGS_PATH # type: ignore[import-not-found]
|
||||
|
||||
task_config = "demo_clean"
|
||||
with open(os.path.join(CONFIGS_PATH, f"{task_config}.yml"), encoding="utf-8") as f:
|
||||
args = yaml.safe_load(f)
|
||||
|
||||
# Resolve embodiment — demo_clean.yml uses [aloha-agilex] (dual-arm single robot)
|
||||
with open(os.path.join(CONFIGS_PATH, "_embodiment_config.yml"), encoding="utf-8") as f:
|
||||
embodiment_types = yaml.safe_load(f)
|
||||
embodiment = args.get("embodiment", ["aloha-agilex"])
|
||||
if len(embodiment) == 1:
|
||||
robot_file = embodiment_types[embodiment[0]]["file_path"]
|
||||
args["left_robot_file"] = robot_file
|
||||
args["right_robot_file"] = robot_file
|
||||
args["dual_arm_embodied"] = True
|
||||
elif len(embodiment) == 3:
|
||||
args["left_robot_file"] = embodiment_types[embodiment[0]]["file_path"]
|
||||
args["right_robot_file"] = embodiment_types[embodiment[1]]["file_path"]
|
||||
args["embodiment_dis"] = embodiment[2]
|
||||
args["dual_arm_embodied"] = False
|
||||
else:
|
||||
raise ValueError(f"embodiment must have 1 or 3 items, got {len(embodiment)}")
|
||||
|
||||
with open(os.path.join(args["left_robot_file"], "config.yml"), encoding="utf-8") as f:
|
||||
args["left_embodiment_config"] = yaml.safe_load(f)
|
||||
with open(os.path.join(args["right_robot_file"], "config.yml"), encoding="utf-8") as f:
|
||||
args["right_embodiment_config"] = yaml.safe_load(f)
|
||||
|
||||
# Camera dimensions
|
||||
with open(os.path.join(CONFIGS_PATH, "_camera_config.yml"), encoding="utf-8") as f:
|
||||
camera_config = yaml.safe_load(f)
|
||||
head_cam = args["camera"]["head_camera_type"]
|
||||
args["head_camera_h"] = camera_config[head_cam]["h"]
|
||||
args["head_camera_w"] = camera_config[head_cam]["w"]
|
||||
|
||||
# Headless overrides
|
||||
args["render_freq"] = 0
|
||||
args["task_name"] = task_name
|
||||
args["task_config"] = task_config
|
||||
|
||||
_ROBOTWIN_SETUP_CACHE[task_name] = args
|
||||
return dict(args)
|
||||
|
||||
|
||||
def _load_robotwin_task(task_name: str) -> type:
|
||||
"""Dynamically import and return a RoboTwin 2.0 task class.
|
||||
|
||||
RoboTwin tasks live in ``envs/<task_name>.py`` relative to the repository
|
||||
root and are expected to be on ``sys.path`` after installation.
|
||||
"""
|
||||
try:
|
||||
module = importlib.import_module(f"envs.{task_name}")
|
||||
except ModuleNotFoundError as e:
|
||||
raise ModuleNotFoundError(
|
||||
f"Could not import RoboTwin task '{task_name}'. "
|
||||
"Ensure RoboTwin 2.0 is installed and its 'envs/' directory is on PYTHONPATH. "
|
||||
"See the RoboTwin installation guide: https://robotwin-platform.github.io/doc/usage/robotwin-install.html"
|
||||
) from e
|
||||
task_cls = getattr(module, task_name, None)
|
||||
if task_cls is None:
|
||||
raise AttributeError(f"Task class '{task_name}' not found in envs/{task_name}.py")
|
||||
return task_cls
|
||||
|
||||
|
||||
class RoboTwinEnv(gym.Env):
|
||||
"""Gymnasium wrapper around a single RoboTwin 2.0 task.
|
||||
|
||||
RoboTwin uses a custom SAPIEN-based API (``setup_demo`` / ``get_obs`` /
|
||||
``take_action`` / ``check_success``) rather than the standard gym interface.
|
||||
This class bridges that API to Gymnasium so that ``lerobot-eval`` can drive
|
||||
RoboTwin exactly like LIBERO or Meta-World.
|
||||
|
||||
The underlying SAPIEN environment is created lazily on the first ``reset()``
|
||||
call *inside the worker process*. This is required for
|
||||
``gym.vector.AsyncVectorEnv`` compatibility: SAPIEN allocates EGL/GPU
|
||||
contexts that must not be forked from the parent process.
|
||||
|
||||
Observations
|
||||
------------
|
||||
The ``pixels`` dict uses the raw RoboTwin camera names as keys (e.g.
|
||||
``"head_camera"``, ``"left_camera"``). ``preprocess_observation`` in
|
||||
``envs/utils.py`` then converts these to ``observation.images.<cam>``.
|
||||
|
||||
Actions
|
||||
-------
|
||||
14-dim float32 array in ``[-1, 1]`` (joint-space, 7 DOF per arm).
|
||||
|
||||
Autograd
|
||||
--------
|
||||
``setup_demo`` and ``take_action`` drive CuRobo's Newton trajectory
|
||||
optimizer, which calls ``cost.backward()`` internally. lerobot_eval wraps
|
||||
the rollout in ``torch.no_grad()``, so both call sites re-enable grad.
|
||||
"""
|
||||
|
||||
metadata = {"render_modes": ["rgb_array"], "render_fps": 25}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
task_name: str,
|
||||
episode_index: int = 0,
|
||||
n_envs: int = 1,
|
||||
camera_names: Sequence[str] = ROBOTWIN_CAMERA_NAMES,
|
||||
observation_height: int | None = None,
|
||||
observation_width: int | None = None,
|
||||
episode_length: int = DEFAULT_EPISODE_LENGTH,
|
||||
render_mode: str = "rgb_array",
|
||||
):
|
||||
super().__init__()
|
||||
self.task_name = task_name
|
||||
self.task = task_name # used by add_envs_task() in utils.py
|
||||
self.task_description = task_name.replace("_", " ")
|
||||
self.episode_index = episode_index
|
||||
self._reset_stride = n_envs
|
||||
self.camera_names = list(camera_names)
|
||||
# Default to D435 dims (the camera type baked into task_config/demo_clean.yml).
|
||||
# The YAML-driven lookup is deferred to reset() so construction doesn't
|
||||
# import RoboTwin's `envs` module — fast-tests run without RoboTwin installed.
|
||||
self.observation_height = observation_height or DEFAULT_CAMERA_H
|
||||
self.observation_width = observation_width or DEFAULT_CAMERA_W
|
||||
self.episode_length = episode_length
|
||||
self._max_episode_steps = episode_length # lerobot_eval.rollout reads this
|
||||
self.render_mode = render_mode
|
||||
|
||||
self._env: Any | None = None # deferred — created on first reset() inside worker
|
||||
self._step_count: int = 0
|
||||
self._black_frame = np.zeros((self.observation_height, self.observation_width, 3), dtype=np.uint8)
|
||||
|
||||
image_spaces = {
|
||||
cam: spaces.Box(
|
||||
low=0,
|
||||
high=255,
|
||||
shape=(self.observation_height, self.observation_width, 3),
|
||||
dtype=np.uint8,
|
||||
)
|
||||
for cam in self.camera_names
|
||||
}
|
||||
self.observation_space = spaces.Dict(
|
||||
{
|
||||
"pixels": spaces.Dict(image_spaces),
|
||||
"agent_pos": spaces.Box(low=-np.inf, high=np.inf, shape=(ACTION_DIM,), dtype=np.float32),
|
||||
}
|
||||
)
|
||||
self.action_space = spaces.Box(
|
||||
low=ACTION_LOW, high=ACTION_HIGH, shape=(ACTION_DIM,), dtype=np.float32
|
||||
)
|
||||
|
||||
def _ensure_env(self) -> None:
|
||||
"""Create the SAPIEN environment on first use.
|
||||
|
||||
Called inside the worker subprocess after fork(), so each worker gets
|
||||
its own EGL/GPU context rather than inheriting a stale one from the
|
||||
parent process (which causes crashes with AsyncVectorEnv).
|
||||
"""
|
||||
if self._env is not None:
|
||||
return
|
||||
task_cls = _load_robotwin_task(self.task_name)
|
||||
self._env = task_cls()
|
||||
|
||||
def _get_obs(self) -> RobotObservation:
|
||||
assert self._env is not None, "_get_obs called before _ensure_env()"
|
||||
raw = self._env.get_obs()
|
||||
cameras_raw = raw.get("observation", {})
|
||||
|
||||
images: dict[str, np.ndarray] = {}
|
||||
for cam in self.camera_names:
|
||||
cam_data = cameras_raw.get(cam)
|
||||
img = cam_data.get("rgb") if cam_data else None
|
||||
if img is None:
|
||||
images[cam] = self._black_frame
|
||||
continue
|
||||
img = np.asarray(img, dtype=np.uint8)
|
||||
if img.ndim == 2:
|
||||
img = np.stack([img, img, img], axis=-1)
|
||||
elif img.shape[-1] != 3:
|
||||
img = img[..., :3]
|
||||
images[cam] = img
|
||||
|
||||
ja = raw.get("joint_action") or {}
|
||||
vec = ja.get("vector")
|
||||
if vec is not None:
|
||||
arr = np.asarray(vec, dtype=np.float32).ravel()
|
||||
joint_state = (
|
||||
arr[:ACTION_DIM] if arr.size >= ACTION_DIM else np.zeros(ACTION_DIM, dtype=np.float32)
|
||||
)
|
||||
else:
|
||||
joint_state = np.zeros(ACTION_DIM, dtype=np.float32)
|
||||
|
||||
return {"pixels": images, "agent_pos": joint_state}
|
||||
|
||||
def reset(self, seed: int | None = None, **kwargs) -> tuple[RobotObservation, dict]:
|
||||
self._ensure_env()
|
||||
super().reset(seed=seed)
|
||||
assert self._env is not None # set by _ensure_env() above
|
||||
|
||||
actual_seed = self.episode_index if seed is None else seed
|
||||
setup_kwargs = _load_robotwin_setup_kwargs(self.task_name)
|
||||
setup_kwargs.update(seed=actual_seed, is_test=True)
|
||||
with torch.enable_grad():
|
||||
self._env.setup_demo(**setup_kwargs)
|
||||
self.episode_index += self._reset_stride
|
||||
self._step_count = 0
|
||||
|
||||
obs = self._get_obs()
|
||||
return obs, {"is_success": False, "task": self.task_name}
|
||||
|
||||
def step(self, action: np.ndarray) -> tuple[RobotObservation, float, bool, bool, dict[str, Any]]:
|
||||
assert self._env is not None, "step() called before reset()"
|
||||
if action.ndim != 1 or action.shape[0] != ACTION_DIM:
|
||||
raise ValueError(f"Expected 1-D action of shape ({ACTION_DIM},), got {action.shape}")
|
||||
|
||||
with torch.enable_grad():
|
||||
if hasattr(self._env, "take_action"):
|
||||
self._env.take_action(action)
|
||||
else:
|
||||
self._env.step(action)
|
||||
|
||||
self._step_count += 1
|
||||
|
||||
is_success = bool(getattr(self._env, "eval_success", False))
|
||||
if not is_success and hasattr(self._env, "check_success"):
|
||||
is_success = bool(self._env.check_success())
|
||||
|
||||
obs = self._get_obs()
|
||||
reward = float(is_success)
|
||||
terminated = is_success
|
||||
truncated = self._step_count >= self.episode_length
|
||||
|
||||
info: dict[str, Any] = {
|
||||
"task": self.task_name,
|
||||
"is_success": is_success,
|
||||
"step": self._step_count,
|
||||
}
|
||||
if terminated or truncated:
|
||||
info["final_info"] = {
|
||||
"task": self.task_name,
|
||||
"is_success": is_success,
|
||||
}
|
||||
self.reset()
|
||||
|
||||
return obs, reward, terminated, truncated, info
|
||||
|
||||
def render(self) -> np.ndarray:
|
||||
self._ensure_env()
|
||||
obs = self._get_obs()
|
||||
# Prefer head camera for rendering; fall back to first available.
|
||||
if "head_camera" in obs["pixels"]:
|
||||
return obs["pixels"]["head_camera"]
|
||||
return next(iter(obs["pixels"].values()))
|
||||
|
||||
def close(self) -> None:
|
||||
if self._env is not None:
|
||||
if hasattr(self._env, "close_env"):
|
||||
import contextlib
|
||||
|
||||
with contextlib.suppress(TypeError):
|
||||
self._env.close_env()
|
||||
self._env = None
|
||||
|
||||
|
||||
# ---- Multi-task factory --------------------------------------------------------
|
||||
|
||||
|
||||
def _make_env_fns(
|
||||
*,
|
||||
task_name: str,
|
||||
n_envs: int,
|
||||
camera_names: list[str],
|
||||
observation_height: int,
|
||||
observation_width: int,
|
||||
episode_length: int,
|
||||
) -> list[Callable[[], RoboTwinEnv]]:
|
||||
"""Return n_envs factory callables for a single task."""
|
||||
|
||||
def _make_one(episode_index: int) -> RoboTwinEnv:
|
||||
return RoboTwinEnv(
|
||||
task_name=task_name,
|
||||
episode_index=episode_index,
|
||||
n_envs=n_envs,
|
||||
camera_names=camera_names,
|
||||
observation_height=observation_height,
|
||||
observation_width=observation_width,
|
||||
episode_length=episode_length,
|
||||
)
|
||||
|
||||
return [partial(_make_one, i) for i in range(n_envs)]
|
||||
|
||||
|
||||
def create_robotwin_envs(
|
||||
task: str,
|
||||
n_envs: int,
|
||||
env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None,
|
||||
camera_names: Sequence[str] = ROBOTWIN_CAMERA_NAMES,
|
||||
observation_height: int = DEFAULT_CAMERA_H,
|
||||
observation_width: int = DEFAULT_CAMERA_W,
|
||||
episode_length: int = DEFAULT_EPISODE_LENGTH,
|
||||
) -> dict[str, dict[int, Any]]:
|
||||
"""Create vectorized RoboTwin 2.0 environments.
|
||||
|
||||
Returns:
|
||||
``dict[task_name][0] -> VectorEnv`` — one entry per task, each wrapping
|
||||
``n_envs`` parallel rollouts.
|
||||
|
||||
Args:
|
||||
task: Comma-separated list of task names (e.g. ``"beat_block_hammer"``
|
||||
or ``"beat_block_hammer,click_bell"``).
|
||||
n_envs: Number of parallel rollouts per task.
|
||||
env_cls: Vector env constructor (e.g. ``gym.vector.AsyncVectorEnv``).
|
||||
camera_names: Cameras to include in observations.
|
||||
observation_height: Pixel height for all cameras.
|
||||
observation_width: Pixel width for all cameras.
|
||||
episode_length: Max steps before truncation.
|
||||
"""
|
||||
if env_cls is None or not callable(env_cls):
|
||||
raise ValueError("env_cls must be callable (e.g. gym.vector.AsyncVectorEnv).")
|
||||
if not isinstance(n_envs, int) or n_envs <= 0:
|
||||
raise ValueError(f"n_envs must be a positive int; got {n_envs}.")
|
||||
|
||||
task_names = [t.strip() for t in str(task).split(",") if t.strip()]
|
||||
if not task_names:
|
||||
raise ValueError("`task` must contain at least one RoboTwin task name.")
|
||||
|
||||
unknown = [t for t in task_names if t not in ROBOTWIN_TASKS]
|
||||
if unknown:
|
||||
raise ValueError(f"Unknown RoboTwin tasks: {unknown}. Available tasks: {sorted(ROBOTWIN_TASKS)}")
|
||||
|
||||
logger.info(
|
||||
"Creating RoboTwin envs | tasks=%s | n_envs(per task)=%d",
|
||||
task_names,
|
||||
n_envs,
|
||||
)
|
||||
|
||||
is_async = env_cls is gym.vector.AsyncVectorEnv
|
||||
cached_obs_space: spaces.Space | None = None
|
||||
cached_act_space: spaces.Space | None = None
|
||||
cached_metadata: dict[str, Any] | None = None
|
||||
|
||||
out: dict[str, dict[int, Any]] = defaultdict(dict)
|
||||
for task_name in task_names:
|
||||
fns = _make_env_fns(
|
||||
task_name=task_name,
|
||||
n_envs=n_envs,
|
||||
camera_names=list(camera_names),
|
||||
observation_height=observation_height,
|
||||
observation_width=observation_width,
|
||||
episode_length=episode_length,
|
||||
)
|
||||
if is_async:
|
||||
lazy = _LazyAsyncVectorEnv(fns, cached_obs_space, cached_act_space, cached_metadata)
|
||||
if cached_obs_space is None:
|
||||
cached_obs_space = lazy.observation_space
|
||||
cached_act_space = lazy.action_space
|
||||
cached_metadata = lazy.metadata
|
||||
out[task_name][0] = lazy
|
||||
else:
|
||||
out[task_name][0] = env_cls(fns)
|
||||
logger.info("Built vec env | task=%s | n_envs=%d", task_name, n_envs)
|
||||
|
||||
return {k: dict(v) for k, v in out.items()}
|
||||
@@ -34,6 +34,25 @@ from lerobot.utils.utils import get_channel_first_image_shape
|
||||
from .configs import EnvConfig
|
||||
|
||||
|
||||
def parse_camera_names(camera_name: str | Sequence[str]) -> list[str]:
|
||||
"""Normalize ``camera_name`` into a non-empty list of strings.
|
||||
|
||||
Accepts a comma-separated string (``"cam_a,cam_b"``) or a sequence of
|
||||
strings (tuples/lists). Whitespace is stripped; empty entries are
|
||||
dropped. Raises ``TypeError`` for unsupported input types and
|
||||
``ValueError`` when the normalized list is empty.
|
||||
"""
|
||||
if isinstance(camera_name, str):
|
||||
cams = [c.strip() for c in camera_name.split(",") if c.strip()]
|
||||
elif isinstance(camera_name, (list | tuple)):
|
||||
cams = [str(c).strip() for c in camera_name if str(c).strip()]
|
||||
else:
|
||||
raise TypeError(f"camera_name must be str or sequence[str], got {type(camera_name).__name__}")
|
||||
if not cams:
|
||||
raise ValueError("camera_name resolved to an empty list.")
|
||||
return cams
|
||||
|
||||
|
||||
def _convert_nested_dict(d):
|
||||
result = {}
|
||||
for k, v in d.items():
|
||||
@@ -153,17 +172,20 @@ class _LazyAsyncVectorEnv:
|
||||
env_fns: list[Callable],
|
||||
observation_space=None,
|
||||
action_space=None,
|
||||
metadata=None,
|
||||
):
|
||||
self._env_fns = env_fns
|
||||
self._env: gym.vector.AsyncVectorEnv | None = None
|
||||
self.num_envs = len(env_fns)
|
||||
if observation_space is not None and action_space is not None:
|
||||
if observation_space is not None and action_space is not None and metadata is not None:
|
||||
self.observation_space = observation_space
|
||||
self.action_space = action_space
|
||||
self.metadata = metadata
|
||||
else:
|
||||
tmp = env_fns[0]()
|
||||
self.observation_space = tmp.observation_space
|
||||
self.action_space = tmp.action_space
|
||||
self.metadata = tmp.metadata
|
||||
tmp.close()
|
||||
self.single_observation_space = self.observation_space
|
||||
self.single_action_space = self.action_space
|
||||
@@ -172,6 +194,10 @@ class _LazyAsyncVectorEnv:
|
||||
if self._env is None:
|
||||
self._env = gym.vector.AsyncVectorEnv(self._env_fns, context="forkserver", shared_memory=True)
|
||||
|
||||
@property
|
||||
def unwrapped(self):
|
||||
return self
|
||||
|
||||
def reset(self, **kwargs):
|
||||
self._ensure()
|
||||
return self._env.reset(**kwargs)
|
||||
|
||||
282
tests/envs/test_robotwin.py
Normal file
282
tests/envs/test_robotwin.py
Normal file
@@ -0,0 +1,282 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Unit tests for the RoboTwin 2.0 Gymnasium wrapper.
|
||||
|
||||
These tests mock out the SAPIEN-based RoboTwin runtime (task modules +
|
||||
YAML config loader) so they run without the full RoboTwin installation
|
||||
(SAPIEN, CuRobo, mplib, asset downloads, etc.).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from contextlib import contextmanager
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import gymnasium as gym
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from lerobot.envs.robotwin import (
|
||||
ACTION_DIM,
|
||||
ROBOTWIN_CAMERA_NAMES,
|
||||
ROBOTWIN_TASKS,
|
||||
RoboTwinEnv,
|
||||
create_robotwin_envs,
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fixtures / helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _make_mock_task_env(
|
||||
height: int = 240,
|
||||
width: int = 320,
|
||||
cameras: tuple[str, ...] = ROBOTWIN_CAMERA_NAMES,
|
||||
) -> MagicMock:
|
||||
"""Return a mock that mimics the RoboTwin task class API.
|
||||
|
||||
RoboTwin's real get_obs returns
|
||||
{"observation": {cam: {"rgb": img}}, "joint_action": {"vector": np.ndarray}, ...}
|
||||
so the mock follows the same nested shape.
|
||||
"""
|
||||
obs_dict = {
|
||||
"observation": {cam: {"rgb": np.zeros((height, width, 3), dtype=np.uint8)} for cam in cameras},
|
||||
"joint_action": {"vector": np.zeros(ACTION_DIM, dtype=np.float32)},
|
||||
"endpose": {},
|
||||
}
|
||||
|
||||
mock = MagicMock()
|
||||
mock.get_obs.return_value = obs_dict
|
||||
mock.setup_demo.return_value = None
|
||||
mock.take_action.return_value = None
|
||||
mock.eval_success = False
|
||||
mock.check_success.return_value = False
|
||||
mock.close_env.return_value = None
|
||||
return mock
|
||||
|
||||
|
||||
@contextmanager
|
||||
def _patch_runtime(mock_task_instance: MagicMock):
|
||||
"""Patch both the task-class loader and the YAML config loader so the
|
||||
env can construct + reset without a real RoboTwin install."""
|
||||
task_cls = MagicMock(return_value=mock_task_instance)
|
||||
fake_setup = {
|
||||
"head_camera_h": 240,
|
||||
"head_camera_w": 320,
|
||||
"left_embodiment_config": {},
|
||||
"right_embodiment_config": {},
|
||||
"left_robot_file": "",
|
||||
"right_robot_file": "",
|
||||
"dual_arm_embodied": True,
|
||||
"render_freq": 0,
|
||||
"task_name": "beat_block_hammer",
|
||||
"task_config": "demo_clean",
|
||||
}
|
||||
with (
|
||||
patch("lerobot.envs.robotwin._load_robotwin_task", return_value=task_cls),
|
||||
patch("lerobot.envs.robotwin._load_robotwin_setup_kwargs", return_value=fake_setup),
|
||||
):
|
||||
yield
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# RoboTwinEnv unit tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRoboTwinEnv:
|
||||
def test_observation_space_shape(self):
|
||||
"""observation_space should have the configured h×w×3 for every camera."""
|
||||
h, w = 240, 320
|
||||
env = RoboTwinEnv(
|
||||
task_name="beat_block_hammer",
|
||||
observation_height=h,
|
||||
observation_width=w,
|
||||
camera_names=["head_camera", "left_camera"],
|
||||
)
|
||||
pixels_space = env.observation_space["pixels"]
|
||||
assert pixels_space["head_camera"].shape == (h, w, 3)
|
||||
assert pixels_space["left_camera"].shape == (h, w, 3)
|
||||
assert "right_camera" not in pixels_space
|
||||
|
||||
def test_action_space(self):
|
||||
env = RoboTwinEnv(task_name="beat_block_hammer")
|
||||
assert env.action_space.shape == (ACTION_DIM,)
|
||||
assert env.action_space.dtype == np.float32
|
||||
|
||||
def test_reset_returns_correct_obs_keys(self):
|
||||
mock_task = _make_mock_task_env()
|
||||
env = RoboTwinEnv(task_name="beat_block_hammer")
|
||||
with _patch_runtime(mock_task):
|
||||
obs, info = env.reset()
|
||||
|
||||
assert "pixels" in obs
|
||||
for cam in ROBOTWIN_CAMERA_NAMES:
|
||||
assert cam in obs["pixels"], f"Missing camera '{cam}' in obs"
|
||||
assert "agent_pos" in obs
|
||||
assert obs["agent_pos"].shape == (ACTION_DIM,)
|
||||
assert info["is_success"] is False
|
||||
|
||||
def test_reset_calls_setup_demo(self):
|
||||
mock_task = _make_mock_task_env()
|
||||
env = RoboTwinEnv(task_name="beat_block_hammer")
|
||||
with _patch_runtime(mock_task):
|
||||
env.reset(seed=42)
|
||||
# setup_demo receives the full YAML-derived kwargs plus seed + is_test;
|
||||
# we only assert the caller-provided bits.
|
||||
assert mock_task.setup_demo.call_count == 1
|
||||
call_kwargs = mock_task.setup_demo.call_args.kwargs
|
||||
assert call_kwargs["seed"] == 42
|
||||
assert call_kwargs["is_test"] is True
|
||||
|
||||
def test_step_returns_correct_types(self):
|
||||
mock_task = _make_mock_task_env()
|
||||
env = RoboTwinEnv(task_name="beat_block_hammer")
|
||||
action = np.zeros(ACTION_DIM, dtype=np.float32)
|
||||
with _patch_runtime(mock_task):
|
||||
env.reset()
|
||||
obs, reward, terminated, truncated, info = env.step(action)
|
||||
|
||||
assert isinstance(obs, dict)
|
||||
assert isinstance(reward, float)
|
||||
assert isinstance(terminated, bool)
|
||||
assert isinstance(truncated, bool)
|
||||
assert isinstance(info, dict)
|
||||
|
||||
def test_step_wrong_action_shape_raises(self):
|
||||
mock_task = _make_mock_task_env()
|
||||
env = RoboTwinEnv(task_name="beat_block_hammer")
|
||||
bad_action = np.zeros(7, dtype=np.float32) # wrong dim
|
||||
with _patch_runtime(mock_task):
|
||||
env.reset()
|
||||
with pytest.raises(ValueError, match="Expected 1-D action"):
|
||||
env.step(bad_action)
|
||||
|
||||
def test_success_terminates_episode(self):
|
||||
mock_task = _make_mock_task_env()
|
||||
mock_task.check_success.return_value = True
|
||||
env = RoboTwinEnv(task_name="beat_block_hammer")
|
||||
action = np.zeros(ACTION_DIM, dtype=np.float32)
|
||||
with _patch_runtime(mock_task):
|
||||
env.reset()
|
||||
_, _, terminated, _, info = env.step(action)
|
||||
assert terminated is True
|
||||
assert info["is_success"] is True
|
||||
|
||||
def test_truncation_after_episode_length(self):
|
||||
mock_task = _make_mock_task_env()
|
||||
env = RoboTwinEnv(task_name="beat_block_hammer", episode_length=2)
|
||||
action = np.zeros(ACTION_DIM, dtype=np.float32)
|
||||
with _patch_runtime(mock_task):
|
||||
env.reset()
|
||||
env.step(action) # step 1
|
||||
_, _, _, truncated, _ = env.step(action) # step 2 → truncated
|
||||
assert truncated is True
|
||||
|
||||
def test_close_calls_close_env(self):
|
||||
mock_task = _make_mock_task_env()
|
||||
env = RoboTwinEnv(task_name="beat_block_hammer")
|
||||
with _patch_runtime(mock_task):
|
||||
env.reset()
|
||||
env.close()
|
||||
mock_task.close_env.assert_called_once()
|
||||
|
||||
def test_black_frame_for_missing_camera(self):
|
||||
"""If a camera key is absent from get_obs(), a black frame is returned."""
|
||||
# Mock exposes only head_camera; we ask for both head_camera + left_camera.
|
||||
mock_task = _make_mock_task_env(height=10, width=10, cameras=("head_camera",))
|
||||
env = RoboTwinEnv(
|
||||
task_name="beat_block_hammer",
|
||||
camera_names=["head_camera", "left_camera"],
|
||||
observation_height=10,
|
||||
observation_width=10,
|
||||
)
|
||||
with _patch_runtime(mock_task):
|
||||
obs, _ = env.reset()
|
||||
assert obs["pixels"]["left_camera"].shape == (10, 10, 3)
|
||||
assert obs["pixels"]["left_camera"].sum() == 0
|
||||
|
||||
def test_task_and_task_description_attributes(self):
|
||||
env = RoboTwinEnv(task_name="beat_block_hammer")
|
||||
assert env.task == "beat_block_hammer"
|
||||
assert isinstance(env.task_description, str)
|
||||
|
||||
def test_deferred_init_env_is_none_before_reset(self):
|
||||
env = RoboTwinEnv(task_name="beat_block_hammer")
|
||||
assert env._env is None # noqa: SLF001 (testing internal state)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# create_robotwin_envs tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestCreateRoboTwinEnvs:
|
||||
def test_returns_correct_structure(self):
|
||||
mock_task = _make_mock_task_env()
|
||||
with _patch_runtime(mock_task):
|
||||
envs = create_robotwin_envs(
|
||||
task="beat_block_hammer",
|
||||
n_envs=1,
|
||||
env_cls=gym.vector.SyncVectorEnv,
|
||||
)
|
||||
assert "beat_block_hammer" in envs
|
||||
assert 0 in envs["beat_block_hammer"]
|
||||
assert isinstance(envs["beat_block_hammer"][0], gym.vector.SyncVectorEnv)
|
||||
|
||||
def test_multi_task(self):
|
||||
mock_task = _make_mock_task_env()
|
||||
with _patch_runtime(mock_task):
|
||||
envs = create_robotwin_envs(
|
||||
task="beat_block_hammer,click_bell",
|
||||
n_envs=1,
|
||||
env_cls=gym.vector.SyncVectorEnv,
|
||||
)
|
||||
assert set(envs.keys()) == {"beat_block_hammer", "click_bell"}
|
||||
|
||||
def test_unknown_task_raises(self):
|
||||
with pytest.raises(ValueError, match="Unknown RoboTwin tasks"):
|
||||
create_robotwin_envs(
|
||||
task="not_a_real_task",
|
||||
n_envs=1,
|
||||
env_cls=gym.vector.SyncVectorEnv,
|
||||
)
|
||||
|
||||
def test_invalid_n_envs_raises(self):
|
||||
with pytest.raises(ValueError, match="n_envs must be a positive int"):
|
||||
create_robotwin_envs(
|
||||
task="beat_block_hammer",
|
||||
n_envs=0,
|
||||
env_cls=gym.vector.SyncVectorEnv,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# ROBOTWIN_TASKS list
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_task_list_not_empty():
|
||||
assert len(ROBOTWIN_TASKS) >= 50
|
||||
|
||||
|
||||
def test_all_tasks_are_strings():
|
||||
assert all(isinstance(t, str) and t for t in ROBOTWIN_TASKS)
|
||||
|
||||
|
||||
def test_no_duplicate_tasks():
|
||||
assert len(ROBOTWIN_TASKS) == len(set(ROBOTWIN_TASKS))
|
||||
232
tests/test_robomme_env.py
Normal file
232
tests/test_robomme_env.py
Normal file
@@ -0,0 +1,232 @@
|
||||
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Unit tests for the RoboMME env wrapper and config.
|
||||
|
||||
RoboMME requires Linux + ManiSkill (Vulkan/SAPIEN), so tests that touch the
|
||||
env wrapper mock the ``robomme`` package. Tests that only exercise the
|
||||
dataclass config run without any mocking.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from types import ModuleType
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
def _install_robomme_stub():
|
||||
"""Register a minimal stub for the ``robomme`` package on sys.modules."""
|
||||
stub = ModuleType("robomme")
|
||||
wrapper_stub = ModuleType("robomme.env_record_wrapper")
|
||||
|
||||
class FakeBuilder:
|
||||
def __init__(self, **kwargs):
|
||||
pass
|
||||
|
||||
def make_env_for_episode(self, episode_idx: int, max_steps: int):
|
||||
env = MagicMock()
|
||||
obs = {
|
||||
"front_rgb_list": [np.zeros((256, 256, 3), dtype=np.uint8)],
|
||||
"wrist_rgb_list": [np.zeros((256, 256, 3), dtype=np.uint8)],
|
||||
"joint_state_list": [np.zeros(7, dtype=np.float32)],
|
||||
"gripper_state_list": [np.zeros(2, dtype=np.float32)],
|
||||
}
|
||||
env.reset.return_value = (obs, {"status": "ongoing", "task_goal": "pick the cube"})
|
||||
env.step.return_value = (obs, 0.0, False, False, {"status": "ongoing", "task_goal": ""})
|
||||
return env
|
||||
|
||||
wrapper_stub.BenchmarkEnvBuilder = FakeBuilder
|
||||
stub.env_record_wrapper = wrapper_stub
|
||||
sys.modules["robomme"] = stub
|
||||
sys.modules["robomme.env_record_wrapper"] = wrapper_stub
|
||||
|
||||
|
||||
def _uninstall_robomme_stub():
|
||||
sys.modules.pop("robomme", None)
|
||||
sys.modules.pop("robomme.env_record_wrapper", None)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Config tests (no sim required)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_robomme_env_config_defaults():
|
||||
from lerobot.envs.configs import RoboMMEEnv
|
||||
|
||||
cfg = RoboMMEEnv()
|
||||
assert cfg.task == "PickXtimes"
|
||||
assert cfg.fps == 10
|
||||
assert cfg.episode_length == 300
|
||||
assert cfg.action_space == "joint_angle"
|
||||
assert cfg.dataset_split == "test"
|
||||
assert cfg.task_ids is None
|
||||
|
||||
|
||||
def test_robomme_env_config_type():
|
||||
from lerobot.envs.configs import RoboMMEEnv
|
||||
|
||||
cfg = RoboMMEEnv()
|
||||
assert cfg.type == "robomme"
|
||||
|
||||
|
||||
def test_robomme_features_map():
|
||||
from lerobot.envs.configs import RoboMMEEnv
|
||||
from lerobot.utils.constants import ACTION, OBS_IMAGES, OBS_STATE
|
||||
|
||||
cfg = RoboMMEEnv()
|
||||
assert cfg.features_map[ACTION] == ACTION
|
||||
assert cfg.features_map["pixels/image"] == f"{OBS_IMAGES}.image"
|
||||
assert cfg.features_map["pixels/wrist_image"] == f"{OBS_IMAGES}.wrist_image"
|
||||
assert cfg.features_map["agent_pos"] == OBS_STATE
|
||||
|
||||
|
||||
def test_robomme_features_action_dim_joint_angle():
|
||||
from lerobot.envs.configs import RoboMMEEnv
|
||||
from lerobot.utils.constants import ACTION
|
||||
|
||||
cfg = RoboMMEEnv(action_space="joint_angle")
|
||||
assert cfg.features[ACTION].shape == (8,)
|
||||
|
||||
|
||||
def test_robomme_features_action_dim_ee_pose():
|
||||
"""`ee_pose` uses a 7-D action; __post_init__ sets the correct shape."""
|
||||
from lerobot.envs.configs import RoboMMEEnv
|
||||
from lerobot.utils.constants import ACTION
|
||||
|
||||
cfg = RoboMMEEnv(action_space="ee_pose")
|
||||
assert cfg.features[ACTION].shape == (7,)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Obs conversion (pure Python, no sim)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_convert_obs_list_format():
|
||||
"""_convert_obs takes the last element from list-format obs fields and
|
||||
emits a nested ``pixels`` dict (image, wrist_image) plus ``agent_pos``.
|
||||
|
||||
The nested layout is required so ``preprocess_observation()`` in
|
||||
``envs/utils.py`` maps each camera to ``observation.images.<cam>``.
|
||||
"""
|
||||
_install_robomme_stub()
|
||||
try:
|
||||
from lerobot.envs.robomme import RoboMMEGymEnv
|
||||
|
||||
env = RoboMMEGymEnv.__new__(RoboMMEGymEnv)
|
||||
|
||||
front = np.full((256, 256, 3), 42, dtype=np.uint8)
|
||||
wrist = np.full((256, 256, 3), 7, dtype=np.uint8)
|
||||
joints = np.arange(7, dtype=np.float32)
|
||||
gripper = np.array([0.5, 0.5], dtype=np.float32)
|
||||
|
||||
obs_raw = {
|
||||
"front_rgb_list": [np.zeros_like(front), front],
|
||||
"wrist_rgb_list": [np.zeros_like(wrist), wrist],
|
||||
"joint_state_list": [np.zeros(7, dtype=np.float32), joints],
|
||||
"gripper_state_list": [np.zeros(2, dtype=np.float32), gripper],
|
||||
}
|
||||
|
||||
result = env._convert_obs(obs_raw)
|
||||
np.testing.assert_array_equal(result["pixels"]["image"], front)
|
||||
np.testing.assert_array_equal(result["pixels"]["wrist_image"], wrist)
|
||||
assert result["agent_pos"].shape == (8,)
|
||||
np.testing.assert_array_almost_equal(result["agent_pos"][:7], joints)
|
||||
assert result["agent_pos"][7] == gripper[0]
|
||||
finally:
|
||||
_uninstall_robomme_stub()
|
||||
|
||||
|
||||
def test_convert_obs_array_format():
|
||||
"""_convert_obs also handles non-list (direct array) obs."""
|
||||
_install_robomme_stub()
|
||||
try:
|
||||
from lerobot.envs.robomme import RoboMMEGymEnv
|
||||
|
||||
env = RoboMMEGymEnv.__new__(RoboMMEGymEnv)
|
||||
|
||||
front = np.zeros((256, 256, 3), dtype=np.uint8)
|
||||
obs_raw = {
|
||||
"front_rgb_list": front,
|
||||
"wrist_rgb_list": front,
|
||||
"joint_state_list": np.zeros(7, dtype=np.float32),
|
||||
"gripper_state_list": np.zeros(2, dtype=np.float32),
|
||||
}
|
||||
result = env._convert_obs(obs_raw)
|
||||
assert result["pixels"]["image"].shape == (256, 256, 3)
|
||||
assert result["pixels"]["wrist_image"].shape == (256, 256, 3)
|
||||
assert result["agent_pos"].shape == (8,)
|
||||
finally:
|
||||
_uninstall_robomme_stub()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# create_robomme_envs (mocked sim)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_create_robomme_envs_returns_correct_structure():
|
||||
"""Single task -> {task_name: {task_id: VectorEnv}} with one entry per task_id."""
|
||||
_install_robomme_stub()
|
||||
try:
|
||||
from lerobot.envs.robomme import create_robomme_envs
|
||||
|
||||
env_cls = MagicMock(return_value=MagicMock())
|
||||
result = create_robomme_envs(
|
||||
task="PickXtimes",
|
||||
n_envs=1,
|
||||
task_ids=[0, 1],
|
||||
env_cls=env_cls,
|
||||
)
|
||||
|
||||
assert "PickXtimes" in result
|
||||
assert 0 in result["PickXtimes"]
|
||||
assert 1 in result["PickXtimes"]
|
||||
assert env_cls.call_count == 2
|
||||
finally:
|
||||
_uninstall_robomme_stub()
|
||||
|
||||
|
||||
def test_create_robomme_envs_multi_task():
|
||||
"""Comma-separated task list produces one suite per task."""
|
||||
_install_robomme_stub()
|
||||
try:
|
||||
from lerobot.envs.robomme import create_robomme_envs
|
||||
|
||||
env_cls = MagicMock(return_value=MagicMock())
|
||||
result = create_robomme_envs(
|
||||
task="PickXtimes,BinFill,StopCube",
|
||||
n_envs=1,
|
||||
env_cls=env_cls,
|
||||
)
|
||||
|
||||
assert set(result.keys()) == {"PickXtimes", "BinFill", "StopCube"}
|
||||
finally:
|
||||
_uninstall_robomme_stub()
|
||||
|
||||
|
||||
def test_create_robomme_envs_raises_on_invalid_env_cls():
|
||||
_install_robomme_stub()
|
||||
try:
|
||||
import pytest
|
||||
|
||||
from lerobot.envs.robomme import create_robomme_envs
|
||||
|
||||
with pytest.raises(ValueError, match="env_cls must be a callable"):
|
||||
create_robomme_envs(task="PickXtimes", n_envs=1, env_cls=None)
|
||||
finally:
|
||||
_uninstall_robomme_stub()
|
||||
Reference in New Issue
Block a user