From adaa0470a0eb331c50b0fb1b53cf3bb3c337ee25 Mon Sep 17 00:00:00 2001 From: Pepijn Date: Thu, 7 May 2026 11:16:43 +0200 Subject: [PATCH] fix(ci): cap VLABench smoke eval at 50 steps per task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VLABench's default episode_length is 500 steps; with 10 tasks at ~1 it/s the smoke eval took ~80 minutes of rollouts on top of the image build. The eval is a pipeline smoke test (running_success_rate stays at 0% on this short rollout anyway), so we don't need full episodes — cap each task at 50 steps to bring total rollout time down ~10x. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/benchmark_tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/benchmark_tests.yml b/.github/workflows/benchmark_tests.yml index b07c8f8da..913e16a2d 100644 --- a/.github/workflows/benchmark_tests.yml +++ b/.github/workflows/benchmark_tests.yml @@ -900,6 +900,7 @@ jobs: --policy.path=lerobot/smolvla_vlabench \ --env.type=vlabench \ --env.task=select_fruit,select_toy,select_book,select_painting,select_drink,select_ingredient,select_billiards,select_poker,add_condiment,insert_flower \ + --env.episode_length=50 \ --eval.batch_size=1 \ --eval.n_episodes=1 \ --eval.use_async_envs=false \