From 7fc0cdf68a7fb216fd401b095f025f5c74512e22 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Fri, 20 Mar 2026 22:48:06 -0700
Subject: [PATCH] fix(eval): skip multi-instance orchestration when
 runtime=docker
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

_orchestrate_multi_instance_eval spawns extra lerobot-eval processes that each
call run_eval_in_docker again, creating N^2 containers. For docker runtime,
instance_count directly controls how many env-worker containers are spawned by
run_eval_in_docker — no process-level orchestration is needed.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/lerobot/scripts/lerobot_eval.py | 51 +++++++++++++++--------------
 1 file changed, 27 insertions(+), 24 deletions(-)

diff --git a/src/lerobot/scripts/lerobot_eval.py b/src/lerobot/scripts/lerobot_eval.py
index be2b5e220..7c8549688 100644
--- a/src/lerobot/scripts/lerobot_eval.py
+++ b/src/lerobot/scripts/lerobot_eval.py
@@ -74,9 +74,9 @@ from tqdm import trange
 
 from lerobot.configs import parser
 from lerobot.configs.eval import EvalPipelineConfig
-from lerobot.envs.lazy_vec_env import LazyVectorEnv
 from lerobot.envs.docker_runtime import run_eval_in_docker
 from lerobot.envs.factory import make_env, make_env_pre_post_processors
+from lerobot.envs.lazy_vec_env import LazyVectorEnv
 from lerobot.envs.utils import (
     add_envs_task,
     check_env_attributes_and_types,
@@ -87,14 +87,14 @@ from lerobot.policies.factory import make_policy, make_pre_post_processors
 from lerobot.policies.pretrained import PreTrainedPolicy
 from lerobot.processor import PolicyAction, PolicyProcessorPipeline
 from lerobot.utils.constants import ACTION, DONE, OBS_STR, REWARD
-from lerobot.utils.import_utils import register_third_party_plugins
-from lerobot.utils.io_utils import write_video
-from lerobot.utils.random_utils import set_seed
 from lerobot.utils.hf_eval_results import (
     build_eval_results_rows,
     default_eval_date,
     upload_eval_results_yaml,
 )
+from lerobot.utils.import_utils import register_third_party_plugins
+from lerobot.utils.io_utils import write_video
+from lerobot.utils.random_utils import set_seed
 from lerobot.utils.utils import (
     get_safe_torch_device,
     init_logging,
@@ -613,7 +613,10 @@ def push_eval_to_hub(
 @parser.wrap()
 def eval_main(cfg: EvalPipelineConfig):
     logging.info(pformat(asdict(cfg)))
-    if cfg.eval.instance_count > 1 and cfg.eval.instance_id == 0:
+    # Multi-instance orchestration only applies to local runtime.
+    # For docker runtime, instance_count controls the number of env containers
+    # spawned directly by run_eval_in_docker — no extra lerobot-eval processes needed.
+    if cfg.eval.runtime == "local" and cfg.eval.instance_count > 1 and cfg.eval.instance_id == 0:
         _orchestrate_multi_instance_eval(cfg)
     else:
         _run_eval_worker(cfg)
@@ -720,7 +723,10 @@ def _run_eval_worker(cfg: EvalPipelineConfig) -> dict:
     env_preprocessor, env_postprocessor = make_env_pre_post_processors(env_cfg=cfg.env, policy_cfg=cfg.policy)
 
     try:
-        with torch.no_grad(), torch.autocast(device_type=device.type) if cfg.policy.use_amp else nullcontext():
+        with (
+            torch.no_grad(),
+            torch.autocast(device_type=device.type) if cfg.policy.use_amp else nullcontext(),
+        ):
             info = eval_policy_all(
                 envs=envs,
                 policy=policy,
@@ -1003,16 +1009,18 @@ def _eval_task_batch(
 
         results: list[tuple[str, int, TaskMetrics]] = []
         for tg, tid, start_i, end_i in task_slices:
-            results.append((
-                tg,
-                tid,
-                TaskMetrics(
-                    sum_rewards=batch_sum_rewards[start_i:end_i].tolist(),
-                    max_rewards=batch_max_rewards[start_i:end_i].tolist(),
-                    successes=batch_successes[start_i:end_i].tolist(),
-                    video_paths=video_paths_per_task.get((tg, tid), []),
-                ),
-            ))
+            results.append(
+                (
+                    tg,
+                    tid,
+                    TaskMetrics(
+                        sum_rewards=batch_sum_rewards[start_i:end_i].tolist(),
+                        max_rewards=batch_max_rewards[start_i:end_i].tolist(),
+                        successes=batch_successes[start_i:end_i].tolist(),
+                        video_paths=video_paths_per_task.get((tg, tid), []),
+                    ),
+                )
+            )
         return results
     finally:
         combined_env.close()
@@ -1137,8 +1145,7 @@ def eval_policy_all(
         total_tasks = len(tasks)
         tasks = [task for idx, task in enumerate(tasks) if idx % instance_count == instance_id]
         logging.info(
-            f"Instance shard {instance_id + 1}/{instance_count}: "
-            f"{len(tasks)}/{total_tasks} tasks assigned."
+            f"Instance shard {instance_id + 1}/{instance_count}: {len(tasks)}/{total_tasks} tasks assigned."
         )
 
     # accumulators: track metrics at both per-group level and across all groups
@@ -1236,13 +1243,9 @@ def eval_policy_all(
     else:
         # Threaded fallback for cases where batched lazy mode cannot be used.
         if all_lazy and n_episodes != 1:
-            logging.info(
-                "Task scheduler mode: threaded (lazy batching disabled because n_episodes != 1)"
-            )
+            logging.info("Task scheduler mode: threaded (lazy batching disabled because n_episodes != 1)")
         elif all_lazy and not single_factory_per_task:
-            logging.info(
-                "Task scheduler mode: threaded (lazy batching disabled because eval.batch_size > 1)"
-            )
+            logging.info("Task scheduler mode: threaded (lazy batching disabled because eval.batch_size > 1)")
         else:
             logging.info(f"Task scheduler mode: threaded (max_workers={max_parallel_tasks})")
         with cf.ThreadPoolExecutor(max_workers=max_parallel_tasks) as executor: