[Port HIl-Serl] Refactor gym-manipulator (#1034)

2026-06-02 11:51:25 +00:00 · 2025-04-25 16:34:54 +02:00
parent a8da4a347e
commit bd4db8d747
13 changed files with 624 additions and 946 deletions
--- a/lerobot/scripts/server/actor_server.py
+++ b/lerobot/scripts/server/actor_server.py
@@ -20,13 +20,11 @@ from functools import lru_cache
 from queue import Empty
 from statistics import mean, quantiles

-# from lerobot.scripts.eval import eval_policy
 import grpc
 import torch
 from torch import nn
 from torch.multiprocessing import Event, Queue

-# TODO: Remove the import of maniskill
 from lerobot.common.policies.factory import make_policy
 from lerobot.common.policies.sac.modeling_sac import SACPolicy
 from lerobot.common.robot_devices.utils import busy_wait
@@ -39,20 +37,21 @@ from lerobot.common.utils.utils import (
 from lerobot.configs import parser
 from lerobot.configs.train import TrainPipelineConfig
 from lerobot.scripts.server import hilserl_pb2, hilserl_pb2_grpc, learner_service
-from lerobot.scripts.server.buffer import (
-    Transition,
-    bytes_to_state_dict,
-    move_state_dict_to_device,
-    move_transition_to_device,
-    python_object_to_bytes,
-    transitions_to_bytes,
-)
+from lerobot.scripts.server.buffer import Transition
 from lerobot.scripts.server.gym_manipulator import make_robot_env
 from lerobot.scripts.server.network_utils import (
+    bytes_to_state_dict,
+    python_object_to_bytes,
    receive_bytes_in_chunks,
    send_bytes_in_chunks,
+    transitions_to_bytes,
+)
+from lerobot.scripts.server.utils import (
+    get_last_item_from_queue,
+    move_state_dict_to_device,
+    move_transition_to_device,
+    setup_process_handlers,
 )
-from lerobot.scripts.server.utils import get_last_item_from_queue, setup_process_handlers

 ACTOR_SHUTDOWN_TIMEOUT = 30

@@ -134,21 +133,8 @@ def actor_cli(cfg: TrainPipelineConfig):
    interactions_process.start()
    receive_policy_process.start()

-    # HACK: FOR MANISKILL we do not have a reward classifier
-    # TODO: Remove this once we merge into main
-    reward_classifier = None
-    # if (
-    #     cfg.env.reward_classifier["pretrained_path"] is not None
-    #     and cfg.env.reward_classifier["config_path"] is not None
-    # ):
-    #     reward_classifier = get_classifier(
-    #         pretrained_path=cfg.env.reward_classifier["pretrained_path"],
-    #         config_path=cfg.env.reward_classifier["config_path"],
-    #     )
-
    act_with_policy(
        cfg=cfg,
-        reward_classifier=reward_classifier,
        shutdown_event=shutdown_event,
        parameters_queue=parameters_queue,
        transitions_queue=transitions_queue,
@@ -183,7 +169,6 @@ def actor_cli(cfg: TrainPipelineConfig):

 def act_with_policy(
    cfg: TrainPipelineConfig,
-    reward_classifier: nn.Module,
    shutdown_event: any,  # Event,
    parameters_queue: Queue,
    transitions_queue: Queue,
@@ -197,7 +182,6 @@ def act_with_policy(

    Args:
        cfg: Configuration settings for the interaction process.
-        reward_classifier: Reward classifier to use for the interaction process.
        shutdown_event: Event to check if the process should shutdown.
        parameters_queue: Queue to receive updated network parameters from the learner.
        transitions_queue: Queue to send transitions to the learner.
@@ -262,16 +246,10 @@ def act_with_policy(

            log_policy_frequency_issue(policy_fps=policy_fps, cfg=cfg, interaction_step=interaction_step)

-            next_obs, reward, done, truncated, info = online_env.step(action.squeeze(dim=0).cpu().numpy())
        else:
-            # TODO (azouitine): Make a custom space for torch tensor
            action = online_env.action_space.sample()
-            next_obs, reward, done, truncated, info = online_env.step(action)

-            # HACK: We have only one env but we want to batch it, it will be resolved with the torch box
-            action = (
-                torch.from_numpy(action[0]).to(device, non_blocking=device.type == "cuda").unsqueeze(dim=0)
-            )
+        next_obs, reward, done, truncated, info = online_env.step(action)

        sum_reward_episode += float(reward)
        # Increment total steps counter for intervention rate
@@ -286,11 +264,6 @@ def act_with_policy(
            # Increment intervention steps counter
            episode_intervention_steps += 1

-        # Check for NaN values in observations
-        for key, tensor in obs.items():
-            if torch.isnan(tensor).any():
-                logging.error(f"[ACTOR] NaN values found in obs[{key}] at step {interaction_step}")
-
        list_transition_to_send_to_learner.append(
            Transition(
                state=obs,