fix(hil-serl): discrete critic send through network (#1468)

Co-authored-by: Khalil Meftah <kmeftah.khalil@gmail.com> Co-authored-by: jpizarrom <jpizarrom@gmail.com>
2026-06-03 12:21:27 +00:00 · 2025-07-09 16:22:40 +02:00
parent cf86b9300d
commit ce2b9724bf
6 changed files with 81 additions and 51 deletions
--- a/src/lerobot/scripts/rl/actor.py
+++ b/src/lerobot/scripts/rl/actor.py
@@ -317,7 +317,7 @@ def act_with_policy(
        if done or truncated:
            logging.info(f"[ACTOR] Global step {interaction_step}: Episode reward: {sum_reward_episode}")

-            update_policy_parameters(policy=policy.actor, parameters_queue=parameters_queue, device=device)
+            update_policy_parameters(policy=policy, parameters_queue=parameters_queue, device=device)

            if len(list_transition_to_send_to_learner) > 0:
                push_transitions_to_transport_queue(
@@ -642,9 +642,29 @@ def update_policy_parameters(policy: SACPolicy, parameters_queue: Queue, device)
    bytes_state_dict = get_last_item_from_queue(parameters_queue, block=False)
    if bytes_state_dict is not None:
        logging.info("[ACTOR] Load new parameters from Learner.")
-        state_dict = bytes_to_state_dict(bytes_state_dict)
-        state_dict = move_state_dict_to_device(state_dict, device=device)
-        policy.load_state_dict(state_dict)
+        state_dicts = bytes_to_state_dict(bytes_state_dict)
+
+        # TODO: check encoder parameter synchronization possible issues:
+        # 1. When shared_encoder=True, we're loading stale encoder params from actor's state_dict
+        #    instead of the updated encoder params from critic (which is optimized separately)
+        # 2. When freeze_vision_encoder=True, we waste bandwidth sending/loading frozen params
+        # 3. Need to handle encoder params correctly for both actor and discrete_critic
+        # Potential fixes:
+        # - Send critic's encoder state when shared_encoder=True
+        # - Skip encoder params entirely when freeze_vision_encoder=True
+        # - Ensure discrete_critic gets correct encoder state (currently uses encoder_critic)
+
+        # Load actor state dict
+        actor_state_dict = move_state_dict_to_device(state_dicts["policy"], device=device)
+        policy.actor.load_state_dict(actor_state_dict)
+
+        # Load discrete critic if present
+        if hasattr(policy, "discrete_critic") and "discrete_critic" in state_dicts:
+            discrete_critic_state_dict = move_state_dict_to_device(
+                state_dicts["discrete_critic"], device=device
+            )
+            policy.discrete_critic.load_state_dict(discrete_critic_state_dict)
+            logging.info("[ACTOR] Loaded discrete critic parameters from Learner.")


 #################################################
--- a/src/lerobot/scripts/rl/learner.py
+++ b/src/lerobot/scripts/rl/learner.py
@@ -1109,8 +1109,18 @@ def check_nan_in_transition(

 def push_actor_policy_to_queue(parameters_queue: Queue, policy: nn.Module):
    logging.debug("[LEARNER] Pushing actor policy to the queue")
-    state_dict = move_state_dict_to_device(policy.actor.state_dict(), device="cpu")
-    state_bytes = state_to_bytes(state_dict)
+
+    # Create a dictionary to hold all the state dicts
+    state_dicts = {"policy": move_state_dict_to_device(policy.actor.state_dict(), device="cpu")}
+
+    # Add discrete critic if it exists
+    if hasattr(policy, "discrete_critic") and policy.discrete_critic is not None:
+        state_dicts["discrete_critic"] = move_state_dict_to_device(
+            policy.discrete_critic.state_dict(), device="cpu"
+        )
+        logging.debug("[LEARNER] Including discrete critic in state dict push")
+
+    state_bytes = state_to_bytes(state_dicts)
    parameters_queue.put(state_bytes)