[HIL-SERL] Migrate threading to multiprocessing (#759)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
Eugene Mironov
2025-03-05 17:19:31 +07:00
committed by AdilZouitine
parent 38f5fa4523
commit db78fee9de
14 changed files with 900 additions and 492 deletions

View File

@@ -13,22 +13,19 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import io
import logging
import pickle
import queue
from statistics import mean, quantiles
import signal
from functools import lru_cache
from lerobot.scripts.server.utils import setup_process_handlers
# from lerobot.scripts.eval import eval_policy
from threading import Thread
import grpc
import hydra
import torch
from omegaconf import DictConfig
from torch import nn
import time
# TODO: Remove the import of maniskill
# from lerobot.common.envs.factory import make_maniskill_env
@@ -47,157 +44,184 @@ from lerobot.scripts.server.buffer import (
Transition,
move_state_dict_to_device,
move_transition_to_device,
bytes_buffer_size,
python_object_to_bytes,
transitions_to_bytes,
bytes_to_state_dict,
)
from lerobot.scripts.server.network_utils import (
receive_bytes_in_chunks,
send_bytes_in_chunks,
)
from lerobot.scripts.server.gym_manipulator import get_classifier, make_robot_env
from lerobot.scripts.server import learner_service
from threading import Event
from torch.multiprocessing import Queue, Event
from queue import Empty
logging.basicConfig(level=logging.INFO)
from lerobot.common.utils.utils import init_logging
parameters_queue = queue.Queue(maxsize=1)
message_queue = queue.Queue(maxsize=1_000_000)
from lerobot.scripts.server.utils import get_last_item_from_queue
ACTOR_SHUTDOWN_TIMEOUT = 30
class ActorInformation:
"""
This helper class is used to differentiate between two types of messages that are placed in the same queue during streaming:
- **Transition Data:** Contains experience tuples (observation, action, reward, next observation) collected during interaction.
- **Interaction Messages:** Encapsulates statistics related to the interaction process.
Attributes:
transition (Optional): Transition data to be sent to the learner.
interaction_message (Optional): Iteraction message providing additional statistics for logging.
"""
def __init__(self, transition=None, interaction_message=None):
self.transition = transition
self.interaction_message = interaction_message
def receive_policy(
learner_client: hilserl_pb2_grpc.LearnerServiceStub,
shutdown_event: Event,
parameters_queue: queue.Queue,
cfg: DictConfig,
parameters_queue: Queue,
shutdown_event: any, # Event,
learner_client: hilserl_pb2_grpc.LearnerServiceStub | None = None,
grpc_channel: grpc.Channel | None = None,
):
logging.info("[ACTOR] Start receiving parameters from the Learner")
bytes_buffer = io.BytesIO()
step = 0
if not use_threads(cfg):
# Setup process handlers to handle shutdown signal
# But use shutdown event from the main process
setup_process_handlers(False)
if grpc_channel is None or learner_client is None:
learner_client, grpc_channel = learner_service_client(
host=cfg.actor_learner_config.learner_host,
port=cfg.actor_learner_config.learner_port,
)
try:
for model_update in learner_client.StreamParameters(hilserl_pb2.Empty()):
if shutdown_event.is_set():
logging.info("[ACTOR] Shutting down policy streaming receiver")
return hilserl_pb2.Empty()
if model_update.transfer_state == hilserl_pb2.TransferState.TRANSFER_BEGIN:
bytes_buffer.seek(0)
bytes_buffer.truncate(0)
bytes_buffer.write(model_update.parameter_bytes)
logging.info("Received model update at step 0")
step = 0
continue
elif (
model_update.transfer_state == hilserl_pb2.TransferState.TRANSFER_MIDDLE
):
bytes_buffer.write(model_update.parameter_bytes)
step += 1
logging.info(f"Received model update at step {step}")
elif model_update.transfer_state == hilserl_pb2.TransferState.TRANSFER_END:
bytes_buffer.write(model_update.parameter_bytes)
logging.info(
f"Received model update at step end size {bytes_buffer_size(bytes_buffer)}"
)
state_dict = torch.load(bytes_buffer)
bytes_buffer.seek(0)
bytes_buffer.truncate(0)
step = 0
logging.info("Model updated")
parameters_queue.put(state_dict)
iterator = learner_client.StreamParameters(hilserl_pb2.Empty())
receive_bytes_in_chunks(
iterator,
parameters_queue,
shutdown_event,
log_prefix="[ACTOR] parameters",
)
except grpc.RpcError as e:
logging.error(f"[ACTOR] gRPC error: {e}")
if not use_threads(cfg):
grpc_channel.close()
logging.info("[ACTOR] Received policy loop stopped")
def transitions_stream(
shutdown_event: Event, transitions_queue: Queue
) -> hilserl_pb2.Empty:
while not shutdown_event.is_set():
try:
message = transitions_queue.get(block=True, timeout=5)
except Empty:
logging.debug("[ACTOR] Transition queue is empty")
continue
yield from send_bytes_in_chunks(
message, hilserl_pb2.Transition, log_prefix="[ACTOR] Send transitions"
)
return hilserl_pb2.Empty()
def transitions_stream(shutdown_event: Event, message_queue: queue.Queue):
def interactions_stream(
shutdown_event: any, # Event,
interactions_queue: Queue,
) -> hilserl_pb2.Empty:
while not shutdown_event.is_set():
try:
message = message_queue.get(block=True, timeout=5)
except queue.Empty:
logging.debug("[ACTOR] Transition queue is empty")
message = interactions_queue.get(block=True, timeout=5)
except Empty:
logging.debug("[ACTOR] Interaction queue is empty")
continue
if message.transition is not None:
transition_to_send_to_learner: list[Transition] = [
move_transition_to_device(transition=T, device="cpu")
for T in message.transition
]
# Check for NaNs in transitions before sending to learner
for transition in transition_to_send_to_learner:
for key, value in transition["state"].items():
if torch.isnan(value).any():
logging.warning(f"Found NaN values in transition {key}")
buf = io.BytesIO()
torch.save(transition_to_send_to_learner, buf)
transition_bytes = buf.getvalue()
transition_message = hilserl_pb2.Transition(
transition_bytes=transition_bytes
)
response = hilserl_pb2.ActorInformation(transition=transition_message)
elif message.interaction_message is not None:
content = hilserl_pb2.InteractionMessage(
interaction_message_bytes=pickle.dumps(message.interaction_message)
)
response = hilserl_pb2.ActorInformation(interaction_message=content)
yield response
yield from send_bytes_in_chunks(
message,
hilserl_pb2.InteractionMessage,
log_prefix="[ACTOR] Send interactions",
)
return hilserl_pb2.Empty()
def send_transitions(
learner_client: hilserl_pb2_grpc.LearnerServiceStub,
shutdown_event: Event,
message_queue: queue.Queue,
):
cfg: DictConfig,
transitions_queue: Queue,
shutdown_event: any, # Event,
learner_client: hilserl_pb2_grpc.LearnerServiceStub | None = None,
grpc_channel: grpc.Channel | None = None,
) -> hilserl_pb2.Empty:
"""
Streams data from the actor to the learner.
Sends transitions to the learner.
This function continuously retrieves messages from the queue and processes them based on their type:
This function continuously retrieves messages from the queue and processes:
- **Transition Data:**
- A batch of transitions (observation, action, reward, next observation) is collected.
- Transitions are moved to the CPU and serialized using PyTorch.
- The serialized data is wrapped in a `hilserl_pb2.Transition` message and sent to the learner.
- **Interaction Messages:**
- Contains useful statistics about episodic rewards and policy timings.
- The message is serialized using `pickle` and sent to the learner.
Yields:
hilserl_pb2.ActorInformation: The response message containing either transition data or an interaction message.
"""
if not use_threads(cfg):
# Setup process handlers to handle shutdown signal
# But use shutdown event from the main process
setup_process_handlers(False)
if grpc_channel is None or learner_client is None:
learner_client, grpc_channel = learner_service_client(
host=cfg.actor_learner_config.learner_host,
port=cfg.actor_learner_config.learner_port,
)
try:
learner_client.ReceiveTransitions(
transitions_stream(shutdown_event, message_queue)
learner_client.SendTransitions(
transitions_stream(shutdown_event, transitions_queue)
)
except grpc.RpcError as e:
logging.error(f"[ACTOR] gRPC error: {e}")
logging.info("[ACTOR] Finished streaming transitions")
if not use_threads(cfg):
grpc_channel.close()
logging.info("[ACTOR] Transitions process stopped")
def send_interactions(
cfg: DictConfig,
interactions_queue: Queue,
shutdown_event: any, # Event,
learner_client: hilserl_pb2_grpc.LearnerServiceStub | None = None,
grpc_channel: grpc.Channel | None = None,
) -> hilserl_pb2.Empty:
"""
Sends interactions to the learner.
This function continuously retrieves messages from the queue and processes:
- **Interaction Messages:**
- Contains useful statistics about episodic rewards and policy timings.
- The message is serialized using `pickle` and sent to the learner.
"""
if not use_threads(cfg):
# Setup process handlers to handle shutdown signal
# But use shutdown event from the main process
setup_process_handlers(False)
if grpc_channel is None or learner_client is None:
learner_client, grpc_channel = learner_service_client(
host=cfg.actor_learner_config.learner_host,
port=cfg.actor_learner_config.learner_port,
)
try:
learner_client.SendInteractions(
interactions_stream(shutdown_event, interactions_queue)
)
except grpc.RpcError as e:
logging.error(f"[ACTOR] gRPC error: {e}")
logging.info("[ACTOR] Finished streaming interactions")
if not use_threads(cfg):
grpc_channel.close()
logging.info("[ACTOR] Interactions process stopped")
@lru_cache(maxsize=1)
def learner_service_client(
@@ -217,7 +241,7 @@ def learner_service_client(
{
"name": [{}], # Applies to ALL methods in ALL services
"retryPolicy": {
"maxAttempts": 7, # Max retries (total attempts = 5)
"maxAttempts": 5, # Max retries (total attempts = 5)
"initialBackoff": "0.1s", # First retry after 0.1s
"maxBackoff": "2s", # Max wait time between retries
"backoffMultiplier": 2, # Exponential backoff factor
@@ -242,20 +266,27 @@ def learner_service_client(
],
)
stub = hilserl_pb2_grpc.LearnerServiceStub(channel)
logging.info("[LEARNER] Learner service client created")
logging.info("[ACTOR] Learner service client created")
return stub, channel
def update_policy_parameters(policy: SACPolicy, parameters_queue: queue.Queue, device):
def update_policy_parameters(policy: SACPolicy, parameters_queue: Queue, device):
if not parameters_queue.empty():
logging.info("[ACTOR] Load new parameters from Learner.")
state_dict = parameters_queue.get()
bytes_state_dict = get_last_item_from_queue(parameters_queue)
state_dict = bytes_to_state_dict(bytes_state_dict)
state_dict = move_state_dict_to_device(state_dict, device=device)
policy.load_state_dict(state_dict)
def act_with_policy(
cfg: DictConfig, robot: Robot, reward_classifier: nn.Module, shutdown_event: Event
cfg: DictConfig,
robot: Robot,
reward_classifier: nn.Module,
shutdown_event: any, # Event,
parameters_queue: Queue,
transitions_queue: Queue,
interactions_queue: Queue,
):
"""
Executes policy interaction within the environment.
@@ -317,7 +348,7 @@ def act_with_policy(
for interaction_step in range(cfg.training.online_steps):
if shutdown_event.is_set():
logging.info("[ACTOR] Shutdown signal received. Exiting...")
logging.info("[ACTOR] Shutting down act_with_policy")
return
if interaction_step >= cfg.training.online_step_before_learning:
@@ -394,10 +425,9 @@ def act_with_policy(
)
if len(list_transition_to_send_to_learner) > 0:
send_transitions_in_chunks(
push_transitions_to_transport_queue(
transitions=list_transition_to_send_to_learner,
message_queue=message_queue,
chunk_size=4,
transitions_queue=transitions_queue,
)
list_transition_to_send_to_learner = []
@@ -405,9 +435,9 @@ def act_with_policy(
list_policy_time.clear()
# Send episodic reward to the learner
message_queue.put(
ActorInformation(
interaction_message={
interactions_queue.put(
python_object_to_bytes(
{
"Episodic reward": sum_reward_episode,
"Interaction step": interaction_step,
"Episode intervention": int(episode_intervention),
@@ -420,7 +450,7 @@ def act_with_policy(
obs, info = online_env.reset()
def send_transitions_in_chunks(transitions: list, message_queue, chunk_size: int = 100):
def push_transitions_to_transport_queue(transitions: list, transitions_queue):
"""Send transitions to learner in smaller chunks to avoid network issues.
Args:
@@ -428,10 +458,16 @@ def send_transitions_in_chunks(transitions: list, message_queue, chunk_size: int
message_queue: Queue to send messages to learner
chunk_size: Size of each chunk to send
"""
for i in range(0, len(transitions), chunk_size):
chunk = transitions[i : i + chunk_size]
logging.debug(f"[ACTOR] Sending chunk of {len(chunk)} transitions to Learner.")
message_queue.put(ActorInformation(transition=chunk))
transition_to_send_to_learner = []
for transition in transitions:
tr = move_transition_to_device(transition=transition, device="cpu")
for key, value in tr["state"].items():
if torch.isnan(value).any():
logging.warning(f"Found NaN values in transition {key}")
transition_to_send_to_learner.append(tr)
transitions_queue.put(transitions_to_bytes(transition_to_send_to_learner))
def get_frequency_stats(list_policy_time: list[float]) -> dict[str, float]:
@@ -458,39 +494,96 @@ def log_policy_frequency_issue(
)
def establish_learner_connection(
stub,
shutdown_event: any, # Event,
attempts=30,
):
for _ in range(attempts):
if shutdown_event.is_set():
logging.info("[ACTOR] Shutting down establish_learner_connection")
return False
# Force a connection attempt and check state
try:
logging.info("[ACTOR] Send ready message to Learner")
if stub.Ready(hilserl_pb2.Empty()) == hilserl_pb2.Empty():
return True
except grpc.RpcError as e:
logging.error(f"[ACTOR] Waiting for Learner to be ready... {e}")
time.sleep(2)
return False
def use_threads(cfg: DictConfig) -> bool:
return cfg.actor_learner_config.concurrency.actor == "threads"
@hydra.main(version_base="1.2", config_name="default", config_path="../../configs")
def actor_cli(cfg: dict):
if not use_threads(cfg):
import torch.multiprocessing as mp
mp.set_start_method("spawn")
init_logging(log_file="actor.log")
robot = make_robot(cfg=cfg.robot)
shutdown_event = Event()
# Define signal handler
def signal_handler(signum, frame):
logging.info("Shutdown signal received. Cleaning up...")
shutdown_event.set()
signal.signal(signal.SIGINT, signal_handler) # Ctrl+C
signal.signal(signal.SIGTERM, signal_handler) # Termination request (kill)
signal.signal(signal.SIGHUP, signal_handler) # Terminal closed/Hangup
signal.signal(signal.SIGQUIT, signal_handler) # Ctrl+\
shutdown_event = setup_process_handlers(use_threads(cfg))
learner_client, grpc_channel = learner_service_client(
host=cfg.actor_learner_config.learner_host,
port=cfg.actor_learner_config.learner_port,
)
receive_policy_thread = Thread(
logging.info("[ACTOR] Establishing connection with Learner")
if not establish_learner_connection(learner_client, shutdown_event):
logging.error("[ACTOR] Failed to establish connection with Learner")
return
if not use_threads(cfg):
# If we use multithreading, we can reuse the channel
grpc_channel.close()
grpc_channel = None
logging.info("[ACTOR] Connection with Learner established")
parameters_queue = Queue()
transitions_queue = Queue()
interactions_queue = Queue()
concurrency_entity = None
if use_threads(cfg):
from threading import Thread
concurrency_entity = Thread
else:
from multiprocessing import Process
concurrency_entity = Process
receive_policy_process = concurrency_entity(
target=receive_policy,
args=(learner_client, shutdown_event, parameters_queue),
args=(cfg, parameters_queue, shutdown_event, grpc_channel),
daemon=True,
)
transitions_thread = Thread(
transitions_process = concurrency_entity(
target=send_transitions,
args=(learner_client, shutdown_event, message_queue),
args=(cfg, transitions_queue, shutdown_event, grpc_channel),
daemon=True,
)
interactions_process = concurrency_entity(
target=send_interactions,
args=(cfg, interactions_queue, shutdown_event, grpc_channel),
daemon=True,
)
transitions_process.start()
interactions_process.start()
receive_policy_process.start()
# HACK: FOR MANISKILL we do not have a reward classifier
# TODO: Remove this once we merge into main
reward_classifier = None
@@ -503,26 +596,35 @@ def actor_cli(cfg: dict):
config_path=cfg.env.reward_classifier.config_path,
)
policy_thread = Thread(
target=act_with_policy,
daemon=True,
args=(cfg, robot, reward_classifier, shutdown_event),
act_with_policy(
cfg,
robot,
reward_classifier,
shutdown_event,
parameters_queue,
transitions_queue,
interactions_queue,
)
logging.info("[ACTOR] Policy process joined")
transitions_thread.start()
policy_thread.start()
receive_policy_thread.start()
logging.info("[ACTOR] Closing queues")
transitions_queue.close()
interactions_queue.close()
parameters_queue.close()
shutdown_event.wait()
logging.info("[ACTOR] Shutdown event received")
grpc_channel.close()
transitions_process.join()
logging.info("[ACTOR] Transitions process joined")
interactions_process.join()
logging.info("[ACTOR] Interactions process joined")
receive_policy_process.join()
logging.info("[ACTOR] Receive policy process joined")
policy_thread.join()
logging.info("[ACTOR] Policy thread joined")
transitions_thread.join()
logging.info("[ACTOR] Transitions thread joined")
receive_policy_thread.join()
logging.info("[ACTOR] Receive policy thread joined")
logging.info("[ACTOR] join queues")
transitions_queue.cancel_join_thread()
interactions_queue.cancel_join_thread()
parameters_queue.cancel_join_thread()
logging.info("[ACTOR] queues closed")
if __name__ == "__main__":