src/lerobot/scripts/train_accelerate.py

#!/usr/bin/env python

# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import time
from pprint import pformat
from typing import Any

import torch
from accelerate import Accelerator
from accelerate.utils import set_seed as accelerate_set_seed
from termcolor import colored
from torch.optim import Optimizer

from lerobot.configs import parser
from lerobot.configs.train import TrainPipelineConfig
from lerobot.datasets.factory import make_dataset
from lerobot.datasets.sampler import EpisodeAwareSampler
from lerobot.envs.factory import make_env
from lerobot.optim.factory import make_optimizer_and_scheduler
from lerobot.policies.factory import make_policy
from lerobot.policies.pretrained import PreTrainedPolicy
from lerobot.scripts.eval import eval_policy
from lerobot.utils.logging_utils import AverageMeter, MetricsTracker
from lerobot.utils.train_utils import (
    get_step_checkpoint_dir,
    get_step_identifier,
    load_training_state,
    save_checkpoint,
    update_last_checkpoint,
)
from lerobot.utils.utils import (
    format_big_number,
    has_method,
    init_logging,
)


def update_policy(
    train_metrics: MetricsTracker,
    policy: PreTrainedPolicy,
    batch: Any,
    optimizer: Optimizer,
    grad_clip_norm: float,
    accelerator: Accelerator,
    lr_scheduler=None,
) -> tuple[MetricsTracker, dict]:
    start_time = time.perf_counter()
    policy.train()

    # Use accelerator's autocast context if mixed precision is enabled
    with accelerator.autocast():
        loss, output_dict = policy.forward(batch)
        # TODO(rcadene): policy.unnormalize_outputs(out_dict)

    # Use accelerator for backward pass
    accelerator.backward(loss)

    # Gradient clipping - accelerator handles unscaling automatically
    if accelerator.sync_gradients and grad_clip_norm > 0:
        grad_norm = accelerator.clip_grad_norm_(policy.parameters(), grad_clip_norm)
    else:
        grad_norm = torch.tensor(0.0)

    optimizer.step()
    lr_scheduler.step() if lr_scheduler is not None else None
    optimizer.zero_grad()

    # Update policy-specific buffers if needed
    if has_method(policy, "update"):
        policy.update()

    # Gather metrics across all processes
    loss_value = accelerator.gather(loss.detach()).mean().item()
    grad_norm_value = accelerator.gather(grad_norm).mean().item()

    train_metrics.loss = loss_value
    train_metrics.grad_norm = grad_norm_value
    train_metrics.lr = optimizer.param_groups[0]["lr"]
    train_metrics.update_s = time.perf_counter() - start_time
    return train_metrics, output_dict


@parser.wrap()
def train(cfg: TrainPipelineConfig):
    cfg.validate()
    logging.info(pformat(cfg.to_dict()))

    # Initialize accelerator
    from accelerate.utils import DistributedDataParallelKwargs

    # added by jade 2 lines
    ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=False)
    accelerator = Accelerator(..., kwargs_handlers=[ddp_kwargs])

    from lerobot.utils.wandb_utils import cfg_to_group, get_wandb_run_id_from_filesystem

    ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
    accelerator = Accelerator(
        mixed_precision="bf16" if cfg.policy.use_amp else "no",
        gradient_accumulation_steps=cfg.policy.gradient_accumulation_steps,
        log_with="wandb" if cfg.wandb.enable else None,
        kwargs_handlers=[ddp_kwargs],
        project_dir=cfg.output_dir,
    )

    accelerator.init_trackers(
        project_name=cfg.wandb.project,
        init_kwargs={
            "wandb": {
                "entity": cfg.wandb.entity,
                "name": cfg.job_name,
                "notes": cfg.wandb.notes,
                "tags": cfg_to_group(cfg, return_list=True),
                "dir": cfg.output_dir,
                "config": cfg.to_dict(),
                "save_code": False,
                "job_type": "train_eval",
                "mode": cfg.wandb.mode if cfg.wandb.mode in ["online", "offline", "disabled"] else "online",
                "resume": "must" if cfg.resume else None,
                "id": cfg.wandb.run_id
                if cfg.wandb.run_id
                else (get_wandb_run_id_from_filesystem(cfg.output_dir) if cfg.resume else None),
            }
        },
    )

    # Set seed for reproducibility
    if cfg.seed is not None:
        accelerate_set_seed(cfg.seed)

    # Setup device - accelerator handles device placement
    torch.backends.cudnn.benchmark = True
    torch.backends.cuda.matmul.allow_tf32 = True

    # Create dataset
    if accelerator.is_main_process:
        logging.info("Creating dataset")
    dataset = make_dataset(cfg)
    print("c")
    # Create evaluation environment (only on main process)
    eval_env = None
    if cfg.eval_freq > 0 and cfg.env is not None and accelerator.is_main_process:
        logging.info("Creating env")
        eval_env = make_env(cfg.env, n_envs=cfg.eval.batch_size, use_async_envs=cfg.eval.use_async_envs)

    # Create policy
    if accelerator.is_main_process:
        logging.info("Creating policy")

    # Use accelerator's device instead of cfg.policy.device
    with accelerator.main_process_first():
        policy = make_policy(
            cfg=cfg.policy,
            ds_meta=dataset.meta,
        )

    # Create optimizer and scheduler
    if accelerator.is_main_process:
        logging.info("Creating optimizer and scheduler")
    optimizer, lr_scheduler = make_optimizer_and_scheduler(cfg, policy)

    step = 0  # number of policy updates

    if cfg.resume:
        step, optimizer, lr_scheduler = load_training_state(cfg.checkpoint_path, optimizer, lr_scheduler)

    # Prepare dataloader
    if hasattr(cfg.policy, "drop_n_last_frames"):
        shuffle = False
        sampler = EpisodeAwareSampler(
            dataset.episode_data_index,
            drop_n_last_frames=cfg.policy.drop_n_last_frames,
            shuffle=True,
        )
    else:
        shuffle = True
        sampler = None

    dataloader = torch.utils.data.DataLoader(
        dataset,
        num_workers=cfg.num_workers,
        batch_size=cfg.batch_size,
        shuffle=shuffle,
        sampler=sampler,
        pin_memory=True,
        drop_last=True,  # Important for distributed training
    )

    # Prepare for distributed training
    policy, optimizer, dataloader, lr_scheduler = accelerator.prepare(
        policy, optimizer, dataloader, lr_scheduler
    )

    # Log training info (only on main process)
    if accelerator.is_main_process:
        num_learnable_params = sum(p.numel() for p in policy.parameters() if p.requires_grad)
        num_total_params = sum(p.numel() for p in policy.parameters())

        logging.info(colored("Output dir:", "yellow", attrs=["bold"]) + f" {cfg.output_dir}")
        if cfg.env is not None:
            logging.info(f"{cfg.env.task=}")
        logging.info(f"{cfg.steps=} ({format_big_number(cfg.steps)})")
        logging.info(f"{dataset.num_frames=} ({format_big_number(dataset.num_frames)})")
        logging.info(f"{dataset.num_episodes=}")
        logging.info(f"{num_learnable_params=} ({format_big_number(num_learnable_params)})")
        logging.info(f"{num_total_params=} ({format_big_number(num_total_params)})")
        logging.info(f"Number of processes: {accelerator.num_processes}")
        logging.info(f"Device: {accelerator.device}")
        logging.info(f"Mixed precision: {accelerator.mixed_precision}")

    # Create metrics trackers
    train_metrics = {
        "loss": AverageMeter("loss", ":.3f"),
        "grad_norm": AverageMeter("grdn", ":.3f"),
        "lr": AverageMeter("lr", ":0.1e"),
        "update_s": AverageMeter("updt_s", ":.3f"),
        "dataloading_s": AverageMeter("data_s", ":.3f"),
    }

    train_tracker = MetricsTracker(
        cfg.batch_size * accelerator.num_processes,  # Account for all processes
        dataset.num_frames,
        dataset.num_episodes,
        train_metrics,
        initial_step=step,
    )

    # Training loop
    policy.train()
    if accelerator.is_main_process:
        logging.info("Start offline training on a fixed dataset")

    # Create iterator from dataloader
    dl_iter = iter(dataloader)

    for current_step in range(step, cfg.steps):
        start_time = time.perf_counter()
        # Get next batch, cycling through dataloader if needed
        try:
            batch = next(dl_iter)
            print("data laoder batch keys: ", batch.keys())
            breakpoint()
        except StopIteration:
            dl_iter = iter(dataloader)
            batch = next(dl_iter)
        train_tracker.dataloading_s = time.perf_counter() - start_time
        # Update policy
        train_tracker, output_dict = update_policy(
            train_tracker,
            policy,
            batch,
            optimizer,
            cfg.optimizer.grad_clip_norm,
            accelerator,
            lr_scheduler=lr_scheduler,
        )

        # Increment step counter
        step += 1
        train_tracker.step()

        # Determine if we should log, save, or evaluate
        is_log_step = cfg.log_freq > 0 and step % cfg.log_freq == 0
        is_saving_step = step % cfg.save_freq == 0 or step == cfg.steps
        is_eval_step = cfg.eval_freq > 0 and step % cfg.eval_freq == 0

        # Logging (only on main process)
        if is_log_step and accelerator.is_main_process:
            logging.info(train_tracker)
            wandb_log_dict = train_tracker.to_dict()
            if output_dict:
                wandb_log_dict.update(output_dict)
            for k, v in wandb_log_dict.items():
                accelerator.log({f"{'train'}/{k}": v}, step=step)
            train_tracker.reset_averages()

        # Checkpointing (only on main process)
        if cfg.save_checkpoint and is_saving_step:
            # ✅ all processes wait here
            accelerator.wait_for_everyone()

            if accelerator.is_main_process:
                logging.info(f"Checkpoint policy after step {step}")
                checkpoint_dir = get_step_checkpoint_dir(cfg.output_dir, cfg.steps, step)

                unwrapped_policy = accelerator.unwrap_model(policy)
                save_checkpoint(checkpoint_dir, step, cfg, unwrapped_policy, optimizer, lr_scheduler)
                update_last_checkpoint(checkpoint_dir)

            # ✅ all processes sync again after saving
            accelerator.wait_for_everyone()

            # if wandb_logger:
            #     wandb_logger.log_policy(checkpoint_dir)

        # Evaluation (only on main process)
        if cfg.env and is_eval_step and accelerator.is_main_process:
            step_id = get_step_identifier(step, cfg.steps)
            logging.info(f"Eval policy at step {step}")

            # Unwrap model for evaluation
            unwrapped_policy = accelerator.unwrap_model(policy)
            unwrapped_policy.eval()

            with torch.no_grad():
                eval_info = eval_policy(
                    eval_env,
                    unwrapped_policy,
                    cfg.eval.n_episodes,
                    videos_dir=cfg.output_dir / "eval" / f"videos_step_{step_id}",
                    max_episodes_rendered=4,
                    start_seed=cfg.seed,
                )

            eval_metrics = {
                "avg_sum_reward": AverageMeter("∑rwrd", ":.3f"),
                "pc_success": AverageMeter("success", ":.1f"),
                "eval_s": AverageMeter("eval_s", ":.3f"),
            }
            eval_tracker = MetricsTracker(
                cfg.batch_size * accelerator.num_processes,
                dataset.num_frames,
                dataset.num_episodes,
                eval_metrics,
                initial_step=step,
            )
            eval_tracker.eval_s = eval_info["aggregated"].pop("eval_s")
            eval_tracker.avg_sum_reward = eval_info["aggregated"].pop("avg_sum_reward")
            eval_tracker.pc_success = eval_info["aggregated"].pop("pc_success")
            logging.info(eval_tracker)

            wandb_log_dict = {**eval_tracker.to_dict(), **eval_info}
            for k, v in wandb_log_dict.items():
                accelerator.log({f"{'eval'}/{k}": v}, step=step)

            # Set back to training mode
            policy.train()

    # Wait for all processes to finish
    accelerator.wait_for_everyone()

    # Cleanup
    if eval_env and accelerator.is_main_process:
        eval_env.close()

    if accelerator.is_main_process:
        logging.info("End of training")
        accelerator.end_training()  # added by jade


if __name__ == "__main__":
    init_logging()
    train()
new things 2025-09-10 11:32:54 +02:00			`#!/usr/bin/env python`

			`# Copyright 2024 The HuggingFace Inc. team. All rights reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`import logging`
			`import time`
			`from pprint import pformat`
			`from typing import Any`

			`import torch`
			`from accelerate import Accelerator`
			`from accelerate.utils import set_seed as accelerate_set_seed`
			`from termcolor import colored`
			`from torch.optim import Optimizer`

[pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci 2025-09-11 11:51:53 +00:00			`from lerobot.configs import parser`
			`from lerobot.configs.train import TrainPipelineConfig`
new things 2025-09-10 11:32:54 +02:00			`from lerobot.datasets.factory import make_dataset`
			`from lerobot.datasets.sampler import EpisodeAwareSampler`
			`from lerobot.envs.factory import make_env`
			`from lerobot.optim.factory import make_optimizer_and_scheduler`
			`from lerobot.policies.factory import make_policy`
			`from lerobot.policies.pretrained import PreTrainedPolicy`
[pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci 2025-09-11 11:51:53 +00:00			`from lerobot.scripts.eval import eval_policy`
new things 2025-09-10 11:32:54 +02:00			`from lerobot.utils.logging_utils import AverageMeter, MetricsTracker`
			`from lerobot.utils.train_utils import (`
			`get_step_checkpoint_dir,`
			`get_step_identifier,`
			`load_training_state,`
			`save_checkpoint,`
			`update_last_checkpoint,`
			`)`
			`from lerobot.utils.utils import (`
			`format_big_number,`
			`has_method,`
			`init_logging,`
			`)`


			`def update_policy(`
			`train_metrics: MetricsTracker,`
			`policy: PreTrainedPolicy,`
			`batch: Any,`
			`optimizer: Optimizer,`
			`grad_clip_norm: float,`
			`accelerator: Accelerator,`
			`lr_scheduler=None,`
			`) -> tuple[MetricsTracker, dict]:`
			`start_time = time.perf_counter()`
			`policy.train()`

			`# Use accelerator's autocast context if mixed precision is enabled`
			`with accelerator.autocast():`
			`loss, output_dict = policy.forward(batch)`
			`# TODO(rcadene): policy.unnormalize_outputs(out_dict)`

			`# Use accelerator for backward pass`
			`accelerator.backward(loss)`

			`# Gradient clipping - accelerator handles unscaling automatically`
			`if accelerator.sync_gradients and grad_clip_norm > 0:`
			`grad_norm = accelerator.clip_grad_norm_(policy.parameters(), grad_clip_norm)`
			`else:`
			`grad_norm = torch.tensor(0.0)`

			`optimizer.step()`
			`lr_scheduler.step() if lr_scheduler is not None else None`
			`optimizer.zero_grad()`

			`# Update policy-specific buffers if needed`
			`if has_method(policy, "update"):`
			`policy.update()`

			`# Gather metrics across all processes`
			`loss_value = accelerator.gather(loss.detach()).mean().item()`
			`grad_norm_value = accelerator.gather(grad_norm).mean().item()`

			`train_metrics.loss = loss_value`
			`train_metrics.grad_norm = grad_norm_value`
			`train_metrics.lr = optimizer.param_groups[0]["lr"]`
			`train_metrics.update_s = time.perf_counter() - start_time`
			`return train_metrics, output_dict`


			`@parser.wrap()`
			`def train(cfg: TrainPipelineConfig):`
			`cfg.validate()`
			`logging.info(pformat(cfg.to_dict()))`

			`# Initialize accelerator`
			`from accelerate.utils import DistributedDataParallelKwargs`
[pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci 2025-09-11 11:51:53 +00:00
new things 2025-09-10 11:32:54 +02:00			`# added by jade 2 lines`
			`ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=False)`
			`accelerator = Accelerator(..., kwargs_handlers=[ddp_kwargs])`

			`from lerobot.utils.wandb_utils import cfg_to_group, get_wandb_run_id_from_filesystem`

			`ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)`
			`accelerator = Accelerator(`
			`mixed_precision="bf16" if cfg.policy.use_amp else "no",`
			`gradient_accumulation_steps=cfg.policy.gradient_accumulation_steps,`
			`log_with="wandb" if cfg.wandb.enable else None,`
			`kwargs_handlers=[ddp_kwargs],`
			`project_dir=cfg.output_dir,`
			`)`

			`accelerator.init_trackers(`
			`project_name=cfg.wandb.project,`
			`init_kwargs={`
			`"wandb": {`
			`"entity": cfg.wandb.entity,`
			`"name": cfg.job_name,`
			`"notes": cfg.wandb.notes,`
			`"tags": cfg_to_group(cfg, return_list=True),`
			`"dir": cfg.output_dir,`
			`"config": cfg.to_dict(),`
			`"save_code": False,`
			`"job_type": "train_eval",`
			`"mode": cfg.wandb.mode if cfg.wandb.mode in ["online", "offline", "disabled"] else "online",`
			`"resume": "must" if cfg.resume else None,`
			`"id": cfg.wandb.run_id`
			`if cfg.wandb.run_id`
			`else (get_wandb_run_id_from_filesystem(cfg.output_dir) if cfg.resume else None),`
			`}`
			`},`
			`)`

			`# Set seed for reproducibility`
			`if cfg.seed is not None:`
			`accelerate_set_seed(cfg.seed)`

			`# Setup device - accelerator handles device placement`
			`torch.backends.cudnn.benchmark = True`
			`torch.backends.cuda.matmul.allow_tf32 = True`

			`# Create dataset`
			`if accelerator.is_main_process:`
			`logging.info("Creating dataset")`
			`dataset = make_dataset(cfg)`
			`print("c")`
			`# Create evaluation environment (only on main process)`
			`eval_env = None`
			`if cfg.eval_freq > 0 and cfg.env is not None and accelerator.is_main_process:`
			`logging.info("Creating env")`
			`eval_env = make_env(cfg.env, n_envs=cfg.eval.batch_size, use_async_envs=cfg.eval.use_async_envs)`

			`# Create policy`
			`if accelerator.is_main_process:`
			`logging.info("Creating policy")`

			`# Use accelerator's device instead of cfg.policy.device`
			`with accelerator.main_process_first():`
			`policy = make_policy(`
			`cfg=cfg.policy,`
			`ds_meta=dataset.meta,`
			`)`

			`# Create optimizer and scheduler`
			`if accelerator.is_main_process:`
			`logging.info("Creating optimizer and scheduler")`
			`optimizer, lr_scheduler = make_optimizer_and_scheduler(cfg, policy)`

			`step = 0 # number of policy updates`

			`if cfg.resume:`
			`step, optimizer, lr_scheduler = load_training_state(cfg.checkpoint_path, optimizer, lr_scheduler)`

			`# Prepare dataloader`
			`if hasattr(cfg.policy, "drop_n_last_frames"):`
			`shuffle = False`
			`sampler = EpisodeAwareSampler(`
			`dataset.episode_data_index,`
			`drop_n_last_frames=cfg.policy.drop_n_last_frames,`
			`shuffle=True,`
			`)`
			`else:`
			`shuffle = True`
			`sampler = None`

			`dataloader = torch.utils.data.DataLoader(`
			`dataset,`
			`num_workers=cfg.num_workers,`
			`batch_size=cfg.batch_size,`
			`shuffle=shuffle,`
			`sampler=sampler,`
			`pin_memory=True,`
			`drop_last=True, # Important for distributed training`
			`)`

			`# Prepare for distributed training`
			`policy, optimizer, dataloader, lr_scheduler = accelerator.prepare(`
			`policy, optimizer, dataloader, lr_scheduler`
			`)`

			`# Log training info (only on main process)`
			`if accelerator.is_main_process:`
			`num_learnable_params = sum(p.numel() for p in policy.parameters() if p.requires_grad)`
			`num_total_params = sum(p.numel() for p in policy.parameters())`

			`logging.info(colored("Output dir:", "yellow", attrs=["bold"]) + f" {cfg.output_dir}")`
			`if cfg.env is not None:`
			`logging.info(f"{cfg.env.task=}")`
			`logging.info(f"{cfg.steps=} ({format_big_number(cfg.steps)})")`
			`logging.info(f"{dataset.num_frames=} ({format_big_number(dataset.num_frames)})")`
			`logging.info(f"{dataset.num_episodes=}")`
			`logging.info(f"{num_learnable_params=} ({format_big_number(num_learnable_params)})")`
			`logging.info(f"{num_total_params=} ({format_big_number(num_total_params)})")`
			`logging.info(f"Number of processes: {accelerator.num_processes}")`
			`logging.info(f"Device: {accelerator.device}")`
			`logging.info(f"Mixed precision: {accelerator.mixed_precision}")`

			`# Create metrics trackers`
			`train_metrics = {`
			`"loss": AverageMeter("loss", ":.3f"),`
			`"grad_norm": AverageMeter("grdn", ":.3f"),`
			`"lr": AverageMeter("lr", ":0.1e"),`
			`"update_s": AverageMeter("updt_s", ":.3f"),`
			`"dataloading_s": AverageMeter("data_s", ":.3f"),`
			`}`

			`train_tracker = MetricsTracker(`
			`cfg.batch_size * accelerator.num_processes, # Account for all processes`
			`dataset.num_frames,`
			`dataset.num_episodes,`
			`train_metrics,`
			`initial_step=step,`
			`)`

			`# Training loop`
			`policy.train()`
			`if accelerator.is_main_process:`
			`logging.info("Start offline training on a fixed dataset")`

			`# Create iterator from dataloader`
			`dl_iter = iter(dataloader)`

			`for current_step in range(step, cfg.steps):`
			`start_time = time.perf_counter()`
			`# Get next batch, cycling through dataloader if needed`
			`try:`
			`batch = next(dl_iter)`
			`print("data laoder batch keys: ", batch.keys())`
			`breakpoint()`
			`except StopIteration:`
			`dl_iter = iter(dataloader)`
			`batch = next(dl_iter)`
			`train_tracker.dataloading_s = time.perf_counter() - start_time`
			`# Update policy`
			`train_tracker, output_dict = update_policy(`
			`train_tracker,`
			`policy,`
			`batch,`
			`optimizer,`
			`cfg.optimizer.grad_clip_norm,`
			`accelerator,`
			`lr_scheduler=lr_scheduler,`
			`)`

			`# Increment step counter`
			`step += 1`
			`train_tracker.step()`

			`# Determine if we should log, save, or evaluate`
			`is_log_step = cfg.log_freq > 0 and step % cfg.log_freq == 0`
			`is_saving_step = step % cfg.save_freq == 0 or step == cfg.steps`
			`is_eval_step = cfg.eval_freq > 0 and step % cfg.eval_freq == 0`

			`# Logging (only on main process)`
			`if is_log_step and accelerator.is_main_process:`
			`logging.info(train_tracker)`
			`wandb_log_dict = train_tracker.to_dict()`
			`if output_dict:`
			`wandb_log_dict.update(output_dict)`
			`for k, v in wandb_log_dict.items():`
			`accelerator.log({f"{'train'}/{k}": v}, step=step)`
			`train_tracker.reset_averages()`

			`# Checkpointing (only on main process)`
			`if cfg.save_checkpoint and is_saving_step:`
			`# ✅ all processes wait here`
			`accelerator.wait_for_everyone()`

			`if accelerator.is_main_process:`
			`logging.info(f"Checkpoint policy after step {step}")`
			`checkpoint_dir = get_step_checkpoint_dir(cfg.output_dir, cfg.steps, step)`

			`unwrapped_policy = accelerator.unwrap_model(policy)`
			`save_checkpoint(checkpoint_dir, step, cfg, unwrapped_policy, optimizer, lr_scheduler)`
			`update_last_checkpoint(checkpoint_dir)`

			`# ✅ all processes sync again after saving`
			`accelerator.wait_for_everyone()`

			`# if wandb_logger:`
			`# wandb_logger.log_policy(checkpoint_dir)`

			`# Evaluation (only on main process)`
			`if cfg.env and is_eval_step and accelerator.is_main_process:`
			`step_id = get_step_identifier(step, cfg.steps)`
			`logging.info(f"Eval policy at step {step}")`

			`# Unwrap model for evaluation`
			`unwrapped_policy = accelerator.unwrap_model(policy)`
			`unwrapped_policy.eval()`

			`with torch.no_grad():`
			`eval_info = eval_policy(`
			`eval_env,`
			`unwrapped_policy,`
			`cfg.eval.n_episodes,`
			`videos_dir=cfg.output_dir / "eval" / f"videos_step_{step_id}",`
			`max_episodes_rendered=4,`
			`start_seed=cfg.seed,`
			`)`

			`eval_metrics = {`
			`"avg_sum_reward": AverageMeter("∑rwrd", ":.3f"),`
			`"pc_success": AverageMeter("success", ":.1f"),`
			`"eval_s": AverageMeter("eval_s", ":.3f"),`
			`}`
			`eval_tracker = MetricsTracker(`
			`cfg.batch_size * accelerator.num_processes,`
			`dataset.num_frames,`
			`dataset.num_episodes,`
			`eval_metrics,`
			`initial_step=step,`
			`)`
			`eval_tracker.eval_s = eval_info["aggregated"].pop("eval_s")`
			`eval_tracker.avg_sum_reward = eval_info["aggregated"].pop("avg_sum_reward")`
			`eval_tracker.pc_success = eval_info["aggregated"].pop("pc_success")`
			`logging.info(eval_tracker)`

			`wandb_log_dict = {eval_tracker.to_dict(), eval_info}`
			`for k, v in wandb_log_dict.items():`
			`accelerator.log({f"{'eval'}/{k}": v}, step=step)`

			`# Set back to training mode`
			`policy.train()`

			`# Wait for all processes to finish`
			`accelerator.wait_for_everyone()`

			`# Cleanup`
			`if eval_env and accelerator.is_main_process:`
			`eval_env.close()`

			`if accelerator.is_main_process:`
			`logging.info("End of training")`
[pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci 2025-09-11 11:51:53 +00:00			`accelerator.end_training() # added by jade`
new things 2025-09-10 11:32:54 +02:00

			`if __name__ == "__main__":`
			`init_logging()`
			`train()`