src/lerobot/policies/diffusion/configuration_diffusion.py

#!/usr/bin/env python

# Copyright 2024 Columbia Artificial Intelligence, Robotics Lab,
# and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from dataclasses import dataclass, field

from lerobot.configs.policies import PreTrainedConfig
from lerobot.configs.types import NormalizationMode
from lerobot.optim.optimizers import AdamConfig
from lerobot.optim.schedulers import DiffuserSchedulerConfig


@PreTrainedConfig.register_subclass("diffusion")
@dataclass
class DiffusionConfig(PreTrainedConfig):
    """Configuration class for DiffusionPolicy.

    Defaults are configured for training with PushT providing proprioceptive and single camera observations.

    The parameters you will most likely need to change are the ones which depend on the environment / sensors.
    Those are: `input_shapes` and `output_shapes`.

    Notes on the inputs and outputs:
        - "observation.state" is required as an input key.
        - Either:
            - At least one key starting with "observation.image is required as an input.
              AND/OR
            - The key "observation.environment_state" is required as input.
        - If there are multiple keys beginning with "observation.image" they are treated as multiple camera
          views. Right now we only support all images having the same shape.
        - "action" is required as an output key.

    Args:
        n_obs_steps: Number of environment steps worth of observations to pass to the policy (takes the
            current step and additional steps going back).
        horizon: Diffusion model action prediction size as detailed in `DiffusionPolicy.select_action`.
        n_action_steps: The number of action steps to run in the environment for one invocation of the policy.
            See `DiffusionPolicy.select_action` for more details.
        input_shapes: A dictionary defining the shapes of the input data for the policy. The key represents
            the input data name, and the value is a list indicating the dimensions of the corresponding data.
            For example, "observation.image" refers to an input from a camera with dimensions [3, 96, 96],
            indicating it has three color channels and 96x96 resolution. Importantly, `input_shapes` doesn't
            include batch dimension or temporal dimension.
        output_shapes: A dictionary defining the shapes of the output data for the policy. The key represents
            the output data name, and the value is a list indicating the dimensions of the corresponding data.
            For example, "action" refers to an output shape of [14], indicating 14-dimensional actions.
            Importantly, `output_shapes` doesn't include batch dimension or temporal dimension.
        input_normalization_modes: A dictionary with key representing the modality (e.g. "observation.state"),
            and the value specifies the normalization mode to apply. The two available modes are "mean_std"
            which subtracts the mean and divides by the standard deviation and "min_max" which rescale in a
            [-1, 1] range.
        output_normalization_modes: Similar dictionary as `normalize_input_modes`, but to unnormalize to the
            original scale. Note that this is also used for normalizing the training targets.
        vision_backbone: Name of the torchvision resnet backbone to use for encoding images.
        crop_shape: (H, W) shape to crop images to as a preprocessing step for the vision backbone. Must fit
            within the image size. If None, no cropping is done.
        crop_is_random: Whether the crop should be random at training time (it's always a center crop in eval
            mode).
        pretrained_backbone_weights: Pretrained weights from torchvision to initialize the backbone.
            `None` means no pretrained weights.
        use_group_norm: Whether to replace batch normalization with group normalization in the backbone.
            The group sizes are set to be about 16 (to be precise, feature_dim // 16).
        spatial_softmax_num_keypoints: Number of keypoints for SpatialSoftmax.
        use_separate_rgb_encoders_per_camera: Whether to use a separate RGB encoder for each camera view.
        down_dims: Feature dimension for each stage of temporal downsampling in the diffusion modeling Unet.
            You may provide a variable number of dimensions, therefore also controlling the degree of
            downsampling.
        kernel_size: The convolutional kernel size of the diffusion modeling Unet.
        n_groups: Number of groups used in the group norm of the Unet's convolutional blocks.
        diffusion_step_embed_dim: The Unet is conditioned on the diffusion timestep via a small non-linear
            network. This is the output dimension of that network, i.e., the embedding dimension.
        use_film_scale_modulation: FiLM (https://huggingface.co/papers/1709.07871) is used for the Unet conditioning.
            Bias modulation is used be default, while this parameter indicates whether to also use scale
            modulation.
        noise_scheduler_type: Name of the noise scheduler to use. Supported options: ["DDPM", "DDIM"].
        num_train_timesteps: Number of diffusion steps for the forward diffusion schedule.
        beta_schedule: Name of the diffusion beta schedule as per DDPMScheduler from Hugging Face diffusers.
        beta_start: Beta value for the first forward-diffusion step.
        beta_end: Beta value for the last forward-diffusion step.
        prediction_type: The type of prediction that the diffusion modeling Unet makes. Choose from "epsilon"
            or "sample". These have equivalent outcomes from a latent variable modeling perspective, but
            "epsilon" has been shown to work better in many deep neural network settings.
        clip_sample: Whether to clip the sample to [-`clip_sample_range`, +`clip_sample_range`] for each
            denoising step at inference time. WARNING: you will need to make sure your action-space is
            normalized to fit within this range.
        clip_sample_range: The magnitude of the clipping range as described above.
        num_inference_steps: Number of reverse diffusion steps to use at inference time (steps are evenly
            spaced). If not provided, this defaults to be the same as `num_train_timesteps`.
        do_mask_loss_for_padding: Whether to mask the loss when there are copy-padded actions. See
            `LeRobotDataset` and `load_previous_and_future_frames` for more information. Note, this defaults
            to False as the original Diffusion Policy implementation does the same.
    """

    # Inputs / output structure.
    n_obs_steps: int = 2
    horizon: int = 16
    n_action_steps: int = 8

    normalization_mapping: dict[str, NormalizationMode] = field(
        default_factory=lambda: {
            "VISUAL": NormalizationMode.MEAN_STD,
            "STATE": NormalizationMode.MIN_MAX,
            "ACTION": NormalizationMode.MIN_MAX,
        }
    )

    # The original implementation doesn't sample frames for the last 7 steps,
    # which avoids excessive padding and leads to improved training results.
    drop_n_last_frames: int = 7  # horizon - n_action_steps - n_obs_steps + 1

    # Architecture / modeling.
    # Vision backbone.
    vision_backbone: str = "resnet18"
    crop_shape: tuple[int, int] | None = (84, 84)
    crop_is_random: bool = True
    pretrained_backbone_weights: str | None = None
    use_group_norm: bool = True
    spatial_softmax_num_keypoints: int = 32
    use_separate_rgb_encoder_per_camera: bool = False
    # Unet.
    down_dims: tuple[int, ...] = (512, 1024, 2048)
    kernel_size: int = 5
    n_groups: int = 8
    diffusion_step_embed_dim: int = 128
    use_film_scale_modulation: bool = True
    # Noise scheduler.
    noise_scheduler_type: str = "DDPM"
    num_train_timesteps: int = 100
    beta_schedule: str = "squaredcos_cap_v2"
    beta_start: float = 0.0001
    beta_end: float = 0.02
    prediction_type: str = "epsilon"
    clip_sample: bool = True
    clip_sample_range: float = 1.0

    # Inference
    num_inference_steps: int | None = None

    # Loss computation
    do_mask_loss_for_padding: bool = False

    # Training presets
    optimizer_lr: float = 1e-4
    optimizer_betas: tuple = (0.95, 0.999)
    optimizer_eps: float = 1e-8
    optimizer_weight_decay: float = 1e-6
    scheduler_name: str = "cosine"
    scheduler_warmup_steps: int = 500

    def __post_init__(self):
        super().__post_init__()

        """Input validation (not exhaustive)."""
        if not self.vision_backbone.startswith("resnet"):
            raise ValueError(
                f"`vision_backbone` must be one of the ResNet variants. Got {self.vision_backbone}."
            )

        supported_prediction_types = ["epsilon", "sample"]
        if self.prediction_type not in supported_prediction_types:
            raise ValueError(
                f"`prediction_type` must be one of {supported_prediction_types}. Got {self.prediction_type}."
            )
        supported_noise_schedulers = ["DDPM", "DDIM"]
        if self.noise_scheduler_type not in supported_noise_schedulers:
            raise ValueError(
                f"`noise_scheduler_type` must be one of {supported_noise_schedulers}. "
                f"Got {self.noise_scheduler_type}."
            )

        # Check that the horizon size and U-Net downsampling is compatible.
        # U-Net downsamples by 2 with each stage.
        downsampling_factor = 2 ** len(self.down_dims)
        if self.horizon % downsampling_factor != 0:
            raise ValueError(
                "The horizon should be an integer multiple of the downsampling factor (which is determined "
                f"by `len(down_dims)`). Got {self.horizon=} and {self.down_dims=}"
            )

    def get_optimizer_preset(self) -> AdamConfig:
        return AdamConfig(
            lr=self.optimizer_lr,
            betas=self.optimizer_betas,
            eps=self.optimizer_eps,
            weight_decay=self.optimizer_weight_decay,
        )

    def get_scheduler_preset(self) -> DiffuserSchedulerConfig:
        return DiffuserSchedulerConfig(
            name=self.scheduler_name,
            num_warmup_steps=self.scheduler_warmup_steps,
        )

    def validate_features(self) -> None:
        if len(self.image_features) == 0 and self.env_state_feature is None:
            raise ValueError("You must provide at least one image or the environment state among the inputs.")

        if self.crop_shape is not None:
            for key, image_ft in self.image_features.items():
                if self.crop_shape[0] > image_ft.shape[1] or self.crop_shape[1] > image_ft.shape[2]:
                    raise ValueError(
                        f"`crop_shape` should fit within the images shapes. Got {self.crop_shape} "
                        f"for `crop_shape` and {image_ft.shape} for "
                        f"`{key}`."
                    )

        # Check that all input images have the same shape.
        if len(self.image_features) > 0:
            first_image_key, first_image_ft = next(iter(self.image_features.items()))
            for key, image_ft in self.image_features.items():
                if image_ft.shape != first_image_ft.shape:
                    raise ValueError(
                        f"`{key}` does not match `{first_image_key}`, but we expect all image shapes to match."
                    )

    @property
    def observation_delta_indices(self) -> list:
        return list(range(1 - self.n_obs_steps, 1))

    @property
    def action_delta_indices(self) -> list:
        return list(range(1 - self.n_obs_steps, 1 - self.n_obs_steps + self.horizon))

    @property
    def reward_delta_indices(self) -> None:
        return None
Add copyrights (#157) 2024-05-15 12:13:09 +02:00			`#!/usr/bin/env python`

			`# Copyright 2024 Columbia Artificial Intelligence, Robotics Lab,`
			`# and The HuggingFace Inc. team. All rights reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
Move normalization to policy for act and diffusion (#90) Co-authored-by: Alexander Soare <alexander.soare159@gmail.com> 2024-04-25 11:47:38 +02:00			`from dataclasses import dataclass, field`
backup wip 2024-04-15 19:06:44 +01:00
Simplify configs (#550) Co-authored-by: Remi <remi.cadene@huggingface.co> Co-authored-by: HUANG TZU-CHUN <137322177+tc-huang@users.noreply.github.com> 2025-01-31 13:57:37 +01:00			`from lerobot.configs.policies import PreTrainedConfig`
			`from lerobot.configs.types import NormalizationMode`
Package folder structure (#1417) * Move files * Replace imports & paths * Update relative paths * Update doc symlinks * Update instructions paths * Fix imports * Update grpc files * Update more instructions * Downgrade grpc-tools * Update manifest * Update more paths * Update config paths * Update CI paths * Update bandit exclusions * Remove walkthrough section 2025-07-01 16:34:46 +02:00			`from lerobot.optim.optimizers import AdamConfig`
			`from lerobot.optim.schedulers import DiffuserSchedulerConfig`
backup wip 2024-04-15 19:06:44 +01:00
Simplify configs (#550) Co-authored-by: Remi <remi.cadene@huggingface.co> Co-authored-by: HUANG TZU-CHUN <137322177+tc-huang@users.noreply.github.com> 2025-01-31 13:57:37 +01:00
			`@PreTrainedConfig.register_subclass("diffusion")`
backup wip 2024-04-15 19:06:44 +01:00			`@dataclass`
Simplify configs (#550) Co-authored-by: Remi <remi.cadene@huggingface.co> Co-authored-by: HUANG TZU-CHUN <137322177+tc-huang@users.noreply.github.com> 2025-01-31 13:57:37 +01:00			`class DiffusionConfig(PreTrainedConfig):`
Refactor TD-MPC (#103) Co-authored-by: Cadene <re.cadene@gmail.com> Co-authored-by: Simon Alibert <75076266+aliberts@users.noreply.github.com> 2024-05-01 16:40:04 +01:00			`"""Configuration class for DiffusionPolicy.`
backup wip 2024-04-15 19:06:44 +01:00
			`Defaults are configured for training with PushT providing proprioceptive and single camera observations.`

			`The parameters you will most likely need to change are the ones which depend on the environment / sensors.`
Move normalization to policy for act and diffusion (#90) Co-authored-by: Alexander Soare <alexander.soare159@gmail.com> 2024-04-25 11:47:38 +02:00			Those are: `input_shapes` and `output_shapes`.
backup wip 2024-04-15 19:06:44 +01:00
Add real-world support for ACT on Aloha/Aloha2 (#228) Co-authored-by: Alexander Soare <alexander.soare159@gmail.com> 2024-05-31 15:31:02 +02:00			`Notes on the inputs and outputs:`
			`- "observation.state" is required as an input key.`
Train diffusion pusht_keypoints (#307) Co-authored-by: Remi <re.cadene@gmail.com> 2024-07-09 12:35:50 +01:00			`- Either:`
			`- At least one key starting with "observation.image is required as an input.`
			`AND/OR`
			`- The key "observation.environment_state" is required as input.`
Add multi-image support to diffusion policy (#218) 2024-06-17 08:11:20 +01:00			`- If there are multiple keys beginning with "observation.image" they are treated as multiple camera`
			`views. Right now we only support all images having the same shape.`
Add real-world support for ACT on Aloha/Aloha2 (#228) Co-authored-by: Alexander Soare <alexander.soare159@gmail.com> 2024-05-31 15:31:02 +02:00			`- "action" is required as an output key.`

backup wip 2024-04-15 19:06:44 +01:00			`Args:`
			`n_obs_steps: Number of environment steps worth of observations to pass to the policy (takes the`
			`current step and additional steps going back).`
backup wip 2024-04-16 12:51:32 +01:00			horizon: Diffusion model action prediction size as detailed in `DiffusionPolicy.select_action`.
			`n_action_steps: The number of action steps to run in the environment for one invocation of the policy.`
			See `DiffusionPolicy.select_action` for more details.
Add real-world support for ACT on Aloha/Aloha2 (#228) Co-authored-by: Alexander Soare <alexander.soare159@gmail.com> 2024-05-31 15:31:02 +02:00			`input_shapes: A dictionary defining the shapes of the input data for the policy. The key represents`
			`the input data name, and the value is a list indicating the dimensions of the corresponding data.`
			`For example, "observation.image" refers to an input from a camera with dimensions [3, 96, 96],`
			indicating it has three color channels and 96x96 resolution. Importantly, `input_shapes` doesn't
			`include batch dimension or temporal dimension.`
			`output_shapes: A dictionary defining the shapes of the output data for the policy. The key represents`
			`the output data name, and the value is a list indicating the dimensions of the corresponding data.`
			`For example, "action" refers to an output shape of [14], indicating 14-dimensional actions.`
			Importantly, `output_shapes` doesn't include batch dimension or temporal dimension.
Refactor TD-MPC (#103) Co-authored-by: Cadene <re.cadene@gmail.com> Co-authored-by: Simon Alibert <75076266+aliberts@users.noreply.github.com> 2024-05-01 16:40:04 +01:00			`input_normalization_modes: A dictionary with key representing the modality (e.g. "observation.state"),`
			`and the value specifies the normalization mode to apply. The two available modes are "mean_std"`
			`which subtracts the mean and divides by the standard deviation and "min_max" which rescale in a`
			`[-1, 1] range.`
			output_normalization_modes: Similar dictionary as `normalize_input_modes`, but to unnormalize to the
			`original scale. Note that this is also used for normalizing the training targets.`
backup wip 2024-04-16 12:51:32 +01:00			`vision_backbone: Name of the torchvision resnet backbone to use for encoding images.`
			`crop_shape: (H, W) shape to crop images to as a preprocessing step for the vision backbone. Must fit`
			`within the image size. If None, no cropping is done.`
			`crop_is_random: Whether the crop should be random at training time (it's always a center crop in eval`
			`mode).`
Add typos checks (#770) 2025-02-25 23:51:15 +01:00			`pretrained_backbone_weights: Pretrained weights from torchvision to initialize the backbone.`
Remove warnings (#111) - Replace `use_pretrained_backbone` with `pretrained_backbone_weights` - Bump diffusers' minimum version `0.26.3` -> `0.27.2` - Add ignore flags in CI's pytest - Change Box observation spaces in simulation environments - Set `version_base="1.2"` in Hydra initializations - Bump einops' minimum version `0.7.0` -> `0.8.0` 2024-04-29 00:31:33 +02:00			`None` means no pretrained weights.
backup wip 2024-04-16 12:51:32 +01:00			`use_group_norm: Whether to replace batch normalization with group normalization in the backbone.`
			`The group sizes are set to be about 16 (to be precise, feature_dim // 16).`
			`spatial_softmax_num_keypoints: Number of keypoints for SpatialSoftmax.`
feat: enable to use multiple rgb encoders per camera in diffusion policy (#484) Co-authored-by: Alexander Soare <alexander.soare159@gmail.com> 2024-10-30 19:00:05 +09:00			`use_separate_rgb_encoders_per_camera: Whether to use a separate RGB encoder for each camera view.`
backup wip 2024-04-16 12:51:32 +01:00			`down_dims: Feature dimension for each stage of temporal downsampling in the diffusion modeling Unet.`
			`You may provide a variable number of dimensions, therefore also controlling the degree of`
			`downsampling.`
			`kernel_size: The convolutional kernel size of the diffusion modeling Unet.`
			`n_groups: Number of groups used in the group norm of the Unet's convolutional blocks.`
			`diffusion_step_embed_dim: The Unet is conditioned on the diffusion timestep via a small non-linear`
			`network. This is the output dimension of that network, i.e., the embedding dimension.`
Use HF Papers (#1120) 2025-06-12 09:58:59 +02:00			`use_film_scale_modulation: FiLM (https://huggingface.co/papers/1709.07871) is used for the Unet conditioning.`
backup wip 2024-04-16 12:51:32 +01:00			`Bias modulation is used be default, while this parameter indicates whether to also use scale`
			`modulation.`
Support for DDIMScheduler in Diffusion Policy (#146) 2024-05-08 13:05:16 -04:00			`noise_scheduler_type: Name of the noise scheduler to use. Supported options: ["DDPM", "DDIM"].`
backup wip 2024-04-16 12:51:32 +01:00			`num_train_timesteps: Number of diffusion steps for the forward diffusion schedule.`
			`beta_schedule: Name of the diffusion beta schedule as per DDPMScheduler from Hugging Face diffusers.`
			`beta_start: Beta value for the first forward-diffusion step.`
			`beta_end: Beta value for the last forward-diffusion step.`
			`prediction_type: The type of prediction that the diffusion modeling Unet makes. Choose from "epsilon"`
			`or "sample". These have equivalent outcomes from a latent variable modeling perspective, but`
			`"epsilon" has been shown to work better in many deep neural network settings.`
			clip_sample: Whether to clip the sample to [-`clip_sample_range`, +`clip_sample_range`] for each
			`denoising step at inference time. WARNING: you will need to make sure your action-space is`
			`normalized to fit within this range.`
			`clip_sample_range: The magnitude of the clipping range as described above.`
			`num_inference_steps: Number of reverse diffusion steps to use at inference time (steps are evenly`
			spaced). If not provided, this defaults to be the same as `num_train_timesteps`.
Remove loss masking from diffusion policy (#135) 2024-05-06 07:27:01 +01:00			`do_mask_loss_for_padding: Whether to mask the loss when there are copy-padded actions. See`
Add typos checks (#770) 2025-02-25 23:51:15 +01:00			`LeRobotDataset` and `load_previous_and_future_frames` for more information. Note, this defaults
Remove loss masking from diffusion policy (#135) 2024-05-06 07:27:01 +01:00			`to False as the original Diffusion Policy implementation does the same.`
backup wip 2024-04-15 19:06:44 +01:00			`"""`

			`# Inputs / output structure.`
			`n_obs_steps: int = 2`
			`horizon: int = 16`
			`n_action_steps: int = 8`

Simplify configs (#550) Co-authored-by: Remi <remi.cadene@huggingface.co> Co-authored-by: HUANG TZU-CHUN <137322177+tc-huang@users.noreply.github.com> 2025-01-31 13:57:37 +01:00			`normalization_mapping: dict[str, NormalizationMode] = field(`
Move normalization to policy for act and diffusion (#90) Co-authored-by: Alexander Soare <alexander.soare159@gmail.com> 2024-04-25 11:47:38 +02:00			`default_factory=lambda: {`
Simplify configs (#550) Co-authored-by: Remi <remi.cadene@huggingface.co> Co-authored-by: HUANG TZU-CHUN <137322177+tc-huang@users.noreply.github.com> 2025-01-31 13:57:37 +01:00			`"VISUAL": NormalizationMode.MEAN_STD,`
			`"STATE": NormalizationMode.MIN_MAX,`
			`"ACTION": NormalizationMode.MIN_MAX,`
Move normalization to policy for act and diffusion (#90) Co-authored-by: Alexander Soare <alexander.soare159@gmail.com> 2024-04-25 11:47:38 +02:00			`}`
			`)`

Simplify configs (#550) Co-authored-by: Remi <remi.cadene@huggingface.co> Co-authored-by: HUANG TZU-CHUN <137322177+tc-huang@users.noreply.github.com> 2025-01-31 13:57:37 +01:00			`# The original implementation doesn't sample frames for the last 7 steps,`
			`# which avoids excessive padding and leads to improved training results.`
			`drop_n_last_frames: int = 7 # horizon - n_action_steps - n_obs_steps + 1`
backup wip 2024-04-15 19:06:44 +01:00
			`# Architecture / modeling.`
			`# Vision backbone.`
			`vision_backbone: str = "resnet18"`
backup wip 2024-04-16 12:51:32 +01:00			`crop_shape: tuple[int, int] \| None = (84, 84)`
backup wip 2024-04-15 19:06:44 +01:00			`crop_is_random: bool = True`
Remove warnings (#111) - Replace `use_pretrained_backbone` with `pretrained_backbone_weights` - Bump diffusers' minimum version `0.26.3` -> `0.27.2` - Add ignore flags in CI's pytest - Change Box observation spaces in simulation environments - Set `version_base="1.2"` in Hydra initializations - Bump einops' minimum version `0.7.0` -> `0.8.0` 2024-04-29 00:31:33 +02:00			`pretrained_backbone_weights: str \| None = None`
backup wip 2024-04-15 19:06:44 +01:00			`use_group_norm: bool = True`
			`spatial_softmax_num_keypoints: int = 32`
feat: enable to use multiple rgb encoders per camera in diffusion policy (#484) Co-authored-by: Alexander Soare <alexander.soare159@gmail.com> 2024-10-30 19:00:05 +09:00			`use_separate_rgb_encoder_per_camera: bool = False`
backup wip 2024-04-15 19:06:44 +01:00			`# Unet.`
			`down_dims: tuple[int, ...] = (512, 1024, 2048)`
			`kernel_size: int = 5`
			`n_groups: int = 8`
			`diffusion_step_embed_dim: int = 128`
backup wip 2024-04-16 12:51:32 +01:00			`use_film_scale_modulation: bool = True`
backup wip 2024-04-15 19:06:44 +01:00			`# Noise scheduler.`
Support for DDIMScheduler in Diffusion Policy (#146) 2024-05-08 13:05:16 -04:00			`noise_scheduler_type: str = "DDPM"`
backup wip 2024-04-15 19:06:44 +01:00			`num_train_timesteps: int = 100`
			`beta_schedule: str = "squaredcos_cap_v2"`
			`beta_start: float = 0.0001`
			`beta_end: float = 0.02`
			`prediction_type: str = "epsilon"`
backup wip 2024-04-16 12:51:32 +01:00			`clip_sample: bool = True`
			`clip_sample_range: float = 1.0`
backup wip 2024-04-15 19:06:44 +01:00
			`# Inference`
backup wip 2024-04-16 12:51:32 +01:00			`num_inference_steps: int \| None = None`
backup wip 2024-04-15 19:06:44 +01:00
Remove loss masking from diffusion policy (#135) 2024-05-06 07:27:01 +01:00			`# Loss computation`
			`do_mask_loss_for_padding: bool = False`

Simplify configs (#550) Co-authored-by: Remi <remi.cadene@huggingface.co> Co-authored-by: HUANG TZU-CHUN <137322177+tc-huang@users.noreply.github.com> 2025-01-31 13:57:37 +01:00			`# Training presets`
			`optimizer_lr: float = 1e-4`
			`optimizer_betas: tuple = (0.95, 0.999)`
			`optimizer_eps: float = 1e-8`
			`optimizer_weight_decay: float = 1e-6`
			`scheduler_name: str = "cosine"`
			`scheduler_warmup_steps: int = 500`

backup wip 2024-04-15 19:06:44 +01:00			`def __post_init__(self):`
Simplify configs (#550) Co-authored-by: Remi <remi.cadene@huggingface.co> Co-authored-by: HUANG TZU-CHUN <137322177+tc-huang@users.noreply.github.com> 2025-01-31 13:57:37 +01:00			`super().__post_init__()`

backup wip 2024-04-15 19:06:44 +01:00			`"""Input validation (not exhaustive)."""`
			`if not self.vision_backbone.startswith("resnet"):`
backup wip 2024-04-16 12:51:32 +01:00			`raise ValueError(`
			f"`vision_backbone` must be one of the ResNet variants. Got {self.vision_backbone}."
			`)`
Train diffusion pusht_keypoints (#307) Co-authored-by: Remi <re.cadene@gmail.com> 2024-07-09 12:35:50 +01:00
backup wip 2024-04-16 12:51:32 +01:00			`supported_prediction_types = ["epsilon", "sample"]`
			`if self.prediction_type not in supported_prediction_types:`
			`raise ValueError(`
			f"`prediction_type` must be one of {supported_prediction_types}. Got {self.prediction_type}."
			`)`
Support for DDIMScheduler in Diffusion Policy (#146) 2024-05-08 13:05:16 -04:00			`supported_noise_schedulers = ["DDPM", "DDIM"]`
			`if self.noise_scheduler_type not in supported_noise_schedulers:`
			`raise ValueError(`
			f"`noise_scheduler_type` must be one of {supported_noise_schedulers}. "
			`f"Got {self.noise_scheduler_type}."`
			`)`
Raise ValueError if horizon is incompatible with downsampling (#422) 2024-09-09 17:22:46 +01:00
			`# Check that the horizon size and U-Net downsampling is compatible.`
			`# U-Net downsamples by 2 with each stage.`
			`downsampling_factor = 2 ** len(self.down_dims)`
			`if self.horizon % downsampling_factor != 0:`
			`raise ValueError(`
			`"The horizon should be an integer multiple of the downsampling factor (which is determined "`
			f"by `len(down_dims)`). Got {self.horizon=} and {self.down_dims=}"
			`)`
Simplify configs (#550) Co-authored-by: Remi <remi.cadene@huggingface.co> Co-authored-by: HUANG TZU-CHUN <137322177+tc-huang@users.noreply.github.com> 2025-01-31 13:57:37 +01:00
			`def get_optimizer_preset(self) -> AdamConfig:`
			`return AdamConfig(`
			`lr=self.optimizer_lr,`
			`betas=self.optimizer_betas,`
			`eps=self.optimizer_eps,`
			`weight_decay=self.optimizer_weight_decay,`
			`)`

			`def get_scheduler_preset(self) -> DiffuserSchedulerConfig:`
			`return DiffuserSchedulerConfig(`
			`name=self.scheduler_name,`
			`num_warmup_steps=self.scheduler_warmup_steps,`
			`)`

			`def validate_features(self) -> None:`
			`if len(self.image_features) == 0 and self.env_state_feature is None:`
			`raise ValueError("You must provide at least one image or the environment state among the inputs.")`

			`if self.crop_shape is not None:`
			`for key, image_ft in self.image_features.items():`
			`if self.crop_shape[0] > image_ft.shape[1] or self.crop_shape[1] > image_ft.shape[2]:`
			`raise ValueError(`
			f"`crop_shape` should fit within the images shapes. Got {self.crop_shape} "
			f"for `crop_shape` and {image_ft.shape} for "
			f"`{key}`."
			`)`

			`# Check that all input images have the same shape.`
fix(DiffusionPolicy): Fix bug where training without image features would crash with exception, fix environment state docs (#1617) * Fix bug in diffusion config validation when not using image features * Fix DiffusionPolicy docstring about shape of env state 2025-07-29 04:40:16 -07:00			`if len(self.image_features) > 0:`
			`first_image_key, first_image_ft = next(iter(self.image_features.items()))`
			`for key, image_ft in self.image_features.items():`
			`if image_ft.shape != first_image_ft.shape:`
			`raise ValueError(`
			f"`{key}` does not match `{first_image_key}`, but we expect all image shapes to match."
			`)`
Simplify configs (#550) Co-authored-by: Remi <remi.cadene@huggingface.co> Co-authored-by: HUANG TZU-CHUN <137322177+tc-huang@users.noreply.github.com> 2025-01-31 13:57:37 +01:00
			`@property`
			`def observation_delta_indices(self) -> list:`
			`return list(range(1 - self.n_obs_steps, 1))`

			`@property`
			`def action_delta_indices(self) -> list:`
			`return list(range(1 - self.n_obs_steps, 1 - self.n_obs_steps + self.horizon))`

			`@property`
			`def reward_delta_indices(self) -> None:`
			`return None`