diff --git a/src/lerobot/configs/default.py b/src/lerobot/configs/default.py index 53cfe58e7..9aa41d562 100644 --- a/src/lerobot/configs/default.py +++ b/src/lerobot/configs/default.py @@ -37,6 +37,21 @@ class DatasetConfig: revision: str | None = None use_imagenet_stats: bool = True video_backend: str = field(default_factory=get_safe_default_codec) + # Multi-dataset support + sampling_weights: str | None = None + max_action_dim: int | None = None + max_state_dim: int | None = None + max_num_images: int | None = None + max_image_dim: int | None = None + train_on_all_features: bool = False + features_version: int = 0 + discard_first_n_frames: int = 0 + min_fps: int = 1 + max_fps: int = 100 + discard_first_idle_frames: bool = False + motion_threshold: float = 5e-2 + motion_window_size: int = 10 + motion_buffer: int = 3 @dataclass diff --git a/src/lerobot/datasets/compute_stats.py b/src/lerobot/datasets/compute_stats.py index bfe7b18b4..bf466d5a7 100644 --- a/src/lerobot/datasets/compute_stats.py +++ b/src/lerobot/datasets/compute_stats.py @@ -125,9 +125,30 @@ def _assert_type_and_shape(stats_list: list[dict[str, dict]]): def aggregate_feature_stats(stats_ft_list: list[dict[str, dict]]) -> dict[str, dict[str, np.ndarray]]: """Aggregates stats for a single feature.""" - means = np.stack([s["mean"] for s in stats_ft_list]) - variances = np.stack([s["std"] ** 2 for s in stats_ft_list]) - counts = np.stack([s["count"] for s in stats_ft_list]) + # Filter out stats that don't have required keys + valid_stats = [] + for s in stats_ft_list: + if all(key in s for key in ["mean", "std", "count", "min", "max"]): + valid_stats.append(s) + else: + # If count is missing, add it with a default value + if "count" not in s: + s["count"] = np.array([1]) # Default count + valid_stats.append(s) + + if not valid_stats: + # If no valid stats, return empty stats + return { + "min": np.array([0]), + "max": np.array([0]), + "mean": np.array([0]), + "std": np.array([0]), + "count": np.array([0]), + } + + means = np.stack([s["mean"] for s in valid_stats]) + variances = np.stack([s["std"] ** 2 for s in valid_stats]) + counts = np.stack([s["count"] for s in valid_stats]) total_count = counts.sum(axis=0) # Prepare weighted mean by matching number of dimensions @@ -144,8 +165,8 @@ def aggregate_feature_stats(stats_ft_list: list[dict[str, dict]]) -> dict[str, d total_variance = weighted_variances.sum(axis=0) / total_count return { - "min": np.min(np.stack([s["min"] for s in stats_ft_list]), axis=0), - "max": np.max(np.stack([s["max"] for s in stats_ft_list]), axis=0), + "min": np.min(np.stack([s["min"] for s in valid_stats]), axis=0), + "max": np.max(np.stack([s["max"] for s in valid_stats]), axis=0), "mean": total_mean, "std": np.sqrt(total_variance), "count": total_count, diff --git a/src/lerobot/datasets/factory.py b/src/lerobot/datasets/factory.py index c3e1f684d..48c0b1c39 100644 --- a/src/lerobot/datasets/factory.py +++ b/src/lerobot/datasets/factory.py @@ -32,7 +32,7 @@ IMAGENET_STATS = { "std": [[[0.229]], [[0.224]], [[0.225]]], # (c,1,1) } -from lerobot.common.datasets.utils_must import EPISODES_DATASET_MAPPING, FEATURE_KEYS_MAPPING +from lerobot.datasets.utils_must import EPISODES_DATASET_MAPPING, FEATURE_KEYS_MAPPING def resolve_delta_timestamps( @@ -106,6 +106,7 @@ def make_dataset(cfg: TrainPipelineConfig) -> LeRobotDataset | MultiLeRobotDatas image_transforms=image_transforms, revision=revision, video_backend=cfg.dataset.video_backend, + download_videos=True, feature_keys_mapping=feature_keys_mapping, max_action_dim=cfg.dataset.max_action_dim, max_state_dim=cfg.dataset.max_state_dim, @@ -132,6 +133,7 @@ def make_dataset(cfg: TrainPipelineConfig) -> LeRobotDataset | MultiLeRobotDatas delta_timestamps=delta_timestamps, image_transforms=image_transforms, video_backend=cfg.dataset.video_backend, + download_videos=True, sampling_weights=sampling_weights, feature_keys_mapping=feature_keys_mapping, max_action_dim=cfg.policy.max_action_dim, diff --git a/src/lerobot/policies/smolvla2/modeling_smolvla2.py b/src/lerobot/policies/smolvla2/modeling_smolvla2.py index 47a7f95eb..9670781f9 100644 --- a/src/lerobot/policies/smolvla2/modeling_smolvla2.py +++ b/src/lerobot/policies/smolvla2/modeling_smolvla2.py @@ -65,13 +65,13 @@ from torch import Tensor, nn from transformers import AutoProcessor from lerobot.constants import ACTION, OBS_STATE -from lerobot.datasets import IMAGES_ORDER +from lerobot.configs.datasets import IMAGES_ORDER from lerobot.policies.normalize import ( Normalize, Unnormalize, ) from lerobot.policies.pretrained import PreTrainedPolicy -from lerobot.policies.smolvla.smolvlm_with_expert import SmolVLMWithExpertModel +from lerobot.policies.smolvla2.smolvlm_with_expert2 import SmolVLMWithExpertModel from lerobot.policies.smolvla2.configuration_smolvla2 import SmolVLA2Config from lerobot.policies.utils import ( populate_queues, @@ -389,10 +389,41 @@ class SmolVLA2Policy(PreTrainedPolicy): def get_optim_params(self) -> dict: return self.parameters() + def _get_action_chunk(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor: + for k in batch: + if k in self._queues: + batch[k] = torch.stack(list(self._queues[k]), dim=1) + + images, img_masks = self.prepare_images(batch) + state = self.prepare_state(batch) + lang_tokens, lang_masks = self.prepare_language(batch) + + actions = self.model.sample_actions(images, img_masks, lang_tokens, lang_masks, state, noise=noise) + + # Unpad actions + original_action_dim = self.config.action_feature.shape[0] + actions = actions[:, :, :original_action_dim] + + actions = self.unnormalize_outputs({ACTION: actions})[ACTION] + + if self.config.adapt_to_pi_aloha: + actions = self._pi_aloha_encode_actions(actions) + + return actions + def merge_peft_model_weights(self) -> None: if "lora" in self.config.peft_method: self.model.vlm_with_expert.merge_lora_weights() + def predict_action_chunk(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor: + self.eval() + + batch = self._prepare_batch(batch) + self._queues = populate_queues(self._queues, batch, exclude_keys=[ACTION]) + + actions = self._get_action_chunk(batch, noise) + return actions + @torch.no_grad def select_action_chunk(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor: """Select a single action given environment observations. @@ -691,7 +722,6 @@ class VLAFlowMatching(nn.Module): model_id=self.config.vlm_model_name, freeze_vision_encoder=self.config.freeze_vision_encoder, train_expert_only=self.config.train_expert_only, - attention_implementation=self.config.attention_implementation, load_vlm_weights=self.config.load_vlm_weights, attention_mode=self.config.attention_mode, num_expert_layers=self.config.num_expert_layers, diff --git a/src/lerobot/policies/smolvla2/smolvlm_with_expert2.py b/src/lerobot/policies/smolvla2/smolvlm_with_expert2.py index f1030cc4b..8196bb4b8 100644 --- a/src/lerobot/policies/smolvla2/smolvlm_with_expert2.py +++ b/src/lerobot/policies/smolvla2/smolvlm_with_expert2.py @@ -24,6 +24,7 @@ from transformers import ( AutoProcessor, SmolVLMForConditionalGeneration, ) +from peft import LoraConfig, TaskType, get_peft_model def apply_rope(x, positions, max_wavelength=10_000): diff --git a/src/lerobot/scripts/train.py b/src/lerobot/scripts/train.py index a077eff74..2caf8c18f 100644 --- a/src/lerobot/scripts/train.py +++ b/src/lerobot/scripts/train.py @@ -175,11 +175,18 @@ def train(cfg: TrainPipelineConfig): else: shuffle = True sampler = None - + keys_to_max_dim = getattr(dataset.meta, "keys_to_max_dim", {}) + keys_to_max_dim = { + "action": (32,), + "observation.state": (32,), + "observation.image": (3, 1080, 1920), + "observation.image2": (3, 1080, 1920), +} collate_fn = partial(multidataset_collate_fn, keys_to_max_dim=keys_to_max_dim) dataloader = torch.utils.data.DataLoader( dataset, + collate_fn=collate_fn, num_workers=cfg.num_workers, batch_size=cfg.batch_size, shuffle=shuffle,