add update_chunk_settings method for LeRobotDatasetMetadata. Introduce tests for chunk settings updates and validation of parameters.

This commit is contained in:
Michel Aractingi
2025-08-18 00:00:23 +02:00
parent c7a3b02625
commit db36f01e8b
4 changed files with 193 additions and 8 deletions

View File

@@ -152,11 +152,11 @@ def update_meta_data(
def aggregate_datasets(
repo_ids: list[str],
aggr_repo_id: str,
roots: list[Path] = None,
aggr_root: Path = None,
data_files_size_in_mb: float = None,
video_files_size_in_mb: float = None,
chunk_size: int = None,
roots: list[Path] | None = None,
aggr_root: Path | None = None,
data_files_size_in_mb: float | None = None,
video_files_size_in_mb: float | None = None,
chunk_size: int | None = None,
):
"""Aggregates multiple LeRobot datasets into a single unified dataset.

View File

@@ -358,6 +358,53 @@ class LeRobotDatasetMetadata:
video_path = self.root / self.get_video_file_path(ep_index=0, vid_key=key)
self.info["features"][key]["info"] = get_video_info(video_path)
def update_chunk_settings(
self,
chunks_size: int | None = None,
data_files_size_in_mb: int | None = None,
video_files_size_in_mb: int | None = None,
) -> None:
"""Update chunk and file size settings after dataset creation.
This allows users to customize storage organization without modifying the constructor.
These settings control how episodes are chunked and how large files can grow before
creating new ones.
Args:
chunks_size: Maximum number of files per chunk directory. If None, keeps current value.
data_files_size_in_mb: Maximum size for data parquet files in MB. If None, keeps current value.
video_files_size_in_mb: Maximum size for video files in MB. If None, keeps current value.
"""
if chunks_size is not None:
if chunks_size <= 0:
raise ValueError(f"chunks_size must be positive, got {chunks_size}")
self.info["chunks_size"] = chunks_size
if data_files_size_in_mb is not None:
if data_files_size_in_mb <= 0:
raise ValueError(f"data_files_size_in_mb must be positive, got {data_files_size_in_mb}")
self.info["data_files_size_in_mb"] = data_files_size_in_mb
if video_files_size_in_mb is not None:
if video_files_size_in_mb <= 0:
raise ValueError(f"video_files_size_in_mb must be positive, got {video_files_size_in_mb}")
self.info["video_files_size_in_mb"] = video_files_size_in_mb
# Update the info file on disk
write_info(self.info, self.root)
def get_chunk_settings(self) -> dict[str, int]:
"""Get current chunk and file size settings.
Returns:
Dict containing chunks_size, data_files_size_in_mb, and video_files_size_in_mb.
"""
return {
"chunks_size": self.chunks_size,
"data_files_size_in_mb": self.data_files_size_in_mb,
"video_files_size_in_mb": self.video_files_size_in_mb,
}
def __repr__(self):
feature_keys = list(self.features)
return (

View File

@@ -540,6 +540,9 @@ def create_empty_dataset_info(
features: dict,
use_videos: bool,
robot_type: str | None = None,
chunks_size: int | None = None,
data_files_size_in_mb: int | None = None,
video_files_size_in_mb: int | None = None,
) -> dict:
return {
"codebase_version": codebase_version,
@@ -547,9 +550,9 @@ def create_empty_dataset_info(
"total_episodes": 0,
"total_frames": 0,
"total_tasks": 0,
"chunks_size": DEFAULT_CHUNK_SIZE,
"data_files_size_in_mb": DEFAULT_DATA_FILE_SIZE_IN_MB,
"video_files_size_in_mb": DEFAULT_VIDEO_FILE_SIZE_IN_MB,
"chunks_size": chunks_size or DEFAULT_CHUNK_SIZE,
"data_files_size_in_mb": data_files_size_in_mb or DEFAULT_DATA_FILE_SIZE_IN_MB,
"video_files_size_in_mb": video_files_size_in_mb or DEFAULT_VIDEO_FILE_SIZE_IN_MB,
"fps": fps,
"splits": {},
"data_path": DEFAULT_DATA_PATH,