mirror of
https://github.com/huggingface/lerobot.git
synced 2026-06-03 20:31:25 +00:00
Merge branch 'main' into tmp/fold_training
This commit is contained in:
@@ -117,6 +117,9 @@ def update_meta_data(
|
||||
Adjusts all indices and timestamps to account for previously aggregated
|
||||
data and videos in the destination dataset.
|
||||
|
||||
For data file indices, uses the 'src_to_dst' mapping from aggregate_data()
|
||||
to correctly map source file indices to their destination locations.
|
||||
|
||||
Args:
|
||||
df: DataFrame containing the metadata to be updated.
|
||||
dst_meta: Destination dataset metadata.
|
||||
@@ -130,8 +133,50 @@ def update_meta_data(
|
||||
|
||||
df["meta/episodes/chunk_index"] = df["meta/episodes/chunk_index"] + meta_idx["chunk"]
|
||||
df["meta/episodes/file_index"] = df["meta/episodes/file_index"] + meta_idx["file"]
|
||||
df["data/chunk_index"] = df["data/chunk_index"] + data_idx["chunk"]
|
||||
df["data/file_index"] = df["data/file_index"] + data_idx["file"]
|
||||
|
||||
# Update data file indices using source-to-destination mapping
|
||||
# This is critical for handling datasets that are already results of a merge
|
||||
data_src_to_dst = data_idx.get("src_to_dst", {})
|
||||
if data_src_to_dst:
|
||||
# Store original indices for lookup
|
||||
df["_orig_data_chunk"] = df["data/chunk_index"].copy()
|
||||
df["_orig_data_file"] = df["data/file_index"].copy()
|
||||
|
||||
# Vectorized mapping from (src_chunk, src_file) to (dst_chunk, dst_file)
|
||||
# This is much faster than per-row iteration for large metadata tables
|
||||
mapping_index = pd.MultiIndex.from_tuples(
|
||||
list(data_src_to_dst.keys()),
|
||||
names=["chunk_index", "file_index"],
|
||||
)
|
||||
mapping_values = list(data_src_to_dst.values())
|
||||
mapping_df = pd.DataFrame(
|
||||
mapping_values,
|
||||
index=mapping_index,
|
||||
columns=["dst_chunk", "dst_file"],
|
||||
)
|
||||
|
||||
# Construct a MultiIndex for each row based on original data indices
|
||||
row_index = pd.MultiIndex.from_arrays(
|
||||
[df["_orig_data_chunk"], df["_orig_data_file"]],
|
||||
names=["chunk_index", "file_index"],
|
||||
)
|
||||
|
||||
# Align mapping to rows; missing keys fall back to the default destination
|
||||
reindexed = mapping_df.reindex(row_index)
|
||||
reindexed[["dst_chunk", "dst_file"]] = reindexed[["dst_chunk", "dst_file"]].fillna(
|
||||
{"dst_chunk": data_idx["chunk"], "dst_file": data_idx["file"]}
|
||||
)
|
||||
|
||||
# Assign mapped destination indices back to the DataFrame
|
||||
df["data/chunk_index"] = reindexed["dst_chunk"].to_numpy()
|
||||
df["data/file_index"] = reindexed["dst_file"].to_numpy()
|
||||
|
||||
# Clean up temporary columns
|
||||
df = df.drop(columns=["_orig_data_chunk", "_orig_data_file"])
|
||||
else:
|
||||
# Fallback to simple offset (backward compatibility for single-file sources)
|
||||
df["data/chunk_index"] = df["data/chunk_index"] + data_idx["chunk"]
|
||||
df["data/file_index"] = df["data/file_index"] + data_idx["file"]
|
||||
for key, video_idx in videos_idx.items():
|
||||
# Store original video file indices before updating
|
||||
orig_chunk_col = f"videos/{key}/chunk_index"
|
||||
@@ -147,8 +192,7 @@ def update_meta_data(
|
||||
if src_to_dst:
|
||||
# Map each episode to its correct destination file and apply offset
|
||||
for idx in df.index:
|
||||
# Convert to Python int to avoid numpy type mismatch in dict lookup
|
||||
src_key = (int(df.at[idx, "_orig_chunk"]), int(df.at[idx, "_orig_file"]))
|
||||
src_key = (df.at[idx, "_orig_chunk"], df.at[idx, "_orig_file"])
|
||||
|
||||
# Get destination chunk/file for this source file
|
||||
dst_chunk, dst_file = src_to_dst.get(src_key, (video_idx["chunk"], video_idx["file"]))
|
||||
@@ -164,8 +208,7 @@ def update_meta_data(
|
||||
df[orig_chunk_col] = video_idx["chunk"]
|
||||
df[orig_file_col] = video_idx["file"]
|
||||
for idx in df.index:
|
||||
# Convert to Python int to avoid numpy type mismatch in dict lookup
|
||||
src_key = (int(df.at[idx, "_orig_chunk"]), int(df.at[idx, "_orig_file"]))
|
||||
src_key = (df.at[idx, "_orig_chunk"], df.at[idx, "_orig_file"])
|
||||
offset = src_to_offset.get(src_key, 0)
|
||||
df.at[idx, f"videos/{key}/from_timestamp"] += offset
|
||||
df.at[idx, f"videos/{key}/to_timestamp"] += offset
|
||||
@@ -263,6 +306,10 @@ def aggregate_datasets(
|
||||
|
||||
meta_idx = aggregate_metadata(src_meta, dst_meta, meta_idx, data_idx, videos_idx)
|
||||
|
||||
# Clear the src_to_dst mapping after processing each source dataset
|
||||
# to avoid interference between different source datasets
|
||||
data_idx.pop("src_to_dst", None)
|
||||
|
||||
dst_meta.info["total_episodes"] += src_meta.total_episodes
|
||||
dst_meta.info["total_frames"] += src_meta.total_frames
|
||||
|
||||
@@ -313,10 +360,6 @@ def aggregate_videos(src_meta, dst_meta, videos_idx, video_files_size_in_mb, chu
|
||||
dst_file_durations = video_idx["dst_file_durations"]
|
||||
|
||||
for src_chunk_idx, src_file_idx in unique_chunk_file_pairs:
|
||||
# Convert to Python int to ensure consistent dict keys
|
||||
src_chunk_idx = int(src_chunk_idx)
|
||||
src_file_idx = int(src_file_idx)
|
||||
|
||||
src_path = src_meta.root / DEFAULT_VIDEO_PATH.format(
|
||||
video_key=key,
|
||||
chunk_index=src_chunk_idx,
|
||||
@@ -389,10 +432,16 @@ def aggregate_data(src_meta, dst_meta, data_idx, data_files_size_in_mb, chunk_si
|
||||
Reads source data files, updates indices to match the aggregated dataset,
|
||||
and writes them to the destination with proper file rotation.
|
||||
|
||||
Tracks a `src_to_dst` mapping from source (chunk, file) to destination (chunk, file)
|
||||
which is critical for correctly updating episode metadata when source datasets
|
||||
have multiple data files (e.g., from a previous merge operation).
|
||||
|
||||
Args:
|
||||
src_meta: Source dataset metadata.
|
||||
dst_meta: Destination dataset metadata.
|
||||
data_idx: Dictionary tracking data chunk and file indices.
|
||||
data_files_size_in_mb: Maximum size for data files in MB.
|
||||
chunk_size: Maximum number of files per chunk.
|
||||
|
||||
Returns:
|
||||
dict: Updated data_idx with current chunk and file indices.
|
||||
@@ -410,6 +459,10 @@ def aggregate_data(src_meta, dst_meta, data_idx, data_files_size_in_mb, chunk_si
|
||||
# retrieve features schema for proper image typing in parquet
|
||||
hf_features = get_hf_features_from_features(dst_meta.features) if contains_images else None
|
||||
|
||||
# Track source to destination file mapping for metadata update
|
||||
# This is critical for handling datasets that are already results of a merge
|
||||
src_to_dst: dict[tuple[int, int], tuple[int, int]] = {}
|
||||
|
||||
for src_chunk_idx, src_file_idx in unique_chunk_file_ids:
|
||||
src_path = src_meta.root / DEFAULT_DATA_PATH.format(
|
||||
chunk_index=src_chunk_idx, file_index=src_file_idx
|
||||
@@ -422,7 +475,9 @@ def aggregate_data(src_meta, dst_meta, data_idx, data_files_size_in_mb, chunk_si
|
||||
df = pd.read_parquet(src_path)
|
||||
df = update_data_df(df, src_meta, dst_meta)
|
||||
|
||||
data_idx = append_or_create_parquet_file(
|
||||
# Write data and get the actual destination file it was written to
|
||||
# This avoids duplicating the rotation logic here
|
||||
data_idx, (dst_chunk, dst_file) = append_or_create_parquet_file(
|
||||
df,
|
||||
src_path,
|
||||
data_idx,
|
||||
@@ -434,6 +489,12 @@ def aggregate_data(src_meta, dst_meta, data_idx, data_files_size_in_mb, chunk_si
|
||||
hf_features=hf_features,
|
||||
)
|
||||
|
||||
# Record the mapping from source to actual destination
|
||||
src_to_dst[(src_chunk_idx, src_file_idx)] = (dst_chunk, dst_file)
|
||||
|
||||
# Add the mapping to data_idx for use in metadata update
|
||||
data_idx["src_to_dst"] = src_to_dst
|
||||
|
||||
return data_idx
|
||||
|
||||
|
||||
@@ -474,7 +535,7 @@ def aggregate_metadata(src_meta, dst_meta, meta_idx, data_idx, videos_idx):
|
||||
videos_idx,
|
||||
)
|
||||
|
||||
meta_idx = append_or_create_parquet_file(
|
||||
meta_idx, _ = append_or_create_parquet_file(
|
||||
df,
|
||||
src_path,
|
||||
meta_idx,
|
||||
@@ -502,7 +563,7 @@ def append_or_create_parquet_file(
|
||||
contains_images: bool = False,
|
||||
aggr_root: Path = None,
|
||||
hf_features: datasets.Features | None = None,
|
||||
):
|
||||
) -> tuple[dict[str, int], tuple[int, int]]:
|
||||
"""Appends data to an existing parquet file or creates a new one based on size constraints.
|
||||
|
||||
Manages file rotation when size limits are exceeded to prevent individual files
|
||||
@@ -520,9 +581,11 @@ def append_or_create_parquet_file(
|
||||
hf_features: Optional HuggingFace Features schema for proper image typing.
|
||||
|
||||
Returns:
|
||||
dict: Updated index dictionary with current chunk and file indices.
|
||||
tuple: (updated_idx, (dst_chunk, dst_file)) where updated_idx is the index dict
|
||||
and (dst_chunk, dst_file) is the actual destination file the data was written to.
|
||||
"""
|
||||
dst_path = aggr_root / default_path.format(chunk_index=idx["chunk"], file_index=idx["file"])
|
||||
dst_chunk, dst_file = idx["chunk"], idx["file"]
|
||||
dst_path = aggr_root / default_path.format(chunk_index=dst_chunk, file_index=dst_file)
|
||||
|
||||
if not dst_path.exists():
|
||||
dst_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
@@ -530,14 +593,15 @@ def append_or_create_parquet_file(
|
||||
to_parquet_with_hf_images(df, dst_path, features=hf_features)
|
||||
else:
|
||||
df.to_parquet(dst_path)
|
||||
return idx
|
||||
return idx, (dst_chunk, dst_file)
|
||||
|
||||
src_size = get_parquet_file_size_in_mb(src_path)
|
||||
dst_size = get_parquet_file_size_in_mb(dst_path)
|
||||
|
||||
if dst_size + src_size >= max_mb:
|
||||
idx["chunk"], idx["file"] = update_chunk_file_indices(idx["chunk"], idx["file"], chunk_size)
|
||||
new_path = aggr_root / default_path.format(chunk_index=idx["chunk"], file_index=idx["file"])
|
||||
dst_chunk, dst_file = idx["chunk"], idx["file"]
|
||||
new_path = aggr_root / default_path.format(chunk_index=dst_chunk, file_index=dst_file)
|
||||
new_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
final_df = df
|
||||
target_path = new_path
|
||||
@@ -556,7 +620,7 @@ def append_or_create_parquet_file(
|
||||
else:
|
||||
final_df.to_parquet(target_path)
|
||||
|
||||
return idx
|
||||
return idx, (dst_chunk, dst_file)
|
||||
|
||||
|
||||
def finalize_aggregation(aggr_meta, all_metadata):
|
||||
|
||||
@@ -1396,6 +1396,132 @@ BYTES_PER_KIB = 1024
|
||||
BYTES_PER_MIB = BYTES_PER_KIB * BYTES_PER_KIB
|
||||
|
||||
|
||||
def modify_tasks(
|
||||
dataset: LeRobotDataset,
|
||||
new_task: str | None = None,
|
||||
episode_tasks: dict[int, str] | None = None,
|
||||
) -> LeRobotDataset:
|
||||
"""Modify tasks in a LeRobotDataset.
|
||||
|
||||
This function allows you to either:
|
||||
1. Set a single task for the entire dataset (using `new_task`)
|
||||
2. Set specific tasks for specific episodes (using `episode_tasks`)
|
||||
|
||||
You can combine both: `new_task` sets the default, and `episode_tasks` overrides
|
||||
specific episodes.
|
||||
|
||||
The dataset is modified in-place, updating only the task-related files:
|
||||
- meta/tasks.parquet
|
||||
- data/**/*.parquet (task_index column)
|
||||
- meta/episodes/**/*.parquet (tasks column)
|
||||
- meta/info.json (total_tasks)
|
||||
|
||||
Args:
|
||||
dataset: The source LeRobotDataset to modify.
|
||||
new_task: A single task string to apply to all episodes. If None and episode_tasks
|
||||
is also None, raises an error.
|
||||
episode_tasks: Optional dict mapping episode indices to their task strings.
|
||||
Overrides `new_task` for specific episodes.
|
||||
|
||||
|
||||
Examples:
|
||||
Set a single task for all episodes:
|
||||
dataset = modify_tasks(dataset, new_task="Pick up the cube")
|
||||
|
||||
Set different tasks for specific episodes:
|
||||
dataset = modify_tasks(
|
||||
dataset,
|
||||
episode_tasks={0: "Task A", 1: "Task B", 2: "Task A"}
|
||||
)
|
||||
|
||||
Set a default task with overrides:
|
||||
dataset = modify_tasks(
|
||||
dataset,
|
||||
new_task="Default task",
|
||||
episode_tasks={5: "Special task for episode 5"}
|
||||
)
|
||||
"""
|
||||
if new_task is None and episode_tasks is None:
|
||||
raise ValueError("Must specify at least one of new_task or episode_tasks")
|
||||
|
||||
if episode_tasks is not None:
|
||||
valid_indices = set(range(dataset.meta.total_episodes))
|
||||
invalid = set(episode_tasks.keys()) - valid_indices
|
||||
if invalid:
|
||||
raise ValueError(f"Invalid episode indices: {invalid}")
|
||||
|
||||
# Ensure episodes metadata is loaded
|
||||
if dataset.meta.episodes is None:
|
||||
dataset.meta.episodes = load_episodes(dataset.root)
|
||||
|
||||
# Build the mapping from episode index to task string
|
||||
episode_to_task: dict[int, str] = {}
|
||||
for ep_idx in range(dataset.meta.total_episodes):
|
||||
if episode_tasks and ep_idx in episode_tasks:
|
||||
episode_to_task[ep_idx] = episode_tasks[ep_idx]
|
||||
elif new_task is not None:
|
||||
episode_to_task[ep_idx] = new_task
|
||||
else:
|
||||
# Keep original task if not overridden and no default provided
|
||||
original_tasks = dataset.meta.episodes[ep_idx]["tasks"]
|
||||
if not original_tasks:
|
||||
raise ValueError(f"Episode {ep_idx} has no tasks and no default task was provided")
|
||||
episode_to_task[ep_idx] = original_tasks[0]
|
||||
|
||||
# Collect all unique tasks and create new task mapping
|
||||
unique_tasks = sorted(set(episode_to_task.values()))
|
||||
new_task_df = pd.DataFrame({"task_index": list(range(len(unique_tasks)))}, index=unique_tasks)
|
||||
task_to_index = {task: idx for idx, task in enumerate(unique_tasks)}
|
||||
|
||||
logging.info(f"Modifying tasks in {dataset.repo_id}")
|
||||
logging.info(f"New tasks: {unique_tasks}")
|
||||
|
||||
root = dataset.root
|
||||
|
||||
# Update data files - modify task_index column
|
||||
logging.info("Updating data files...")
|
||||
data_dir = root / DATA_DIR
|
||||
|
||||
for parquet_path in tqdm(sorted(data_dir.rglob("*.parquet")), desc="Updating data"):
|
||||
df = pd.read_parquet(parquet_path)
|
||||
|
||||
# Build a mapping from episode_index to new task_index for rows in this file
|
||||
episode_indices_in_file = df["episode_index"].unique()
|
||||
ep_to_new_task_idx = {
|
||||
ep_idx: task_to_index[episode_to_task[ep_idx]] for ep_idx in episode_indices_in_file
|
||||
}
|
||||
|
||||
# Update task_index column
|
||||
df["task_index"] = df["episode_index"].map(ep_to_new_task_idx)
|
||||
df.to_parquet(parquet_path, index=False)
|
||||
|
||||
# Update episodes metadata - modify tasks column
|
||||
logging.info("Updating episodes metadata...")
|
||||
episodes_dir = root / "meta" / "episodes"
|
||||
|
||||
for parquet_path in tqdm(sorted(episodes_dir.rglob("*.parquet")), desc="Updating episodes"):
|
||||
df = pd.read_parquet(parquet_path)
|
||||
|
||||
# Update tasks column
|
||||
df["tasks"] = df["episode_index"].apply(lambda ep_idx: [episode_to_task[ep_idx]])
|
||||
df.to_parquet(parquet_path, index=False)
|
||||
|
||||
# Write new tasks.parquet
|
||||
write_tasks(new_task_df, root)
|
||||
|
||||
# Update info.json
|
||||
dataset.meta.info["total_tasks"] = len(unique_tasks)
|
||||
write_info(dataset.meta.info, root)
|
||||
|
||||
# Reload metadata to reflect changes
|
||||
dataset.meta.tasks = new_task_df
|
||||
dataset.meta.episodes = load_episodes(root)
|
||||
|
||||
logging.info(f"Tasks: {unique_tasks}")
|
||||
|
||||
return dataset
|
||||
|
||||
|
||||
def convert_image_to_video_dataset(
|
||||
dataset: LeRobotDataset,
|
||||
output_dir: Path,
|
||||
|
||||
@@ -57,6 +57,7 @@ from lerobot.datasets.utils import (
|
||||
load_info,
|
||||
load_nested_dataset,
|
||||
load_stats,
|
||||
load_subtasks,
|
||||
load_tasks,
|
||||
update_chunk_file_indices,
|
||||
validate_episode_buffer,
|
||||
@@ -162,6 +163,7 @@ class LeRobotDatasetMetadata:
|
||||
self.info = load_info(self.root)
|
||||
check_version_compatibility(self.repo_id, self._version, CODEBASE_VERSION)
|
||||
self.tasks = load_tasks(self.root)
|
||||
self.subtasks = load_subtasks(self.root)
|
||||
self.episodes = load_episodes(self.root)
|
||||
self.stats = load_stats(self.root)
|
||||
|
||||
@@ -518,6 +520,7 @@ class LeRobotDatasetMetadata:
|
||||
_validate_feature_names(features)
|
||||
|
||||
obj.tasks = None
|
||||
obj.subtasks = None
|
||||
obj.episodes = None
|
||||
obj.stats = None
|
||||
obj.info = create_empty_dataset_info(
|
||||
@@ -1075,6 +1078,12 @@ class LeRobotDataset(torch.utils.data.Dataset):
|
||||
# Add task as a string
|
||||
task_idx = item["task_index"].item()
|
||||
item["task"] = self.meta.tasks.iloc[task_idx].name
|
||||
|
||||
# add subtask information if available
|
||||
if "subtask_index" in self.features and self.meta.subtasks is not None:
|
||||
subtask_idx = item["subtask_index"].item()
|
||||
item["subtask"] = self.meta.subtasks.iloc[subtask_idx].name
|
||||
|
||||
return item
|
||||
|
||||
def __repr__(self):
|
||||
|
||||
@@ -60,6 +60,7 @@ VIDEO_DIR = "videos"
|
||||
|
||||
CHUNK_FILE_PATTERN = "chunk-{chunk_index:03d}/file-{file_index:03d}"
|
||||
DEFAULT_TASKS_PATH = "meta/tasks.parquet"
|
||||
DEFAULT_SUBTASKS_PATH = "meta/subtasks.parquet"
|
||||
DEFAULT_EPISODES_PATH = EPISODES_DIR + "/" + CHUNK_FILE_PATTERN + ".parquet"
|
||||
DEFAULT_DATA_PATH = DATA_DIR + "/" + CHUNK_FILE_PATTERN + ".parquet"
|
||||
DEFAULT_VIDEO_PATH = VIDEO_DIR + "/{video_key}/" + CHUNK_FILE_PATTERN + ".mp4"
|
||||
@@ -353,6 +354,14 @@ def load_tasks(local_dir: Path) -> pandas.DataFrame:
|
||||
return tasks
|
||||
|
||||
|
||||
def load_subtasks(local_dir: Path) -> pandas.DataFrame | None:
|
||||
"""Load subtasks from subtasks.parquet if it exists."""
|
||||
subtasks_path = local_dir / DEFAULT_SUBTASKS_PATH
|
||||
if subtasks_path.exists():
|
||||
return pd.read_parquet(subtasks_path)
|
||||
return None
|
||||
|
||||
|
||||
def write_episodes(episodes: Dataset, local_dir: Path) -> None:
|
||||
"""Write episode metadata to a parquet file in the LeRobot v3.0 format.
|
||||
This function writes episode-level metadata to a single parquet file.
|
||||
|
||||
Reference in New Issue
Block a user