From cbc8bfb2e618a16b7d1cb46bdc0f8ac6073c1b29 Mon Sep 17 00:00:00 2001 From: Caroline Pascal Date: Wed, 4 Mar 2026 17:59:03 +0100 Subject: [PATCH] chore(docstrings): updating v2.1-v3.0 conversion script docstrings to match the new task label (#3077) * chore(docstrings): updating v2.1-v3.0 conversion script docstrings to match the new task label * chore(task): renamming the default index label in the tasks DataFrame to task * Revert "chore(docstrings): updating v2.1-v3.0 conversion script docstrings to match the new task label" This reverts commit f55de3255278f23f18b5d955565f6768d094951d. * chore(docstrings): updating docstrings to match dataset v3.0 architecture * chore(format): formatting code --- src/lerobot/datasets/aggregate.py | 4 +++- src/lerobot/datasets/dataset_tools.py | 4 +++- src/lerobot/datasets/lerobot_dataset.py | 2 +- src/lerobot/datasets/utils.py | 1 + .../datasets/v30/convert_dataset_v21_to_v30.py | 11 ++++++----- tests/fixtures/dataset_factories.py | 2 +- 6 files changed, 15 insertions(+), 9 deletions(-) diff --git a/src/lerobot/datasets/aggregate.py b/src/lerobot/datasets/aggregate.py index 7020545d2..b32116233 100644 --- a/src/lerobot/datasets/aggregate.py +++ b/src/lerobot/datasets/aggregate.py @@ -289,7 +289,9 @@ def aggregate_datasets( logging.info("Find all tasks") unique_tasks = pd.concat([m.tasks for m in all_metadata]).index.unique() - dst_meta.tasks = pd.DataFrame({"task_index": range(len(unique_tasks))}, index=unique_tasks) + dst_meta.tasks = pd.DataFrame( + {"task_index": range(len(unique_tasks))}, index=pd.Index(unique_tasks, name="task") + ) meta_idx = {"chunk": 0, "file": 0} data_idx = {"chunk": 0, "file": 0} diff --git a/src/lerobot/datasets/dataset_tools.py b/src/lerobot/datasets/dataset_tools.py index c900d7479..546b3d67f 100644 --- a/src/lerobot/datasets/dataset_tools.py +++ b/src/lerobot/datasets/dataset_tools.py @@ -1475,7 +1475,9 @@ def modify_tasks( # Collect all unique tasks and create new task mapping unique_tasks = sorted(set(episode_to_task.values())) - new_task_df = pd.DataFrame({"task_index": list(range(len(unique_tasks)))}, index=unique_tasks) + new_task_df = pd.DataFrame( + {"task_index": list(range(len(unique_tasks)))}, index=pd.Index(unique_tasks, name="task") + ) task_to_index = {task: idx for idx, task in enumerate(unique_tasks)} logging.info(f"Modifying tasks in {dataset.repo_id}") diff --git a/src/lerobot/datasets/lerobot_dataset.py b/src/lerobot/datasets/lerobot_dataset.py index 76d44de07..26f0c769c 100644 --- a/src/lerobot/datasets/lerobot_dataset.py +++ b/src/lerobot/datasets/lerobot_dataset.py @@ -314,7 +314,7 @@ class LeRobotDatasetMetadata: if self.tasks is None: new_tasks = tasks task_indices = range(len(tasks)) - self.tasks = pd.DataFrame({"task_index": task_indices}, index=tasks) + self.tasks = pd.DataFrame({"task_index": task_indices}, index=pd.Index(tasks, name="task")) else: new_tasks = [task for task in tasks if task not in self.tasks.index] new_task_indices = range(len(self.tasks), len(self.tasks) + len(new_tasks)) diff --git a/src/lerobot/datasets/utils.py b/src/lerobot/datasets/utils.py index da186bf30..a56740191 100644 --- a/src/lerobot/datasets/utils.py +++ b/src/lerobot/datasets/utils.py @@ -341,6 +341,7 @@ def write_tasks(tasks: pandas.DataFrame, local_dir: Path) -> None: def load_tasks(local_dir: Path) -> pandas.DataFrame: tasks = pd.read_parquet(local_dir / DEFAULT_TASKS_PATH) + tasks.index.name = "task" return tasks diff --git a/src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py b/src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py index 5362c52f4..3ae9093b9 100644 --- a/src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py +++ b/src/lerobot/datasets/v30/convert_dataset_v21_to_v30.py @@ -108,7 +108,7 @@ episodes.jsonl {"episode_index": 1, "tasks": ["Put the blue block in the green bowl"], "length": 266} NEW -meta/episodes/chunk-000/episodes_000.parquet +meta/episodes/chunk-000/file_000.parquet episode_index | video_chunk_index | video_file_index | data_chunk_index | data_file_index | tasks | length ------------------------- OLD @@ -116,15 +116,16 @@ tasks.jsonl {"task_index": 1, "task": "Put the blue block in the green bowl"} NEW -meta/tasks/chunk-000/file_000.parquet +meta/tasks.parquet task_index | task ------------------------- OLD episodes_stats.jsonl +{"episode_index": 1, "stats": {"feature_name": {"min": ..., "max": ..., "mean": ..., "std": ..., "count": ...}}} NEW -meta/episodes_stats/chunk-000/file_000.parquet -episode_index | mean | std | min | max +meta/episodes/chunk-000/file_000.parquet +episode_index | feature_name/min | feature_name/max | feature_name/mean | feature_name/std | feature_name/count ------------------------- UPDATE meta/info.json @@ -173,7 +174,7 @@ def convert_tasks(root, new_root): tasks, _ = legacy_load_tasks(root) task_indices = tasks.keys() task_strings = tasks.values() - df_tasks = pd.DataFrame({"task_index": task_indices}, index=task_strings) + df_tasks = pd.DataFrame({"task_index": task_indices}, index=pd.Index(task_strings, name="task")) write_tasks(df_tasks, new_root) diff --git a/tests/fixtures/dataset_factories.py b/tests/fixtures/dataset_factories.py index c33fdcb72..f8dd01fec 100644 --- a/tests/fixtures/dataset_factories.py +++ b/tests/fixtures/dataset_factories.py @@ -222,7 +222,7 @@ def tasks_factory(): def _create_tasks(total_tasks: int = 3) -> pd.DataFrame: ids = list(range(total_tasks)) tasks = [f"Perform action {i}." for i in ids] - df = pd.DataFrame({"task_index": ids}, index=tasks) + df = pd.DataFrame({"task_index": ids}, index=pd.Index(tasks, name="task")) return df return _create_tasks