lerobot-clone/src/lerobot/scripts/lerobot_edit_dataset.py

#!/usr/bin/env python

# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Edit LeRobot datasets using various transformation tools.

This script allows you to delete episodes, split datasets, merge datasets,
remove features, modify tasks, and convert image datasets to video format.
When new_repo_id is specified, creates a new dataset.

Path semantics (v2): --root and --new_root are exact dataset folders containing
meta/, data/, videos/. When omitted, defaults to $HF_LEROBOT_HOME/{repo_id}.

Usage Examples:

Delete episodes 0, 2, and 5 from a dataset:
    lerobot-edit-dataset \
        --repo_id lerobot/pusht \
        --operation.type delete_episodes \
        --operation.episode_indices "[0, 2, 5]"

Delete episodes from a local dataset at a specific path:
    lerobot-edit-dataset \
        --repo_id lerobot/pusht \
        --root /path/to/pusht \
        --operation.type delete_episodes \
        --operation.episode_indices "[0, 2, 5]"

Delete episodes and save to a new dataset at a specific path and with a new repo_id:
    lerobot-edit-dataset \
        --repo_id lerobot/pusht \
        --new_repo_id lerobot/pusht_filtered \
        --new_root /path/to/pusht_filtered \
        --operation.type delete_episodes \
        --operation.episode_indices "[0, 2, 5]"

Split dataset by fractions (pusht_train, pusht_val):
    lerobot-edit-dataset \
        --repo_id lerobot/pusht \
        --operation.type split \
        --operation.splits '{"train": 0.8, "val": 0.2}'

Split dataset by fractions and save split datasets to a specific folder (base_folder/train, base_folder/val):
    lerobot-edit-dataset \
        --repo_id lerobot/pusht \
        --new_root /path/to/base_folder \
        --operation.type split \
        --operation.splits '{"train": 0.8, "val": 0.2}'

Split dataset by episode indices:
    lerobot-edit-dataset \
        --repo_id lerobot/pusht \
        --operation.type split \
        --operation.splits '{"train": [0, 1, 2, 3], "val": [4, 5]}'

Split into more than two splits:
    lerobot-edit-dataset \
        --repo_id lerobot/pusht \
        --operation.type split \
        --operation.splits '{"train": 0.6, "val": 0.2, "test": 0.2}'

Merge multiple datasets:
    lerobot-edit-dataset \
        --new_repo_id lerobot/pusht_merged \
        --operation.type merge \
        --operation.repo_ids "['lerobot/pusht_train', 'lerobot/pusht_val']"

Merge multiple datasets to a specific output path:
    lerobot-edit-dataset \
        --new_repo_id lerobot/pusht_merged \
        --new_root /path/to/pusht_merged \
        --operation.type merge \
        --operation.repo_ids "['lerobot/pusht_train', 'lerobot/pusht_val']"

Merge multiple datasets from a list of local dataset paths:
    lerobot-edit-dataset \
        --new_repo_id lerobot/pusht_merged \
        --operation.type merge \
        --operation.repo_ids "['pusht_train', 'pusht_val']" \
        --operation.roots "['/path/to/pusht_train', '/path/to/pusht_val']"

Remove camera feature:
    lerobot-edit-dataset \
        --repo_id lerobot/pusht \
        --operation.type remove_feature \
        --operation.feature_names "['observation.image']"

Modify tasks - set a single task for all episodes (WARNING: modifies in-place):
    lerobot-edit-dataset \
        --repo_id lerobot/pusht \
        --operation.type modify_tasks \
        --operation.new_task "Pick up the cube and place it"

Modify tasks - set different tasks for specific episodes (WARNING: modifies in-place):
    lerobot-edit-dataset \
        --repo_id lerobot/pusht \
        --operation.type modify_tasks \
        --operation.episode_tasks '{"0": "Task A", "1": "Task B", "2": "Task A"}'

Modify tasks - set default task with overrides for specific episodes (WARNING: modifies in-place):
    lerobot-edit-dataset \
        --repo_id lerobot/pusht \
        --operation.type modify_tasks \
        --operation.new_task "Default task" \
        --operation.episode_tasks '{"5": "Special task for episode 5"}'

Convert image dataset to video format and save locally:
    lerobot-edit-dataset \
        --repo_id lerobot/pusht_image \
        --new_root /path/to/output/pusht_video \
        --operation.type convert_image_to_video

Convert image dataset to video format and save with new repo_id:
    lerobot-edit-dataset \
        --repo_id lerobot/pusht_image \
        --new_repo_id lerobot/pusht_video \
        --operation.type convert_image_to_video

Convert image dataset to video format and push to hub:
    lerobot-edit-dataset \
        --repo_id lerobot/pusht_image \
        --new_repo_id lerobot/pusht_video \
        --operation.type convert_image_to_video \
        --push_to_hub true

Show dataset information:
    lerobot-edit-dataset \
        --repo_id lerobot/pusht_image \
        --operation.type info \
        --operation.show_features true

Show dataset information without feature details:
    lerobot-edit-dataset \
        --repo_id lerobot/pusht_image \
        --operation.type info \
        --operation.show_features false

Using JSON config file:
    lerobot-edit-dataset \
        --config_path path/to/edit_config.json
"""

import abc
import logging
import shutil
import sys
from dataclasses import dataclass
from pathlib import Path

import draccus

from lerobot.configs import parser
from lerobot.datasets.dataset_tools import (
    convert_image_to_video_dataset,
    delete_episodes,
    merge_datasets,
    modify_tasks,
    remove_feature,
    split_dataset,
)
from lerobot.datasets.lerobot_dataset import LeRobotDataset
from lerobot.utils.constants import HF_LEROBOT_HOME
from lerobot.utils.utils import init_logging


@dataclass
class OperationConfig(draccus.ChoiceRegistry, abc.ABC):
    @property
    def type(self) -> str:
        return self.get_choice_name(self.__class__)


@OperationConfig.register_subclass("delete_episodes")
@dataclass
class DeleteEpisodesConfig(OperationConfig):
    episode_indices: list[int] | None = None


@OperationConfig.register_subclass("split")
@dataclass
class SplitConfig(OperationConfig):
    splits: dict[str, float | list[int]] | None = None


@OperationConfig.register_subclass("merge")
@dataclass
class MergeConfig(OperationConfig):
    repo_ids: list[str] | None = None
    roots: list[str] | None = None


@OperationConfig.register_subclass("remove_feature")
@dataclass
class RemoveFeatureConfig(OperationConfig):
    feature_names: list[str] | None = None


@OperationConfig.register_subclass("modify_tasks")
@dataclass
class ModifyTasksConfig(OperationConfig):
    new_task: str | None = None
    episode_tasks: dict[str, str] | None = None


@OperationConfig.register_subclass("convert_image_to_video")
@dataclass
class ConvertImageToVideoConfig(OperationConfig):
    output_dir: str | None = None
    vcodec: str = "libsvtav1"
    pix_fmt: str = "yuv420p"
    g: int = 2
    crf: int = 30
    fast_decode: int = 0
    episode_indices: list[int] | None = None
    num_workers: int = 4
    max_episodes_per_batch: int | None = None
    max_frames_per_batch: int | None = None


@OperationConfig.register_subclass("info")
@dataclass
class InfoConfig(OperationConfig):
    show_features: bool = False


@dataclass
class EditDatasetConfig:
    # Operation configuration.
    operation: OperationConfig
    # Input dataset identifier. Always required unless for Merge operation.
    repo_id: str | None = None
    # Root directory where the input dataset is stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id.
    root: str | None = None
    # Edited dataset identifier. When both new_repo_id (resp. new_root) and repo_id (resp. root) are identical, modifications are applied in-place and a backup of the original dataset is created. Required for Merge operation.
    new_repo_id: str | None = None
    # Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/new_repo_id. For Split operation, this is the base directory for the split datasets.
    new_root: str | None = None
    # Upload dataset to Hugging Face hub.
    push_to_hub: bool = False


def get_output_path(
    repo_id: str,
    new_repo_id: str | None,
    root: Path | str | None,
    new_root: Path | str | None,
) -> tuple[str, Path]:
    input_path = Path(root) if root else HF_LEROBOT_HOME / repo_id

    output_repo_id = new_repo_id if new_repo_id else repo_id
    output_path = Path(new_root) if new_root else HF_LEROBOT_HOME / output_repo_id

    # In case of in-place modification, create a backup of the original dataset (if it exists)
    if output_path == input_path:
        backup_path = input_path.with_name(input_path.name + "_old")

        if input_path.exists():
            if backup_path.exists():
                shutil.rmtree(backup_path)
            shutil.move(input_path, backup_path)

    return output_repo_id, output_path


def handle_delete_episodes(cfg: EditDatasetConfig) -> None:
    if not isinstance(cfg.operation, DeleteEpisodesConfig):
        raise ValueError("Operation config must be DeleteEpisodesConfig")

    if not cfg.operation.episode_indices:
        raise ValueError("episode_indices must be specified for delete_episodes operation")

    dataset = LeRobotDataset(cfg.repo_id, root=cfg.root)
    output_repo_id, output_dir = get_output_path(
        cfg.repo_id,
        new_repo_id=cfg.new_repo_id,
        root=cfg.root,
        new_root=cfg.new_root,
    )

    # In case of in-place modification, make the dataset point to the backup directory
    if output_dir == dataset.root:
        dataset.root = dataset.root.with_name(dataset.root.name + "_old")

    logging.info(f"Deleting episodes {cfg.operation.episode_indices} from {cfg.repo_id}")
    new_dataset = delete_episodes(
        dataset,
        episode_indices=cfg.operation.episode_indices,
        output_dir=output_dir,
        repo_id=output_repo_id,
    )

    logging.info(f"Dataset saved to {output_dir}")
    logging.info(f"Episodes: {new_dataset.meta.total_episodes}, Frames: {new_dataset.meta.total_frames}")

    if cfg.push_to_hub:
        logging.info(f"Pushing to hub as {output_repo_id}")
        LeRobotDataset(output_repo_id, root=output_dir).push_to_hub()


def handle_split(cfg: EditDatasetConfig) -> None:
    if not isinstance(cfg.operation, SplitConfig):
        raise ValueError("Operation config must be SplitConfig")

    if not cfg.operation.splits:
        raise ValueError(
            "splits dict must be specified with split names as keys and fractions/episode lists as values"
        )

    if cfg.new_repo_id is not None:
        logging.warning(
            "split uses the original dataset identifier --repo_id to generate split names. The --new_repo_id parameter is ignored."
        )

    dataset = LeRobotDataset(cfg.repo_id, root=cfg.root)

    logging.info(f"Splitting dataset {cfg.repo_id} with splits: {cfg.operation.splits}")
    split_datasets = split_dataset(
        dataset,
        splits=cfg.operation.splits,
        output_dir=cfg.new_root,
    )

    for split_name, split_ds in split_datasets.items():
        logging.info(
            f"{split_name}: {split_ds.meta.total_episodes} episodes, {split_ds.meta.total_frames} frames"
        )

        if cfg.push_to_hub:
            logging.info(f"Pushing {split_name} split to hub as {split_ds.repo_id}")
            LeRobotDataset(split_ds.repo_id, root=split_ds.root).push_to_hub()


def handle_merge(cfg: EditDatasetConfig) -> None:
    if not isinstance(cfg.operation, MergeConfig):
        raise ValueError("Operation config must be MergeConfig")

    if not cfg.operation.repo_ids:
        raise ValueError("repo_ids must be specified for merge operation")

    if cfg.repo_id is not None or cfg.root is not None:
        logging.warning(
            "merge uses --new_repo_id and --new_root for the merged dataset. The --repo_id and --root parameters are ignored."
        )

    if cfg.operation.roots:
        if len(cfg.operation.roots) != len(cfg.operation.repo_ids):
            raise ValueError("repo_ids and roots must have the same length for merge operation")
        logging.info(f"Loading {len(cfg.operation.roots)} datasets to merge")
        datasets = [
            LeRobotDataset(repo_id=repo_id, root=root)
            for repo_id, root in zip(cfg.operation.repo_ids, cfg.operation.roots, strict=True)
        ]
    else:
        logging.info(f"Loading {len(cfg.operation.repo_ids)} datasets to merge")
        datasets = [LeRobotDataset(repo_id) for repo_id in cfg.operation.repo_ids]

    output_dir = Path(cfg.new_root) if cfg.new_root else HF_LEROBOT_HOME / cfg.new_repo_id

    logging.info(f"Merging datasets into {cfg.new_repo_id}")
    merged_dataset = merge_datasets(
        datasets,
        output_repo_id=cfg.new_repo_id,
        output_dir=output_dir,
    )

    logging.info(f"Merged dataset saved to {output_dir}")
    logging.info(
        f"Episodes: {merged_dataset.meta.total_episodes}, Frames: {merged_dataset.meta.total_frames}"
    )

    if cfg.push_to_hub:
        logging.info(f"Pushing to hub as {cfg.new_repo_id}")
        LeRobotDataset(merged_dataset.repo_id, root=output_dir).push_to_hub()


def handle_remove_feature(cfg: EditDatasetConfig) -> None:
    if not isinstance(cfg.operation, RemoveFeatureConfig):
        raise ValueError("Operation config must be RemoveFeatureConfig")

    if not cfg.operation.feature_names:
        raise ValueError("feature_names must be specified for remove_feature operation")

    dataset = LeRobotDataset(cfg.repo_id, root=cfg.root)
    output_repo_id, output_dir = get_output_path(
        cfg.repo_id,
        new_repo_id=cfg.new_repo_id,
        root=cfg.root,
        new_root=cfg.new_root,
    )

    # In case of in-place modification, make the dataset point to the backup directory
    if output_dir == dataset.root:
        dataset.root = dataset.root.with_name(dataset.root.name + "_old")

    logging.info(f"Removing features {cfg.operation.feature_names} from {cfg.repo_id}")
    new_dataset = remove_feature(
        dataset,
        feature_names=cfg.operation.feature_names,
        output_dir=output_dir,
        repo_id=output_repo_id,
    )

    logging.info(f"Dataset saved to {output_dir}")
    logging.info(f"Remaining features: {list(new_dataset.meta.features.keys())}")

    if cfg.push_to_hub:
        logging.info(f"Pushing to hub as {output_repo_id}")
        LeRobotDataset(output_repo_id, root=output_dir).push_to_hub()


def handle_modify_tasks(cfg: EditDatasetConfig) -> None:
    if not isinstance(cfg.operation, ModifyTasksConfig):
        raise ValueError("Operation config must be ModifyTasksConfig")

    new_task = cfg.operation.new_task
    episode_tasks_raw = cfg.operation.episode_tasks

    if new_task is None and episode_tasks_raw is None:
        raise ValueError("Must specify at least one of new_task or episode_tasks for modify_tasks operation")

    if cfg.new_repo_id is not None or cfg.new_root is not None:
        logging.warning(
            "modify_tasks modifies datasets in-place. The --new_repo_id and --new_root parameters are ignored."
        )

    dataset = LeRobotDataset(cfg.repo_id, root=cfg.root)
    logging.warning(f"Modifying dataset in-place at {dataset.root}. Original data will be overwritten.")

    # Convert episode_tasks keys from string to int if needed (CLI passes strings)
    episode_tasks: dict[int, str] | None = None
    if episode_tasks_raw is not None:
        episode_tasks = {int(k): v for k, v in episode_tasks_raw.items()}

    logging.info(f"Modifying tasks in {cfg.repo_id}")
    if new_task:
        logging.info(f"  Default task: '{new_task}'")
    if episode_tasks:
        logging.info(f"  Episode-specific tasks: {episode_tasks}")

    modified_dataset = modify_tasks(
        dataset,
        new_task=new_task,
        episode_tasks=episode_tasks,
    )

    logging.info(f"Dataset modified at {dataset.root}")
    logging.info(f"Tasks: {list(modified_dataset.meta.tasks.index)}")

    if cfg.push_to_hub:
        logging.info(f"Pushing to hub as {cfg.repo_id}")
        modified_dataset.push_to_hub()


def handle_convert_image_to_video(cfg: EditDatasetConfig) -> None:
    # Note: Parser may create any config type with the right fields, so we access fields directly
    # instead of checking isinstance()
    dataset = LeRobotDataset(cfg.repo_id, root=cfg.root)

    # Determine output directory and repo_id
    # Priority: 1) new_root, 2) new_repo_id, 3) operation.output_dir, 4) auto-generated name
    output_dir_config = getattr(cfg.operation, "output_dir", None)
    if output_dir_config:
        logging.warning(
            "--operation.output_dir is deprecated and will be removed in future versions. "
            "Please use --new_root instead."
        )

    if cfg.new_root:
        output_dir = Path(cfg.new_root)
        output_repo_id = cfg.new_repo_id or f"{cfg.repo_id}_video"
        logging.info(f"Saving to new_root: {output_dir} as {output_repo_id}")
    elif cfg.new_repo_id:
        output_repo_id = cfg.new_repo_id
        output_dir = HF_LEROBOT_HOME / cfg.new_repo_id
        logging.info(f"Saving to new dataset: {cfg.new_repo_id} at {output_dir}")
    elif output_dir_config:
        output_dir = Path(output_dir_config)
        output_repo_id = output_dir.name
        logging.info(f"Saving to local directory: {output_dir} as {output_repo_id}")
    else:
        output_repo_id = f"{cfg.repo_id}_video"
        output_dir = HF_LEROBOT_HOME / output_repo_id
        logging.info(f"Saving to auto-generated location: {output_dir} as {output_repo_id}")

    logging.info(f"Converting dataset {cfg.repo_id} to video format")

    new_dataset = convert_image_to_video_dataset(
        dataset=dataset,
        output_dir=output_dir,
        repo_id=output_repo_id,
        vcodec=getattr(cfg.operation, "vcodec", "libsvtav1"),
        pix_fmt=getattr(cfg.operation, "pix_fmt", "yuv420p"),
        g=getattr(cfg.operation, "g", 2),
        crf=getattr(cfg.operation, "crf", 30),
        fast_decode=getattr(cfg.operation, "fast_decode", 0),
        episode_indices=getattr(cfg.operation, "episode_indices", None),
        num_workers=getattr(cfg.operation, "num_workers", 4),
        max_episodes_per_batch=getattr(cfg.operation, "max_episodes_per_batch", None),
        max_frames_per_batch=getattr(cfg.operation, "max_frames_per_batch", None),
    )

    logging.info("Video dataset created successfully!")
    logging.info(f"Location: {output_dir}")
    logging.info(f"Episodes: {new_dataset.meta.total_episodes}")
    logging.info(f"Frames: {new_dataset.meta.total_frames}")

    if cfg.push_to_hub:
        logging.info(f"Pushing to hub as {output_repo_id}...")
        new_dataset.push_to_hub()
        logging.info("✓ Successfully pushed to hub!")
    else:
        logging.info("Dataset saved locally (not pushed to hub)")


def _get_dataset_size(repo_path):
    import os

    total = 0
    with os.scandir(repo_path) as it:
        for entry in it:
            if entry.is_file():
                total += entry.stat().st_size
            elif entry.is_dir():
                total += _get_dataset_size(entry.path)
    return total


def handle_info(cfg: EditDatasetConfig):
    if not isinstance(cfg.operation, InfoConfig):
        raise ValueError("Operation config must be InfoConfig")

    dataset = LeRobotDataset(cfg.repo_id, root=cfg.root)
    sys.stdout.write(f"======Info {dataset.meta.repo_id}\n")
    sys.stdout.write(f"Repository ID: {dataset.meta.repo_id} \n")
    sys.stdout.write(f"Total episode: {dataset.meta.total_episodes} \n")
    sys.stdout.write(f"Total task: {dataset.meta.total_tasks} \n")
    sys.stdout.write(f"Total frame(Actual Count): {dataset.meta.total_frames}({len(dataset)}) \n")
    sys.stdout.write(
        f"Average frame per episode: {dataset.meta.total_frames / dataset.meta.total_episodes:.1f}\n"
    )
    sys.stdout.write(
        f"Average episode time(sec): {(dataset.meta.total_frames / dataset.meta.total_episodes) / dataset.meta.fps:.1f}\n"
    )
    sys.stdout.write(f"FPS: {dataset.meta.fps}\n")

    total_file_size = _get_dataset_size(dataset.root)
    sys.stdout.write(f"Size: {total_file_size / (1024 * 1024):.1f} MB\n")
    if cfg.operation.show_features:
        import json

        feature_dump_str = json.dumps(
            dataset.meta.features, ensure_ascii=False, indent=4, sort_keys=True, separators=(",", ": ")
        )
        sys.stdout.write("Features:\n")
        sys.stdout.write(f"{feature_dump_str}\n")


def _validate_config(cfg: EditDatasetConfig) -> None:
    if isinstance(cfg.operation, MergeConfig):
        if not cfg.new_repo_id:
            raise ValueError("--new_repo_id is required for merge operation (the merged dataset identifier)")
    else:
        if not cfg.repo_id:
            raise ValueError(
                f"--repo_id is required for {cfg.operation.type} operation (the input dataset identifier)"
            )


@parser.wrap()
def edit_dataset(cfg: EditDatasetConfig) -> None:
    _validate_config(cfg)
    operation_type = cfg.operation.type

    if operation_type == "delete_episodes":
        handle_delete_episodes(cfg)
    elif operation_type == "split":
        handle_split(cfg)
    elif operation_type == "merge":
        handle_merge(cfg)
    elif operation_type == "remove_feature":
        handle_remove_feature(cfg)
    elif operation_type == "modify_tasks":
        handle_modify_tasks(cfg)
    elif operation_type == "convert_image_to_video":
        handle_convert_image_to_video(cfg)
    elif operation_type == "info":
        handle_info(cfg)
    else:
        available = ", ".join(OperationConfig.get_known_choices())
        raise ValueError(f"Unknown operation: {operation_type}\nAvailable operations: {available}")


def main() -> None:
    init_logging()
    edit_dataset()


if __name__ == "__main__":
    main()