diff --git a/src/lerobot/datasets/backward_compatibility.py b/src/lerobot/datasets/backward_compatibility.py index 2ba4c7c80..a087c8ce7 100644 --- a/src/lerobot/datasets/backward_compatibility.py +++ b/src/lerobot/datasets/backward_compatibility.py @@ -70,10 +70,8 @@ class CompatibilityError(Exception): ... class BackwardCompatibilityError(CompatibilityError): def __init__(self, repo_id: str, version: packaging.version.Version): - if version.major == 3: - message = V30_MESSAGE.format(repo_id=repo_id, version=version) - elif version.major == 2: - message = V2_MESSAGE.format(repo_id=repo_id, version=version) + if version.major == 2 and version.minor == 1: + message = V30_MESSAGE.format(repo_id=repo_id, version=version) else: raise NotImplementedError( "Contact the maintainer on [Discord](https://discord.com/invite/s3KuuzsPFb)." diff --git a/src/lerobot/datasets/utils.py b/src/lerobot/datasets/utils.py index 2ed0fc46b..c78f16cd2 100644 --- a/src/lerobot/datasets/utils.py +++ b/src/lerobot/datasets/utils.py @@ -39,7 +39,7 @@ from torchvision import transforms from lerobot.configs.types import FeatureType, PolicyFeature from lerobot.datasets.backward_compatibility import ( - V21_MESSAGE, + FUTURE_MESSAGE, BackwardCompatibilityError, ForwardCompatibilityError, ) @@ -343,7 +343,7 @@ def check_version_compatibility( if v_check.major < v_current.major and enforce_breaking_major: raise BackwardCompatibilityError(repo_id, v_check) elif v_check.minor < v_current.minor: - logging.warning(V21_MESSAGE.format(repo_id=repo_id, version=v_check)) + logging.warning(FUTURE_MESSAGE.format(repo_id=repo_id, version=v_check)) def get_repo_versions(repo_id: str) -> list[packaging.version.Version]: diff --git a/src/lerobot/datasets/v21/_remove_language_instruction.py b/src/lerobot/datasets/v21/_remove_language_instruction.py deleted file mode 100644 index 1f1cb1855..000000000 --- a/src/lerobot/datasets/v21/_remove_language_instruction.py +++ /dev/null @@ -1,87 +0,0 @@ -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import traceback -from pathlib import Path - -from datasets import get_dataset_config_info -from huggingface_hub import HfApi - -from lerobot import available_datasets -from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata -from lerobot.datasets.utils import INFO_PATH, write_info -from lerobot.datasets.v21.convert_dataset_v20_to_v21 import V20, SuppressWarnings - -LOCAL_DIR = Path("data/") - -hub_api = HfApi() - - -def fix_dataset(repo_id: str) -> str: - if not hub_api.revision_exists(repo_id, V20, repo_type="dataset"): - return f"{repo_id}: skipped (not in {V20})." - - dataset_info = get_dataset_config_info(repo_id, "default") - with SuppressWarnings(): - lerobot_metadata = LeRobotDatasetMetadata(repo_id, revision=V20, force_cache_sync=True) - - meta_features = {key for key, ft in lerobot_metadata.features.items() if ft["dtype"] != "video"} - parquet_features = set(dataset_info.features) - - diff_parquet_meta = parquet_features - meta_features - diff_meta_parquet = meta_features - parquet_features - - if diff_parquet_meta: - raise ValueError(f"In parquet not in info.json: {parquet_features - meta_features}") - - if not diff_meta_parquet: - return f"{repo_id}: skipped (no diff)" - - if diff_meta_parquet: - logging.warning(f"In info.json not in parquet: {meta_features - parquet_features}") - assert diff_meta_parquet == {"language_instruction"} - lerobot_metadata.features.pop("language_instruction") - write_info(lerobot_metadata.info, lerobot_metadata.root) - commit_info = hub_api.upload_file( - path_or_fileobj=lerobot_metadata.root / INFO_PATH, - path_in_repo=INFO_PATH, - repo_id=repo_id, - repo_type="dataset", - revision=V20, - commit_message="Remove 'language_instruction'", - create_pr=True, - ) - return f"{repo_id}: success - PR: {commit_info.pr_url}" - - -def batch_fix(): - status = {} - LOCAL_DIR.mkdir(parents=True, exist_ok=True) - logfile = LOCAL_DIR / "fix_features_v20.txt" - for num, repo_id in enumerate(available_datasets): - print(f"\nConverting {repo_id} ({num}/{len(available_datasets)})") - print("---------------------------------------------------------") - try: - status = fix_dataset(repo_id) - except Exception: - status = f"{repo_id}: failed\n {traceback.format_exc()}" - - logging.info(status) - with open(logfile, "a") as file: - file.write(status + "\n") - - -if __name__ == "__main__": - batch_fix() diff --git a/src/lerobot/datasets/v21/batch_convert_dataset_v20_to_v21.py b/src/lerobot/datasets/v21/batch_convert_dataset_v20_to_v21.py deleted file mode 100644 index b4f1c36c4..000000000 --- a/src/lerobot/datasets/v21/batch_convert_dataset_v20_to_v21.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script is for internal use to convert all datasets under the 'lerobot' hub user account to v2.1. -""" - -import traceback -from pathlib import Path - -from huggingface_hub import HfApi - -from lerobot import available_datasets -from lerobot.datasets.v21.convert_dataset_v20_to_v21 import V21, convert_dataset - -LOCAL_DIR = Path("data/") - - -def batch_convert(): - status = {} - LOCAL_DIR.mkdir(parents=True, exist_ok=True) - logfile = LOCAL_DIR / "conversion_log_v21.txt" - hub_api = HfApi() - for num, repo_id in enumerate(available_datasets): - print(f"\nConverting {repo_id} ({num}/{len(available_datasets)})") - print("---------------------------------------------------------") - try: - if hub_api.revision_exists(repo_id, V21, repo_type="dataset"): - status = f"{repo_id}: success (already in {V21})." - else: - convert_dataset(repo_id) - status = f"{repo_id}: success." - except Exception: - status = f"{repo_id}: failed\n {traceback.format_exc()}" - - with open(logfile, "a") as file: - file.write(status + "\n") - - -if __name__ == "__main__": - batch_convert() diff --git a/src/lerobot/datasets/v21/convert_dataset_v20_to_v21.py b/src/lerobot/datasets/v21/convert_dataset_v20_to_v21.py deleted file mode 100644 index cfcff6e45..000000000 --- a/src/lerobot/datasets/v21/convert_dataset_v20_to_v21.py +++ /dev/null @@ -1,111 +0,0 @@ -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script will help you convert any LeRobot dataset already pushed to the hub from codebase version 2.0 to -2.1. It will: - -- Generate per-episodes stats and writes them in `episodes_stats.jsonl` -- Check consistency between these new stats and the old ones. -- Remove the deprecated `stats.json`. -- Update codebase_version in `info.json`. -- Push this new version to the hub on the 'main' branch and tags it with "v2.1". - -Usage: - -```bash -python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 \ - --repo-id=aliberts/koch_tutorial -``` - -""" - -import argparse -import logging - -from huggingface_hub import HfApi - -from lerobot.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDataset -from lerobot.datasets.utils import STATS_PATH, load_stats, write_info -from lerobot.datasets.v21.convert_stats import check_aggregate_stats, convert_stats - -V20 = "v2.0" -V21 = "v2.1" - - -class SuppressWarnings: - def __enter__(self): - self.previous_level = logging.getLogger().getEffectiveLevel() - logging.getLogger().setLevel(logging.ERROR) - - def __exit__(self, exc_type, exc_val, exc_tb): - logging.getLogger().setLevel(self.previous_level) - - -def convert_dataset( - repo_id: str, - branch: str | None = None, - num_workers: int = 4, -): - with SuppressWarnings(): - dataset = LeRobotDataset(repo_id, revision=V20, force_cache_sync=True) - - convert_stats(dataset, num_workers=num_workers) - ref_stats = load_stats(dataset.root) - check_aggregate_stats(dataset, ref_stats) - - dataset.meta.info["codebase_version"] = CODEBASE_VERSION - write_info(dataset.meta.info, dataset.root) - - dataset.push_to_hub(branch=branch, tag_version=False, allow_patterns="meta/") - - # delete old stats.json file - if (dataset.root / STATS_PATH).is_file: - (dataset.root / STATS_PATH).unlink() - - hub_api = HfApi() - if hub_api.file_exists( - repo_id=dataset.repo_id, filename=STATS_PATH, revision=branch, repo_type="dataset" - ): - hub_api.delete_file( - path_in_repo=STATS_PATH, repo_id=dataset.repo_id, revision=branch, repo_type="dataset" - ) - - hub_api.create_tag(repo_id, tag=CODEBASE_VERSION, revision=branch, repo_type="dataset") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--repo-id", - type=str, - required=True, - help="Repository identifier on Hugging Face: a community or a user name `/` the name of the dataset " - "(e.g. `lerobot/pusht`, `cadene/aloha_sim_insertion_human`).", - ) - parser.add_argument( - "--branch", - type=str, - default=None, - help="Repo branch to push your dataset. Defaults to the main branch.", - ) - parser.add_argument( - "--num-workers", - type=int, - default=4, - help="Number of workers for parallelizing stats compute. Defaults to 4.", - ) - - args = parser.parse_args() - convert_dataset(**vars(args)) diff --git a/src/lerobot/datasets/v21/convert_stats.py b/src/lerobot/datasets/v21/convert_stats.py deleted file mode 100644 index 0c706d2a3..000000000 --- a/src/lerobot/datasets/v21/convert_stats.py +++ /dev/null @@ -1,114 +0,0 @@ -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from concurrent.futures import ThreadPoolExecutor, as_completed -from pathlib import Path - -import jsonlines -import numpy as np -from tqdm import tqdm - -from lerobot.datasets.compute_stats import aggregate_stats, get_feature_stats, sample_indices -from lerobot.datasets.lerobot_dataset import LeRobotDataset -from lerobot.datasets.utils import LEGACY_EPISODES_STATS_PATH, serialize_dict - - -def append_jsonlines(data: dict, fpath: Path) -> None: - fpath.parent.mkdir(exist_ok=True, parents=True) - with jsonlines.open(fpath, "a") as writer: - writer.write(data) - - -def legacy_write_episode_stats(episode_index: int, episode_stats: dict, local_dir: Path): - # We wrap episode_stats in a dictionary since `episode_stats["episode_index"]` - # is a dictionary of stats and not an integer. - episode_stats = {"episode_index": episode_index, "stats": serialize_dict(episode_stats)} - append_jsonlines(episode_stats, local_dir / LEGACY_EPISODES_STATS_PATH) - - -def sample_episode_video_frames(dataset: LeRobotDataset, episode_index: int, ft_key: str) -> np.ndarray: - ep_len = dataset.meta.episodes[episode_index]["length"] - sampled_indices = sample_indices(ep_len) - query_timestamps = dataset._get_query_timestamps(0.0, {ft_key: sampled_indices}) - video_frames = dataset._query_videos(query_timestamps, episode_index) - return video_frames[ft_key].numpy() - - -def convert_episode_stats(dataset: LeRobotDataset, ep_idx: int): - ep_start_idx = dataset.episode_data_index["from"][ep_idx] - ep_end_idx = dataset.episode_data_index["to"][ep_idx] - ep_data = dataset.hf_dataset.select(range(ep_start_idx, ep_end_idx)) - - ep_stats = {} - for key, ft in dataset.features.items(): - if ft["dtype"] == "video": - # We sample only for videos - ep_ft_data = sample_episode_video_frames(dataset, ep_idx, key) - else: - ep_ft_data = np.array(ep_data[key]) - - axes_to_reduce = (0, 2, 3) if ft["dtype"] in ["image", "video"] else 0 - keepdims = True if ft["dtype"] in ["image", "video"] else ep_ft_data.ndim == 1 - ep_stats[key] = get_feature_stats(ep_ft_data, axis=axes_to_reduce, keepdims=keepdims) - - if ft["dtype"] in ["image", "video"]: # remove batch dim - ep_stats[key] = { - k: v if k == "count" else np.squeeze(v, axis=0) for k, v in ep_stats[key].items() - } - - dataset.meta.episodes_stats[ep_idx] = ep_stats - - -def convert_stats(dataset: LeRobotDataset, num_workers: int = 0): - assert dataset.episodes is None - print("Computing episodes stats") - total_episodes = dataset.meta.total_episodes - if num_workers > 0: - with ThreadPoolExecutor(max_workers=num_workers) as executor: - futures = { - executor.submit(convert_episode_stats, dataset, ep_idx): ep_idx - for ep_idx in range(total_episodes) - } - for future in tqdm(as_completed(futures), total=total_episodes): - future.result() - else: - for ep_idx in tqdm(range(total_episodes)): - convert_episode_stats(dataset, ep_idx) - - for ep_idx in tqdm(range(total_episodes)): - legacy_write_episode_stats(ep_idx, dataset.meta.episodes_stats[ep_idx], dataset.root) - - -def check_aggregate_stats( - dataset: LeRobotDataset, - reference_stats: dict[str, dict[str, np.ndarray]], - video_rtol_atol: tuple[float] = (1e-2, 1e-2), - default_rtol_atol: tuple[float] = (5e-6, 6e-5), -): - """Verifies that the aggregated stats from episodes_stats are close to reference stats.""" - agg_stats = aggregate_stats(list(dataset.meta.episodes_stats.values())) - for key, ft in dataset.features.items(): - # These values might need some fine-tuning - if ft["dtype"] == "video": - # to account for image sub-sampling - rtol, atol = video_rtol_atol - else: - rtol, atol = default_rtol_atol - - for stat, val in agg_stats[key].items(): - if key in reference_stats and stat in reference_stats[key]: - err_msg = f"feature='{key}' stats='{stat}'" - np.testing.assert_allclose( - val, reference_stats[key][stat], rtol=rtol, atol=atol, err_msg=err_msg - )