fix(dataset): use revision-safe Hub cache for downloaded datasets (#3233)

* refactor(dataset): enhance dataset root directory handling and introduce hub cache support - Updated DatasetConfig and LeRobotDatasetMetadata to clarify root directory behavior and introduce a dedicated hub cache for downloads. - Refactored LeRobotDataset and StreamingLeRobotDataset to utilize the new hub cache and improve directory management. - Added tests to ensure correct behavior when using the hub cache and handling different revisions without a specified root directory. * refactor(dataset): improve root directory handling in LeRobotDataset - Updated LeRobotDataset to store the requested root path separately from the actual root path. - Adjusted metadata loading to use the requested root, enhancing clarity and consistency in directory management. * refactor(dataset): minor improvements for hub cache support * chore(datasets): guard in resume + assertion test --------- Co-authored-by: AdilZouitine <adilzouitinegm@gmail.com> Co-authored-by: mickaelChen <mickael.chen.levinson@gmail.com>
2026-06-04 21:01:26 +00:00 · 2026-03-27 22:21:55 +01:00
parent 975d89b38d
commit 4e45acca52
8 changed files with 440 additions and 40 deletions
--- a/src/lerobot/datasets/utils.py
+++ b/src/lerobot/datasets/utils.py
@@ -18,6 +18,7 @@ import importlib.resources
 import json
 import logging
 from collections.abc import Iterator
+from pathlib import Path
 from typing import Any

 import datasets
@@ -101,6 +102,18 @@ DEFAULT_FEATURES = {
 }


+def has_legacy_hub_download_metadata(root: Path) -> bool:
+    """Return ``True`` when *root* looks like a legacy Hub ``local_dir`` mirror.
+
+    ``snapshot_download(local_dir=...)`` stores lightweight metadata under
+    ``<local_dir>/.cache/huggingface/download/``.  The presence of this
+    directory is a reliable indicator that the dataset was downloaded with
+    the old non-revision-safe ``local_dir`` mode and should be re-fetched
+    through the snapshot cache instead.
+    """
+    return (root / ".cache" / "huggingface" / "download").exists()
+
+
 def update_chunk_file_indices(chunk_idx: int, file_idx: int, chunks_size: int) -> tuple[int, int]:
    if file_idx == chunks_size - 1:
        file_idx = 0