mirror of
https://github.com/huggingface/lerobot.git
synced 2026-06-04 21:01:26 +00:00
fix(dataset): use revision-safe Hub cache for downloaded datasets (#3233)
* refactor(dataset): enhance dataset root directory handling and introduce hub cache support - Updated DatasetConfig and LeRobotDatasetMetadata to clarify root directory behavior and introduce a dedicated hub cache for downloads. - Refactored LeRobotDataset and StreamingLeRobotDataset to utilize the new hub cache and improve directory management. - Added tests to ensure correct behavior when using the hub cache and handling different revisions without a specified root directory. * refactor(dataset): improve root directory handling in LeRobotDataset - Updated LeRobotDataset to store the requested root path separately from the actual root path. - Adjusted metadata loading to use the requested root, enhancing clarity and consistency in directory management. * refactor(dataset): minor improvements for hub cache support * chore(datasets): guard in resume + assertion test --------- Co-authored-by: AdilZouitine <adilzouitinegm@gmail.com> Co-authored-by: mickaelChen <mickael.chen.levinson@gmail.com>
This commit is contained in:
@@ -18,6 +18,7 @@ import importlib.resources
|
||||
import json
|
||||
import logging
|
||||
from collections.abc import Iterator
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import datasets
|
||||
@@ -101,6 +102,18 @@ DEFAULT_FEATURES = {
|
||||
}
|
||||
|
||||
|
||||
def has_legacy_hub_download_metadata(root: Path) -> bool:
|
||||
"""Return ``True`` when *root* looks like a legacy Hub ``local_dir`` mirror.
|
||||
|
||||
``snapshot_download(local_dir=...)`` stores lightweight metadata under
|
||||
``<local_dir>/.cache/huggingface/download/``. The presence of this
|
||||
directory is a reliable indicator that the dataset was downloaded with
|
||||
the old non-revision-safe ``local_dir`` mode and should be re-fetched
|
||||
through the snapshot cache instead.
|
||||
"""
|
||||
return (root / ".cache" / "huggingface" / "download").exists()
|
||||
|
||||
|
||||
def update_chunk_file_indices(chunk_idx: int, file_idx: int, chunks_size: int) -> tuple[int, int]:
|
||||
if file_idx == chunks_size - 1:
|
||||
file_idx = 0
|
||||
|
||||
Reference in New Issue
Block a user