chore(installation): remove libero installation patch (#2416 )

* chore(installation): remove libero installation patch * fix(ci): exclude groot for unbound deps test
fix(dataset): fix data access bottleneck for faster training (#2408 )
2026-06-02 20:01:25 +00:00 · 2025-11-10 11:51:52 +01:00 · 2025-11-07 21:54:44 +01:00 · 2025-11-04 15:56:41 +01:00
7 changed files with 40 additions and 34 deletions
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -83,11 +83,11 @@ jobs:
          fi

      - name: Remove Tags with Git dependencies
-        # TODO(Steven): Temporary patch to remove libero and pi from PyPi 0.4.0 release due to its reliance on git dependencies.
+        # TODO(Steven): Temporary patch to remove pi from PyPi 0.4.0 release due to its reliance on git dependencies.
        run: |
          echo "::info:: Checking for Git dependencies to remove from pyproject.toml..."
-          grep -E '@ git\+https|lerobot\[pi\]|lerobot\[libero\]' pyproject.toml | sed 's/^/::warning:: Removing line: /' || true
-          sed -E -i '/@ git\+https|lerobot\[pi\]|lerobot\[libero\]/d' pyproject.toml
+          grep -E '@ git\+https|lerobot\[pi\]' pyproject.toml | sed 's/^/::warning:: Removing line: /' || true
+          sed -E -i '/@ git\+https|lerobot\[pi\]/d' pyproject.toml
          echo "::info:: Git dependencies removed. Proceeding with build."

      - name: Install build dependencies
--- a/.github/workflows/unbound_deps_tests.yml
+++ b/.github/workflows/unbound_deps_tests.yml
@@ -70,7 +70,7 @@ jobs:
          echo "Dependencies unbound:" && cat pyproject.toml

      - name: Install lerobot with all extras
-        run: uv sync --all-extras
+        run: uv sync --all-extras --no-extra groot # TODO(Steven): Make flash-attn optional

      - name: Run pytest (all extras)
        run: uv run pytest tests -vv
--- a/README.md
+++ b/README.md
@@ -186,7 +186,7 @@ For a full list of optional dependencies, see:
 https://pypi.org/project/lerobot/

 > [!NOTE]
-> For lerobot 0.4.0, if you want to install libero or pi tags, you will have to do: `pip install "lerobot[pi,libero]@git+https://github.com/huggingface/lerobot.git"`.
+> For lerobot 0.4.0, if you want to install pi tags, you will have to do: `pip install "lerobot[pi]@git+https://github.com/huggingface/lerobot.git"`.
 >
 > This will be solved in the next patch release

--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -82,7 +82,7 @@ For a full list of optional dependencies, see:
 https://pypi.org/project/lerobot/

 > [!NOTE]
-> For lerobot 0.4.0, if you want to install libero or pi, you will have to do: `pip install "lerobot[pi,libero]@git+https://github.com/huggingface/lerobot.git"`
+> For lerobot 0.4.0, if you want to install pi, you will have to do: `pip install "lerobot[pi]@git+https://github.com/huggingface/lerobot.git"`

 ### Troubleshooting

--- a/docs/source/libero.mdx
+++ b/docs/source/libero.mdx
@@ -28,11 +28,6 @@ LIBERO is now part of our **multi-eval supported simulation**, meaning you can b
 To Install LIBERO, after following LeRobot official instructions, just do:
 `pip install -e ".[libero]"`

-> [!NOTE]
-> For lerobot 0.4.0, if you want to install libero tag, you will have to do: `pip install "lerobot[libero]@git+https://github.com/huggingface/lerobot.git"`.
->
-> This will be solved in the next patch release
-
 ### Single-suite evaluation

 Evaluate a policy on one LIBERO suite:
--- a/src/lerobot/datasets/dataset_tools.py
+++ b/src/lerobot/datasets/dataset_tools.py
@@ -39,6 +39,7 @@ from lerobot.datasets.aggregate import aggregate_datasets
 from lerobot.datasets.compute_stats import aggregate_stats
 from lerobot.datasets.lerobot_dataset import LeRobotDataset, LeRobotDatasetMetadata
 from lerobot.datasets.utils import (
+    DATA_DIR,
    DEFAULT_CHUNK_SIZE,
    DEFAULT_DATA_FILE_SIZE_IN_MB,
    DEFAULT_DATA_PATH,
@@ -962,28 +963,23 @@ def _copy_data_with_feature_changes(
    remove_features: list[str] | None = None,
 ) -> None:
    """Copy data while adding or removing features."""
-    if dataset.meta.episodes is None:
-        dataset.meta.episodes = load_episodes(dataset.meta.root)
+    data_dir = dataset.root / DATA_DIR
+    parquet_files = sorted(data_dir.glob("*/*.parquet"))

-    # Map file paths to episode indices to extract chunk/file indices
-    file_to_episodes: dict[Path, set[int]] = {}
-    for ep_idx in range(dataset.meta.total_episodes):
-        file_path = dataset.meta.get_data_file_path(ep_idx)
-        if file_path not in file_to_episodes:
-            file_to_episodes[file_path] = set()
-        file_to_episodes[file_path].add(ep_idx)
+    if not parquet_files:
+        raise ValueError(f"No parquet files found in {data_dir}")

    frame_idx = 0

-    for src_path in tqdm(sorted(file_to_episodes.keys()), desc="Processing data files"):
-        df = pd.read_parquet(dataset.root / src_path).reset_index(drop=True)
+    for src_path in tqdm(parquet_files, desc="Processing data files"):
+        df = pd.read_parquet(src_path).reset_index(drop=True)

-        # Get chunk_idx and file_idx from the source file's first episode
-        episodes_in_file = file_to_episodes[src_path]
-        first_ep_idx = min(episodes_in_file)
-        src_ep = dataset.meta.episodes[first_ep_idx]
-        chunk_idx = src_ep["data/chunk_index"]
-        file_idx = src_ep["data/file_index"]
+        relative_path = src_path.relative_to(dataset.root)
+        chunk_dir = relative_path.parts[1]
+        file_name = relative_path.parts[2]
+
+        chunk_idx = int(chunk_dir.split("-")[1])
+        file_idx = int(file_name.split("-")[1].split(".")[0])

        if remove_features:
            df = df.drop(columns=remove_features, errors="ignore")
@@ -1009,7 +1005,7 @@ def _copy_data_with_feature_changes(
                        df[feature_name] = feature_slice
            frame_idx = end_idx

-        # Write using the preserved chunk_idx and file_idx from source
+        # Write using the same chunk/file structure as source
        dst_path = new_meta.root / DEFAULT_DATA_PATH.format(chunk_index=chunk_idx, file_index=file_idx)
        dst_path.parent.mkdir(parents=True, exist_ok=True)

--- a/src/lerobot/datasets/lerobot_dataset.py
+++ b/src/lerobot/datasets/lerobot_dataset.py
@@ -940,11 +940,26 @@ class LeRobotDataset(torch.utils.data.Dataset):
        return query_timestamps

    def _query_hf_dataset(self, query_indices: dict[str, list[int]]) -> dict:
-        return {
-            key: torch.stack(self.hf_dataset[q_idx][key])
-            for key, q_idx in query_indices.items()
-            if key not in self.meta.video_keys
-        }
+        """
+        Query dataset for indices across keys, skipping video keys.
+
+        Tries column-first [key][indices] for speed, falls back to row-first.
+
+        Args:
+            query_indices: Dict mapping keys to index lists to retrieve
+
+        Returns:
+            Dict with stacked tensors of queried data (video keys excluded)
+        """
+        result: dict = {}
+        for key, q_idx in query_indices.items():
+            if key in self.meta.video_keys:
+                continue
+            try:
+                result[key] = torch.stack(self.hf_dataset[key][q_idx])
+            except (KeyError, TypeError, IndexError):
+                result[key] = torch.stack(self.hf_dataset[q_idx][key])
+        return result

    def _query_videos(self, query_timestamps: dict[str, list[float]], ep_idx: int) -> dict[str, torch.Tensor]:
        """Note: When using data workers (e.g. DataLoader with num_workers>0), do not call this function