From aaf37070587581b3ffa8a28b6c134e846afe3a2e Mon Sep 17 00:00:00 2001 From: Caroline Pascal Date: Wed, 18 Feb 2026 19:16:53 +0100 Subject: [PATCH] fix(filtering): fixing episodes filtering in load_nested_dataset to always use .from_parquet() (#2982) --- src/lerobot/datasets/utils.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/src/lerobot/datasets/utils.py b/src/lerobot/datasets/utils.py index 321ecedd5..da186bf30 100644 --- a/src/lerobot/datasets/utils.py +++ b/src/lerobot/datasets/utils.py @@ -122,19 +122,9 @@ def load_nested_dataset( raise FileNotFoundError(f"Provided directory does not contain any parquet file: {pq_dir}") with SuppressProgressBars(): - # When no filtering needed, Dataset uses memory-mapped loading for efficiency - # PyArrow loads the entire dataset into memory - if episodes is None: - return Dataset.from_parquet([str(path) for path in paths], features=features) - - arrow_dataset = pa_ds.dataset(paths, format="parquet") - filter_expr = pa_ds.field("episode_index").isin(episodes) - table = arrow_dataset.to_table(filter=filter_expr) - - if features is not None: - table = table.cast(features.arrow_schema) - - return Dataset(table) + # We use .from_parquet() memory-mapped loading for efficiency + filters = pa_ds.field("episode_index").isin(episodes) if episodes is not None else None + return Dataset.from_parquet([str(path) for path in paths], filters=filters, features=features) def get_parquet_num_frames(parquet_path: str | Path) -> int: