From 0747afdba7e5638d77a5e5f99aef2b11ca78747c Mon Sep 17 00:00:00 2001
From: Michel Aractingi <michel.aractingi@huggingface.co>
Date: Fri, 5 Sep 2025 18:37:48 +0200
Subject: [PATCH] Optimize dataset updates by incrementally concatenating new
 data instead of reloading from disk, reducing memory usage and improving
 performance.

---
 src/lerobot/datasets/lerobot_dataset.py | 37 ++++++++++---------------
 1 file changed, 14 insertions(+), 23 deletions(-)

diff --git a/src/lerobot/datasets/lerobot_dataset.py b/src/lerobot/datasets/lerobot_dataset.py
index 101eb20e7..db2c4aaef 100644
--- a/src/lerobot/datasets/lerobot_dataset.py
+++ b/src/lerobot/datasets/lerobot_dataset.py
@@ -28,7 +28,7 @@ import pandas as pd
 import PIL.Image
 import torch
 import torch.utils
-from datasets import Dataset
+from datasets import Dataset, concatenate_datasets
 from huggingface_hub import HfApi, snapshot_download
 from huggingface_hub.constants import REPOCARD_NAME
 from huggingface_hub.errors import RevisionNotFoundError
@@ -316,17 +316,16 @@ class LeRobotDatasetMetadata:
         path.parent.mkdir(parents=True, exist_ok=True)
         df.to_parquet(path, index=False)
 
-        # Update the Hugging Face dataset by reloading it.
-        # This process should be fast because only the latest Parquet file has been modified.
-        # Therefore, only this file needs to be converted to PyArrow; the rest is loaded from the PyArrow memory-mapped cache.
+        # Update the Hugging Face dataset incrementally instead of reloading from disk
+        # This eliminates repeated load_episodes calls that cause cache bloat
+        if self.episodes is None:
+            self.episodes = load_episodes(self.root)
+            return
 
-        # Explicitly delete old dataset to free memory before reloading
-        if hasattr(self, "episodes") and self.episodes is not None:
-            del self.episodes
-            self.episodes = None
-            gc.collect()
-
-        self.episodes = load_episodes(self.root)
+        # Remove columns from df that start with 'stats/'
+        df = df.drop(columns=[col for col in df.columns if col.startswith("stats/")])
+        new_episode_dataset = Dataset.from_pandas(df)
+        self.episodes = concatenate_datasets([self.episodes, new_episode_dataset])
 
     def save_episode(
         self,
@@ -1064,17 +1063,8 @@ class LeRobotDataset(torch.utils.data.Dataset):
         else:
             df.to_parquet(path)
 
-        # Update the Hugging Face dataset by reloading it.
-        # This process should be fast because only the latest Parquet file has been modified.
-        # Therefore, only this file needs to be converted to PyArrow; the rest is loaded from the PyArrow memory-mapped cache.
-
-        # Explicitly delete old dataset to free memory before reloading
-        if hasattr(self, "hf_dataset") and self.hf_dataset is not None:
-            del self.hf_dataset
-            self.hf_dataset = None
-            gc.collect()
-
-        self.hf_dataset = self.load_hf_dataset()
+        new_hf_dataset = Dataset.from_pandas(df)
+        self.hf_dataset = concatenate_datasets([self.hf_dataset, new_hf_dataset])
 
         metadata = {
             "data/chunk_index": chunk_idx,
@@ -1093,7 +1083,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
         if self.meta.episodes is None:
             # Initialize indices for a new dataset made of the first episode data
             chunk_idx, file_idx = 0, 0
-            latest_duration_in_s = 0
+            latest_duration_in_s = 0.0
             new_path = self.root / self.meta.video_path.format(
                 video_key=video_key, chunk_index=chunk_idx, file_index=file_idx
             )
@@ -1119,6 +1109,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
                 )
                 new_path.parent.mkdir(parents=True, exist_ok=True)
                 shutil.move(str(ep_path), str(new_path))
+                latest_duration_in_s = 0.0
             else:
                 # Update latest video file
                 concat_video_files([latest_path, ep_path], self.root, video_key, chunk_idx, file_idx)