fix(hw-dataset): adding missing support for audio in hw-to-dataset functions

2026-06-05 05:11:25 +00:00 · 2025-08-06 20:34:06 +02:00
parent e126d35249
commit 2726b4e865
1 changed files with 16 additions and 1 deletions
--- a/src/lerobot/datasets/utils.py
+++ b/src/lerobot/datasets/utils.py
@@ -657,7 +657,12 @@ def hw_to_dataset_features(
        for key, ftype in hw_features.items()
        if ftype is float or (isinstance(ftype, PolicyFeature) and ftype.type != FeatureType.VISUAL)
    }
-    cam_fts = {key: shape for key, shape in hw_features.items() if isinstance(shape, tuple)}
+    cam_fts = {
+        key: shape for key, shape in hw_features.items() if isinstance(shape, tuple) and len(shape) == 3
+    }
+    mic_fts = {
+        key: shape for key, shape in hw_features.items() if isinstance(shape, tuple) and len(shape) == 2
+    }

    if joint_fts and prefix == ACTION:
        features[prefix] = {
@@ -680,6 +685,14 @@ def hw_to_dataset_features(
            "names": ["height", "width", "channels"],
        }

+    for key, features in mic_fts.items():
+        features[f"{prefix}.audio.{key}"] = {
+            "dtype": "audio",
+            "shape": (features[1],),
+            "names": ["channels"],
+            "sample_rate": features[0],
+        }
+
    _validate_feature_names(features)
    return features

@@ -709,6 +722,8 @@ def build_dataset_frame(
            frame[key] = np.array([values[name] for name in ft["names"]], dtype=np.float32)
        elif ft["dtype"] in ["image", "video"]:
            frame[key] = values[key.removeprefix(f"{prefix}.images.")]
+        elif ft["dtype"] == "audio":
+            frame[key] = values[key.removeprefix(f"{prefix}.audio.")]

    return frame