From 0d6e4f3baddb4b8cd1dc2a1782cf66c42da72d50 Mon Sep 17 00:00:00 2001 From: CarolinePascal Date: Mon, 27 Apr 2026 17:29:11 +0200 Subject: [PATCH] feat(features): route 2D camera shapes to observation.depth. hw_to_dataset_features now treats a camera entry whose shape has length 2 as a single-channel depth feature: it emits the feature as "{prefix}.depth." with names=["height", "width"] and an info={"video.is_depth_map": True} marker so the depth-encoder branch in LeRobotDataset is engaged. The "_depth" hardware-side suffix (if present) is stripped so a paired RGB + depth camera ends up as "observation.images." + "observation.depth.". build_dataset_frame mirrors the routing: depth feature keys read their value from "_depth" in the raw observation dict, with fallback to the bare name for producers that already emit dataset-style keys. Tests: add tests/utils/test_feature_utils.py covering the routing of 2D vs 3D camera shapes, the paired RGB+depth case, and the build_dataset_frame value routing. Made-with: Cursor --- src/lerobot/utils/feature_utils.py | 32 ++++++++++--- tests/utils/test_feature_utils.py | 77 ++++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+), 6 deletions(-) create mode 100644 tests/utils/test_feature_utils.py diff --git a/src/lerobot/utils/feature_utils.py b/src/lerobot/utils/feature_utils.py index 2a4886234..a27e3bc0d 100644 --- a/src/lerobot/utils/feature_utils.py +++ b/src/lerobot/utils/feature_utils.py @@ -86,11 +86,24 @@ def hw_to_dataset_features( } for key, shape in cam_fts.items(): - features[f"{prefix}.images.{key}"] = { - "dtype": "video" if use_video else "image", - "shape": shape, - "names": ["height", "width", "channels"], - } + if len(shape) == 2: + # Single-channel feature (e.g. depth map). The hardware-side key is + # expected to use a "_depth" suffix to disambiguate from its color + # counterpart; we strip it so the dataset feature is published as + # ``{prefix}.depth.`` and aligned with ``observation.images.``. + bare = key.removesuffix("_depth") if key.endswith("_depth") else key + features[f"{prefix}.depth.{bare}"] = { + "dtype": "video" if use_video else "image", + "shape": shape, + "names": ["height", "width"], + "info": {"video.is_depth_map": True}, + } + else: + features[f"{prefix}.images.{key}"] = { + "dtype": "video" if use_video else "image", + "shape": shape, + "names": ["height", "width", "channels"], + } _validate_feature_names(features) return features @@ -120,7 +133,14 @@ def build_dataset_frame( elif ft["dtype"] == "float32" and len(ft["shape"]) == 1: frame[key] = np.array([values[name] for name in ft["names"]], dtype=np.float32) elif ft["dtype"] in ["image", "video"]: - frame[key] = values[key.removeprefix(f"{prefix}.images.")] + if key.startswith(f"{prefix}.depth."): + bare = key.removeprefix(f"{prefix}.depth.") + # Hardware emits depth values under "_depth" to disambiguate + # from the color stream stored at "" — fall back to the bare + # name when the producer already uses dataset-style keys. + frame[key] = values.get(f"{bare}_depth", values.get(bare)) + else: + frame[key] = values[key.removeprefix(f"{prefix}.images.")] return frame diff --git a/tests/utils/test_feature_utils.py b/tests/utils/test_feature_utils.py new file mode 100644 index 000000000..112f71e8a --- /dev/null +++ b/tests/utils/test_feature_utils.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python + +# Copyright 2026 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Unit tests for ``lerobot.utils.feature_utils``.""" + +import numpy as np + +from lerobot.utils.constants import OBS_STR +from lerobot.utils.feature_utils import build_dataset_frame, hw_to_dataset_features + + +def test_hw_to_dataset_features_routes_3d_shape_to_images(): + hw = {"front": (480, 640, 3)} + out = hw_to_dataset_features(hw, OBS_STR, use_video=True) + + assert "observation.images.front" in out + assert out["observation.images.front"]["dtype"] == "video" + assert out["observation.images.front"]["shape"] == (480, 640, 3) + assert out["observation.images.front"]["names"] == ["height", "width", "channels"] + assert "info" not in out["observation.images.front"] + + +def test_hw_to_dataset_features_routes_2d_shape_to_depth(): + hw = {"front_depth": (480, 640)} + out = hw_to_dataset_features(hw, OBS_STR, use_video=True) + + assert "observation.depth.front" in out, out + feat = out["observation.depth.front"] + assert feat["dtype"] == "video" + assert feat["shape"] == (480, 640) + assert feat["names"] == ["height", "width"] + assert feat["info"] == {"video.is_depth_map": True} + + +def test_hw_to_dataset_features_handles_paired_color_and_depth(): + """A camera with use_depth=True is expected to emit both keys.""" + hw = {"front": (480, 640, 3), "front_depth": (480, 640)} + out = hw_to_dataset_features(hw, OBS_STR, use_video=True) + + assert set(out) == {"observation.images.front", "observation.depth.front"} + assert out["observation.images.front"]["shape"] == (480, 640, 3) + assert out["observation.depth.front"]["shape"] == (480, 640) + + +def test_hw_to_dataset_features_keeps_bare_2d_key_when_no_suffix(): + """If the producer didn't use a "_depth" suffix, the bare name flows through.""" + hw = {"top": (240, 320)} + out = hw_to_dataset_features(hw, OBS_STR, use_video=True) + + assert "observation.depth.top" in out + + +def test_build_dataset_frame_routes_depth_values(): + ds_features = hw_to_dataset_features( + {"front": (4, 6, 3), "front_depth": (4, 6)}, + OBS_STR, + use_video=True, + ) + rgb = np.zeros((4, 6, 3), dtype=np.uint8) + depth = np.full((4, 6), 0.5, dtype=np.float32) + values = {"front": rgb, "front_depth": depth} + + frame = build_dataset_frame(ds_features, values, OBS_STR) + assert frame["observation.images.front"] is rgb + assert frame["observation.depth.front"] is depth