fix(annotate): default frame provider to video keys, not image keys

VideoFrameProvider derived its default camera and camera list from meta.camera_keys, which mixes image- and video-stored cameras. The clip/decode paths read videos/<key>/from_timestamp, which only exists for video keys, so an image-stored camera sorted first (e.g. observation.images.wrist) crashed the plan phase with a KeyError. Restrict the list and default to meta.video_keys. Add a regression test and point the example job at the dataset's actual video camera. Skip bandit B607 (ffmpeg/git are intentionally resolved via PATH). Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-04 12:51:27 +00:00 · 2026-06-02 12:08:15 +02:00
parent 5dbf0fac5f
commit 98a519e7f2
4 changed files with 68 additions and 23 deletions
--- a/src/lerobot/annotations/steerable_pipeline/frames.py
+++ b/src/lerobot/annotations/steerable_pipeline/frames.py
@@ -151,13 +151,15 @@ class VideoFrameProvider:
        from lerobot.datasets.dataset_metadata import LeRobotDatasetMetadata  # noqa: PLC0415

        self._meta = LeRobotDatasetMetadata(repo_id="local", root=self.root)
-        # ``camera_keys`` covers both image- and video-stored cameras and is
-        # always defined on the metadata (``[]`` in the worst case), so it is
-        # the single source we need here.
-        keys = list(self._meta.camera_keys)
-        # Last-resort fallback: if metadata didn't surface anything but the
-        # caller explicitly named a camera (``--vlm.camera_key=...``), trust
-        # them — the key is by definition known to exist on the dataset.
+        # Only ``video_keys`` are decodable here: the clip/decode paths read
+        # ``videos/<key>/from_timestamp`` from episode metadata, which exists
+        # only for video-stored cameras. Image-stored cameras (also in
+        # ``camera_keys``) would KeyError, so restrict the list — and the
+        # default — to video keys.
+        keys = list(self._meta.video_keys)
+        # Last-resort fallback: if metadata didn't surface any video keys but
+        # the caller explicitly named a camera (``--vlm.camera_key=...``),
+        # trust them — the key is by definition known to exist on the dataset.
        if not keys and self.camera_key:
            keys = [self.camera_key]
        self._camera_keys = keys
@@ -338,8 +340,7 @@ class VideoFrameProvider:
                self._warned_decode_fail = True
        if not already_warned:
            logger.warning(
-                "VideoFrameProvider._decode failed for episode=%s camera=%s "
-                "video_path=%s backends=%s: %s",
+                "VideoFrameProvider._decode failed for episode=%s camera=%s video_path=%s backends=%s: %s",
                episode_index,
                camera_key,
                video_path,
@@ -383,11 +384,21 @@ def _decode_frames_ffmpeg(video_path: Path, timestamps: list[float]) -> list[Any
    for ts in timestamps:
        proc = subprocess.run(
            [
-                "ffmpeg", "-nostdin", "-loglevel", "error",
-                "-ss", f"{max(ts, 0.0):.3f}",
-                "-i", str(video_path),
-                "-frames:v", "1",
-                "-f", "image2pipe", "-vcodec", "png", "pipe:1",
+                "ffmpeg",
+                "-nostdin",
+                "-loglevel",
+                "error",
+                "-ss",
+                f"{max(ts, 0.0):.3f}",
+                "-i",
+                str(video_path),
+                "-frames:v",
+                "1",
+                "-f",
+                "image2pipe",
+                "-vcodec",
+                "png",
+                "pipe:1",
            ],
            capture_output=True,
            check=True,