From 98a519e7f266acd9e19bdf420b201a696785186e Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Tue, 2 Jun 2026 12:08:15 +0200
Subject: [PATCH] fix(annotate): default frame provider to video keys, not
 image keys

VideoFrameProvider derived its default camera and camera list from
meta.camera_keys, which mixes image- and video-stored cameras. The
clip/decode paths read videos/<key>/from_timestamp, which only exists
for video keys, so an image-stored camera sorted first (e.g.
observation.images.wrist) crashed the plan phase with a KeyError.

Restrict the list and default to meta.video_keys. Add a regression test
and point the example job at the dataset's actual video camera. Skip
bandit B607 (ffmpeg/git are intentionally resolved via PATH).

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 examples/annotations/run_hf_job.py            | 11 +++---
 pyproject.toml                                |  2 +-
 .../annotations/steerable_pipeline/frames.py  | 39 ++++++++++++-------
 tests/annotations/test_frames.py              | 39 +++++++++++++++++--
 4 files changed, 68 insertions(+), 23 deletions(-)
diff --git a/examples/annotations/run_hf_job.py b/examples/annotations/run_hf_job.py
index dcc9435ce..01ef58f4d 100644
--- a/examples/annotations/run_hf_job.py
+++ b/examples/annotations/run_hf_job.py
@@ -36,8 +36,8 @@ CMD = (
     "export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0 && "
     "export VLLM_VIDEO_BACKEND=pyav && "
     "lerobot-annotate "
-    "--repo_id=imstevenpmwork/super_poulain_draft "
-    "--dest_repo_id=pepijn223/super_poulain_vocab "
+    "--repo_id=pepijn223/robocasa_smoke_2atomic_v3 "
+    "--dest_repo_id=pepijn223/robocasa_smoke_2atomic_v3_ann "
     "--push_to_hub=true "
     "--vlm.backend=openai "
     "--vlm.model_id=Qwen/Qwen3.6-35B-A3B-FP8 "
@@ -52,17 +52,18 @@ CMD = (
     "--vlm.temperature=0.7 "
     "--executor.episode_parallelism=16 "
     "--vlm.chat_template_kwargs='{\"enable_thinking\": false}' "
-    "--vlm.camera_key=observation.images.wrist "
+    "--vlm.camera_key=observation.images.robot0_agentview_right "
     # Phase 1 — plan module (subtasks + plan + memory + task_aug).
     "--plan.frames_per_second=1.0 "
     "--plan.use_video_url=true "
     "--plan.use_video_url_fps=1.0 "
     "--plan.derive_task_from_video=always "
-    "--plan.n_task_rephrasings=30 "
+    "--plan.task_aug_axes.enabled=true "
+    "--plan.action_records.enabled=true "
     # Phase 2 — interjections + speech.
     "--interjections.max_interjections_per_episode=6 "
     # Phase 4 — general VQA.
-    "--vqa.K=3 "
+    "--vqa.K=1 "
     "--vqa.vqa_emission_hz=1.0"
 )
 
diff --git a/pyproject.toml b/pyproject.toml
index d29800a3b..5a329e56d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -401,7 +401,7 @@ exclude_dirs = [
     "benchmarks",
     "src/lerobot/datasets/push_dataset_to_hub",
 ]
-skips = ["B101", "B311", "B404", "B603", "B615"]
+skips = ["B101", "B311", "B404", "B603", "B607", "B615"]
 
 [tool.typos]
 default.extend-ignore-re = [
diff --git a/src/lerobot/annotations/steerable_pipeline/frames.py b/src/lerobot/annotations/steerable_pipeline/frames.py
index 112f50ce6..804dae109 100644
--- a/src/lerobot/annotations/steerable_pipeline/frames.py
+++ b/src/lerobot/annotations/steerable_pipeline/frames.py
@@ -151,13 +151,15 @@ class VideoFrameProvider:
         from lerobot.datasets.dataset_metadata import LeRobotDatasetMetadata  # noqa: PLC0415
 
         self._meta = LeRobotDatasetMetadata(repo_id="local", root=self.root)
-        # ``camera_keys`` covers both image- and video-stored cameras and is
-        # always defined on the metadata (``[]`` in the worst case), so it is
-        # the single source we need here.
-        keys = list(self._meta.camera_keys)
-        # Last-resort fallback: if metadata didn't surface anything but the
-        # caller explicitly named a camera (``--vlm.camera_key=...``), trust
-        # them — the key is by definition known to exist on the dataset.
+        # Only ``video_keys`` are decodable here: the clip/decode paths read
+        # ``videos/<key>/from_timestamp`` from episode metadata, which exists
+        # only for video-stored cameras. Image-stored cameras (also in
+        # ``camera_keys``) would KeyError, so restrict the list — and the
+        # default — to video keys.
+        keys = list(self._meta.video_keys)
+        # Last-resort fallback: if metadata didn't surface any video keys but
+        # the caller explicitly named a camera (``--vlm.camera_key=...``),
+        # trust them — the key is by definition known to exist on the dataset.
         if not keys and self.camera_key:
             keys = [self.camera_key]
         self._camera_keys = keys
@@ -338,8 +340,7 @@ class VideoFrameProvider:
                 self._warned_decode_fail = True
         if not already_warned:
             logger.warning(
-                "VideoFrameProvider._decode failed for episode=%s camera=%s "
-                "video_path=%s backends=%s: %s",
+                "VideoFrameProvider._decode failed for episode=%s camera=%s video_path=%s backends=%s: %s",
                 episode_index,
                 camera_key,
                 video_path,
@@ -383,11 +384,21 @@ def _decode_frames_ffmpeg(video_path: Path, timestamps: list[float]) -> list[Any
     for ts in timestamps:
         proc = subprocess.run(
             [
-                "ffmpeg", "-nostdin", "-loglevel", "error",
-                "-ss", f"{max(ts, 0.0):.3f}",
-                "-i", str(video_path),
-                "-frames:v", "1",
-                "-f", "image2pipe", "-vcodec", "png", "pipe:1",
+                "ffmpeg",
+                "-nostdin",
+                "-loglevel",
+                "error",
+                "-ss",
+                f"{max(ts, 0.0):.3f}",
+                "-i",
+                str(video_path),
+                "-frames:v",
+                "1",
+                "-f",
+                "image2pipe",
+                "-vcodec",
+                "png",
+                "pipe:1",
             ],
             capture_output=True,
             check=True,
diff --git a/tests/annotations/test_frames.py b/tests/annotations/test_frames.py
index 07b8b2c33..c8ed51ed5 100644
--- a/tests/annotations/test_frames.py
+++ b/tests/annotations/test_frames.py
@@ -45,6 +45,33 @@ from lerobot.annotations.steerable_pipeline.frames import (  # noqa: E402
 )
 
 
+class _FakeMeta:
+    """Minimal metadata stub exposing ``video_keys`` / ``camera_keys``."""
+
+    def __init__(self, video_keys: list[str], image_keys: list[str]) -> None:
+        self.video_keys = video_keys
+        self.camera_keys = [*video_keys, *image_keys]
+
+
+def test_default_camera_key_skips_image_only_cameras(tmp_path: Path, monkeypatch) -> None:
+    """The default camera must be a *video* key — image-stored cameras have no
+    ``videos/<key>/from_timestamp`` and would KeyError in the clip/decode path.
+
+    Regression: a dataset whose first ``camera_keys`` entry was an image-stored
+    camera (e.g. ``observation.images.wrist``) crashed at clip extraction.
+    """
+    fake = _FakeMeta(
+        video_keys=["observation.images.robot0_agentview_right"],
+        image_keys=["observation.images.wrist"],
+    )
+    import lerobot.datasets.dataset_metadata as meta_mod
+
+    monkeypatch.setattr(meta_mod, "LeRobotDatasetMetadata", lambda *a, **k: fake, raising=True)
+    provider = VideoFrameProvider(root=tmp_path)
+    assert provider.camera_key == "observation.images.robot0_agentview_right"
+    assert "observation.images.wrist" not in provider.camera_keys
+
+
 def test_video_for_episode_is_a_method_of_videoframeprovider():
     """``video_for_episode`` must be a bound method, not nested dead code."""
     assert callable(getattr(VideoFrameProvider, "video_for_episode", None))
@@ -81,9 +108,15 @@ def sample_video(tmp_path: Path) -> Path:
     out = tmp_path / "sample.mp4"
     subprocess.run(
         [
-            "ffmpeg", "-y", "-f", "lavfi",
-            "-i", "testsrc=duration=3:size=160x120:rate=10",
-            "-pix_fmt", "yuv420p", str(out),
+            "ffmpeg",
+            "-y",
+            "-f",
+            "lavfi",
+            "-i",
+            "testsrc=duration=3:size=160x120:rate=10",
+            "-pix_fmt",
+            "yuv420p",
+            str(out),
         ],
         check=True,
         capture_output=True,