From 98a519e7f266acd9e19bdf420b201a696785186e Mon Sep 17 00:00:00 2001 From: Pepijn Date: Tue, 2 Jun 2026 12:08:15 +0200 Subject: [PATCH] fix(annotate): default frame provider to video keys, not image keys VideoFrameProvider derived its default camera and camera list from meta.camera_keys, which mixes image- and video-stored cameras. The clip/decode paths read videos//from_timestamp, which only exists for video keys, so an image-stored camera sorted first (e.g. observation.images.wrist) crashed the plan phase with a KeyError. Restrict the list and default to meta.video_keys. Add a regression test and point the example job at the dataset's actual video camera. Skip bandit B607 (ffmpeg/git are intentionally resolved via PATH). Co-authored-by: Cursor --- examples/annotations/run_hf_job.py | 11 +++--- pyproject.toml | 2 +- .../annotations/steerable_pipeline/frames.py | 39 ++++++++++++------- tests/annotations/test_frames.py | 39 +++++++++++++++++-- 4 files changed, 68 insertions(+), 23 deletions(-) diff --git a/examples/annotations/run_hf_job.py b/examples/annotations/run_hf_job.py index dcc9435ce..01ef58f4d 100644 --- a/examples/annotations/run_hf_job.py +++ b/examples/annotations/run_hf_job.py @@ -36,8 +36,8 @@ CMD = ( "export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0 && " "export VLLM_VIDEO_BACKEND=pyav && " "lerobot-annotate " - "--repo_id=imstevenpmwork/super_poulain_draft " - "--dest_repo_id=pepijn223/super_poulain_vocab " + "--repo_id=pepijn223/robocasa_smoke_2atomic_v3 " + "--dest_repo_id=pepijn223/robocasa_smoke_2atomic_v3_ann " "--push_to_hub=true " "--vlm.backend=openai " "--vlm.model_id=Qwen/Qwen3.6-35B-A3B-FP8 " @@ -52,17 +52,18 @@ CMD = ( "--vlm.temperature=0.7 " "--executor.episode_parallelism=16 " "--vlm.chat_template_kwargs='{\"enable_thinking\": false}' " - "--vlm.camera_key=observation.images.wrist " + "--vlm.camera_key=observation.images.robot0_agentview_right " # Phase 1 — plan module (subtasks + plan + memory + task_aug). "--plan.frames_per_second=1.0 " "--plan.use_video_url=true " "--plan.use_video_url_fps=1.0 " "--plan.derive_task_from_video=always " - "--plan.n_task_rephrasings=30 " + "--plan.task_aug_axes.enabled=true " + "--plan.action_records.enabled=true " # Phase 2 — interjections + speech. "--interjections.max_interjections_per_episode=6 " # Phase 4 — general VQA. - "--vqa.K=3 " + "--vqa.K=1 " "--vqa.vqa_emission_hz=1.0" ) diff --git a/pyproject.toml b/pyproject.toml index d29800a3b..5a329e56d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -401,7 +401,7 @@ exclude_dirs = [ "benchmarks", "src/lerobot/datasets/push_dataset_to_hub", ] -skips = ["B101", "B311", "B404", "B603", "B615"] +skips = ["B101", "B311", "B404", "B603", "B607", "B615"] [tool.typos] default.extend-ignore-re = [ diff --git a/src/lerobot/annotations/steerable_pipeline/frames.py b/src/lerobot/annotations/steerable_pipeline/frames.py index 112f50ce6..804dae109 100644 --- a/src/lerobot/annotations/steerable_pipeline/frames.py +++ b/src/lerobot/annotations/steerable_pipeline/frames.py @@ -151,13 +151,15 @@ class VideoFrameProvider: from lerobot.datasets.dataset_metadata import LeRobotDatasetMetadata # noqa: PLC0415 self._meta = LeRobotDatasetMetadata(repo_id="local", root=self.root) - # ``camera_keys`` covers both image- and video-stored cameras and is - # always defined on the metadata (``[]`` in the worst case), so it is - # the single source we need here. - keys = list(self._meta.camera_keys) - # Last-resort fallback: if metadata didn't surface anything but the - # caller explicitly named a camera (``--vlm.camera_key=...``), trust - # them — the key is by definition known to exist on the dataset. + # Only ``video_keys`` are decodable here: the clip/decode paths read + # ``videos//from_timestamp`` from episode metadata, which exists + # only for video-stored cameras. Image-stored cameras (also in + # ``camera_keys``) would KeyError, so restrict the list — and the + # default — to video keys. + keys = list(self._meta.video_keys) + # Last-resort fallback: if metadata didn't surface any video keys but + # the caller explicitly named a camera (``--vlm.camera_key=...``), + # trust them — the key is by definition known to exist on the dataset. if not keys and self.camera_key: keys = [self.camera_key] self._camera_keys = keys @@ -338,8 +340,7 @@ class VideoFrameProvider: self._warned_decode_fail = True if not already_warned: logger.warning( - "VideoFrameProvider._decode failed for episode=%s camera=%s " - "video_path=%s backends=%s: %s", + "VideoFrameProvider._decode failed for episode=%s camera=%s video_path=%s backends=%s: %s", episode_index, camera_key, video_path, @@ -383,11 +384,21 @@ def _decode_frames_ffmpeg(video_path: Path, timestamps: list[float]) -> list[Any for ts in timestamps: proc = subprocess.run( [ - "ffmpeg", "-nostdin", "-loglevel", "error", - "-ss", f"{max(ts, 0.0):.3f}", - "-i", str(video_path), - "-frames:v", "1", - "-f", "image2pipe", "-vcodec", "png", "pipe:1", + "ffmpeg", + "-nostdin", + "-loglevel", + "error", + "-ss", + f"{max(ts, 0.0):.3f}", + "-i", + str(video_path), + "-frames:v", + "1", + "-f", + "image2pipe", + "-vcodec", + "png", + "pipe:1", ], capture_output=True, check=True, diff --git a/tests/annotations/test_frames.py b/tests/annotations/test_frames.py index 07b8b2c33..c8ed51ed5 100644 --- a/tests/annotations/test_frames.py +++ b/tests/annotations/test_frames.py @@ -45,6 +45,33 @@ from lerobot.annotations.steerable_pipeline.frames import ( # noqa: E402 ) +class _FakeMeta: + """Minimal metadata stub exposing ``video_keys`` / ``camera_keys``.""" + + def __init__(self, video_keys: list[str], image_keys: list[str]) -> None: + self.video_keys = video_keys + self.camera_keys = [*video_keys, *image_keys] + + +def test_default_camera_key_skips_image_only_cameras(tmp_path: Path, monkeypatch) -> None: + """The default camera must be a *video* key — image-stored cameras have no + ``videos//from_timestamp`` and would KeyError in the clip/decode path. + + Regression: a dataset whose first ``camera_keys`` entry was an image-stored + camera (e.g. ``observation.images.wrist``) crashed at clip extraction. + """ + fake = _FakeMeta( + video_keys=["observation.images.robot0_agentview_right"], + image_keys=["observation.images.wrist"], + ) + import lerobot.datasets.dataset_metadata as meta_mod + + monkeypatch.setattr(meta_mod, "LeRobotDatasetMetadata", lambda *a, **k: fake, raising=True) + provider = VideoFrameProvider(root=tmp_path) + assert provider.camera_key == "observation.images.robot0_agentview_right" + assert "observation.images.wrist" not in provider.camera_keys + + def test_video_for_episode_is_a_method_of_videoframeprovider(): """``video_for_episode`` must be a bound method, not nested dead code.""" assert callable(getattr(VideoFrameProvider, "video_for_episode", None)) @@ -81,9 +108,15 @@ def sample_video(tmp_path: Path) -> Path: out = tmp_path / "sample.mp4" subprocess.run( [ - "ffmpeg", "-y", "-f", "lavfi", - "-i", "testsrc=duration=3:size=160x120:rate=10", - "-pix_fmt", "yuv420p", str(out), + "ffmpeg", + "-y", + "-f", + "lavfi", + "-i", + "testsrc=duration=3:size=160x120:rate=10", + "-pix_fmt", + "yuv420p", + str(out), ], check=True, capture_output=True,