annotate(config): further compact field comments

Tighten the remaining multi-line comment blocks in config.py (derive_task, frames/window, describe_first, action-record/vqa/vlm fields, video_backend, repo ids, executor) to 1-3 lines each. Also fix a stale path typo ('examples/annotation' -> the docstring now just says HF Jobs). Comments only — no field or behavior change. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-04 12:51:27 +00:00 · 2026-06-04 14:36:02 +02:00
parent 973318ef65
commit 99baae012f
1 changed files with 52 additions and 91 deletions
--- a/src/lerobot/annotations/steerable_pipeline/config.py
+++ b/src/lerobot/annotations/steerable_pipeline/config.py
@@ -36,38 +36,30 @@ class PlanConfig:
    # ``${task}`` binding rotates among them per ``sample_idx``. ``0`` disables.
    n_task_rephrasings: int = 10

-    # When to derive the task from the video instead of using
-    # ``record.episode_task``: ``off``, ``if_short`` (short / placeholder /
-    # missing canonical task), or ``always``. The derived task replaces the
-    # canonical one for every ``plan``-module prompt; ``meta/tasks.parquet``
-    # is never modified.
+    # Derive the task from video instead of ``record.episode_task``: ``off``,
+    # ``if_short`` (canonical task short/placeholder/missing), or ``always``.
+    # Affects prompts only; ``meta/tasks.parquet`` is untouched.
    derive_task_from_video: str = "if_short"
    derive_task_min_words: int = 3

-    # Frames are sampled uniformly across the episode, capped at
-    # ``max_video_frames`` (a HARD context-budget cap, not an annotation
-    # knob). Each embedded frame is ~250-320 vision tokens, so 32 frames
-    # (~8-10k tokens) fit a 32k-context VLM; 128 would overflow it. Lower
-    # this if you hit "Input length exceeds maximum context length".
+    # Frames sampled uniformly, capped at ``max_video_frames`` — a HARD context
+    # cap (~250-320 tokens/frame, so 32 fit a 32k VLM; 128 overflow). Lower it
+    # if you hit "Input length exceeds maximum context length".
    frames_per_second: float = 1.0
    max_video_frames: int = 32

-    # Windowed subtask generation for CONSTANT temporal density. When > 0
-    # and the episode is longer than this, the plan module processes it in
-    # consecutive windows of this length (each sampled at
-    # ``frames_per_second``) instead of subsampling the whole episode to a
-    # sparse ``max_video_frames``. The describe -> segment chain runs per
-    # window; spans are merged + stitched. Set to ~max_video_frames /
-    # frames_per_second (e.g. 32s at 1 fps). 0 disables.
+    # Windowed subtask generation for constant temporal density: when > 0 and
+    # the episode is longer, process it in windows of this length (each at
+    # ``frames_per_second``) instead of subsampling the whole episode; spans are
+    # merged + stitched. ~max_video_frames / frames_per_second. 0 disables.
    subtask_window_seconds: float = 0.0

    min_subtask_seconds: float = 1.5
    plan_max_steps: int = 8

-    # Run a grounding pass that narrates ONLY what's visible (no subtask
-    # JSON yet), then feed that into the segmentation prompt — the strongest
-    # lever against subtasks invented from the task text. ON by default;
-    # +1 VLM call/episode. False trades quality for fewer calls.
+    # Grounding pass that narrates ONLY what's visible before segmenting — the
+    # strongest lever against subtasks invented from the task text. ON by
+    # default (+1 VLM call/episode); False trades quality for fewer calls.
    subtask_describe_first: bool = True

    # Emit ``style="plan"`` rows (the numbered still-todo list, re-emitted at
@@ -76,12 +68,10 @@ class PlanConfig:
    emit_plan: bool = True

    # NOTE: subtask spans are ALWAYS stitched into a contiguous full-episode
-    # cover (see ``_stitch_full_coverage``) — not configurable, since a
-    # sparse / gap-ridden timeline is never useful for conditioning.
+    # cover (see ``_stitch_full_coverage``) — not configurable.

-    # When True (with a backend that supports it, e.g. ``openai``), send a
-    # ``video_url`` block pointing at a per-episode mp4 subclip and let the
-    # server sample frames at ``use_video_url_fps``.
+    # When True, send a server-side ``video_url`` clip (sampled at
+    # ``use_video_url_fps``) instead of embedded frames.
    use_video_url: bool = False
    use_video_url_fps: float = 1.0

@@ -124,19 +114,15 @@ class ActionRecordsConfig:

    enabled: bool = False

-    # When True (default), emit a separate row with ``style="action_record"``
-    # and ``content=json.dumps(record)`` at the subtask's start timestamp.
-    # This is the only output of the feature — set ``enabled=False`` to
-    # skip the extra VLM calls entirely.
+    # Emit the ``style="action_record"`` row (JSON content) at the subtask
+    # start — the only output of the feature. ``enabled=False`` skips it.
    emit_record_row: bool = True

-    # Frame sampling for the per-subtask VLM call (similar to the
-    # interjection module's window). Anchored to the subtask span.
+    # Frames sampled from the subtask span for the per-subtask VLM call.
    frames_per_subtask: int = 4

-    # Closed verb vocabulary. The prompt instructs the VLM to pick
-    # exactly one. Override per-dataset (e.g. ``["pick", "place", "open",
-    # "close"]`` for door-only manipulation) for tighter constraint.
+    # Closed verb vocabulary; the prompt picks exactly one. Override
+    # per-dataset (e.g. door-only manipulation) for a tighter constraint.
    verb_vocabulary: tuple[str, ...] = (
        "pick",
        "place",
@@ -157,9 +143,8 @@ class ActionRecordsConfig:
        "dump",
    )

-    # Closed grasp-type vocabulary. ``null`` is always allowed (no
-    # contact / unclear). Adjust per-hardware (e.g. drop ``hook`` /
-    # ``key`` for parallel-jaw grippers).
+    # Closed grasp-type vocabulary (``null`` always allowed). Adjust
+    # per-hardware (e.g. drop ``hook`` / ``key`` for parallel-jaw grippers).
    grasp_vocabulary: tuple[str, ...] = (
        "pinch",
        "wrap",
@@ -199,15 +184,13 @@ class InterjectionsConfig:

    enabled: bool = True

-    # Each interjection emits a paired ``(interjection, speech)`` event row
-    # and triggers a ``plan`` refresh at the same timestamp via the
-    # ``plan`` module.
+    # Each interjection emits a paired (interjection, speech) event row and
+    # triggers a ``plan`` refresh at the same timestamp.
    max_interjections_per_episode: int = 3
    interjection_min_t: float = 2.0

-    # Visual context attached to the interjection prompt: a short window
-    # of frames centered on the chosen timestamp so the VLM sees the
-    # ongoing motion rather than a single frozen frame.
+    # A short frame window centered on the timestamp so the VLM sees the
+    # motion, not one frozen frame.
    interjection_window_seconds: float = 2.0
    interjection_window_frames: int = 4

@@ -219,22 +202,14 @@ class VqaConfig:
    enabled: bool = True
    vqa_emission_hz: float = 1.0
    K: int = 1
-    """How many *consecutive* frames each emission tick anchors a VQA pair
-    to. The VLM grounds its answer (bbox / keypoint coordinates, count, …)
-    against the *first* anchored frame's image, so anchoring K>1 frames
-    copies that same answer onto later frames where the scene has already
-    moved — stale labels. Default ``1``: a VQA pair lands on exactly its
-    emission frame, no temporal smear. Raise it only to trade label
-    precision for more (noisier) VQA frames."""
+    """Consecutive frames each emission tick anchors a VQA pair to. The VLM
+    grounds its answer on the FIRST anchored frame, so K>1 copies that answer
+    onto later (moved) frames — stale labels. Default 1 (no smear)."""
    question_types: tuple[str, ...] = ("bbox", "keypoint", "count", "attribute", "spatial")

-    # Camera restriction. By default VQA iterates EVERY camera the
-    # dataset declares (one VQA pair per camera per emission tick). Set
-    # ``restrict_to_default_camera=True`` to ground VQA on only the
-    # single ``--vlm.camera_key`` stream — the same camera the plan /
-    # interjection modules use — so the whole pipeline focuses on one
-    # view. Use this when you want every annotation grounded on, e.g.,
-    # ``observation.images.base`` and nothing else.
+    # By default VQA iterates every camera (one pair per camera per tick). Set
+    # True to ground VQA only on ``--vlm.camera_key`` — the single view the
+    # plan / interjection modules use.
    restrict_to_default_camera: bool = False


@@ -242,11 +217,9 @@ class VqaConfig:
 class VlmConfig:
    """Shared Qwen-VL client configuration."""

-    # Only ``openai`` is supported for now (the in-process ``vllm`` /
-    # ``transformers`` local backends were removed — the shipped workflow
-    # is Hugging Face Jobs). ``openai`` talks to an OpenAI-compatible vLLM
-    # server; the CLI auto-spawns one in-job when ``auto_serve=True``.
-    # ``stub`` is for unit tests (construct ``StubVlmClient`` directly).
+    # Only ``openai`` is supported (in-process vllm/transformers were removed;
+    # the shipped workflow is HF Jobs). Talks to an OpenAI-compatible vLLM
+    # server, auto-spawned in-job when ``auto_serve=True``. ``stub`` is for tests.
    backend: str = "openai"
    model_id: str = "Qwen/Qwen3.6-27B"

@@ -263,9 +236,8 @@ class VlmConfig:
    # when ``parallel_servers > 1``.
    serve_command: str | None = None

-    # Run multiple independent inference servers for round-robin client
-    # routing (each pinned to a GPU via ``CUDA_VISIBLE_DEVICES`` and bound
-    # to ``serve_port + i``). ``num_gpus=0`` means one GPU per replica.
+    # Independent servers for round-robin routing (each pinned to a GPU,
+    # bound to ``serve_port + i``). ``num_gpus=0`` = one GPU per replica.
    parallel_servers: int = 1
    num_gpus: int = 0
    client_concurrency: int = 16
@@ -289,16 +261,12 @@ class VlmConfig:

@dataclass
 class ExecutorConfig:
-    """Executor settings.
+    """Executor settings — intra-process episode concurrency only
+    (distributed execution is delegated to Hugging Face Jobs)."""

-    Distributed execution is provided by Hugging Face Jobs (see
-    ``examples/annotation/run_hf_job.py``); this config only controls
-    intra-process episode concurrency.
-    """
-
-    # Episodes processed concurrently within each module phase. Each
-    # in-flight episode dispatches 3-5 dependent VLM calls, so this is the
-    # main knob for saturating ``parallel_servers`` and ``client_concurrency``.
+    # Episodes processed concurrently per module phase. Each dispatches 3-5 VLM
+    # calls, so this is the main knob for saturating ``parallel_servers`` /
+    # ``client_concurrency``.
    episode_parallelism: int = 16


@@ -310,15 +278,12 @@ class AnnotationPipelineConfig:
    revisions of the same dataset live in separate copies.
    """

-    # Hub dataset id. Used as the download source when ``root`` is unset,
-    # and as the destination repo when ``push_to_hub`` is enabled and
-    # ``new_repo_id`` is unset.
+    # Hub dataset id: download source when ``root`` is unset, and push target
+    # when ``push_to_hub`` is on and ``new_repo_id`` is unset.
    repo_id: str | None = None

-    # Optional separate Hub dataset id to push the annotated result to (named
-    # ``new_repo_id`` to match the LeRobot dataset edit tools). When unset,
-    # ``push_to_hub`` uploads back to ``repo_id`` (annotate in place); when
-    # set, the source ``repo_id`` is left untouched.
+    # Optional separate push target (named to match the LeRobot dataset edit
+    # tools). Unset → push back to ``repo_id`` in place; set → source untouched.
    new_repo_id: str | None = None

    root: Path | None = None
@@ -338,17 +303,13 @@ class AnnotationPipelineConfig:
    skip_validation: bool = False
    only_episodes: tuple[int, ...] | None = None

-    # Keyframe decode backend. When unset, the pipeline decodes with the
-    # ffmpeg CLI: it decodes AV1 and runs each decode as an isolated child
-    # process, which is both crash-safe and safe under the concurrent
-    # decode the executor performs (torchcodec is not thread-safe and
-    # SIGSEGVs there). Set to ``"torchcodec"`` or ``"pyav"`` to pin an
-    # in-process decoder when its build is known thread-safe.
+    # Keyframe decode backend. Unset → ffmpeg CLI: decodes AV1 in an isolated
+    # child process, so it's crash- and thread-safe under concurrent decode
+    # (torchcodec SIGSEGVs there). Set ``"torchcodec"`` / ``"pyav"`` to pin one.
    video_backend: str | None = None

-    # When True, upload the annotated dataset to the Hugging Face Hub:
-    # to ``new_repo_id`` if set, otherwise back to ``repo_id``. One of
-    # the two must be set for this to take effect.
+    # Upload the annotated dataset to the Hub (to ``new_repo_id`` if set, else
+    # back to ``repo_id`` — one of the two must be set).
    push_to_hub: bool = False
    push_private: bool = False
    push_commit_message: str | None = None