annotate(config): tighten field comments to one line each

Collapse the remaining multi-line field comments / docstrings in config.py to single lines (or two where a knob genuinely needs it), keeping the essential rationale. Comments only — no field or behavior change. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-04 21:01:26 +00:00 · 2026-06-04 15:12:31 +02:00
parent cd59c8b312
commit cdd94a703f
1 changed files with 41 additions and 91 deletions
--- a/src/lerobot/annotations/steerable_pipeline/config.py
+++ b/src/lerobot/annotations/steerable_pipeline/config.py
@@ -23,79 +23,53 @@ from typing import Any

@dataclass
 class PlanConfig:
-    """``plan`` module: plan + subtasks + memory + task augmentation.
-
-    The ``plan`` module attaches the whole episode as one Qwen-VL video
-    block; ``max_video_frames`` only caps the frames packed in (a
-    model-capacity bound, not an annotation-logic knob).
-    """
+    """``plan`` module: subtasks + plan + memory + task augmentation."""

    enabled: bool = True

-    # Number of ``task_aug`` rephrasings emitted at ``t=0``. The renderer's
-    # ``${task}`` binding rotates among them per ``sample_idx``. ``0`` disables.
+    # ``task_aug`` rephrasings at t=0 (renderer rotates ${task} among them); 0 disables.
    n_task_rephrasings: int = 10

-    # Derive the task from video instead of ``record.episode_task``: ``off``,
-    # ``if_short`` (canonical task short/placeholder/missing), or ``always``.
+    # Derive the task from video instead of episode_task: off / if_short / always.
    # Affects prompts only; ``meta/tasks.parquet`` is untouched.
    derive_task_from_video: str = "if_short"
    derive_task_min_words: int = 3

-    # Frames sampled uniformly, capped at ``max_video_frames`` — a HARD context
-    # cap (~250-320 tokens/frame, so 32 fit a 32k VLM; 128 overflow). Lower it
-    # if you hit "Input length exceeds maximum context length".
+    # Frames sampled uniformly, capped at max_video_frames — a hard context cap
+    # (~300 tokens/frame, so 32 fit a 32k VLM; 128 overflow).
    frames_per_second: float = 1.0
    max_video_frames: int = 32

-    # Windowed subtask generation for constant temporal density: when > 0 and
-    # the episode is longer, process it in windows of this length (each at
-    # ``frames_per_second``) instead of subsampling the whole episode; spans are
-    # merged + stitched. ~max_video_frames / frames_per_second. 0 disables.
+    # >0: split long episodes into windows of this length (constant fps density)
+    # instead of subsampling the whole episode; spans merged + stitched. 0 disables.
    subtask_window_seconds: float = 0.0

    min_subtask_seconds: float = 1.5
    plan_max_steps: int = 8

-    # Grounding pass that narrates ONLY what's visible before segmenting — the
-    # strongest lever against subtasks invented from the task text. ON by
-    # default (+1 VLM call/episode); False trades quality for fewer calls.
+    # Narrate-only grounding pass before segmenting — best defense against subtasks
+    # invented from the task text (+1 VLM call/episode).
    subtask_describe_first: bool = True

-    # Emit ``style="plan"`` rows (the numbered still-todo list, re-emitted at
-    # every subtask boundary). False keeps only subtasks + memory and skips
-    # the per-boundary ``_generate_plan`` call.
+    # Emit ``style="plan"`` rows at each boundary; False = subtasks + memory only.
    emit_plan: bool = True

-    # NOTE: subtask spans are ALWAYS stitched into a contiguous full-episode
-    # cover (see ``_stitch_full_coverage``) — not configurable.
+    # (subtask spans are always stitched to a contiguous full-episode cover; not configurable.)

-    # When True, send a server-side ``video_url`` clip (sampled at
-    # ``use_video_url_fps``) instead of embedded frames.
+    # Send a server-side ``video_url`` clip (at use_video_url_fps) instead of embedded frames.
    use_video_url: bool = False
    use_video_url_fps: float = 1.0

-    # Optional 5-axis task-augmentation taxonomy for the t=0 variants
-    # (EgoMimic-style: synonym / omit_arm / omit_orientation /
-    # omit_grasp_method / combined). Replaces the free-form
-    # ``n_task_rephrasings`` flow when enabled; see ``TaskAugAxesConfig``.
+    # Optional EgoMimic-style 5-axis task augmentation; replaces n_task_rephrasings.
    task_aug_axes: TaskAugAxesConfig = field(default_factory=lambda: TaskAugAxesConfig())


@dataclass
 class TaskAugAxesConfig:
-    """Structured 5-axis augmentation taxonomy for t=0 task variants.
-
-    When ``enabled=True``, replaces the free-form ``n_task_rephrasings``
-    flow with variants along five named axes (EgoMimic-style):
-    ``synonym_paraphrase`` (reword, keep all info), ``omit_arm``,
-    ``omit_orientation``, ``omit_grasp_method``, and ``combined_omissions``
-    (drop two at once).
-
-    Default counts (3+3+2+2+2 = 12) match EgoMimic. Axes with nothing to
-    omit emit fewer entries rather than pad. Each variant becomes a
-    ``task_aug`` row at ``t=0``, identical in style to the free-form ones.
-    """
+    """5-axis t=0 task augmentation (EgoMimic-style): synonym / omit_arm /
+    omit_orientation / omit_grasp_method / combined. Replaces n_task_rephrasings
+    when enabled; each variant becomes a ``task_aug`` row. Axes with nothing to
+    omit emit fewer entries. Defaults (3+3+2+2+2) match EgoMimic."""

    enabled: bool = False

@@ -112,13 +86,11 @@ class InterjectionsConfig:

    enabled: bool = True

-    # Each interjection emits a paired (interjection, speech) event row and
-    # triggers a ``plan`` refresh at the same timestamp.
+    # Each emits a paired (interjection, speech) row + a plan refresh at that ts.
    max_interjections_per_episode: int = 3
    interjection_min_t: float = 2.0

-    # A short frame window centered on the timestamp so the VLM sees the
-    # motion, not one frozen frame.
+    # Frame window centered on the timestamp so the VLM sees motion, not one frame.
    interjection_window_seconds: float = 2.0
    interjection_window_frames: int = 4

@@ -130,14 +102,11 @@ class VqaConfig:
    enabled: bool = True
    vqa_emission_hz: float = 1.0
    K: int = 1
-    """Consecutive frames each emission tick anchors a VQA pair to. The VLM
-    grounds its answer on the FIRST anchored frame, so K>1 copies that answer
-    onto later (moved) frames — stale labels. Default 1 (no smear)."""
+    """Consecutive frames per emission tick. The VLM grounds on the FIRST frame,
+    so K>1 smears stale labels onto moved frames. Default 1 (no smear)."""
    question_types: tuple[str, ...] = ("bbox", "keypoint", "count", "attribute", "spatial")

-    # By default VQA iterates every camera (one pair per camera per tick). Set
-    # True to ground VQA only on ``--vlm.camera_key`` — the single view the
-    # plan / interjection modules use.
+    # True: ground VQA only on --vlm.camera_key (default: every camera).
    restrict_to_default_camera: bool = False


@@ -145,27 +114,22 @@ class VqaConfig:
 class VlmConfig:
    """Shared Qwen-VL client configuration."""

-    # Only ``openai`` is supported (in-process vllm/transformers were removed;
-    # the shipped workflow is HF Jobs). Talks to an OpenAI-compatible vLLM
-    # server, auto-spawned in-job when ``auto_serve=True``. ``stub`` is for tests.
+    # Only ``openai`` (OpenAI-compatible vLLM server, auto-spawned when
+    # auto_serve=True); ``stub`` is for tests.
    backend: str = "openai"
    model_id: str = "Qwen/Qwen3.6-27B"

-    # OpenAI-compatible server endpoint; ``EMPTY`` works for local servers.
+    # OpenAI-compatible endpoint; ``EMPTY`` key works for local servers.
    api_base: str = "http://localhost:8000/v1"
    api_key: str = "EMPTY"

-    # When True with ``backend=openai``, the CLI probes ``api_base`` and
-    # spawns a server if none answers (default: ``transformers serve``).
-    # Set to False to fail fast when pointing at a remote endpoint.
+    # Spawn a server if none answers api_base; False = fail fast on a remote.
    auto_serve: bool = True
    serve_port: int = 8000
-    # Override the auto-serve command. ``{port}`` is substituted per replica
-    # when ``parallel_servers > 1``.
+    # Override the auto-serve command; ``{port}`` substituted per replica.
    serve_command: str | None = None

-    # Independent servers for round-robin routing (each pinned to a GPU,
-    # bound to ``serve_port + i``). ``num_gpus=0`` = one GPU per replica.
+    # Independent servers for round-robin routing (one per GPU). num_gpus=0 = one each.
    parallel_servers: int = 1
    num_gpus: int = 0
    client_concurrency: int = 16
@@ -174,49 +138,37 @@ class VlmConfig:
    max_new_tokens: int = 512
    temperature: float = 0.2

-    # Context length for the auto-spawned vLLM server (None → 32768). vLLM
-    # tuning flags (tensor-parallel size, GPU memory fraction, ...) go in
-    # ``serve_command`` directly, not here.
+    # Auto-serve context length (None → 32768); other vLLM flags go in serve_command.
    max_model_len: int | None = None

-    # Override the camera stream used for keyframe attachment. None picks
-    # the first ``observation.images.*`` key the dataset declares.
+    # Camera for keyframes; None → first ``observation.images.*`` key.
    camera_key: str | None = None
-    # Forwarded as ``extra_body.chat_template_kwargs`` on every chat call;
-    # use to pass model-specific flags such as ``{"enable_thinking": false}``.
+    # Forwarded as extra_body.chat_template_kwargs (e.g. {"enable_thinking": false}).
    chat_template_kwargs: dict[str, Any] | None = None


@dataclass
 class ExecutorConfig:
-    """Executor settings — intra-process episode concurrency only
-    (distributed execution is delegated to Hugging Face Jobs)."""
+    """Executor settings (intra-process episode concurrency; distribution via HF Jobs)."""

-    # Episodes processed concurrently per module phase. Each dispatches 3-5 VLM
-    # calls, so this is the main knob for saturating ``parallel_servers`` /
-    # ``client_concurrency``.
+    # Episodes processed concurrently per phase; main knob for saturating the servers.
    episode_parallelism: int = 16


@dataclass
 class AnnotationPipelineConfig:
-    """Top-level config for ``lerobot-annotate``.
+    """Top-level config for ``lerobot-annotate`` (rewrites data shards in place)."""

-    The writer rewrites ``data/chunk-*/file-*.parquet`` in place. Multiple
-    revisions of the same dataset live in separate copies.
-    """
-
-    # Hub dataset id: download source when ``root`` is unset, and push target
-    # when ``push_to_hub`` is on and ``new_repo_id`` is unset.
+    # Hub dataset: download source when ``root`` unset; push target when push_to_hub
+    # is on and ``new_repo_id`` unset.
    repo_id: str | None = None

-    # Optional separate push target (named to match the LeRobot dataset edit
-    # tools). Unset → push back to ``repo_id`` in place; set → source untouched.
+    # Separate push target (matches the LeRobot edit tools). Unset → push in place.
    new_repo_id: str | None = None

    root: Path | None = None

-    # Defaults to ``<root>/.annotate_staging/`` when unset.
+    # Defaults to ``<root>/.annotate_staging/``.
    staging_dir: Path | None = None

    seed: int = 1729
@@ -231,13 +183,11 @@ class AnnotationPipelineConfig:
    skip_validation: bool = False
    only_episodes: tuple[int, ...] | None = None

-    # Keyframe decode backend. Unset → ffmpeg CLI: decodes AV1 in an isolated
-    # child process, so it's crash- and thread-safe under concurrent decode
-    # (torchcodec SIGSEGVs there). Set ``"torchcodec"`` / ``"pyav"`` to pin one.
+    # Keyframe decode backend. None → ffmpeg CLI (crash-/thread-safe; torchcodec
+    # SIGSEGVs under concurrent decode). Or ``"torchcodec"`` / ``"pyav"``.
    video_backend: str | None = None

-    # Upload the annotated dataset to the Hub (to ``new_repo_id`` if set, else
-    # back to ``repo_id`` — one of the two must be set).
+    # Upload to the Hub (new_repo_id if set, else repo_id; one must be set).
    push_to_hub: bool = False
    push_private: bool = False
    push_commit_message: str | None = None