From b9a01873354c6cc483843d36e935b3a3bb71055d Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Wed, 3 Jun 2026 16:28:40 +0200
Subject: [PATCH] =?UTF-8?q?annotate:=20drop=20local=20in-process=20VLM=20b?=
 =?UTF-8?q?ackends=20=E2=80=94=20HF=20Jobs=20(openai)=20only=20for=20now?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The shipped workflow is Hugging Face Jobs (examples/annotations/run_hf_
job.py): it serves the model with vLLM in the vllm/vllm-openai image and
the pipeline talks to it over the OpenAI-compatible API. The in-process
vllm / transformers local backends added surface (and the vllm
one pinned an old torch) without being part of that path, so they're
removed for now.

  * vlm_client.make_vlm_client: keep only backend='openai' (+ 'stub'
    rejected with the usual guidance). Requesting 'vllm'/'transformers'
    now raises a clear 'not supported for now — use the HF Jobs flow'
    error. Removed _make_vllm_client and _make_transformers_client.
  * config: backend docstring updated (openai-only); default model_id
    bumped to Qwen/Qwen3.6-27B to match run_hf_job.
  * docs/annotation_pipeline.mdx: remove the '## Running locally'
    section; the launcher description now says one vLLM server per GPU
    over the OpenAI API, and the 'One Qwen-VL pass' note drops the
    'vLLM/transformers fallback' wording.

Tests are unaffected (they construct StubVlmClient directly; nothing
referenced the removed backends).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 docs/source/annotation_pipeline.mdx           |  43 +-----
 .../annotations/steerable_pipeline/config.py  |  10 +-
 .../steerable_pipeline/vlm_client.py          | 137 +++---------------
 3 files changed, 32 insertions(+), 158 deletions(-)
diff --git a/docs/source/annotation_pipeline.mdx b/docs/source/annotation_pipeline.mdx
index 05e4d103d..7fd27b1f2 100644
--- a/docs/source/annotation_pipeline.mdx
+++ b/docs/source/annotation_pipeline.mdx
@@ -48,38 +48,6 @@ anything already there. Implementations of those tools live under
 `src/lerobot/tools/`; one file per tool, registered via
 `TOOL_REGISTRY`. See the [Tools](./tools) doc for the authoring guide.
 
-## Running locally
-
-Install the extra and invoke the console script. Episode-level
-concurrency comes from `--executor.episode_parallelism` (default 16);
-that is the only knob the in-process executor exposes.
-
-```bash
-uv sync --extra annotations
-uv run lerobot-annotate \
-  --root=/path/to/dataset \
-  --vlm.model_id=Qwen/Qwen2.5-VL-7B-Instruct
-```
-
-The pipeline attaches actual camera footage to every `plan` /
-`interjections` / `vqa` prompt by default, decoded from the dataset's
-first `observation.images.*` stream. Override with
-`--vlm.camera_key=observation.images.<name>` to pin a specific
-viewpoint. Datasets with no video tracks fall back to text-only prompts
-automatically.
-
-**The `plan` module sees the whole episode as one video block.** Subtask
-decomposition gets a `{"type":"video", "video":[<frames>]}` block
-covering the entire demonstration; Qwen-VL pools temporally on its own
-and decides where to cut. There is no keyframe stride or count knob —
-`--plan.max_video_frames` (default 128) only caps the frames packed
-into the video block as a model-capacity bound. The `interjections`
-module attaches a short window of frames straddling the interjection
-timestamp. The `vqa` module grounds each VQA pair on a single frame —
-its `--vqa.K` knob sets how many consecutive frames each emission tick
-anchors, and every anchored frame gets its own VQA pair on that one
-frame (there is no per-pair frame window).
-
 ## Running on Hugging Face Jobs
 
 Distributed annotation is delegated to
@@ -91,10 +59,11 @@ HF_TOKEN=hf_... uv run python examples/annotations/run_hf_job.py
 ```
 
 [`examples/annotations/run_hf_job.py`](https://github.com/huggingface/lerobot/blob/main/examples/annotations/run_hf_job.py)
-spawns one `h200x2` job that:
+spawns a multi-GPU `h200` job that:
 
 1. installs the branch under test plus the annotation extras,
-2. boots two vllm servers (one per GPU) for the chosen model,
+2. boots one vLLM server per GPU (in the `vllm/vllm-openai` image) for the
+   chosen model, which the pipeline drives over the OpenAI-compatible API,
 3. runs the `plan` / `interjections` / `vqa` modules across the dataset
    via `lerobot-annotate`,
 4. uploads the annotated dataset to `--push_to_hub`.
@@ -126,9 +95,9 @@ Two things drive the scope:
    speech) only appear on the exact frame whose timestamp matches the
    emission. The pipeline writes timestamps taken straight from the
    source parquet — no floating-point recomputation.
-2. **One Qwen-VL pass.** All three modules share a single VLM client
-   (vLLM if available, transformers fallback) so the cost is one model
-   load per dataset, not three.
+2. **One Qwen-VL pass.** All three modules share a single VLM client (the
+   OpenAI-compatible client talking to the job's vLLM server) so the cost
+   is one model load per dataset, not three.
 
 ## Module independence and staged reruns
 
diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py
index cdcf38072..470dccfc1 100644
--- a/src/lerobot/annotations/steerable_pipeline/config.py
+++ b/src/lerobot/annotations/steerable_pipeline/config.py
@@ -295,11 +295,13 @@ class VqaConfig:
 class VlmConfig:
     """Shared Qwen-VL client configuration."""
 
-    # One of ``vllm``, ``transformers``, ``openai``, or ``stub`` (tests).
-    # ``openai`` talks to a local OpenAI-compatible server; the CLI
-    # auto-spawns one when ``auto_serve=True``.
+    # Only ``openai`` is supported for now (the in-process ``vllm`` /
+    # ``transformers`` local backends were removed — the shipped workflow
+    # is Hugging Face Jobs). ``openai`` talks to an OpenAI-compatible vLLM
+    # server; the CLI auto-spawns one in-job when ``auto_serve=True``.
+    # ``stub`` is for unit tests (construct ``StubVlmClient`` directly).
     backend: str = "openai"
-    model_id: str = "Qwen/Qwen3.6-35B-A3B-FP8"
+    model_id: str = "Qwen/Qwen3.6-27B"
 
     # OpenAI-compatible server endpoint; ``EMPTY`` works for local servers.
     api_base: str = "http://localhost:8000/v1"
diff --git a/src/lerobot/annotations/steerable_pipeline/vlm_client.py b/src/lerobot/annotations/steerable_pipeline/vlm_client.py
index 8aa7d01c6..d0d9e56a9 100644
--- a/src/lerobot/annotations/steerable_pipeline/vlm_client.py
+++ b/src/lerobot/annotations/steerable_pipeline/vlm_client.py
@@ -192,132 +192,35 @@ class _GenericTextClient:
 
 
 def make_vlm_client(config: VlmConfig) -> VlmClient:
-    """Build the shared VLM client per the configured backend.
+    """Build the shared VLM client.
 
-    For ``stub``, callers should construct :class:`StubVlmClient` directly with
-    a responder callable. ``stub`` here is rejected to make accidental misuse
-    obvious.
+    Only the ``openai`` backend is supported for now. The shipped workflow
+    is Hugging Face Jobs (``examples/annotations/run_hf_job.py``): it boots
+    a vLLM server inside the ``vllm/vllm-openai`` image and the pipeline
+    talks to it over the OpenAI-compatible API (``--vlm.backend=openai``,
+    optionally auto-spawning the server via ``auto_serve`` /
+    ``serve_command``). The former in-process ``vllm`` / ``transformers``
+    backends were removed to keep the support surface to the HF Jobs path.
+
+    For ``stub``, construct :class:`StubVlmClient` directly with a responder
+    callable; it is rejected here to make accidental misuse obvious.
     """
+    if config.backend == "openai":
+        return _make_openai_client(config)
     if config.backend == "stub":
         raise ValueError(
             "Use StubVlmClient(...) directly for the stub backend; make_vlm_client builds real clients."
         )
-    if config.backend == "vllm":
-        return _make_vllm_client(config)
-    if config.backend == "transformers":
-        return _make_transformers_client(config)
-    if config.backend == "openai":
-        return _make_openai_client(config)
+    if config.backend in {"vllm", "transformers"}:
+        raise ValueError(
+            f"backend={config.backend!r} (in-process local model) is not supported for now — "
+            "only backend='openai' (the Hugging Face Jobs flow) is. Run the pipeline via "
+            "examples/annotations/run_hf_job.py, which serves the model with vLLM in the "
+            "vllm/vllm-openai image and talks to it over the OpenAI-compatible API."
+        )
     raise ValueError(f"Unknown VLM backend: {config.backend!r}")
 
 
-def _make_vllm_client(config: VlmConfig) -> VlmClient:
-    try:
-        from vllm import LLM, SamplingParams  # type: ignore[import-not-found]
-    except ImportError as exc:
-        raise ImportError(
-            "vllm is required for backend='vllm'. Install it separately with "
-            "`pip install vllm` (it is not a hard dependency of the "
-            "``annotations`` extra because it pins an older torch). The HF "
-            "Jobs launcher uses the vllm/vllm-openai image + backend='openai' "
-            "instead."
-        ) from exc
-    # Workaround for cuDNN 9.x + torch 2.8 conv3d regression that surfaces
-    # as CUDNN_STATUS_NOT_INITIALIZED in Qwen-VL vision-tower patch
-    # embedders. Setting LEROBOT_DISABLE_CUDNN=1 forces native PyTorch
-    # convolution kernels — slower but functional.
-    if os.environ.get("LEROBOT_DISABLE_CUDNN", "").lower() in {"1", "true", "yes"}:
-        import torch as _torch  # noqa: PLC0415  - optional GPU dep, deferred
-
-        _torch.backends.cudnn.enabled = False
-    llm_kwargs: dict[str, Any] = {
-        "model": config.model_id,
-        "tensor_parallel_size": config.tensor_parallel_size,
-        "gpu_memory_utilization": config.gpu_memory_utilization,
-        "trust_remote_code": config.trust_remote_code,
-    }
-    if config.max_model_len is not None:
-        llm_kwargs["max_model_len"] = config.max_model_len
-    llm = LLM(**llm_kwargs)
-
-    def _gen(batch: Sequence[Sequence[dict[str, Any]]], max_tok: int, temp: float) -> list[str]:
-        # ``guided_decoding`` would speed up parsing but its API differs across
-        # vllm releases (dict vs GuidedDecodingParams). The _GenericTextClient
-        # wrapper already has a one-retry JSON-recovery path, so we skip it.
-        params = SamplingParams(max_tokens=max_tok, temperature=temp)
-        # ``llm.chat`` handles chat-template application + multimodal input
-        # extraction (image/video blocks) internally, which ``llm.generate``
-        # does not.
-        outputs = llm.chat([list(m) for m in batch], params)
-        return [o.outputs[0].text for o in outputs]
-
-    return _GenericTextClient(_gen, config)
-
-
-def _make_transformers_client(config: VlmConfig) -> VlmClient:
-    try:
-        import torch  # type: ignore[import-not-found]
-        import transformers  # type: ignore[import-not-found]
-        from transformers import AutoProcessor  # type: ignore[import-not-found]
-    except ImportError as exc:
-        raise ImportError("transformers + torch are required for backend='transformers'.") from exc
-    auto_cls = getattr(transformers, "AutoModelForImageTextToText", None) or getattr(
-        transformers, "AutoModelForVision2Seq", None
-    )
-    if auto_cls is None:
-        raise ImportError(
-            "Neither AutoModelForImageTextToText nor AutoModelForVision2Seq is available in this "
-            "transformers version. Install transformers>=4.45 (which has AutoModelForImageTextToText) "
-            "for VL models."
-        )
-    processor = AutoProcessor.from_pretrained(config.model_id, trust_remote_code=config.trust_remote_code)
-    use_accelerate = os.environ.get("LEROBOT_TRANSFORMERS_DEVICE_MAP", "manual") != "manual"
-    # ``device_map='auto'`` triggers a known std::bad_alloc on the Qwen3-VL
-    # post-load dispatch path (the alloc fails in accelerate's hook setup
-    # even with TBs of host RAM). Default to manual: load on CPU with
-    # ``low_cpu_mem_usage=True``, then ``.to("cuda")``. Set
-    # ``LEROBOT_TRANSFORMERS_DEVICE_MAP=auto`` to opt back into the old path.
-    if use_accelerate:
-        model = auto_cls.from_pretrained(
-            config.model_id,
-            torch_dtype="auto",
-            device_map="auto",
-            low_cpu_mem_usage=True,
-            trust_remote_code=config.trust_remote_code,
-        )
-    else:
-        import torch as _torch  # noqa: PLC0415  - optional GPU dep, deferred
-
-        model = auto_cls.from_pretrained(
-            config.model_id,
-            torch_dtype=_torch.bfloat16,
-            low_cpu_mem_usage=True,
-            trust_remote_code=config.trust_remote_code,
-        )
-        model = model.to("cuda")
-    model.eval()
-
-    def _gen(batch: Sequence[Sequence[dict[str, Any]]], max_tok: int, temp: float) -> list[str]:
-        outs: list[str] = []
-        for messages in batch:
-            text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
-            inputs = processor(text=[text], return_tensors="pt").to(model.device)
-            with torch.no_grad():
-                gen = model.generate(
-                    **inputs,
-                    max_new_tokens=max_tok,
-                    temperature=temp,
-                    do_sample=temp > 0.0,
-                )
-            decoded = processor.batch_decode(
-                gen[:, inputs["input_ids"].shape[-1] :], skip_special_tokens=True
-            )[0]
-            outs.append(decoded)
-        return outs
-
-    return _GenericTextClient(_gen, config)
-
-
 def _make_openai_client(config: VlmConfig) -> VlmClient:
     """Backend that talks to any OpenAI-compatible server.