From b9a01873354c6cc483843d36e935b3a3bb71055d Mon Sep 17 00:00:00 2001 From: Pepijn Date: Wed, 3 Jun 2026 16:28:40 +0200 Subject: [PATCH] =?UTF-8?q?annotate:=20drop=20local=20in-process=20VLM=20b?= =?UTF-8?q?ackends=20=E2=80=94=20HF=20Jobs=20(openai)=20only=20for=20now?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The shipped workflow is Hugging Face Jobs (examples/annotations/run_hf_ job.py): it serves the model with vLLM in the vllm/vllm-openai image and the pipeline talks to it over the OpenAI-compatible API. The in-process vllm / transformers local backends added surface (and the vllm one pinned an old torch) without being part of that path, so they're removed for now. * vlm_client.make_vlm_client: keep only backend='openai' (+ 'stub' rejected with the usual guidance). Requesting 'vllm'/'transformers' now raises a clear 'not supported for now — use the HF Jobs flow' error. Removed _make_vllm_client and _make_transformers_client. * config: backend docstring updated (openai-only); default model_id bumped to Qwen/Qwen3.6-27B to match run_hf_job. * docs/annotation_pipeline.mdx: remove the '## Running locally' section; the launcher description now says one vLLM server per GPU over the OpenAI API, and the 'One Qwen-VL pass' note drops the 'vLLM/transformers fallback' wording. Tests are unaffected (they construct StubVlmClient directly; nothing referenced the removed backends). Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/source/annotation_pipeline.mdx | 43 +----- .../annotations/steerable_pipeline/config.py | 10 +- .../steerable_pipeline/vlm_client.py | 137 +++--------------- 3 files changed, 32 insertions(+), 158 deletions(-) diff --git a/docs/source/annotation_pipeline.mdx b/docs/source/annotation_pipeline.mdx index 05e4d103d..7fd27b1f2 100644 --- a/docs/source/annotation_pipeline.mdx +++ b/docs/source/annotation_pipeline.mdx @@ -48,38 +48,6 @@ anything already there. Implementations of those tools live under `src/lerobot/tools/`; one file per tool, registered via `TOOL_REGISTRY`. See the [Tools](./tools) doc for the authoring guide. -## Running locally - -Install the extra and invoke the console script. Episode-level -concurrency comes from `--executor.episode_parallelism` (default 16); -that is the only knob the in-process executor exposes. - -```bash -uv sync --extra annotations -uv run lerobot-annotate \ - --root=/path/to/dataset \ - --vlm.model_id=Qwen/Qwen2.5-VL-7B-Instruct -``` - -The pipeline attaches actual camera footage to every `plan` / -`interjections` / `vqa` prompt by default, decoded from the dataset's -first `observation.images.*` stream. Override with -`--vlm.camera_key=observation.images.` to pin a specific -viewpoint. Datasets with no video tracks fall back to text-only prompts -automatically. - -**The `plan` module sees the whole episode as one video block.** Subtask -decomposition gets a `{"type":"video", "video":[]}` block -covering the entire demonstration; Qwen-VL pools temporally on its own -and decides where to cut. There is no keyframe stride or count knob — -`--plan.max_video_frames` (default 128) only caps the frames packed -into the video block as a model-capacity bound. The `interjections` -module attaches a short window of frames straddling the interjection -timestamp. The `vqa` module grounds each VQA pair on a single frame — -its `--vqa.K` knob sets how many consecutive frames each emission tick -anchors, and every anchored frame gets its own VQA pair on that one -frame (there is no per-pair frame window). - ## Running on Hugging Face Jobs Distributed annotation is delegated to @@ -91,10 +59,11 @@ HF_TOKEN=hf_... uv run python examples/annotations/run_hf_job.py ``` [`examples/annotations/run_hf_job.py`](https://github.com/huggingface/lerobot/blob/main/examples/annotations/run_hf_job.py) -spawns one `h200x2` job that: +spawns a multi-GPU `h200` job that: 1. installs the branch under test plus the annotation extras, -2. boots two vllm servers (one per GPU) for the chosen model, +2. boots one vLLM server per GPU (in the `vllm/vllm-openai` image) for the + chosen model, which the pipeline drives over the OpenAI-compatible API, 3. runs the `plan` / `interjections` / `vqa` modules across the dataset via `lerobot-annotate`, 4. uploads the annotated dataset to `--push_to_hub`. @@ -126,9 +95,9 @@ Two things drive the scope: speech) only appear on the exact frame whose timestamp matches the emission. The pipeline writes timestamps taken straight from the source parquet — no floating-point recomputation. -2. **One Qwen-VL pass.** All three modules share a single VLM client - (vLLM if available, transformers fallback) so the cost is one model - load per dataset, not three. +2. **One Qwen-VL pass.** All three modules share a single VLM client (the + OpenAI-compatible client talking to the job's vLLM server) so the cost + is one model load per dataset, not three. ## Module independence and staged reruns diff --git a/src/lerobot/annotations/steerable_pipeline/config.py b/src/lerobot/annotations/steerable_pipeline/config.py index cdcf38072..470dccfc1 100644 --- a/src/lerobot/annotations/steerable_pipeline/config.py +++ b/src/lerobot/annotations/steerable_pipeline/config.py @@ -295,11 +295,13 @@ class VqaConfig: class VlmConfig: """Shared Qwen-VL client configuration.""" - # One of ``vllm``, ``transformers``, ``openai``, or ``stub`` (tests). - # ``openai`` talks to a local OpenAI-compatible server; the CLI - # auto-spawns one when ``auto_serve=True``. + # Only ``openai`` is supported for now (the in-process ``vllm`` / + # ``transformers`` local backends were removed — the shipped workflow + # is Hugging Face Jobs). ``openai`` talks to an OpenAI-compatible vLLM + # server; the CLI auto-spawns one in-job when ``auto_serve=True``. + # ``stub`` is for unit tests (construct ``StubVlmClient`` directly). backend: str = "openai" - model_id: str = "Qwen/Qwen3.6-35B-A3B-FP8" + model_id: str = "Qwen/Qwen3.6-27B" # OpenAI-compatible server endpoint; ``EMPTY`` works for local servers. api_base: str = "http://localhost:8000/v1" diff --git a/src/lerobot/annotations/steerable_pipeline/vlm_client.py b/src/lerobot/annotations/steerable_pipeline/vlm_client.py index 8aa7d01c6..d0d9e56a9 100644 --- a/src/lerobot/annotations/steerable_pipeline/vlm_client.py +++ b/src/lerobot/annotations/steerable_pipeline/vlm_client.py @@ -192,132 +192,35 @@ class _GenericTextClient: def make_vlm_client(config: VlmConfig) -> VlmClient: - """Build the shared VLM client per the configured backend. + """Build the shared VLM client. - For ``stub``, callers should construct :class:`StubVlmClient` directly with - a responder callable. ``stub`` here is rejected to make accidental misuse - obvious. + Only the ``openai`` backend is supported for now. The shipped workflow + is Hugging Face Jobs (``examples/annotations/run_hf_job.py``): it boots + a vLLM server inside the ``vllm/vllm-openai`` image and the pipeline + talks to it over the OpenAI-compatible API (``--vlm.backend=openai``, + optionally auto-spawning the server via ``auto_serve`` / + ``serve_command``). The former in-process ``vllm`` / ``transformers`` + backends were removed to keep the support surface to the HF Jobs path. + + For ``stub``, construct :class:`StubVlmClient` directly with a responder + callable; it is rejected here to make accidental misuse obvious. """ + if config.backend == "openai": + return _make_openai_client(config) if config.backend == "stub": raise ValueError( "Use StubVlmClient(...) directly for the stub backend; make_vlm_client builds real clients." ) - if config.backend == "vllm": - return _make_vllm_client(config) - if config.backend == "transformers": - return _make_transformers_client(config) - if config.backend == "openai": - return _make_openai_client(config) + if config.backend in {"vllm", "transformers"}: + raise ValueError( + f"backend={config.backend!r} (in-process local model) is not supported for now — " + "only backend='openai' (the Hugging Face Jobs flow) is. Run the pipeline via " + "examples/annotations/run_hf_job.py, which serves the model with vLLM in the " + "vllm/vllm-openai image and talks to it over the OpenAI-compatible API." + ) raise ValueError(f"Unknown VLM backend: {config.backend!r}") -def _make_vllm_client(config: VlmConfig) -> VlmClient: - try: - from vllm import LLM, SamplingParams # type: ignore[import-not-found] - except ImportError as exc: - raise ImportError( - "vllm is required for backend='vllm'. Install it separately with " - "`pip install vllm` (it is not a hard dependency of the " - "``annotations`` extra because it pins an older torch). The HF " - "Jobs launcher uses the vllm/vllm-openai image + backend='openai' " - "instead." - ) from exc - # Workaround for cuDNN 9.x + torch 2.8 conv3d regression that surfaces - # as CUDNN_STATUS_NOT_INITIALIZED in Qwen-VL vision-tower patch - # embedders. Setting LEROBOT_DISABLE_CUDNN=1 forces native PyTorch - # convolution kernels — slower but functional. - if os.environ.get("LEROBOT_DISABLE_CUDNN", "").lower() in {"1", "true", "yes"}: - import torch as _torch # noqa: PLC0415 - optional GPU dep, deferred - - _torch.backends.cudnn.enabled = False - llm_kwargs: dict[str, Any] = { - "model": config.model_id, - "tensor_parallel_size": config.tensor_parallel_size, - "gpu_memory_utilization": config.gpu_memory_utilization, - "trust_remote_code": config.trust_remote_code, - } - if config.max_model_len is not None: - llm_kwargs["max_model_len"] = config.max_model_len - llm = LLM(**llm_kwargs) - - def _gen(batch: Sequence[Sequence[dict[str, Any]]], max_tok: int, temp: float) -> list[str]: - # ``guided_decoding`` would speed up parsing but its API differs across - # vllm releases (dict vs GuidedDecodingParams). The _GenericTextClient - # wrapper already has a one-retry JSON-recovery path, so we skip it. - params = SamplingParams(max_tokens=max_tok, temperature=temp) - # ``llm.chat`` handles chat-template application + multimodal input - # extraction (image/video blocks) internally, which ``llm.generate`` - # does not. - outputs = llm.chat([list(m) for m in batch], params) - return [o.outputs[0].text for o in outputs] - - return _GenericTextClient(_gen, config) - - -def _make_transformers_client(config: VlmConfig) -> VlmClient: - try: - import torch # type: ignore[import-not-found] - import transformers # type: ignore[import-not-found] - from transformers import AutoProcessor # type: ignore[import-not-found] - except ImportError as exc: - raise ImportError("transformers + torch are required for backend='transformers'.") from exc - auto_cls = getattr(transformers, "AutoModelForImageTextToText", None) or getattr( - transformers, "AutoModelForVision2Seq", None - ) - if auto_cls is None: - raise ImportError( - "Neither AutoModelForImageTextToText nor AutoModelForVision2Seq is available in this " - "transformers version. Install transformers>=4.45 (which has AutoModelForImageTextToText) " - "for VL models." - ) - processor = AutoProcessor.from_pretrained(config.model_id, trust_remote_code=config.trust_remote_code) - use_accelerate = os.environ.get("LEROBOT_TRANSFORMERS_DEVICE_MAP", "manual") != "manual" - # ``device_map='auto'`` triggers a known std::bad_alloc on the Qwen3-VL - # post-load dispatch path (the alloc fails in accelerate's hook setup - # even with TBs of host RAM). Default to manual: load on CPU with - # ``low_cpu_mem_usage=True``, then ``.to("cuda")``. Set - # ``LEROBOT_TRANSFORMERS_DEVICE_MAP=auto`` to opt back into the old path. - if use_accelerate: - model = auto_cls.from_pretrained( - config.model_id, - torch_dtype="auto", - device_map="auto", - low_cpu_mem_usage=True, - trust_remote_code=config.trust_remote_code, - ) - else: - import torch as _torch # noqa: PLC0415 - optional GPU dep, deferred - - model = auto_cls.from_pretrained( - config.model_id, - torch_dtype=_torch.bfloat16, - low_cpu_mem_usage=True, - trust_remote_code=config.trust_remote_code, - ) - model = model.to("cuda") - model.eval() - - def _gen(batch: Sequence[Sequence[dict[str, Any]]], max_tok: int, temp: float) -> list[str]: - outs: list[str] = [] - for messages in batch: - text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) - inputs = processor(text=[text], return_tensors="pt").to(model.device) - with torch.no_grad(): - gen = model.generate( - **inputs, - max_new_tokens=max_tok, - temperature=temp, - do_sample=temp > 0.0, - ) - decoded = processor.batch_decode( - gen[:, inputs["input_ids"].shape[-1] :], skip_special_tokens=True - )[0] - outs.append(decoded) - return outs - - return _GenericTextClient(_gen, config) - - def _make_openai_client(config: VlmConfig) -> VlmClient: """Backend that talks to any OpenAI-compatible server.