Files
lerobot-clone/tests/annotations/test_vocabulary.py

413 lines
16 KiB
Python
Raw Normal View History

feat(annotate): phase 0 — derive canonical vocabulary from sample episodes The pipeline previously emitted near-unique subtask + memory phrasings per episode (free-form LLM rephrasing). On the downstream low-level policy that collapses the action expert's conditioning to noise: every episode pairs a different paraphrase with similar motions, so the expert learns a flat scene-prior that ignores the subtask string — then at inference the high-level head invents *yet another* paraphrase and the expert produces tiny "uncertain hover" chunks. Add a vocabulary-discovery phase (phase 0) that runs once per dataset: - watches the first ``vocabulary.sample_episodes`` (default 3) episode videos as one Qwen-VL prompt, - asks the VLM to derive ~``n_subtask_target`` canonical imperative subtask labels and ~``n_memory_target`` first-person past-tense memory milestones that recur across the demos, - persists them to ``meta/canonical_vocabulary.json`` (human- inspectable, hand-editable), and - wires the resulting ``Vocabulary`` into the ``plan`` module so every per-episode subtask + memory call is constrained to those exact strings (both as prompt-side instructions *and* post-VLM validation: paraphrases snap to the closest canonical entry via token-set overlap; below a 0.5 Jaccard floor the subtask is dropped rather than warped into something semantically wrong). Operator workflow: - first run discovers the vocabulary, writes the JSON, and runs the ``plan`` module against it, - subsequent runs reuse the on-disk file (``reuse_existing=True`` default) so hand-edits stick, - set ``--vocabulary.enabled=False`` to fall back to free-form generation (the original behaviour). The discovery prompt forbids gerunds / third-person / adverbs and caps the lists to the requested counts, matching the Hi-Robot / π0.6-MEM convention of small per-environment vocabularies. The ``plan`` module's subtask + memory prompts grow a conditional ``{vocabulary_block}`` slot rendered only when a vocabulary is present; without one the templates collapse to their previous free-form form. Tests: 11 new unit tests under tests/annotations/test_vocabulary.py cover the on-disk round-trip, discovery against the fixture dataset, ``reuse_existing`` short-circuit, paraphrase canonicalisation, off- vocab subtask dropping, and the no-vocabulary pass-through path. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-22 11:40:05 +00:00
#!/usr/bin/env python
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Vocabulary-discovery phase (phase 0) tests."""
from __future__ import annotations
import json
from pathlib import Path
from lerobot.annotations.steerable_pipeline.config import (
PlanConfig,
VocabularyConfig,
)
from lerobot.annotations.steerable_pipeline.modules import PlanSubtasksMemoryModule
from lerobot.annotations.steerable_pipeline.reader import iter_episodes
from lerobot.annotations.steerable_pipeline.staging import EpisodeStaging
from lerobot.annotations.steerable_pipeline.vocabulary import (
Vocabulary,
VocabularyDiscoveryModule,
load_vocabulary,
save_vocabulary,
vocabulary_path,
)
from ._helpers import make_canned_responder
_CANONICAL_SUBTASKS = (
"grasp blue cube",
"place blue cube in box",
"retract arm",
)
_CANONICAL_MEMORY = (
"I picked up the blue cube.",
"I placed the blue cube in the box.",
)
# ---------------------------------------------------------------------------
# Vocabulary dataclass + on-disk round-trip
# ---------------------------------------------------------------------------
def test_vocabulary_roundtrip(tmp_path: Path) -> None:
vocab = Vocabulary(
subtasks=_CANONICAL_SUBTASKS, memory_milestones=_CANONICAL_MEMORY
)
save_path = save_vocabulary(tmp_path, vocab)
assert save_path == vocabulary_path(tmp_path)
assert save_path.exists()
loaded = load_vocabulary(tmp_path)
assert loaded is not None
assert loaded.subtasks == _CANONICAL_SUBTASKS
assert loaded.memory_milestones == _CANONICAL_MEMORY
def test_vocabulary_load_missing_returns_none(tmp_path: Path) -> None:
assert load_vocabulary(tmp_path) is None
def test_vocabulary_load_malformed_returns_none(tmp_path: Path) -> None:
path = vocabulary_path(tmp_path)
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text("{ not valid json", encoding="utf-8")
assert load_vocabulary(tmp_path) is None
def test_vocabulary_load_empty_payload_returns_none(tmp_path: Path) -> None:
path = vocabulary_path(tmp_path)
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps({"subtasks": [], "memory_milestones": []}), encoding="utf-8")
assert load_vocabulary(tmp_path) is None
# ---------------------------------------------------------------------------
# Discovery module
# ---------------------------------------------------------------------------
def test_vocabulary_discovery_calls_vlm_and_returns_vocab(
fixture_dataset_root: Path,
) -> None:
vlm = make_canned_responder(
{
"canonical vocabulary": {
"subtasks": list(_CANONICAL_SUBTASKS),
"memory_milestones": list(_CANONICAL_MEMORY),
}
}
)
module = VocabularyDiscoveryModule(vlm=vlm, config=VocabularyConfig(sample_episodes=2))
records = list(iter_episodes(fixture_dataset_root))
vocab = module.discover(records)
assert vocab is not None
assert vocab.subtasks == _CANONICAL_SUBTASKS
assert vocab.memory_milestones == _CANONICAL_MEMORY
def test_vocabulary_discovery_reuses_existing(fixture_dataset_root: Path) -> None:
"""``reuse_existing=True`` short-circuits the VLM call entirely."""
def _explode(_messages): # pragma: no cover - must not be called
raise AssertionError("VLM should not be invoked when reusing existing vocabulary")
from lerobot.annotations.steerable_pipeline.vlm_client import StubVlmClient
vlm = StubVlmClient(responder=_explode)
module = VocabularyDiscoveryModule(
vlm=vlm, config=VocabularyConfig(reuse_existing=True)
)
records = list(iter_episodes(fixture_dataset_root))
existing = Vocabulary(subtasks=("a", "b"), memory_milestones=("I a.",))
vocab = module.discover(records, existing=existing)
assert vocab is existing
def test_vocabulary_discovery_empty_payload_returns_none(
fixture_dataset_root: Path,
) -> None:
vlm = make_canned_responder({"canonical vocabulary": {"subtasks": [], "memory_milestones": []}})
module = VocabularyDiscoveryModule(vlm=vlm, config=VocabularyConfig())
records = list(iter_episodes(fixture_dataset_root))
assert module.discover(records) is None
# ---------------------------------------------------------------------------
# PlanSubtasksMemoryModule consumes the vocabulary
# ---------------------------------------------------------------------------
def test_plan_module_inlines_vocab_into_subtask_prompt(
fixture_dataset_root: Path, tmp_path: Path
) -> None:
captured: list[str] = []
def responder(messages):
# Find the last user text block and stash it for inspection.
for message in messages:
content = message.get("content")
if isinstance(content, list):
for block in content:
if isinstance(block, dict) and block.get("type") == "text":
captured.append(block.get("text", ""))
# Return canned subtasks; pick the first two canonical strings so
# the validator accepts them.
return {
"subtasks": [
{"text": "grasp blue cube", "start": 0.0, "end": 0.4},
{"text": "place blue cube in box", "start": 0.4, "end": 0.9},
]
}
from lerobot.annotations.steerable_pipeline.vlm_client import StubVlmClient
vlm = StubVlmClient(responder=responder)
vocab = Vocabulary(subtasks=_CANONICAL_SUBTASKS, memory_milestones=_CANONICAL_MEMORY)
module = PlanSubtasksMemoryModule(
vlm=vlm,
config=PlanConfig(n_task_rephrasings=0),
vocabulary=vocab,
)
record = next(iter_episodes(fixture_dataset_root))
staging = EpisodeStaging(tmp_path / "stage", record.episode_index)
module.run_episode(record, staging)
# The subtask prompt (and the memory prompt) carries the canonical
# bullet list so the VLM can't paraphrase them away.
assert any("Canonical subtask labels:" in t for t in captured)
assert any("grasp blue cube" in t for t in captured)
fix(annotate): replace fuzzy subtask snapping with strict match + one-shot retry The Jaccard-overlap snap was warping VLM output into wrong canonical labels — e.g. an off-vocab "consult the wizard" span would silently become "grasp blue cube" if that scored highest. Even with a higher floor the operator can't tell which subtasks were paraphrases vs genuine mislabels in the resulting dataset. Replace with strict exact-match validation + a single targeted retry: 1. Generate subtasks as before. 2. If any returned subtask's normalised form (lowercased, articles stripped, whitespace collapsed) isn't in the canonical vocab, fire one retry call naming the offending strings and re-sending the full canonical list. The retry prompt requires byte-identical output from the vocab. 3. After the retry, validate again. Spans still off-vocab are dropped — no fuzzy snapping ever produces a different canonical label than the VLM actually emitted. 4. If every span ends up off-vocab even after the retry, warn loudly so the operator extends ``meta/canonical_vocabulary.json`` to cover the missing phase. The episode is left with empty subtasks rather than silently fabricated ones — visibility > sweep-under- the-rug. Promote ``_NORMALIZE_STRIP_TOKENS`` to a class constant and split the normalisation helper out so the retry-validation and the final canonicalisation share one source of truth. Tests: - test_plan_module_accepts_article_only_difference: "grasp the blue cube" still maps to canonical "grasp blue cube" (article-tolerant). - test_plan_module_retries_when_subtask_off_vocab: paraphrase triggers the retry which the VLM corrects in pass 2. - test_plan_module_drops_off_vocab_subtask_after_retry: VLM that refuses to correct → bad span dropped, in-vocab span kept. - test_plan_module_empty_when_all_off_vocab_after_retry: every span off-vocab → episode left empty (no warping). All 13 vocabulary tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-23 09:57:27 +00:00
def test_plan_module_accepts_article_only_difference(
feat(annotate): phase 0 — derive canonical vocabulary from sample episodes The pipeline previously emitted near-unique subtask + memory phrasings per episode (free-form LLM rephrasing). On the downstream low-level policy that collapses the action expert's conditioning to noise: every episode pairs a different paraphrase with similar motions, so the expert learns a flat scene-prior that ignores the subtask string — then at inference the high-level head invents *yet another* paraphrase and the expert produces tiny "uncertain hover" chunks. Add a vocabulary-discovery phase (phase 0) that runs once per dataset: - watches the first ``vocabulary.sample_episodes`` (default 3) episode videos as one Qwen-VL prompt, - asks the VLM to derive ~``n_subtask_target`` canonical imperative subtask labels and ~``n_memory_target`` first-person past-tense memory milestones that recur across the demos, - persists them to ``meta/canonical_vocabulary.json`` (human- inspectable, hand-editable), and - wires the resulting ``Vocabulary`` into the ``plan`` module so every per-episode subtask + memory call is constrained to those exact strings (both as prompt-side instructions *and* post-VLM validation: paraphrases snap to the closest canonical entry via token-set overlap; below a 0.5 Jaccard floor the subtask is dropped rather than warped into something semantically wrong). Operator workflow: - first run discovers the vocabulary, writes the JSON, and runs the ``plan`` module against it, - subsequent runs reuse the on-disk file (``reuse_existing=True`` default) so hand-edits stick, - set ``--vocabulary.enabled=False`` to fall back to free-form generation (the original behaviour). The discovery prompt forbids gerunds / third-person / adverbs and caps the lists to the requested counts, matching the Hi-Robot / π0.6-MEM convention of small per-environment vocabularies. The ``plan`` module's subtask + memory prompts grow a conditional ``{vocabulary_block}`` slot rendered only when a vocabulary is present; without one the templates collapse to their previous free-form form. Tests: 11 new unit tests under tests/annotations/test_vocabulary.py cover the on-disk round-trip, discovery against the fixture dataset, ``reuse_existing`` short-circuit, paraphrase canonicalisation, off- vocab subtask dropping, and the no-vocabulary pass-through path. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-22 11:40:05 +00:00
fixture_dataset_root: Path, tmp_path: Path
) -> None:
fix(annotate): replace fuzzy subtask snapping with strict match + one-shot retry The Jaccard-overlap snap was warping VLM output into wrong canonical labels — e.g. an off-vocab "consult the wizard" span would silently become "grasp blue cube" if that scored highest. Even with a higher floor the operator can't tell which subtasks were paraphrases vs genuine mislabels in the resulting dataset. Replace with strict exact-match validation + a single targeted retry: 1. Generate subtasks as before. 2. If any returned subtask's normalised form (lowercased, articles stripped, whitespace collapsed) isn't in the canonical vocab, fire one retry call naming the offending strings and re-sending the full canonical list. The retry prompt requires byte-identical output from the vocab. 3. After the retry, validate again. Spans still off-vocab are dropped — no fuzzy snapping ever produces a different canonical label than the VLM actually emitted. 4. If every span ends up off-vocab even after the retry, warn loudly so the operator extends ``meta/canonical_vocabulary.json`` to cover the missing phase. The episode is left with empty subtasks rather than silently fabricated ones — visibility > sweep-under- the-rug. Promote ``_NORMALIZE_STRIP_TOKENS`` to a class constant and split the normalisation helper out so the retry-validation and the final canonicalisation share one source of truth. Tests: - test_plan_module_accepts_article_only_difference: "grasp the blue cube" still maps to canonical "grasp blue cube" (article-tolerant). - test_plan_module_retries_when_subtask_off_vocab: paraphrase triggers the retry which the VLM corrects in pass 2. - test_plan_module_drops_off_vocab_subtask_after_retry: VLM that refuses to correct → bad span dropped, in-vocab span kept. - test_plan_module_empty_when_all_off_vocab_after_retry: every span off-vocab → episode left empty (no warping). All 13 vocabulary tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-23 09:57:27 +00:00
"""Articles like 'the'/'a'/'an' are stripped during validation."""
feat(annotate): phase 0 — derive canonical vocabulary from sample episodes The pipeline previously emitted near-unique subtask + memory phrasings per episode (free-form LLM rephrasing). On the downstream low-level policy that collapses the action expert's conditioning to noise: every episode pairs a different paraphrase with similar motions, so the expert learns a flat scene-prior that ignores the subtask string — then at inference the high-level head invents *yet another* paraphrase and the expert produces tiny "uncertain hover" chunks. Add a vocabulary-discovery phase (phase 0) that runs once per dataset: - watches the first ``vocabulary.sample_episodes`` (default 3) episode videos as one Qwen-VL prompt, - asks the VLM to derive ~``n_subtask_target`` canonical imperative subtask labels and ~``n_memory_target`` first-person past-tense memory milestones that recur across the demos, - persists them to ``meta/canonical_vocabulary.json`` (human- inspectable, hand-editable), and - wires the resulting ``Vocabulary`` into the ``plan`` module so every per-episode subtask + memory call is constrained to those exact strings (both as prompt-side instructions *and* post-VLM validation: paraphrases snap to the closest canonical entry via token-set overlap; below a 0.5 Jaccard floor the subtask is dropped rather than warped into something semantically wrong). Operator workflow: - first run discovers the vocabulary, writes the JSON, and runs the ``plan`` module against it, - subsequent runs reuse the on-disk file (``reuse_existing=True`` default) so hand-edits stick, - set ``--vocabulary.enabled=False`` to fall back to free-form generation (the original behaviour). The discovery prompt forbids gerunds / third-person / adverbs and caps the lists to the requested counts, matching the Hi-Robot / π0.6-MEM convention of small per-environment vocabularies. The ``plan`` module's subtask + memory prompts grow a conditional ``{vocabulary_block}`` slot rendered only when a vocabulary is present; without one the templates collapse to their previous free-form form. Tests: 11 new unit tests under tests/annotations/test_vocabulary.py cover the on-disk round-trip, discovery against the fixture dataset, ``reuse_existing`` short-circuit, paraphrase canonicalisation, off- vocab subtask dropping, and the no-vocabulary pass-through path. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-22 11:40:05 +00:00
from lerobot.annotations.steerable_pipeline.vlm_client import StubVlmClient
def responder(_messages):
return {
"subtasks": [
fix(annotate): replace fuzzy subtask snapping with strict match + one-shot retry The Jaccard-overlap snap was warping VLM output into wrong canonical labels — e.g. an off-vocab "consult the wizard" span would silently become "grasp blue cube" if that scored highest. Even with a higher floor the operator can't tell which subtasks were paraphrases vs genuine mislabels in the resulting dataset. Replace with strict exact-match validation + a single targeted retry: 1. Generate subtasks as before. 2. If any returned subtask's normalised form (lowercased, articles stripped, whitespace collapsed) isn't in the canonical vocab, fire one retry call naming the offending strings and re-sending the full canonical list. The retry prompt requires byte-identical output from the vocab. 3. After the retry, validate again. Spans still off-vocab are dropped — no fuzzy snapping ever produces a different canonical label than the VLM actually emitted. 4. If every span ends up off-vocab even after the retry, warn loudly so the operator extends ``meta/canonical_vocabulary.json`` to cover the missing phase. The episode is left with empty subtasks rather than silently fabricated ones — visibility > sweep-under- the-rug. Promote ``_NORMALIZE_STRIP_TOKENS`` to a class constant and split the normalisation helper out so the retry-validation and the final canonicalisation share one source of truth. Tests: - test_plan_module_accepts_article_only_difference: "grasp the blue cube" still maps to canonical "grasp blue cube" (article-tolerant). - test_plan_module_retries_when_subtask_off_vocab: paraphrase triggers the retry which the VLM corrects in pass 2. - test_plan_module_drops_off_vocab_subtask_after_retry: VLM that refuses to correct → bad span dropped, in-vocab span kept. - test_plan_module_empty_when_all_off_vocab_after_retry: every span off-vocab → episode left empty (no warping). All 13 vocabulary tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-23 09:57:27 +00:00
# Same canonical phrase modulo "the" — should be accepted.
feat(annotate): phase 0 — derive canonical vocabulary from sample episodes The pipeline previously emitted near-unique subtask + memory phrasings per episode (free-form LLM rephrasing). On the downstream low-level policy that collapses the action expert's conditioning to noise: every episode pairs a different paraphrase with similar motions, so the expert learns a flat scene-prior that ignores the subtask string — then at inference the high-level head invents *yet another* paraphrase and the expert produces tiny "uncertain hover" chunks. Add a vocabulary-discovery phase (phase 0) that runs once per dataset: - watches the first ``vocabulary.sample_episodes`` (default 3) episode videos as one Qwen-VL prompt, - asks the VLM to derive ~``n_subtask_target`` canonical imperative subtask labels and ~``n_memory_target`` first-person past-tense memory milestones that recur across the demos, - persists them to ``meta/canonical_vocabulary.json`` (human- inspectable, hand-editable), and - wires the resulting ``Vocabulary`` into the ``plan`` module so every per-episode subtask + memory call is constrained to those exact strings (both as prompt-side instructions *and* post-VLM validation: paraphrases snap to the closest canonical entry via token-set overlap; below a 0.5 Jaccard floor the subtask is dropped rather than warped into something semantically wrong). Operator workflow: - first run discovers the vocabulary, writes the JSON, and runs the ``plan`` module against it, - subsequent runs reuse the on-disk file (``reuse_existing=True`` default) so hand-edits stick, - set ``--vocabulary.enabled=False`` to fall back to free-form generation (the original behaviour). The discovery prompt forbids gerunds / third-person / adverbs and caps the lists to the requested counts, matching the Hi-Robot / π0.6-MEM convention of small per-environment vocabularies. The ``plan`` module's subtask + memory prompts grow a conditional ``{vocabulary_block}`` slot rendered only when a vocabulary is present; without one the templates collapse to their previous free-form form. Tests: 11 new unit tests under tests/annotations/test_vocabulary.py cover the on-disk round-trip, discovery against the fixture dataset, ``reuse_existing`` short-circuit, paraphrase canonicalisation, off- vocab subtask dropping, and the no-vocabulary pass-through path. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-22 11:40:05 +00:00
{"text": "grasp the blue cube", "start": 0.0, "end": 0.4},
]
}
vlm = StubVlmClient(responder=responder)
vocab = Vocabulary(subtasks=_CANONICAL_SUBTASKS, memory_milestones=_CANONICAL_MEMORY)
module = PlanSubtasksMemoryModule(
vlm=vlm,
config=PlanConfig(n_task_rephrasings=0),
vocabulary=vocab,
)
record = next(iter_episodes(fixture_dataset_root))
staging = EpisodeStaging(tmp_path / "stage", record.episode_index)
module.run_episode(record, staging)
rows = staging.read("plan")
subtask_texts = [r["content"] for r in rows if r["style"] == "subtask"]
fix(annotate): replace fuzzy subtask snapping with strict match + one-shot retry The Jaccard-overlap snap was warping VLM output into wrong canonical labels — e.g. an off-vocab "consult the wizard" span would silently become "grasp blue cube" if that scored highest. Even with a higher floor the operator can't tell which subtasks were paraphrases vs genuine mislabels in the resulting dataset. Replace with strict exact-match validation + a single targeted retry: 1. Generate subtasks as before. 2. If any returned subtask's normalised form (lowercased, articles stripped, whitespace collapsed) isn't in the canonical vocab, fire one retry call naming the offending strings and re-sending the full canonical list. The retry prompt requires byte-identical output from the vocab. 3. After the retry, validate again. Spans still off-vocab are dropped — no fuzzy snapping ever produces a different canonical label than the VLM actually emitted. 4. If every span ends up off-vocab even after the retry, warn loudly so the operator extends ``meta/canonical_vocabulary.json`` to cover the missing phase. The episode is left with empty subtasks rather than silently fabricated ones — visibility > sweep-under- the-rug. Promote ``_NORMALIZE_STRIP_TOKENS`` to a class constant and split the normalisation helper out so the retry-validation and the final canonicalisation share one source of truth. Tests: - test_plan_module_accepts_article_only_difference: "grasp the blue cube" still maps to canonical "grasp blue cube" (article-tolerant). - test_plan_module_retries_when_subtask_off_vocab: paraphrase triggers the retry which the VLM corrects in pass 2. - test_plan_module_drops_off_vocab_subtask_after_retry: VLM that refuses to correct → bad span dropped, in-vocab span kept. - test_plan_module_empty_when_all_off_vocab_after_retry: every span off-vocab → episode left empty (no warping). All 13 vocabulary tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-23 09:57:27 +00:00
assert subtask_texts == ["grasp blue cube"]
feat(annotate): phase 0 — derive canonical vocabulary from sample episodes The pipeline previously emitted near-unique subtask + memory phrasings per episode (free-form LLM rephrasing). On the downstream low-level policy that collapses the action expert's conditioning to noise: every episode pairs a different paraphrase with similar motions, so the expert learns a flat scene-prior that ignores the subtask string — then at inference the high-level head invents *yet another* paraphrase and the expert produces tiny "uncertain hover" chunks. Add a vocabulary-discovery phase (phase 0) that runs once per dataset: - watches the first ``vocabulary.sample_episodes`` (default 3) episode videos as one Qwen-VL prompt, - asks the VLM to derive ~``n_subtask_target`` canonical imperative subtask labels and ~``n_memory_target`` first-person past-tense memory milestones that recur across the demos, - persists them to ``meta/canonical_vocabulary.json`` (human- inspectable, hand-editable), and - wires the resulting ``Vocabulary`` into the ``plan`` module so every per-episode subtask + memory call is constrained to those exact strings (both as prompt-side instructions *and* post-VLM validation: paraphrases snap to the closest canonical entry via token-set overlap; below a 0.5 Jaccard floor the subtask is dropped rather than warped into something semantically wrong). Operator workflow: - first run discovers the vocabulary, writes the JSON, and runs the ``plan`` module against it, - subsequent runs reuse the on-disk file (``reuse_existing=True`` default) so hand-edits stick, - set ``--vocabulary.enabled=False`` to fall back to free-form generation (the original behaviour). The discovery prompt forbids gerunds / third-person / adverbs and caps the lists to the requested counts, matching the Hi-Robot / π0.6-MEM convention of small per-environment vocabularies. The ``plan`` module's subtask + memory prompts grow a conditional ``{vocabulary_block}`` slot rendered only when a vocabulary is present; without one the templates collapse to their previous free-form form. Tests: 11 new unit tests under tests/annotations/test_vocabulary.py cover the on-disk round-trip, discovery against the fixture dataset, ``reuse_existing`` short-circuit, paraphrase canonicalisation, off- vocab subtask dropping, and the no-vocabulary pass-through path. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-22 11:40:05 +00:00
fix(annotate): replace fuzzy subtask snapping with strict match + one-shot retry The Jaccard-overlap snap was warping VLM output into wrong canonical labels — e.g. an off-vocab "consult the wizard" span would silently become "grasp blue cube" if that scored highest. Even with a higher floor the operator can't tell which subtasks were paraphrases vs genuine mislabels in the resulting dataset. Replace with strict exact-match validation + a single targeted retry: 1. Generate subtasks as before. 2. If any returned subtask's normalised form (lowercased, articles stripped, whitespace collapsed) isn't in the canonical vocab, fire one retry call naming the offending strings and re-sending the full canonical list. The retry prompt requires byte-identical output from the vocab. 3. After the retry, validate again. Spans still off-vocab are dropped — no fuzzy snapping ever produces a different canonical label than the VLM actually emitted. 4. If every span ends up off-vocab even after the retry, warn loudly so the operator extends ``meta/canonical_vocabulary.json`` to cover the missing phase. The episode is left with empty subtasks rather than silently fabricated ones — visibility > sweep-under- the-rug. Promote ``_NORMALIZE_STRIP_TOKENS`` to a class constant and split the normalisation helper out so the retry-validation and the final canonicalisation share one source of truth. Tests: - test_plan_module_accepts_article_only_difference: "grasp the blue cube" still maps to canonical "grasp blue cube" (article-tolerant). - test_plan_module_retries_when_subtask_off_vocab: paraphrase triggers the retry which the VLM corrects in pass 2. - test_plan_module_drops_off_vocab_subtask_after_retry: VLM that refuses to correct → bad span dropped, in-vocab span kept. - test_plan_module_empty_when_all_off_vocab_after_retry: every span off-vocab → episode left empty (no warping). All 13 vocabulary tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-23 09:57:27 +00:00
def test_plan_module_retries_when_subtask_off_vocab(
feat(annotate): phase 0 — derive canonical vocabulary from sample episodes The pipeline previously emitted near-unique subtask + memory phrasings per episode (free-form LLM rephrasing). On the downstream low-level policy that collapses the action expert's conditioning to noise: every episode pairs a different paraphrase with similar motions, so the expert learns a flat scene-prior that ignores the subtask string — then at inference the high-level head invents *yet another* paraphrase and the expert produces tiny "uncertain hover" chunks. Add a vocabulary-discovery phase (phase 0) that runs once per dataset: - watches the first ``vocabulary.sample_episodes`` (default 3) episode videos as one Qwen-VL prompt, - asks the VLM to derive ~``n_subtask_target`` canonical imperative subtask labels and ~``n_memory_target`` first-person past-tense memory milestones that recur across the demos, - persists them to ``meta/canonical_vocabulary.json`` (human- inspectable, hand-editable), and - wires the resulting ``Vocabulary`` into the ``plan`` module so every per-episode subtask + memory call is constrained to those exact strings (both as prompt-side instructions *and* post-VLM validation: paraphrases snap to the closest canonical entry via token-set overlap; below a 0.5 Jaccard floor the subtask is dropped rather than warped into something semantically wrong). Operator workflow: - first run discovers the vocabulary, writes the JSON, and runs the ``plan`` module against it, - subsequent runs reuse the on-disk file (``reuse_existing=True`` default) so hand-edits stick, - set ``--vocabulary.enabled=False`` to fall back to free-form generation (the original behaviour). The discovery prompt forbids gerunds / third-person / adverbs and caps the lists to the requested counts, matching the Hi-Robot / π0.6-MEM convention of small per-environment vocabularies. The ``plan`` module's subtask + memory prompts grow a conditional ``{vocabulary_block}`` slot rendered only when a vocabulary is present; without one the templates collapse to their previous free-form form. Tests: 11 new unit tests under tests/annotations/test_vocabulary.py cover the on-disk round-trip, discovery against the fixture dataset, ``reuse_existing`` short-circuit, paraphrase canonicalisation, off- vocab subtask dropping, and the no-vocabulary pass-through path. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-22 11:40:05 +00:00
fixture_dataset_root: Path, tmp_path: Path
) -> None:
fix(annotate): replace fuzzy subtask snapping with strict match + one-shot retry The Jaccard-overlap snap was warping VLM output into wrong canonical labels — e.g. an off-vocab "consult the wizard" span would silently become "grasp blue cube" if that scored highest. Even with a higher floor the operator can't tell which subtasks were paraphrases vs genuine mislabels in the resulting dataset. Replace with strict exact-match validation + a single targeted retry: 1. Generate subtasks as before. 2. If any returned subtask's normalised form (lowercased, articles stripped, whitespace collapsed) isn't in the canonical vocab, fire one retry call naming the offending strings and re-sending the full canonical list. The retry prompt requires byte-identical output from the vocab. 3. After the retry, validate again. Spans still off-vocab are dropped — no fuzzy snapping ever produces a different canonical label than the VLM actually emitted. 4. If every span ends up off-vocab even after the retry, warn loudly so the operator extends ``meta/canonical_vocabulary.json`` to cover the missing phase. The episode is left with empty subtasks rather than silently fabricated ones — visibility > sweep-under- the-rug. Promote ``_NORMALIZE_STRIP_TOKENS`` to a class constant and split the normalisation helper out so the retry-validation and the final canonicalisation share one source of truth. Tests: - test_plan_module_accepts_article_only_difference: "grasp the blue cube" still maps to canonical "grasp blue cube" (article-tolerant). - test_plan_module_retries_when_subtask_off_vocab: paraphrase triggers the retry which the VLM corrects in pass 2. - test_plan_module_drops_off_vocab_subtask_after_retry: VLM that refuses to correct → bad span dropped, in-vocab span kept. - test_plan_module_empty_when_all_off_vocab_after_retry: every span off-vocab → episode left empty (no warping). All 13 vocabulary tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-23 09:57:27 +00:00
"""One-shot retry replaces an off-vocab paraphrase with the canonical form."""
from lerobot.annotations.steerable_pipeline.vlm_client import StubVlmClient
call_count = {"n": 0}
def responder(messages):
call_count["n"] += 1
# First call: returns an off-vocab paraphrase.
if call_count["n"] == 1:
return {
"subtasks": [
# paraphrase, not in vocab
{"text": "pick up blue cube", "start": 0.0, "end": 0.4},
]
}
# Second call (the retry): should contain the correction prompt;
# respond with the canonical phrase exactly.
last_user_text = ""
for message in messages:
content = message.get("content")
if isinstance(content, str):
last_user_text = content
elif isinstance(content, list):
for block in content:
if isinstance(block, dict) and block.get("type") == "text":
last_user_text = block.get("text", "")
assert "NOT in the canonical vocabulary" in last_user_text
return {
"subtasks": [
{"text": "grasp blue cube", "start": 0.0, "end": 0.4},
]
}
fix(annotate): never leave an episode with zero canonical subtasks When the canonical vocabulary is enabled and the VLM produces spans that don't overlap any canonical label, the previous Jaccard-floor (0.5) dropped them and the episode came out with no subtasks at all — invisible to the downstream policy. Observed on ``pepijn223/super_poulain_vocab``: some episodes had empty subtask columns because every VLM-emitted phrase scored below 0.5 against the discovered vocabulary. Two-pass canonicalisation: - First pass keeps the Jaccard floor (lowered from 0.5 → 0.25, to let mild paraphrases through) and drops everything below. - If that first pass leaves the episode with **zero** subtasks, fall back to a second pass that always snaps each VLM span to its nearest canonical label by Jaccard (no floor). The episode ends up with subtasks even when the vocabulary missed a phase — a slightly-wrong canonical label is still closer to the right motion than nothing at all. - Log loudly when the fallback fires so the operator can spot coverage gaps in ``meta/canonical_vocabulary.json``. - Log a per-episode count at INFO when some (but not all) spans were dropped so it's visible without spamming the run output. Promote the Jaccard floor + ignore-tokens to class constants so they're a single edit point. Add ``force=True`` parameter to ``_canonicalize_subtask`` for the no-floor fallback path. New test ``test_plan_module_snaps_when_all_off_vocab`` covers the fallback; existing ``test_plan_module_drops_off_vocab_subtask`` is adjusted to keep at least one in-vocab span so the floor path can still fire and is exercised. All 12 vocabulary tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-22 12:44:03 +00:00
fix(annotate): replace fuzzy subtask snapping with strict match + one-shot retry The Jaccard-overlap snap was warping VLM output into wrong canonical labels — e.g. an off-vocab "consult the wizard" span would silently become "grasp blue cube" if that scored highest. Even with a higher floor the operator can't tell which subtasks were paraphrases vs genuine mislabels in the resulting dataset. Replace with strict exact-match validation + a single targeted retry: 1. Generate subtasks as before. 2. If any returned subtask's normalised form (lowercased, articles stripped, whitespace collapsed) isn't in the canonical vocab, fire one retry call naming the offending strings and re-sending the full canonical list. The retry prompt requires byte-identical output from the vocab. 3. After the retry, validate again. Spans still off-vocab are dropped — no fuzzy snapping ever produces a different canonical label than the VLM actually emitted. 4. If every span ends up off-vocab even after the retry, warn loudly so the operator extends ``meta/canonical_vocabulary.json`` to cover the missing phase. The episode is left with empty subtasks rather than silently fabricated ones — visibility > sweep-under- the-rug. Promote ``_NORMALIZE_STRIP_TOKENS`` to a class constant and split the normalisation helper out so the retry-validation and the final canonicalisation share one source of truth. Tests: - test_plan_module_accepts_article_only_difference: "grasp the blue cube" still maps to canonical "grasp blue cube" (article-tolerant). - test_plan_module_retries_when_subtask_off_vocab: paraphrase triggers the retry which the VLM corrects in pass 2. - test_plan_module_drops_off_vocab_subtask_after_retry: VLM that refuses to correct → bad span dropped, in-vocab span kept. - test_plan_module_empty_when_all_off_vocab_after_retry: every span off-vocab → episode left empty (no warping). All 13 vocabulary tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-23 09:57:27 +00:00
vlm = StubVlmClient(responder=responder)
vocab = Vocabulary(subtasks=_CANONICAL_SUBTASKS, memory_milestones=_CANONICAL_MEMORY)
module = PlanSubtasksMemoryModule(
vlm=vlm,
config=PlanConfig(n_task_rephrasings=0),
vocabulary=vocab,
)
record = next(iter_episodes(fixture_dataset_root))
staging = EpisodeStaging(tmp_path / "stage", record.episode_index)
module.run_episode(record, staging)
rows = staging.read("plan")
subtask_texts = [r["content"] for r in rows if r["style"] == "subtask"]
assert subtask_texts == ["grasp blue cube"]
# The retry must have fired exactly once.
assert call_count["n"] == 2
def test_plan_module_drops_off_vocab_subtask_after_retry(
fixture_dataset_root: Path, tmp_path: Path
) -> None:
"""If the VLM stays off-vocab even after the retry, the bad span is dropped."""
feat(annotate): phase 0 — derive canonical vocabulary from sample episodes The pipeline previously emitted near-unique subtask + memory phrasings per episode (free-form LLM rephrasing). On the downstream low-level policy that collapses the action expert's conditioning to noise: every episode pairs a different paraphrase with similar motions, so the expert learns a flat scene-prior that ignores the subtask string — then at inference the high-level head invents *yet another* paraphrase and the expert produces tiny "uncertain hover" chunks. Add a vocabulary-discovery phase (phase 0) that runs once per dataset: - watches the first ``vocabulary.sample_episodes`` (default 3) episode videos as one Qwen-VL prompt, - asks the VLM to derive ~``n_subtask_target`` canonical imperative subtask labels and ~``n_memory_target`` first-person past-tense memory milestones that recur across the demos, - persists them to ``meta/canonical_vocabulary.json`` (human- inspectable, hand-editable), and - wires the resulting ``Vocabulary`` into the ``plan`` module so every per-episode subtask + memory call is constrained to those exact strings (both as prompt-side instructions *and* post-VLM validation: paraphrases snap to the closest canonical entry via token-set overlap; below a 0.5 Jaccard floor the subtask is dropped rather than warped into something semantically wrong). Operator workflow: - first run discovers the vocabulary, writes the JSON, and runs the ``plan`` module against it, - subsequent runs reuse the on-disk file (``reuse_existing=True`` default) so hand-edits stick, - set ``--vocabulary.enabled=False`` to fall back to free-form generation (the original behaviour). The discovery prompt forbids gerunds / third-person / adverbs and caps the lists to the requested counts, matching the Hi-Robot / π0.6-MEM convention of small per-environment vocabularies. The ``plan`` module's subtask + memory prompts grow a conditional ``{vocabulary_block}`` slot rendered only when a vocabulary is present; without one the templates collapse to their previous free-form form. Tests: 11 new unit tests under tests/annotations/test_vocabulary.py cover the on-disk round-trip, discovery against the fixture dataset, ``reuse_existing`` short-circuit, paraphrase canonicalisation, off- vocab subtask dropping, and the no-vocabulary pass-through path. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-22 11:40:05 +00:00
from lerobot.annotations.steerable_pipeline.vlm_client import StubVlmClient
fix(annotate): replace fuzzy subtask snapping with strict match + one-shot retry The Jaccard-overlap snap was warping VLM output into wrong canonical labels — e.g. an off-vocab "consult the wizard" span would silently become "grasp blue cube" if that scored highest. Even with a higher floor the operator can't tell which subtasks were paraphrases vs genuine mislabels in the resulting dataset. Replace with strict exact-match validation + a single targeted retry: 1. Generate subtasks as before. 2. If any returned subtask's normalised form (lowercased, articles stripped, whitespace collapsed) isn't in the canonical vocab, fire one retry call naming the offending strings and re-sending the full canonical list. The retry prompt requires byte-identical output from the vocab. 3. After the retry, validate again. Spans still off-vocab are dropped — no fuzzy snapping ever produces a different canonical label than the VLM actually emitted. 4. If every span ends up off-vocab even after the retry, warn loudly so the operator extends ``meta/canonical_vocabulary.json`` to cover the missing phase. The episode is left with empty subtasks rather than silently fabricated ones — visibility > sweep-under- the-rug. Promote ``_NORMALIZE_STRIP_TOKENS`` to a class constant and split the normalisation helper out so the retry-validation and the final canonicalisation share one source of truth. Tests: - test_plan_module_accepts_article_only_difference: "grasp the blue cube" still maps to canonical "grasp blue cube" (article-tolerant). - test_plan_module_retries_when_subtask_off_vocab: paraphrase triggers the retry which the VLM corrects in pass 2. - test_plan_module_drops_off_vocab_subtask_after_retry: VLM that refuses to correct → bad span dropped, in-vocab span kept. - test_plan_module_empty_when_all_off_vocab_after_retry: every span off-vocab → episode left empty (no warping). All 13 vocabulary tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-23 09:57:27 +00:00
call_count = {"n": 0}
feat(annotate): phase 0 — derive canonical vocabulary from sample episodes The pipeline previously emitted near-unique subtask + memory phrasings per episode (free-form LLM rephrasing). On the downstream low-level policy that collapses the action expert's conditioning to noise: every episode pairs a different paraphrase with similar motions, so the expert learns a flat scene-prior that ignores the subtask string — then at inference the high-level head invents *yet another* paraphrase and the expert produces tiny "uncertain hover" chunks. Add a vocabulary-discovery phase (phase 0) that runs once per dataset: - watches the first ``vocabulary.sample_episodes`` (default 3) episode videos as one Qwen-VL prompt, - asks the VLM to derive ~``n_subtask_target`` canonical imperative subtask labels and ~``n_memory_target`` first-person past-tense memory milestones that recur across the demos, - persists them to ``meta/canonical_vocabulary.json`` (human- inspectable, hand-editable), and - wires the resulting ``Vocabulary`` into the ``plan`` module so every per-episode subtask + memory call is constrained to those exact strings (both as prompt-side instructions *and* post-VLM validation: paraphrases snap to the closest canonical entry via token-set overlap; below a 0.5 Jaccard floor the subtask is dropped rather than warped into something semantically wrong). Operator workflow: - first run discovers the vocabulary, writes the JSON, and runs the ``plan`` module against it, - subsequent runs reuse the on-disk file (``reuse_existing=True`` default) so hand-edits stick, - set ``--vocabulary.enabled=False`` to fall back to free-form generation (the original behaviour). The discovery prompt forbids gerunds / third-person / adverbs and caps the lists to the requested counts, matching the Hi-Robot / π0.6-MEM convention of small per-environment vocabularies. The ``plan`` module's subtask + memory prompts grow a conditional ``{vocabulary_block}`` slot rendered only when a vocabulary is present; without one the templates collapse to their previous free-form form. Tests: 11 new unit tests under tests/annotations/test_vocabulary.py cover the on-disk round-trip, discovery against the fixture dataset, ``reuse_existing`` short-circuit, paraphrase canonicalisation, off- vocab subtask dropping, and the no-vocabulary pass-through path. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-22 11:40:05 +00:00
def responder(_messages):
fix(annotate): replace fuzzy subtask snapping with strict match + one-shot retry The Jaccard-overlap snap was warping VLM output into wrong canonical labels — e.g. an off-vocab "consult the wizard" span would silently become "grasp blue cube" if that scored highest. Even with a higher floor the operator can't tell which subtasks were paraphrases vs genuine mislabels in the resulting dataset. Replace with strict exact-match validation + a single targeted retry: 1. Generate subtasks as before. 2. If any returned subtask's normalised form (lowercased, articles stripped, whitespace collapsed) isn't in the canonical vocab, fire one retry call naming the offending strings and re-sending the full canonical list. The retry prompt requires byte-identical output from the vocab. 3. After the retry, validate again. Spans still off-vocab are dropped — no fuzzy snapping ever produces a different canonical label than the VLM actually emitted. 4. If every span ends up off-vocab even after the retry, warn loudly so the operator extends ``meta/canonical_vocabulary.json`` to cover the missing phase. The episode is left with empty subtasks rather than silently fabricated ones — visibility > sweep-under- the-rug. Promote ``_NORMALIZE_STRIP_TOKENS`` to a class constant and split the normalisation helper out so the retry-validation and the final canonicalisation share one source of truth. Tests: - test_plan_module_accepts_article_only_difference: "grasp the blue cube" still maps to canonical "grasp blue cube" (article-tolerant). - test_plan_module_retries_when_subtask_off_vocab: paraphrase triggers the retry which the VLM corrects in pass 2. - test_plan_module_drops_off_vocab_subtask_after_retry: VLM that refuses to correct → bad span dropped, in-vocab span kept. - test_plan_module_empty_when_all_off_vocab_after_retry: every span off-vocab → episode left empty (no warping). All 13 vocabulary tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-23 09:57:27 +00:00
call_count["n"] += 1
# Both calls return the same off-vocab span — the model can't
# be corrected. The second call also returns one in-vocab span
# so the episode isn't empty; this lets us check that the
# off-vocab span is dropped without affecting the in-vocab one.
if call_count["n"] == 1:
return {
"subtasks": [
{"text": "perform a fancy macarena dance", "start": 0.0, "end": 0.4},
{"text": "grasp blue cube", "start": 0.4, "end": 0.9},
]
}
feat(annotate): phase 0 — derive canonical vocabulary from sample episodes The pipeline previously emitted near-unique subtask + memory phrasings per episode (free-form LLM rephrasing). On the downstream low-level policy that collapses the action expert's conditioning to noise: every episode pairs a different paraphrase with similar motions, so the expert learns a flat scene-prior that ignores the subtask string — then at inference the high-level head invents *yet another* paraphrase and the expert produces tiny "uncertain hover" chunks. Add a vocabulary-discovery phase (phase 0) that runs once per dataset: - watches the first ``vocabulary.sample_episodes`` (default 3) episode videos as one Qwen-VL prompt, - asks the VLM to derive ~``n_subtask_target`` canonical imperative subtask labels and ~``n_memory_target`` first-person past-tense memory milestones that recur across the demos, - persists them to ``meta/canonical_vocabulary.json`` (human- inspectable, hand-editable), and - wires the resulting ``Vocabulary`` into the ``plan`` module so every per-episode subtask + memory call is constrained to those exact strings (both as prompt-side instructions *and* post-VLM validation: paraphrases snap to the closest canonical entry via token-set overlap; below a 0.5 Jaccard floor the subtask is dropped rather than warped into something semantically wrong). Operator workflow: - first run discovers the vocabulary, writes the JSON, and runs the ``plan`` module against it, - subsequent runs reuse the on-disk file (``reuse_existing=True`` default) so hand-edits stick, - set ``--vocabulary.enabled=False`` to fall back to free-form generation (the original behaviour). The discovery prompt forbids gerunds / third-person / adverbs and caps the lists to the requested counts, matching the Hi-Robot / π0.6-MEM convention of small per-environment vocabularies. The ``plan`` module's subtask + memory prompts grow a conditional ``{vocabulary_block}`` slot rendered only when a vocabulary is present; without one the templates collapse to their previous free-form form. Tests: 11 new unit tests under tests/annotations/test_vocabulary.py cover the on-disk round-trip, discovery against the fixture dataset, ``reuse_existing`` short-circuit, paraphrase canonicalisation, off- vocab subtask dropping, and the no-vocabulary pass-through path. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-22 11:40:05 +00:00
return {
"subtasks": [
fix(annotate): replace fuzzy subtask snapping with strict match + one-shot retry The Jaccard-overlap snap was warping VLM output into wrong canonical labels — e.g. an off-vocab "consult the wizard" span would silently become "grasp blue cube" if that scored highest. Even with a higher floor the operator can't tell which subtasks were paraphrases vs genuine mislabels in the resulting dataset. Replace with strict exact-match validation + a single targeted retry: 1. Generate subtasks as before. 2. If any returned subtask's normalised form (lowercased, articles stripped, whitespace collapsed) isn't in the canonical vocab, fire one retry call naming the offending strings and re-sending the full canonical list. The retry prompt requires byte-identical output from the vocab. 3. After the retry, validate again. Spans still off-vocab are dropped — no fuzzy snapping ever produces a different canonical label than the VLM actually emitted. 4. If every span ends up off-vocab even after the retry, warn loudly so the operator extends ``meta/canonical_vocabulary.json`` to cover the missing phase. The episode is left with empty subtasks rather than silently fabricated ones — visibility > sweep-under- the-rug. Promote ``_NORMALIZE_STRIP_TOKENS`` to a class constant and split the normalisation helper out so the retry-validation and the final canonicalisation share one source of truth. Tests: - test_plan_module_accepts_article_only_difference: "grasp the blue cube" still maps to canonical "grasp blue cube" (article-tolerant). - test_plan_module_retries_when_subtask_off_vocab: paraphrase triggers the retry which the VLM corrects in pass 2. - test_plan_module_drops_off_vocab_subtask_after_retry: VLM that refuses to correct → bad span dropped, in-vocab span kept. - test_plan_module_empty_when_all_off_vocab_after_retry: every span off-vocab → episode left empty (no warping). All 13 vocabulary tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-23 09:57:27 +00:00
{"text": "perform a fancy macarena dance", "start": 0.0, "end": 0.4},
{"text": "grasp blue cube", "start": 0.4, "end": 0.9},
feat(annotate): phase 0 — derive canonical vocabulary from sample episodes The pipeline previously emitted near-unique subtask + memory phrasings per episode (free-form LLM rephrasing). On the downstream low-level policy that collapses the action expert's conditioning to noise: every episode pairs a different paraphrase with similar motions, so the expert learns a flat scene-prior that ignores the subtask string — then at inference the high-level head invents *yet another* paraphrase and the expert produces tiny "uncertain hover" chunks. Add a vocabulary-discovery phase (phase 0) that runs once per dataset: - watches the first ``vocabulary.sample_episodes`` (default 3) episode videos as one Qwen-VL prompt, - asks the VLM to derive ~``n_subtask_target`` canonical imperative subtask labels and ~``n_memory_target`` first-person past-tense memory milestones that recur across the demos, - persists them to ``meta/canonical_vocabulary.json`` (human- inspectable, hand-editable), and - wires the resulting ``Vocabulary`` into the ``plan`` module so every per-episode subtask + memory call is constrained to those exact strings (both as prompt-side instructions *and* post-VLM validation: paraphrases snap to the closest canonical entry via token-set overlap; below a 0.5 Jaccard floor the subtask is dropped rather than warped into something semantically wrong). Operator workflow: - first run discovers the vocabulary, writes the JSON, and runs the ``plan`` module against it, - subsequent runs reuse the on-disk file (``reuse_existing=True`` default) so hand-edits stick, - set ``--vocabulary.enabled=False`` to fall back to free-form generation (the original behaviour). The discovery prompt forbids gerunds / third-person / adverbs and caps the lists to the requested counts, matching the Hi-Robot / π0.6-MEM convention of small per-environment vocabularies. The ``plan`` module's subtask + memory prompts grow a conditional ``{vocabulary_block}`` slot rendered only when a vocabulary is present; without one the templates collapse to their previous free-form form. Tests: 11 new unit tests under tests/annotations/test_vocabulary.py cover the on-disk round-trip, discovery against the fixture dataset, ``reuse_existing`` short-circuit, paraphrase canonicalisation, off- vocab subtask dropping, and the no-vocabulary pass-through path. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-22 11:40:05 +00:00
]
}
vlm = StubVlmClient(responder=responder)
vocab = Vocabulary(subtasks=_CANONICAL_SUBTASKS, memory_milestones=_CANONICAL_MEMORY)
module = PlanSubtasksMemoryModule(
vlm=vlm,
config=PlanConfig(n_task_rephrasings=0),
vocabulary=vocab,
)
record = next(iter_episodes(fixture_dataset_root))
staging = EpisodeStaging(tmp_path / "stage", record.episode_index)
module.run_episode(record, staging)
rows = staging.read("plan")
subtask_texts = [r["content"] for r in rows if r["style"] == "subtask"]
fix(annotate): replace fuzzy subtask snapping with strict match + one-shot retry The Jaccard-overlap snap was warping VLM output into wrong canonical labels — e.g. an off-vocab "consult the wizard" span would silently become "grasp blue cube" if that scored highest. Even with a higher floor the operator can't tell which subtasks were paraphrases vs genuine mislabels in the resulting dataset. Replace with strict exact-match validation + a single targeted retry: 1. Generate subtasks as before. 2. If any returned subtask's normalised form (lowercased, articles stripped, whitespace collapsed) isn't in the canonical vocab, fire one retry call naming the offending strings and re-sending the full canonical list. The retry prompt requires byte-identical output from the vocab. 3. After the retry, validate again. Spans still off-vocab are dropped — no fuzzy snapping ever produces a different canonical label than the VLM actually emitted. 4. If every span ends up off-vocab even after the retry, warn loudly so the operator extends ``meta/canonical_vocabulary.json`` to cover the missing phase. The episode is left with empty subtasks rather than silently fabricated ones — visibility > sweep-under- the-rug. Promote ``_NORMALIZE_STRIP_TOKENS`` to a class constant and split the normalisation helper out so the retry-validation and the final canonicalisation share one source of truth. Tests: - test_plan_module_accepts_article_only_difference: "grasp the blue cube" still maps to canonical "grasp blue cube" (article-tolerant). - test_plan_module_retries_when_subtask_off_vocab: paraphrase triggers the retry which the VLM corrects in pass 2. - test_plan_module_drops_off_vocab_subtask_after_retry: VLM that refuses to correct → bad span dropped, in-vocab span kept. - test_plan_module_empty_when_all_off_vocab_after_retry: every span off-vocab → episode left empty (no warping). All 13 vocabulary tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-23 09:57:27 +00:00
# Retry fired exactly once; bad span dropped, good span kept.
assert call_count["n"] == 2
feat(annotate): phase 0 — derive canonical vocabulary from sample episodes The pipeline previously emitted near-unique subtask + memory phrasings per episode (free-form LLM rephrasing). On the downstream low-level policy that collapses the action expert's conditioning to noise: every episode pairs a different paraphrase with similar motions, so the expert learns a flat scene-prior that ignores the subtask string — then at inference the high-level head invents *yet another* paraphrase and the expert produces tiny "uncertain hover" chunks. Add a vocabulary-discovery phase (phase 0) that runs once per dataset: - watches the first ``vocabulary.sample_episodes`` (default 3) episode videos as one Qwen-VL prompt, - asks the VLM to derive ~``n_subtask_target`` canonical imperative subtask labels and ~``n_memory_target`` first-person past-tense memory milestones that recur across the demos, - persists them to ``meta/canonical_vocabulary.json`` (human- inspectable, hand-editable), and - wires the resulting ``Vocabulary`` into the ``plan`` module so every per-episode subtask + memory call is constrained to those exact strings (both as prompt-side instructions *and* post-VLM validation: paraphrases snap to the closest canonical entry via token-set overlap; below a 0.5 Jaccard floor the subtask is dropped rather than warped into something semantically wrong). Operator workflow: - first run discovers the vocabulary, writes the JSON, and runs the ``plan`` module against it, - subsequent runs reuse the on-disk file (``reuse_existing=True`` default) so hand-edits stick, - set ``--vocabulary.enabled=False`` to fall back to free-form generation (the original behaviour). The discovery prompt forbids gerunds / third-person / adverbs and caps the lists to the requested counts, matching the Hi-Robot / π0.6-MEM convention of small per-environment vocabularies. The ``plan`` module's subtask + memory prompts grow a conditional ``{vocabulary_block}`` slot rendered only when a vocabulary is present; without one the templates collapse to their previous free-form form. Tests: 11 new unit tests under tests/annotations/test_vocabulary.py cover the on-disk round-trip, discovery against the fixture dataset, ``reuse_existing`` short-circuit, paraphrase canonicalisation, off- vocab subtask dropping, and the no-vocabulary pass-through path. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-22 11:40:05 +00:00
assert subtask_texts == ["grasp blue cube"]
fix(annotate): bump same-frame subtasks onto distinct frames If two consecutive VLM-emitted subtask spans have ``start`` timestamps that round to the same source frame after ``snap_to_frame`` (e.g. on short episodes the VLM sometimes nominates two ~adjacent action boundaries within one 30 Hz step), the writer emits two ``style=subtask`` rows at the identical persistent timestamp. The training-time renderer's default binding ``subtask: active_at(t, style=subtask)`` then raises: ValueError: Ambiguous resolver for style='subtask'; add role=..., tool_name=..., or camera=... to disambiguate. … and the whole training run dies on the first batch. Observed concretely on ``pepijn223/super_poulain_vocab2`` (job 22159979): episodes 3 and 30 each had two subtask rows at the same timestamp (``release yellow cube`` + ``retract arm`` snapping to the same frame). Add ``_dedupe_starts_to_distinct_frames`` to walk the cleaned span list and, whenever a snapped start collides with one already used, push the later span onto the next free frame timestamp. Both subtasks survive on distinct timestamps; the renderer can now disambiguate. If the episode genuinely has no later free frame (extremely unlikely — would require a same-timestamp collision on the very last frame of the episode), the later span is dropped with a warning rather than left to poison the render. New test ``test_plan_module_bumps_collocated_subtasks_to_distinct_frames`` locks in the contract; full vocabulary suite is 14/14 green. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-23 19:31:44 +00:00
def test_plan_module_bumps_collocated_subtasks_to_distinct_frames(
fixture_dataset_root: Path, tmp_path: Path
) -> None:
"""Two subtasks whose starts snap to the same frame get split onto two frames.
Without this guard, both spans would emit ``style=subtask`` rows at the
identical persistent timestamp; the training-time renderer's
``active_at(t, style=subtask)`` then raises an ambiguity error.
"""
from lerobot.annotations.steerable_pipeline.vlm_client import StubVlmClient
def responder(_messages):
# Two canonical labels with starts within one frame of each other —
# both snap to the same source frame, so the dedupe pass must bump
# the later one to the next frame.
return {
"subtasks": [
{"text": "grasp blue cube", "start": 0.40, "end": 0.42},
{"text": "place blue cube in box", "start": 0.41, "end": 0.50},
]
}
vlm = StubVlmClient(responder=responder)
vocab = Vocabulary(subtasks=_CANONICAL_SUBTASKS, memory_milestones=_CANONICAL_MEMORY)
module = PlanSubtasksMemoryModule(
vlm=vlm,
config=PlanConfig(n_task_rephrasings=0),
vocabulary=vocab,
)
record = next(iter_episodes(fixture_dataset_root))
staging = EpisodeStaging(tmp_path / "stage", record.episode_index)
module.run_episode(record, staging)
rows = staging.read("plan")
subtask_rows = [r for r in rows if r["style"] == "subtask"]
# Both subtasks present, both on distinct timestamps.
assert len(subtask_rows) == 2
timestamps = [r["timestamp"] for r in subtask_rows]
assert len(set(timestamps)) == 2, f"subtask timestamps collide: {timestamps}"
# Order preserved: the chronologically earlier span keeps the earlier
# frame, the later one was bumped onto the next available frame.
assert subtask_rows[0]["content"] == "grasp blue cube"
assert subtask_rows[1]["content"] == "place blue cube in box"
assert subtask_rows[1]["timestamp"] > subtask_rows[0]["timestamp"]
fix(annotate): replace fuzzy subtask snapping with strict match + one-shot retry The Jaccard-overlap snap was warping VLM output into wrong canonical labels — e.g. an off-vocab "consult the wizard" span would silently become "grasp blue cube" if that scored highest. Even with a higher floor the operator can't tell which subtasks were paraphrases vs genuine mislabels in the resulting dataset. Replace with strict exact-match validation + a single targeted retry: 1. Generate subtasks as before. 2. If any returned subtask's normalised form (lowercased, articles stripped, whitespace collapsed) isn't in the canonical vocab, fire one retry call naming the offending strings and re-sending the full canonical list. The retry prompt requires byte-identical output from the vocab. 3. After the retry, validate again. Spans still off-vocab are dropped — no fuzzy snapping ever produces a different canonical label than the VLM actually emitted. 4. If every span ends up off-vocab even after the retry, warn loudly so the operator extends ``meta/canonical_vocabulary.json`` to cover the missing phase. The episode is left with empty subtasks rather than silently fabricated ones — visibility > sweep-under- the-rug. Promote ``_NORMALIZE_STRIP_TOKENS`` to a class constant and split the normalisation helper out so the retry-validation and the final canonicalisation share one source of truth. Tests: - test_plan_module_accepts_article_only_difference: "grasp the blue cube" still maps to canonical "grasp blue cube" (article-tolerant). - test_plan_module_retries_when_subtask_off_vocab: paraphrase triggers the retry which the VLM corrects in pass 2. - test_plan_module_drops_off_vocab_subtask_after_retry: VLM that refuses to correct → bad span dropped, in-vocab span kept. - test_plan_module_empty_when_all_off_vocab_after_retry: every span off-vocab → episode left empty (no warping). All 13 vocabulary tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-23 09:57:27 +00:00
def test_plan_module_empty_when_all_off_vocab_after_retry(
fix(annotate): never leave an episode with zero canonical subtasks When the canonical vocabulary is enabled and the VLM produces spans that don't overlap any canonical label, the previous Jaccard-floor (0.5) dropped them and the episode came out with no subtasks at all — invisible to the downstream policy. Observed on ``pepijn223/super_poulain_vocab``: some episodes had empty subtask columns because every VLM-emitted phrase scored below 0.5 against the discovered vocabulary. Two-pass canonicalisation: - First pass keeps the Jaccard floor (lowered from 0.5 → 0.25, to let mild paraphrases through) and drops everything below. - If that first pass leaves the episode with **zero** subtasks, fall back to a second pass that always snaps each VLM span to its nearest canonical label by Jaccard (no floor). The episode ends up with subtasks even when the vocabulary missed a phase — a slightly-wrong canonical label is still closer to the right motion than nothing at all. - Log loudly when the fallback fires so the operator can spot coverage gaps in ``meta/canonical_vocabulary.json``. - Log a per-episode count at INFO when some (but not all) spans were dropped so it's visible without spamming the run output. Promote the Jaccard floor + ignore-tokens to class constants so they're a single edit point. Add ``force=True`` parameter to ``_canonicalize_subtask`` for the no-floor fallback path. New test ``test_plan_module_snaps_when_all_off_vocab`` covers the fallback; existing ``test_plan_module_drops_off_vocab_subtask`` is adjusted to keep at least one in-vocab span so the floor path can still fire and is exercised. All 12 vocabulary tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-22 12:44:03 +00:00
fixture_dataset_root: Path, tmp_path: Path
) -> None:
fix(annotate): replace fuzzy subtask snapping with strict match + one-shot retry The Jaccard-overlap snap was warping VLM output into wrong canonical labels — e.g. an off-vocab "consult the wizard" span would silently become "grasp blue cube" if that scored highest. Even with a higher floor the operator can't tell which subtasks were paraphrases vs genuine mislabels in the resulting dataset. Replace with strict exact-match validation + a single targeted retry: 1. Generate subtasks as before. 2. If any returned subtask's normalised form (lowercased, articles stripped, whitespace collapsed) isn't in the canonical vocab, fire one retry call naming the offending strings and re-sending the full canonical list. The retry prompt requires byte-identical output from the vocab. 3. After the retry, validate again. Spans still off-vocab are dropped — no fuzzy snapping ever produces a different canonical label than the VLM actually emitted. 4. If every span ends up off-vocab even after the retry, warn loudly so the operator extends ``meta/canonical_vocabulary.json`` to cover the missing phase. The episode is left with empty subtasks rather than silently fabricated ones — visibility > sweep-under- the-rug. Promote ``_NORMALIZE_STRIP_TOKENS`` to a class constant and split the normalisation helper out so the retry-validation and the final canonicalisation share one source of truth. Tests: - test_plan_module_accepts_article_only_difference: "grasp the blue cube" still maps to canonical "grasp blue cube" (article-tolerant). - test_plan_module_retries_when_subtask_off_vocab: paraphrase triggers the retry which the VLM corrects in pass 2. - test_plan_module_drops_off_vocab_subtask_after_retry: VLM that refuses to correct → bad span dropped, in-vocab span kept. - test_plan_module_empty_when_all_off_vocab_after_retry: every span off-vocab → episode left empty (no warping). All 13 vocabulary tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-23 09:57:27 +00:00
"""All-off-vocab spans → episode comes out empty (no silent fuzzy snap)."""
fix(annotate): never leave an episode with zero canonical subtasks When the canonical vocabulary is enabled and the VLM produces spans that don't overlap any canonical label, the previous Jaccard-floor (0.5) dropped them and the episode came out with no subtasks at all — invisible to the downstream policy. Observed on ``pepijn223/super_poulain_vocab``: some episodes had empty subtask columns because every VLM-emitted phrase scored below 0.5 against the discovered vocabulary. Two-pass canonicalisation: - First pass keeps the Jaccard floor (lowered from 0.5 → 0.25, to let mild paraphrases through) and drops everything below. - If that first pass leaves the episode with **zero** subtasks, fall back to a second pass that always snaps each VLM span to its nearest canonical label by Jaccard (no floor). The episode ends up with subtasks even when the vocabulary missed a phase — a slightly-wrong canonical label is still closer to the right motion than nothing at all. - Log loudly when the fallback fires so the operator can spot coverage gaps in ``meta/canonical_vocabulary.json``. - Log a per-episode count at INFO when some (but not all) spans were dropped so it's visible without spamming the run output. Promote the Jaccard floor + ignore-tokens to class constants so they're a single edit point. Add ``force=True`` parameter to ``_canonicalize_subtask`` for the no-floor fallback path. New test ``test_plan_module_snaps_when_all_off_vocab`` covers the fallback; existing ``test_plan_module_drops_off_vocab_subtask`` is adjusted to keep at least one in-vocab span so the floor path can still fire and is exercised. All 12 vocabulary tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-22 12:44:03 +00:00
from lerobot.annotations.steerable_pipeline.vlm_client import StubVlmClient
def responder(_messages):
fix(annotate): replace fuzzy subtask snapping with strict match + one-shot retry The Jaccard-overlap snap was warping VLM output into wrong canonical labels — e.g. an off-vocab "consult the wizard" span would silently become "grasp blue cube" if that scored highest. Even with a higher floor the operator can't tell which subtasks were paraphrases vs genuine mislabels in the resulting dataset. Replace with strict exact-match validation + a single targeted retry: 1. Generate subtasks as before. 2. If any returned subtask's normalised form (lowercased, articles stripped, whitespace collapsed) isn't in the canonical vocab, fire one retry call naming the offending strings and re-sending the full canonical list. The retry prompt requires byte-identical output from the vocab. 3. After the retry, validate again. Spans still off-vocab are dropped — no fuzzy snapping ever produces a different canonical label than the VLM actually emitted. 4. If every span ends up off-vocab even after the retry, warn loudly so the operator extends ``meta/canonical_vocabulary.json`` to cover the missing phase. The episode is left with empty subtasks rather than silently fabricated ones — visibility > sweep-under- the-rug. Promote ``_NORMALIZE_STRIP_TOKENS`` to a class constant and split the normalisation helper out so the retry-validation and the final canonicalisation share one source of truth. Tests: - test_plan_module_accepts_article_only_difference: "grasp the blue cube" still maps to canonical "grasp blue cube" (article-tolerant). - test_plan_module_retries_when_subtask_off_vocab: paraphrase triggers the retry which the VLM corrects in pass 2. - test_plan_module_drops_off_vocab_subtask_after_retry: VLM that refuses to correct → bad span dropped, in-vocab span kept. - test_plan_module_empty_when_all_off_vocab_after_retry: every span off-vocab → episode left empty (no warping). All 13 vocabulary tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-23 09:57:27 +00:00
# Returns the same off-vocab spans on both attempts.
fix(annotate): never leave an episode with zero canonical subtasks When the canonical vocabulary is enabled and the VLM produces spans that don't overlap any canonical label, the previous Jaccard-floor (0.5) dropped them and the episode came out with no subtasks at all — invisible to the downstream policy. Observed on ``pepijn223/super_poulain_vocab``: some episodes had empty subtask columns because every VLM-emitted phrase scored below 0.5 against the discovered vocabulary. Two-pass canonicalisation: - First pass keeps the Jaccard floor (lowered from 0.5 → 0.25, to let mild paraphrases through) and drops everything below. - If that first pass leaves the episode with **zero** subtasks, fall back to a second pass that always snaps each VLM span to its nearest canonical label by Jaccard (no floor). The episode ends up with subtasks even when the vocabulary missed a phase — a slightly-wrong canonical label is still closer to the right motion than nothing at all. - Log loudly when the fallback fires so the operator can spot coverage gaps in ``meta/canonical_vocabulary.json``. - Log a per-episode count at INFO when some (but not all) spans were dropped so it's visible without spamming the run output. Promote the Jaccard floor + ignore-tokens to class constants so they're a single edit point. Add ``force=True`` parameter to ``_canonicalize_subtask`` for the no-floor fallback path. New test ``test_plan_module_snaps_when_all_off_vocab`` covers the fallback; existing ``test_plan_module_drops_off_vocab_subtask`` is adjusted to keep at least one in-vocab span so the floor path can still fire and is exercised. All 12 vocabulary tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-22 12:44:03 +00:00
return {
"subtasks": [
{"text": "make a smoothie", "start": 0.0, "end": 0.4},
{"text": "consult the wizard", "start": 0.4, "end": 0.9},
]
}
vlm = StubVlmClient(responder=responder)
vocab = Vocabulary(subtasks=_CANONICAL_SUBTASKS, memory_milestones=_CANONICAL_MEMORY)
module = PlanSubtasksMemoryModule(
vlm=vlm,
config=PlanConfig(n_task_rephrasings=0),
vocabulary=vocab,
)
record = next(iter_episodes(fixture_dataset_root))
staging = EpisodeStaging(tmp_path / "stage", record.episode_index)
module.run_episode(record, staging)
rows = staging.read("plan")
subtask_texts = [r["content"] for r in rows if r["style"] == "subtask"]
fix(annotate): replace fuzzy subtask snapping with strict match + one-shot retry The Jaccard-overlap snap was warping VLM output into wrong canonical labels — e.g. an off-vocab "consult the wizard" span would silently become "grasp blue cube" if that scored highest. Even with a higher floor the operator can't tell which subtasks were paraphrases vs genuine mislabels in the resulting dataset. Replace with strict exact-match validation + a single targeted retry: 1. Generate subtasks as before. 2. If any returned subtask's normalised form (lowercased, articles stripped, whitespace collapsed) isn't in the canonical vocab, fire one retry call naming the offending strings and re-sending the full canonical list. The retry prompt requires byte-identical output from the vocab. 3. After the retry, validate again. Spans still off-vocab are dropped — no fuzzy snapping ever produces a different canonical label than the VLM actually emitted. 4. If every span ends up off-vocab even after the retry, warn loudly so the operator extends ``meta/canonical_vocabulary.json`` to cover the missing phase. The episode is left with empty subtasks rather than silently fabricated ones — visibility > sweep-under- the-rug. Promote ``_NORMALIZE_STRIP_TOKENS`` to a class constant and split the normalisation helper out so the retry-validation and the final canonicalisation share one source of truth. Tests: - test_plan_module_accepts_article_only_difference: "grasp the blue cube" still maps to canonical "grasp blue cube" (article-tolerant). - test_plan_module_retries_when_subtask_off_vocab: paraphrase triggers the retry which the VLM corrects in pass 2. - test_plan_module_drops_off_vocab_subtask_after_retry: VLM that refuses to correct → bad span dropped, in-vocab span kept. - test_plan_module_empty_when_all_off_vocab_after_retry: every span off-vocab → episode left empty (no warping). All 13 vocabulary tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-23 09:57:27 +00:00
# No subtask gets fabricated — better to leave the episode empty
# so the operator notices the vocabulary gap than to silently
# warp the labels.
assert subtask_texts == []
fix(annotate): never leave an episode with zero canonical subtasks When the canonical vocabulary is enabled and the VLM produces spans that don't overlap any canonical label, the previous Jaccard-floor (0.5) dropped them and the episode came out with no subtasks at all — invisible to the downstream policy. Observed on ``pepijn223/super_poulain_vocab``: some episodes had empty subtask columns because every VLM-emitted phrase scored below 0.5 against the discovered vocabulary. Two-pass canonicalisation: - First pass keeps the Jaccard floor (lowered from 0.5 → 0.25, to let mild paraphrases through) and drops everything below. - If that first pass leaves the episode with **zero** subtasks, fall back to a second pass that always snaps each VLM span to its nearest canonical label by Jaccard (no floor). The episode ends up with subtasks even when the vocabulary missed a phase — a slightly-wrong canonical label is still closer to the right motion than nothing at all. - Log loudly when the fallback fires so the operator can spot coverage gaps in ``meta/canonical_vocabulary.json``. - Log a per-episode count at INFO when some (but not all) spans were dropped so it's visible without spamming the run output. Promote the Jaccard floor + ignore-tokens to class constants so they're a single edit point. Add ``force=True`` parameter to ``_canonicalize_subtask`` for the no-floor fallback path. New test ``test_plan_module_snaps_when_all_off_vocab`` covers the fallback; existing ``test_plan_module_drops_off_vocab_subtask`` is adjusted to keep at least one in-vocab span so the floor path can still fire and is exercised. All 12 vocabulary tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-22 12:44:03 +00:00
feat(annotate): phase 0 — derive canonical vocabulary from sample episodes The pipeline previously emitted near-unique subtask + memory phrasings per episode (free-form LLM rephrasing). On the downstream low-level policy that collapses the action expert's conditioning to noise: every episode pairs a different paraphrase with similar motions, so the expert learns a flat scene-prior that ignores the subtask string — then at inference the high-level head invents *yet another* paraphrase and the expert produces tiny "uncertain hover" chunks. Add a vocabulary-discovery phase (phase 0) that runs once per dataset: - watches the first ``vocabulary.sample_episodes`` (default 3) episode videos as one Qwen-VL prompt, - asks the VLM to derive ~``n_subtask_target`` canonical imperative subtask labels and ~``n_memory_target`` first-person past-tense memory milestones that recur across the demos, - persists them to ``meta/canonical_vocabulary.json`` (human- inspectable, hand-editable), and - wires the resulting ``Vocabulary`` into the ``plan`` module so every per-episode subtask + memory call is constrained to those exact strings (both as prompt-side instructions *and* post-VLM validation: paraphrases snap to the closest canonical entry via token-set overlap; below a 0.5 Jaccard floor the subtask is dropped rather than warped into something semantically wrong). Operator workflow: - first run discovers the vocabulary, writes the JSON, and runs the ``plan`` module against it, - subsequent runs reuse the on-disk file (``reuse_existing=True`` default) so hand-edits stick, - set ``--vocabulary.enabled=False`` to fall back to free-form generation (the original behaviour). The discovery prompt forbids gerunds / third-person / adverbs and caps the lists to the requested counts, matching the Hi-Robot / π0.6-MEM convention of small per-environment vocabularies. The ``plan`` module's subtask + memory prompts grow a conditional ``{vocabulary_block}`` slot rendered only when a vocabulary is present; without one the templates collapse to their previous free-form form. Tests: 11 new unit tests under tests/annotations/test_vocabulary.py cover the on-disk round-trip, discovery against the fixture dataset, ``reuse_existing`` short-circuit, paraphrase canonicalisation, off- vocab subtask dropping, and the no-vocabulary pass-through path. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-22 11:40:05 +00:00
def test_plan_module_without_vocab_passes_through(
fixture_dataset_root: Path, tmp_path: Path
) -> None:
"""No vocabulary configured → original free-form behavior is preserved."""
from lerobot.annotations.steerable_pipeline.vlm_client import StubVlmClient
def responder(_messages):
return {
"subtasks": [
{"text": "any free-form text the VLM wants", "start": 0.0, "end": 1.0},
]
}
vlm = StubVlmClient(responder=responder)
module = PlanSubtasksMemoryModule(
vlm=vlm, config=PlanConfig(n_task_rephrasings=0)
)
record = next(iter_episodes(fixture_dataset_root))
staging = EpisodeStaging(tmp_path / "stage", record.episode_index)
module.run_episode(record, staging)
rows = staging.read("plan")
subtask_texts = [r["content"] for r in rows if r["style"] == "subtask"]
assert subtask_texts == ["any free-form text the VLM wants"]