From a18d969753fc702f590c1af3b84534fbfa2ca418 Mon Sep 17 00:00:00 2001 From: Pepijn Date: Wed, 3 Jun 2026 16:21:17 +0200 Subject: [PATCH] tests(annotations): fix stale canned-VLM markers + action_record style assertion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The annotation tests had never actually run in CI (collection failed on the missing 'datasets' extra); now that they do, three stale assertions surfaced against the evolved pipeline: * test_module1_plan_memory_subtask_smoke: the memory canned-responder marker 'Update the memory' no longer appears in module_1_memory.txt (now 'compressed semantic memory'), so the stub returned no memory row and the {subtask,plan,memory} subset check failed. Marker updated to match the current prompt. * test_module2_mid_episode_emits_paired_interjection_and_speech: the interjection marker 'Write ONE interjection' is now 'Write ONE compact interjection' in module_2_interjection.txt, so 0 interjections were emitted. Marker updated. * tests/datasets/test_language.py::test_style_registry_routes_columns: PERSISTENT_STYLES gained 'action_record' in this PR; add it to the expected set. These are test/prompt-marker syncs — no production behavior change. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/annotations/test_modules.py | 12 +++++------- tests/datasets/test_language.py | 2 +- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/tests/annotations/test_modules.py b/tests/annotations/test_modules.py index 189481169..021cd207f 100644 --- a/tests/annotations/test_modules.py +++ b/tests/annotations/test_modules.py @@ -88,7 +88,7 @@ def test_module1_plan_memory_subtask_smoke(fixture_dataset_root: Path, tmp_path: {"text": "place the sponge into the sink", "start": 0.8, "end": 1.1}, ] }, - "Update the memory": {"memory": "wiped the counter once"}, + "compressed semantic memory": {"memory": "wiped the counter once"}, }, ) module = PlanSubtasksMemoryModule(vlm=vlm, config=PlanConfig()) @@ -151,12 +151,10 @@ def test_module2_mid_episode_emits_paired_interjection_and_speech( { "acknowledgement the robot": {"text": "OK."}, # Marker matches the distinctive line of - # ``module_2_interjection.txt``. The old marker - # ("ONE realistic interruption") came from a previous prompt - # version that asked for counterfactual interjections; the - # current design anchors on subtask boundaries instead, so - # the prompt and its marker changed. - "Write ONE interjection": { + # ``module_2_interjection.txt`` ("Write ONE compact + # interjection ..."). Keep this in sync with that prompt's + # wording — the canned responder matches on substring. + "Write ONE compact interjection": { "interjection": "now wipe the counter please", "speech": "On it.", }, diff --git a/tests/datasets/test_language.py b/tests/datasets/test_language.py index 52c7b3708..2846dab1d 100644 --- a/tests/datasets/test_language.py +++ b/tests/datasets/test_language.py @@ -64,7 +64,7 @@ def test_validate_feature_language_warns_only_on_non_empty_value(caplog): def test_style_registry_routes_columns(): - assert {"subtask", "plan", "memory", "motion", "task_aug"} == PERSISTENT_STYLES + assert {"subtask", "plan", "memory", "motion", "task_aug", "action_record"} == PERSISTENT_STYLES assert {"interjection", "vqa", "trace"} == EVENT_ONLY_STYLES assert PERSISTENT_STYLES | EVENT_ONLY_STYLES <= STYLE_REGISTRY