From a18d969753fc702f590c1af3b84534fbfa2ca418 Mon Sep 17 00:00:00 2001
From: Pepijn <pepijn@huggingface.co>
Date: Wed, 3 Jun 2026 16:21:17 +0200
Subject: [PATCH] tests(annotations): fix stale canned-VLM markers +
 action_record style assertion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The annotation tests had never actually run in CI (collection failed on
the missing 'datasets' extra); now that they do, three stale assertions
surfaced against the evolved pipeline:

  * test_module1_plan_memory_subtask_smoke: the memory canned-responder
    marker 'Update the memory' no longer appears in module_1_memory.txt
    (now 'compressed semantic memory'), so the stub returned no memory
    row and the {subtask,plan,memory} subset check failed. Marker
    updated to match the current prompt.
  * test_module2_mid_episode_emits_paired_interjection_and_speech: the
    interjection marker 'Write ONE interjection' is now 'Write ONE
    compact interjection' in module_2_interjection.txt, so 0 interjections
    were emitted. Marker updated.
  * tests/datasets/test_language.py::test_style_registry_routes_columns:
    PERSISTENT_STYLES gained 'action_record' in this PR; add it to the
    expected set.

These are test/prompt-marker syncs — no production behavior change.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 tests/annotations/test_modules.py | 12 +++++-------
 tests/datasets/test_language.py   |  2 +-
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/tests/annotations/test_modules.py b/tests/annotations/test_modules.py
index 189481169..021cd207f 100644
--- a/tests/annotations/test_modules.py
+++ b/tests/annotations/test_modules.py
@@ -88,7 +88,7 @@ def test_module1_plan_memory_subtask_smoke(fixture_dataset_root: Path, tmp_path:
                     {"text": "place the sponge into the sink", "start": 0.8, "end": 1.1},
                 ]
             },
-            "Update the memory": {"memory": "wiped the counter once"},
+            "compressed semantic memory": {"memory": "wiped the counter once"},
         },
     )
     module = PlanSubtasksMemoryModule(vlm=vlm, config=PlanConfig())
@@ -151,12 +151,10 @@ def test_module2_mid_episode_emits_paired_interjection_and_speech(
         {
             "acknowledgement the robot": {"text": "OK."},
             # Marker matches the distinctive line of
-            # ``module_2_interjection.txt``. The old marker
-            # ("ONE realistic interruption") came from a previous prompt
-            # version that asked for counterfactual interjections; the
-            # current design anchors on subtask boundaries instead, so
-            # the prompt and its marker changed.
-            "Write ONE interjection": {
+            # ``module_2_interjection.txt`` ("Write ONE compact
+            # interjection ..."). Keep this in sync with that prompt's
+            # wording — the canned responder matches on substring.
+            "Write ONE compact interjection": {
                 "interjection": "now wipe the counter please",
                 "speech": "On it.",
             },
diff --git a/tests/datasets/test_language.py b/tests/datasets/test_language.py
index 52c7b3708..2846dab1d 100644
--- a/tests/datasets/test_language.py
+++ b/tests/datasets/test_language.py
@@ -64,7 +64,7 @@ def test_validate_feature_language_warns_only_on_non_empty_value(caplog):
 
 
 def test_style_registry_routes_columns():
-    assert {"subtask", "plan", "memory", "motion", "task_aug"} == PERSISTENT_STYLES
+    assert {"subtask", "plan", "memory", "motion", "task_aug", "action_record"} == PERSISTENT_STYLES
     assert {"interjection", "vqa", "trace"} == EVENT_ONLY_STYLES
     assert PERSISTENT_STYLES | EVENT_ONLY_STYLES <= STYLE_REGISTRY