tests/annotations/conftest.py

#!/usr/bin/env python

# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Shared fixtures for annotation-pipeline tests.

Builds a minimal LeRobot-shaped dataset on disk so writer/validator tests
can exercise real parquet reads and writes without needing a checked-in
LFS dataset.
"""

from __future__ import annotations

import json
from pathlib import Path

import pyarrow as pa
import pyarrow.parquet as pq
import pytest


def _make_episode_table(
    episode_index: int,
    num_frames: int,
    *,
    fps: int = 10,
    task_index: int = 0,
) -> pa.Table:
    timestamps = [round(i / fps, 6) for i in range(num_frames)]
    frame_indices = list(range(num_frames))
    return pa.Table.from_pydict(
        {
            "episode_index": [episode_index] * num_frames,
            "frame_index": frame_indices,
            "timestamp": timestamps,
            "task_index": [task_index] * num_frames,
            "subtask_index": [0] * num_frames,  # legacy column the writer must drop
        }
    )


def _build_dataset(root: Path, episode_specs: list[tuple[int, int, str]], *, fps: int = 10) -> Path:
    """Create a fixture dataset under ``root``.

    ``episode_specs`` is a list of ``(episode_index, num_frames, task_text)``.
    Each episode goes into its own ``data/chunk-000/file-{ep:03d}.parquet``
    so the writer's per-shard rewrite path is exercised.
    """
    data_dir = root / "data" / "chunk-000"
    data_dir.mkdir(parents=True, exist_ok=True)
    tasks = {}
    for episode_index, num_frames, task_text in episode_specs:
        task_index = len(tasks)
        if task_text not in tasks.values():
            tasks[task_index] = task_text
        else:
            task_index = next(k for k, v in tasks.items() if v == task_text)
        table = _make_episode_table(episode_index, num_frames, fps=fps, task_index=task_index)
        path = data_dir / f"file-{episode_index:03d}.parquet"
        pq.write_table(table, path)

    meta_dir = root / "meta"
    meta_dir.mkdir(parents=True, exist_ok=True)
    tasks_table = pa.Table.from_pydict(
        {
            "task_index": list(tasks.keys()),
            "task": list(tasks.values()),
        }
    )
    pq.write_table(tasks_table, meta_dir / "tasks.parquet")

    info = {
        "codebase_version": "v3.1",
        "fps": fps,
        "total_episodes": len(episode_specs),
    }
    (meta_dir / "info.json").write_text(json.dumps(info, indent=2))

    return root


@pytest.fixture
def fixture_dataset_root(tmp_path: Path) -> Path:
    """A tiny dataset with two episodes, 12 frames each at 10 fps."""
    return _build_dataset(
        tmp_path / "ds",
        episode_specs=[
            (0, 12, "Could you tidy the kitchen please?"),
            (1, 12, "Please clean up the kitchen"),
        ],
        fps=10,
    )


@pytest.fixture
def single_episode_root(tmp_path: Path) -> Path:
    return _build_dataset(
        tmp_path / "ds_one",
        episode_specs=[(0, 30, "Pour water from the bottle into the cup.")],
        fps=10,
    )
feat: language annotation pipeline (PR 2/3) Adds the steerable annotation pipeline (`lerobot-annotate`) that populates the `language_persistent` and `language_events` columns introduced in PR 1 directly into `data/chunk-/file-.parquet`. No flavor namespace, no sidecar tree. Modules produced: - Module 1 (plan_subtasks_memory): Pi0.7-style subtasks, plan (init + refresh on interjection), MEM-style memory at subtask boundaries. - Module 2 (interjections_and_speech): t=0 speech-only acknowledgement, mid-episode paired interjection + speech tool-call atom. - Module 3 (general_vqa): bbox/keypoint/count/attribute/spatial pairs at configurable cadence with one-retry JSON validation. Writer enforces: per-episode persistent identity, exact-frame event timestamps, column routing per `column_for_style`, dataset-level `tools` column with the `say` schema, drops legacy `subtask_index`. Validator runs against staged JSONL artifacts before the writer rewrites parquet. Adds `lerobot-annotate` console script, `annotations` extra (datatrove + optional vllm), `make annotation-e2e` opt-in smoke target, and `docs/source/annotation_pipeline.mdx`. Branched from PR 1 (`feat/language-columns`). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-04-27 16:22:51 +02:00			`#!/usr/bin/env python`

			`# Copyright 2026 The HuggingFace Inc. team. All rights reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`"""Shared fixtures for annotation-pipeline tests.`

			`Builds a minimal LeRobot-shaped dataset on disk so writer/validator tests`
			`can exercise real parquet reads and writes without needing a checked-in`
			`LFS dataset.`
			`"""`

			`from __future__ import annotations`

			`import json`
			`from pathlib import Path`

			`import pyarrow as pa`
			`import pyarrow.parquet as pq`
			`import pytest`


			`def _make_episode_table(`
			`episode_index: int,`
			`num_frames: int,`
			`*,`
			`fps: int = 10,`
			`task_index: int = 0,`
			`) -> pa.Table:`
			`timestamps = [round(i / fps, 6) for i in range(num_frames)]`
			`frame_indices = list(range(num_frames))`
			`return pa.Table.from_pydict(`
			`{`
			`"episode_index": [episode_index] * num_frames,`
			`"frame_index": frame_indices,`
			`"timestamp": timestamps,`
			`"task_index": [task_index] * num_frames,`
			`"subtask_index": [0] * num_frames, # legacy column the writer must drop`
			`}`
			`)`


			`def _build_dataset(root: Path, episode_specs: list[tuple[int, int, str]], *, fps: int = 10) -> Path:`
			"""Create a fixture dataset under ``root``.

			``episode_specs`` is a list of ``(episode_index, num_frames, task_text)``.
			Each episode goes into its own ``data/chunk-000/file-{ep:03d}.parquet``
			`so the writer's per-shard rewrite path is exercised.`
			`"""`
			`data_dir = root / "data" / "chunk-000"`
			`data_dir.mkdir(parents=True, exist_ok=True)`
			`tasks = {}`
			`for episode_index, num_frames, task_text in episode_specs:`
			`task_index = len(tasks)`
			`if task_text not in tasks.values():`
			`tasks[task_index] = task_text`
			`else:`
			`task_index = next(k for k, v in tasks.items() if v == task_text)`
			`table = _make_episode_table(episode_index, num_frames, fps=fps, task_index=task_index)`
			`path = data_dir / f"file-{episode_index:03d}.parquet"`
			`pq.write_table(table, path)`

			`meta_dir = root / "meta"`
			`meta_dir.mkdir(parents=True, exist_ok=True)`
			`tasks_table = pa.Table.from_pydict(`
			`{`
			`"task_index": list(tasks.keys()),`
			`"task": list(tasks.values()),`
			`}`
			`)`
			`pq.write_table(tasks_table, meta_dir / "tasks.parquet")`

			`info = {`
			`"codebase_version": "v3.1",`
			`"fps": fps,`
			`"total_episodes": len(episode_specs),`
			`}`
			`(meta_dir / "info.json").write_text(json.dumps(info, indent=2))`

			`return root`


			`@pytest.fixture`
			`def fixture_dataset_root(tmp_path: Path) -> Path:`
			`"""A tiny dataset with two episodes, 12 frames each at 10 fps."""`
			`return _build_dataset(`
			`tmp_path / "ds",`
			`episode_specs=[`
			`(0, 12, "Could you tidy the kitchen please?"),`
			`(1, 12, "Please clean up the kitchen"),`
			`],`
			`fps=10,`
			`)`


			`@pytest.fixture`
			`def single_episode_root(tmp_path: Path) -> Path:`
			`return _build_dataset(`
			`tmp_path / "ds_one",`
			`episode_specs=[(0, 30, "Pour water from the bottle into the cup.")],`
			`fps=10,`
			`)`