mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-31 10:51:35 +00:00
feat(language): per-camera tagging on view-dependent styles
Adds a nullable `camera` field to the language row struct (both persistent
and event variants) so view-dependent styles like `vqa` can carry which
`observation.images.*` view they were grounded against. Without this,
multi-camera datasets ended up with multiple `(vqa, role)` rows at the
same timestamp that the resolver could not disambiguate.
- `language.py`: add `camera` to PERSISTENT_ROW_FIELDS / EVENT_ROW_FIELDS,
to both Arrow struct types and the HF datasets feature mappings;
introduce VIEW_DEPENDENT_STYLES = {vqa, motion, trace} plus
`is_view_dependent_style` and `validate_camera_field` helpers (camera
required iff style is view-dependent).
- `language_render.py`: thread an optional `camera=` kwarg through every
resolver (`active_at`, `emitted_at`, `nth_prev`, `nth_next`) and through
`_matching_rows` / `_select_*`, so recipes can disambiguate per-camera
VQA with `emitted_at(t, style=vqa, role=assistant, camera=...)`.
Without a `camera` filter, multi-row matches keep raising the existing
ambiguity error — which is the desired behaviour on multi-camera data.
- `recipes/pi05_hirobot.yaml`: replace the single `ask_vqa` branch with
`ask_vqa_top` and `ask_vqa_wrist` per-camera sub-recipes (each carrying
the matching image block), keeping the original 0.20 budget and
documenting the customization point for datasets with different cameras.
- Tests: schema test asserts the new field order; new tests cover
`is_view_dependent_style`, `validate_camera_field` (both required and
forbidden directions), per-camera `emitted_at` filtering, and the
ambiguity error when two cameras emit `(vqa, assistant)` at the same
timestamp without a `camera=` filter. RenderMessagesStep + dataset
passthrough fixtures updated to include the new field.
- `docs/source/language_and_recipes.mdx`: document the `camera` field,
the per-camera resolver pattern, and the canonical recipe convention.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -46,18 +46,23 @@ def active_at(
|
||||
style: str | None = None,
|
||||
role: str | None = None,
|
||||
tool_name: str | None = None,
|
||||
camera: str | None = None,
|
||||
) -> LanguageRow | None:
|
||||
"""Return the persistent row of ``style`` that is active at time ``t``.
|
||||
|
||||
A persistent row is "active" at ``t`` when its own ``timestamp`` is the
|
||||
most recent one ``<= t`` for the given ``style``/``role``/``tool_name``
|
||||
selector. ``events`` is accepted for resolver-signature uniformity but is
|
||||
not consulted: only persistent styles are valid here.
|
||||
most recent one ``<= t`` for the given ``style``/``role``/``tool_name``/
|
||||
``camera`` selector. ``events`` is accepted for resolver-signature
|
||||
uniformity but is not consulted: only persistent styles are valid here.
|
||||
"""
|
||||
_validate_persistent_resolver("active_at", style)
|
||||
matches = _matching_rows(persistent, style=style, role=role, tool_name=tool_name)
|
||||
matches = _matching_rows(
|
||||
persistent, style=style, role=role, tool_name=tool_name, camera=camera
|
||||
)
|
||||
matches = [row for row in matches if _timestamp(row) <= t]
|
||||
return _select_latest(matches, style=style, role=role, tool_name=tool_name)
|
||||
return _select_latest(
|
||||
matches, style=style, role=role, tool_name=tool_name, camera=camera
|
||||
)
|
||||
|
||||
|
||||
def emitted_at(
|
||||
@@ -68,26 +73,45 @@ def emitted_at(
|
||||
style: str | None = None,
|
||||
role: str | None = None,
|
||||
tool_name: str | None = None,
|
||||
camera: str | None = None,
|
||||
) -> LanguageRow | None:
|
||||
"""Return the row of ``style`` emitted at exactly time ``t``.
|
||||
|
||||
For persistent styles, this matches persistent rows whose own ``timestamp``
|
||||
equals ``t``. For event styles, the ``events`` list is assumed to come from
|
||||
the dataset row at frame ``t`` (event rows carry no timestamp of their own),
|
||||
so all matching event rows are considered emitted at ``t``.
|
||||
so all matching event rows are considered emitted at ``t``. ``camera``
|
||||
filters by the row's ``camera`` field — required to disambiguate when
|
||||
multiple view-dependent rows share ``(t, role)`` across cameras.
|
||||
"""
|
||||
column = column_for_style(style)
|
||||
if column == LANGUAGE_PERSISTENT:
|
||||
matches = [
|
||||
row
|
||||
for row in _matching_rows(persistent, style=style, role=role, tool_name=tool_name)
|
||||
for row in _matching_rows(
|
||||
persistent, style=style, role=role, tool_name=tool_name, camera=camera
|
||||
)
|
||||
if _timestamp(row) == t
|
||||
]
|
||||
return _select_one(
|
||||
matches, style=style, role=role, tool_name=tool_name, sort_key=_persistent_sort_key
|
||||
matches,
|
||||
style=style,
|
||||
role=role,
|
||||
tool_name=tool_name,
|
||||
camera=camera,
|
||||
sort_key=_persistent_sort_key,
|
||||
)
|
||||
matches = _matching_rows(events, style=style, role=role, tool_name=tool_name)
|
||||
return _select_one(matches, style=style, role=role, tool_name=tool_name, sort_key=_event_sort_key)
|
||||
matches = _matching_rows(
|
||||
events, style=style, role=role, tool_name=tool_name, camera=camera
|
||||
)
|
||||
return _select_one(
|
||||
matches,
|
||||
style=style,
|
||||
role=role,
|
||||
tool_name=tool_name,
|
||||
camera=camera,
|
||||
sort_key=_event_sort_key,
|
||||
)
|
||||
|
||||
|
||||
def nth_prev(
|
||||
@@ -99,12 +123,14 @@ def nth_prev(
|
||||
offset: int = 1,
|
||||
role: str | None = None,
|
||||
tool_name: str | None = None,
|
||||
camera: str | None = None,
|
||||
) -> LanguageRow | None:
|
||||
"""Return the persistent row that was active ``offset`` steps before ``t``.
|
||||
|
||||
Walks back through chronologically sorted persistent rows of ``style``
|
||||
(filtered by optional ``role``/``tool_name``) and returns the one ``offset``
|
||||
positions before the row active at ``t``. Only valid for persistent styles.
|
||||
(filtered by optional ``role``/``tool_name``/``camera``) and returns the
|
||||
one ``offset`` positions before the row active at ``t``. Only valid for
|
||||
persistent styles.
|
||||
"""
|
||||
return _nth_relative(
|
||||
t,
|
||||
@@ -113,6 +139,7 @@ def nth_prev(
|
||||
offset=-offset,
|
||||
role=role,
|
||||
tool_name=tool_name,
|
||||
camera=camera,
|
||||
resolver_name="nth_prev",
|
||||
)
|
||||
|
||||
@@ -126,12 +153,14 @@ def nth_next(
|
||||
offset: int = 1,
|
||||
role: str | None = None,
|
||||
tool_name: str | None = None,
|
||||
camera: str | None = None,
|
||||
) -> LanguageRow | None:
|
||||
"""Return the persistent row that becomes active ``offset`` steps after ``t``.
|
||||
|
||||
Walks forward through chronologically sorted persistent rows of ``style``
|
||||
(filtered by optional ``role``/``tool_name``) and returns the one ``offset``
|
||||
positions after the row active at ``t``. Only valid for persistent styles.
|
||||
(filtered by optional ``role``/``tool_name``/``camera``) and returns the
|
||||
one ``offset`` positions after the row active at ``t``. Only valid for
|
||||
persistent styles.
|
||||
"""
|
||||
return _nth_relative(
|
||||
t,
|
||||
@@ -140,6 +169,7 @@ def nth_next(
|
||||
offset=offset,
|
||||
role=role,
|
||||
tool_name=tool_name,
|
||||
camera=camera,
|
||||
resolver_name="nth_next",
|
||||
)
|
||||
|
||||
@@ -376,6 +406,7 @@ def _nth_relative(
|
||||
offset: int,
|
||||
role: str | None,
|
||||
tool_name: str | None,
|
||||
camera: str | None,
|
||||
resolver_name: str,
|
||||
) -> LanguageRow | None:
|
||||
"""Shared body for ``nth_prev`` / ``nth_next`` with signed ``offset``."""
|
||||
@@ -384,7 +415,7 @@ def _nth_relative(
|
||||
raise ValueError(f"{resolver_name} offset must be non-zero.")
|
||||
|
||||
rows = sorted(
|
||||
_matching_rows(persistent, style=style, role=role, tool_name=tool_name),
|
||||
_matching_rows(persistent, style=style, role=role, tool_name=tool_name, camera=camera),
|
||||
key=_persistent_sort_key,
|
||||
)
|
||||
if not rows:
|
||||
@@ -420,14 +451,16 @@ def _matching_rows(
|
||||
style: str | None,
|
||||
role: str | None,
|
||||
tool_name: str | None,
|
||||
camera: str | None,
|
||||
) -> list[LanguageRow]:
|
||||
"""Return ``rows`` filtered by optional ``style``/``role``/``tool_name`` selectors."""
|
||||
"""Return ``rows`` filtered by optional ``style``/``role``/``tool_name``/``camera`` selectors."""
|
||||
return [
|
||||
row
|
||||
for row in rows
|
||||
if (style is None or row.get("style") == style)
|
||||
and (role is None or row.get("role") == role)
|
||||
and (tool_name is None or _row_has_tool_name(row, tool_name))
|
||||
and (camera is None or row.get("camera") == camera)
|
||||
]
|
||||
|
||||
|
||||
@@ -437,6 +470,7 @@ def _select_latest(
|
||||
style: str | None,
|
||||
role: str | None,
|
||||
tool_name: str | None,
|
||||
camera: str | None,
|
||||
) -> LanguageRow | None:
|
||||
"""Return the row tied for the latest ``timestamp`` (disambiguated by selectors)."""
|
||||
if not rows:
|
||||
@@ -448,6 +482,7 @@ def _select_latest(
|
||||
style=style,
|
||||
role=role,
|
||||
tool_name=tool_name,
|
||||
camera=camera,
|
||||
sort_key=_persistent_sort_key,
|
||||
)
|
||||
|
||||
@@ -458,14 +493,16 @@ def _select_one(
|
||||
style: str | None,
|
||||
role: str | None,
|
||||
tool_name: str | None,
|
||||
camera: str | None,
|
||||
sort_key: Any,
|
||||
) -> LanguageRow | None:
|
||||
"""Return the single matching row, or raise if the selectors are ambiguous."""
|
||||
if not rows:
|
||||
return None
|
||||
if len(rows) > 1 and role is None and tool_name is None:
|
||||
if len(rows) > 1 and role is None and tool_name is None and camera is None:
|
||||
raise ValueError(
|
||||
f"Ambiguous resolver for style={style!r}; add role=... or tool_name=... to disambiguate."
|
||||
f"Ambiguous resolver for style={style!r}; add role=..., tool_name=..., "
|
||||
f"or camera=... to disambiguate."
|
||||
)
|
||||
return sorted(rows, key=sort_key)[0]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user