diff --git a/src/lerobot/policies/sarm/configuration_sarm.py b/src/lerobot/policies/sarm/configuration_sarm.py index 258531166..437d4bb59 100644 --- a/src/lerobot/policies/sarm/configuration_sarm.py +++ b/src/lerobot/policies/sarm/configuration_sarm.py @@ -83,11 +83,9 @@ class SARMConfig(PreTrainedConfig): encode_on_the_fly: bool = True # Encode images/text during training use_dataset_task: bool = True # Use task descriptions from dataset use_subtask_annotations: bool = True # Use subtask annotations for stage-aware training if available - - # Features (required by PreTrainedPolicy) + + # Video_features and text_features are generated by the processor from raw images/text, we don't declare them as VISUAL/LANGUAGE here to avoid validation errors input_features: dict = field(default_factory=lambda: { - "video_features": PolicyFeature(shape=(9, 512), type=FeatureType.VISUAL), - "text_features": PolicyFeature(shape=(384,), type=FeatureType.LANGUAGE), "state_features": PolicyFeature(shape=(9, 14), type=FeatureType.STATE) # Example: 7 DOF × 2 arms }) output_features: dict = field(default_factory=lambda: {