merge branch main

2026-06-01 11:21:27 +00:00 · 2026-02-09 08:55:11 +01:00
parent 6629b454b2 97e7e0f9ed
commit 6aa0cc267f
86 changed files with 9097 additions and 886 deletions
--- a/src/lerobot/data_processing/annotations/annotate_libero.sh
+++ b/src/lerobot/data_processing/annotations/annotate_libero.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+# Example script to run synthetic data generation with Qwen VLM
+# This generates user prompts and robot utterances for hierarchical policy training
+
+# Configuration
+REPO_ID="lerobot/libero_10"
+MODEL="Qwen/Qwen3-VL-30B-A3B-Instruct"
+# or: MODEL="Qwen/Qwen2-VL-7B-Instruct"
+
+
+OUTPUT_DIR="/fsx/jade_choghari/outputs/libero-10-annotate-high"
+
+BATCH_SIZE=16
+TEMPERATURE=0.9
+SAMPLE_INTERVAL=5.0  # generate dialogue every 1 second (all episodes processed)
+
+# Run subtask annotation
+# python /admin/home/jade_choghari/lerobot/src/lerobot/policies/pi05_full/annotate/subtask_annotate.py \
+#     --repo-id "$REPO_ID" \
+#     --video-key observation.images.image \
+#     --output-dir "$OUTPUT_DIR" \
+#     --skip-existing \
+#     --output-repo-id "jadechoghari/libero10-annotate" \
+#     --batch-size "$BATCH_SIZE" \
+# run synthetic data generation (all episodes processed)
+# python examples/dataset/annotate_pgen.py \
+#     --repo-id "$REPO_ID" \
+#     --model "$MODEL" \
+#     --output-dir "$OUTPUT_DIR" \
+#     --temperature "$TEMPERATURE" \
+#     --batch-size "$BATCH_SIZE" \
+#     --sample-interval "$SAMPLE_INTERVAL" \
+#     --image-key observation.images.base \
+#     --num-image-views-per-sample 1
+
+# for faster testing, increase sample interval:
+# --sample-interval 5.0  # Samples every 5 seconds (much faster)
+
+# to push to hub after generation:
+# add --push-to-hub flag
+
+# efficient batch processing: 4 episodes at once
+python src/lerobot/data_processing/annotations/high_level_annotate.py \
+    --data-dir "/fsx/jade_choghari/outputs/libero-10-annotate" \
+    --output-dir "$OUTPUT_DIR" \
+    --video-mode \
+    --video-key observation.images.image \
+    --video-batch-size "$BATCH_SIZE" \
+    --sample-interval 5.0
--- a/src/lerobot/data_processing/annotations/high_level_annotate.py
+++ b/src/lerobot/data_processing/annotations/high_level_annotate.py
--- a/src/lerobot/data_processing/annotations/load_lerobot_high.py
+++ b/src/lerobot/data_processing/annotations/load_lerobot_high.py
@@ -0,0 +1,52 @@
+import torch
+from huggingface_hub import HfApi
+
+import lerobot
+from lerobot.datasets.lerobot_dataset import LeRobotDataset, LeRobotDatasetMetadata
+from lerobot.policies.factory import make_pre_post_processors
+from lerobot.configs.policies import PreTrainedConfig
+
+# /fsx/jade_choghari/data/libero_10_subtasks_kw_converted
+dataset = LeRobotDataset(repo_id="lerobot/libero_10_image_subtask")
+
+dataloader = torch.utils.data.DataLoader(
+        dataset,
+        num_workers=0,
+        batch_size=2,
+        shuffle=True,
+)
+
+cfg = PreTrainedConfig.from_pretrained(
+    pretrained_name_or_path="/fsx/jade_choghari/models/pi05-base",
+)
+cfg.dtype = "bfloat16"
+
+pre_processor, post_processor = make_pre_post_processors(
+    policy_cfg=cfg,
+    pretrained_path="/fsx/jade_choghari/models/pi05-base",
+)
+batch = next(iter(dataloader))
+breakpoint()
+batch1 = pre_processor(batch)
+breakpoint()
+print(batch.keys())
+# print(batch['task_index_high_level'].shape)
+# print(batch['task_index_high_level'])
+# print(batch['user_prompt'][0])
+# print(batch['robot_utterance'][0])
+# print(batch['task'][0])
+
+valid_episode_list = []
+for episode_idx in range(len(dataset.meta.episodes)):
+        subtask_index = dataset[episode_idx]["subtask_index"]
+        valid_episode_list.append(episode_idx)
+
+print(len(valid_episode_list))
+
+# read this parquet /fsx/jade_choghari/outputs/pgen_annotations1/meta/tasks.parquett
+# import pandas as pd
+# tasks_df = pd.read_parquet('/fsx/jade_choghari/outputs/pgen_annotations1/meta/tasks.parquet')
+
+# # print all
+# print(tasks_df.columns)
+# breakpoint()
--- a/src/lerobot/data_processing/annotations/run_pgen.sh
+++ b/src/lerobot/data_processing/annotations/run_pgen.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# Example script to run synthetic data generation with Qwen VLM
+# This generates user prompts and robot utterances for hierarchical policy training
+
+# Configuration
+REPO_ID="jadechoghari/collect-data"
+MODEL="Qwen/Qwen3-VL-30B-A3B-Instruct"
+# or: MODEL="Qwen/Qwen2-VL-7B-Instruct"
+
+
+OUTPUT_DIR="/fsx/jade_choghari/outputs/collect-data-pgen_new"
+
+BATCH_SIZE=32
+TEMPERATURE=0.9
+SAMPLE_INTERVAL=5.0  # generate dialogue every 1 second (all episodes processed)
+
+# Run subtask annotation
+python /admin/home/jade_choghari/lerobot/src/lerobot/policies/pi05_full/annotate/subtask_annotate.py \
+    --repo-id "$REPO_ID" \
+    --video-key observation.images.base \
+    --output-dir "$OUTPUT_DIR" \
+    --output-repo-id "jadechoghari/collect-data-with-subtasks"
+# run synthetic data generation (all episodes processed)
+# python examples/dataset/annotate_pgen.py \
+#     --repo-id "$REPO_ID" \
+#     --model "$MODEL" \
+#     --output-dir "$OUTPUT_DIR" \
+#     --temperature "$TEMPERATURE" \
+#     --batch-size "$BATCH_SIZE" \
+#     --sample-interval "$SAMPLE_INTERVAL" \
+#     --image-key observation.images.base \
+#     --num-image-views-per-sample 1
+
+# for faster testing, increase sample interval:
+# --sample-interval 5.0  # Samples every 5 seconds (much faster)
+
+# to push to hub after generation:
+# add --push-to-hub flag
+
+# efficient batch processing: 4 episodes at once
+# python examples/dataset/annotate_pgen.py \
+#     --repo-id "$REPO_ID" \
+#     --model "$MODEL" \
+#     --output-dir "$OUTPUT_DIR" \
+#     --video-mode \
+#     --video-key observation.images.up \
+#     --video-batch-size "$BATCH_SIZE" \
+#     --sample-interval 1.0
--- a/src/lerobot/data_processing/annotations/subtask_annotate.py
+++ b/src/lerobot/data_processing/annotations/subtask_annotate.py