lerobot-clone/src/lerobot/data_processing/annotations/run_pgen.sh

#!/bin/bash

# Example script to run synthetic data generation with Qwen VLM
# This generates user prompts and robot utterances for hierarchical policy training

# Configuration
REPO_ID="jadechoghari/piper-demo-20260205_103303"
# MODEL="Qwen/Qwen3-VL-30B-A3B-Thinking"
MODEL="Qwen/Qwen3.5-27B"
# or: MODEL="Qwen/Qwen2-VL-7B-Instruct"


OUTPUT_DIR="/fsx/jade_choghari/outputs/collect-data-pgen_new"

BATCH_SIZE=2
TEMPERATURE=0.9
SAMPLE_INTERVAL=5.0  # generate dialogue every 1 second (all episodes processed)

# Run subtask annotation.
# To use closed-vocabulary labels, add a line: --subtask-labels "label1" "label2" ...
# Example (add backslash after "$MODEL" and uncomment the next line):
#   --model "$MODEL" \
#   --subtask-labels "pick_up_yellow_nut_bar" "pick_up_cake" "pick_up_biscuit_pack" "pick_up_soda_can"
python /home/lerobot/src/lerobot/data_processing/annotations/subtask_annotate.py \
    --repo-id "$REPO_ID" \
    --video-key observation.images.top \
    --output-dir "$OUTPUT_DIR" \
    --output-repo-id "jadechoghari/piper-demo-annotated1" \
    --push-to-hub \
    --no-timer-overlay \
    --model "$MODEL" \
    --subtask-labels "pick_up_yellow_nut_bar" "pick_up_cake" "pick_up_biscuit_pack" "pick_up_soda_can" \
    --batch-size 2

# Run subtask annotation (image-window: frames as images for better accuracy)
# python /admin/home/jade_choghari/lerobot/src/lerobot/data_processing/annotations/subtask_annotate_image.py \
#     --repo-id "$REPO_ID" \
#     --camera-key observation.images.wrist \
#     --output-dir "$OUTPUT_DIR" \
#     --output-repo-id "jadechoghari/piper-demo-annotated1-image" \
#     --push-to-hub \
#     --model "$MODEL" \
#     --window-size 184 \
#     --max-frames-per-window 16 \
#     --subtask-labels "pick_up_yellow_nut_bar" "pick_up_cake" "pick_up_biscuit_pack" "pick_up_soda_can" \
#     --batch-size 2


# run synthetic data generation (all episodes processed)
# python examples/dataset/annotate_pgen.py \
#     --repo-id "$REPO_ID" \
#     --model "$MODEL" \
#     --output-dir "$OUTPUT_DIR" \
#     --temperature "$TEMPERATURE" \
#     --batch-size "$BATCH_SIZE" \
#     --sample-interval "$SAMPLE_INTERVAL" \
#     --image-key observation.images.base \
#     --num-image-views-per-sample 1

# for faster testing, increase sample interval:
# --sample-interval 5.0  # Samples every 5 seconds (much faster)

# to push to hub after generation:
# add --push-to-hub flag

# efficient batch processing: 4 episodes at once
# python examples/dataset/annotate_pgen.py \
#     --repo-id "$REPO_ID" \
#     --model "$MODEL" \
#     --output-dir "$OUTPUT_DIR" \
#     --video-mode \
#     --video-key observation.images.up \
#     --video-batch-size "$BATCH_SIZE" \
#     --sample-interval 1.0