diff --git a/examples/annotation/run_hf_job.py b/examples/annotation/run_hf_job.py index e8cba5c36..672d9f368 100644 --- a/examples/annotation/run_hf_job.py +++ b/examples/annotation/run_hf_job.py @@ -21,9 +21,7 @@ from huggingface_hub import get_token, run_job token = os.environ.get("HF_TOKEN") or get_token() if not token: - raise RuntimeError( - "No HF token. Run `huggingface-cli login` or `export HF_TOKEN=hf_...`" - ) + raise RuntimeError("No HF token. Run `huggingface-cli login` or `export HF_TOKEN=hf_...`") CMD = ( "apt-get update -qq && apt-get install -y -qq git ffmpeg && " @@ -46,13 +44,16 @@ CMD = ( "--vlm.client_concurrency=256 " "--vlm.max_new_tokens=512 " "--executor.episode_parallelism=32 " - "--vlm.chat_template_kwargs='{enable_thinking: false}' " + "--vlm.chat_template_kwargs='{\"enable_thinking\": false}' " "--vlm.camera_key=observation.images.wrist " "--module_1.frames_per_second=1.0 " "--module_1.use_video_url=true " "--module_1.use_video_url_fps=1.0 " - "--module_3.K=1 --module_3.vqa_emission_hz=0.2 " - "--push_to_hub=pepijn223/super_poulain_qwen36moe-3" + "--module_1.derive_task_from_video=always " + "--module_1.n_task_rephrasings=10 " + "--module_3.K=1 " + "--module_3.vqa_emission_hz=1.0 " + "--push_to_hub=pepijn223/super_poulain_full_tool2" ) job = run_job( diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_memory.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_memory.txt index 6a89ecefa..e9b2ee136 100644 --- a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_memory.txt +++ b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_memory.txt @@ -18,8 +18,14 @@ Previous memory: {prior_memory} Just-completed subtask: "{completed_subtask}" Remaining subtasks (for relevance judgement only): {remaining_subtasks} -Update the memory. Drop irrelevant detail. Compress completed steps. -Keep WHAT happened, drop HOW. Shorter is better. +Update the memory as a compact state note. + +Rules: +- Keep only facts needed later. +- Keep WHAT changed; drop HOW it was done. +- Use fragments when clear. +- Prefer: "bowl in box; lid still open" +- Avoid: "The robot placed the bowl into the box and the lid remains open." Output strictly valid JSON: - {{ "memory": "" }} + {{ "memory": "" }} diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_plan.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_plan.txt index b0121c977..528c6f0c9 100644 --- a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_plan.txt +++ b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_plan.txt @@ -1,18 +1,18 @@ You are the high-level planner for a robot demonstrating: "{episode_task}". -Given the subtask decomposition below, write a concise hierarchical PLAN -the robot should follow. Format the plan as a numbered list, one line per -high-level step. The plan describes the full task; subtasks are the atomic -skills used to execute it. +Given the subtask decomposition below, write a compact hierarchical PLAN. +Use short imperative fragments, like pi0.7 context prompts. Subtasks for context: {subtasks_text} Authoring rules: - 3 to {plan_max_steps} steps. -- Each step describes one logical chunk of the task, not one motion. +- Each step is one logical chunk, not one motion. - Steps must be in execution order. -- Plain prose, no JSON, no markdown headers. +- Brief commands, not full sentences. +- Prefer: "open air fryer"; avoid: "The robot should open the air fryer." +- Plain text, no markdown headers. Output strictly valid JSON: {{ "plan": "1. ...\n2. ...\n3. ..." }} diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt index 5d7c9cc8d..0229dbc4c 100644 --- a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt +++ b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_subtasks.txt @@ -4,17 +4,18 @@ The user originally asked: "{episode_task}" You are shown the entire demonstration as a single video. Watch the whole clip, then segment it into a list of consecutive atomic subtasks -the robot performs. +the robot performs. Write compact action labels, not prose. Authoring rules — based on Hi Robot (Shi 2025) atom granularity and -Pi0.7 (Physical Intelligence 2025) "how, not what" detail: +pi0.7 (Physical Intelligence 2025) compact context prompts: - Each subtask is one atomic skill the low-level policy can execute, e.g. "pick up one piece of lettuce", "place the bowl into the box", "move the right arm to the left". -- Capture HOW the subtask is performed, not only WHAT — e.g. prefer +- Capture HOW when useful, but keep it brief — e.g. prefer "grasp the handle of the sponge with the left hand" to "pick up the sponge". +- Use verb phrases, not full sentences. - Subtasks are non-overlapping and cover the full episode in order. Choose the cut points yourself based on what you see in the video (gripper open/close events, contact, regrasps, transitions). diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_task_rephrasings.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_task_rephrasings.txt index d03a6bf8b..602892bd3 100644 --- a/src/lerobot/annotations/steerable_pipeline/prompts/module_1_task_rephrasings.txt +++ b/src/lerobot/annotations/steerable_pipeline/prompts/module_1_task_rephrasings.txt @@ -9,7 +9,7 @@ Original task: Generate exactly {n} alternative phrasings of the same task. Vary: - formality (casual / polite / curt) -- verbosity (short imperative vs longer polite request) +- verbosity (mostly short imperative; occasional polite request) - word choice (synonyms, different verbs) - sentence structure (imperative / question / suggestion) @@ -17,7 +17,7 @@ Hard rules: - Each phrasing MUST preserve the exact meaning of the original task. Do not change which object is involved, the destination, or the action. Do not add extra steps. Do not invent new objects. -- Each phrasing must be a single short sentence, plain prose, no +- Each phrasing must be a short phrase or sentence, plain prose, no markdown, no quotes, no list numbers. - Phrasings must be distinct — no near-duplicates. - Output exactly {n} entries. diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_2_initial_speech.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_2_initial_speech.txt index 6058b1f5c..625ce920c 100644 --- a/src/lerobot/annotations/steerable_pipeline/prompts/module_2_initial_speech.txt +++ b/src/lerobot/annotations/steerable_pipeline/prompts/module_2_initial_speech.txt @@ -1,10 +1,12 @@ The user just asked the robot: "{episode_task}". Generate a short verbal acknowledgement the robot would speak back before -beginning the task. Style: confident, friendly, single short sentence. +beginning the task. Style: compact, confident, friendly. Examples (Hi Robot, Shi 2025): "Sure, I won't put cheese on it.", "OK, starting with the sponge.", "Got it.". +Prefer very short replies: "Got it.", "On it.", "OK." + Output strictly valid JSON: {{ "text": "" }} diff --git a/src/lerobot/annotations/steerable_pipeline/prompts/module_2_interjection.txt b/src/lerobot/annotations/steerable_pipeline/prompts/module_2_interjection.txt index d6f77883f..4a4719f54 100644 --- a/src/lerobot/annotations/steerable_pipeline/prompts/module_2_interjection.txt +++ b/src/lerobot/annotations/steerable_pipeline/prompts/module_2_interjection.txt @@ -14,12 +14,10 @@ subtask boundary in the demonstration: - Subtask the robot is about to start: "{next_subtask}" - Time into episode: {timestamp:.2f}s -Write ONE interjection the user would naturally say at this moment to -prompt / confirm / encourage the robot to do "{next_subtask}". Phrase it -like a real human mid-task remark — conversational, varied, sometimes -just a nudge, sometimes a clarification, sometimes a small constraint -that the upcoming motion happens to satisfy. Plus the robot's verbal -acknowledgement. +Write ONE compact interjection the user would naturally say at this +moment to prompt / confirm / encourage the robot to do "{next_subtask}". +Keep it like a mid-task coaching cue, not a full instruction paragraph. +Also write the robot's compact verbal acknowledgement. Hard rules: @@ -29,7 +27,9 @@ Hard rules: instead", DO NOT — those would contradict the demonstration. - The interjection must reference an object, location, or action that is plausible given the visible scene and the next subtask text. -- One sentence each. Conversational, not robotic. +- One short phrase or sentence each. Conversational, not robotic. +- Prefer direct cues: "{next_subtask}, please."; "Now {next_subtask}." +- Keep robot speech very short: "OK.", "On it.", "Doing that." Style examples (vary the phrasing — don't reuse these verbatim): - "Now go ahead and {next_subtask}." @@ -41,6 +41,6 @@ Style examples (vary the phrasing — don't reuse these verbatim): Output strictly valid JSON: {{ - "interjection": "", - "speech": "" + "interjection": "", + "speech": "" }}