add some comments, license and readme

This commit is contained in:
Pepijn
2025-09-11 10:36:38 +02:00
parent 494aa576b2
commit f613a37cd2
8 changed files with 205 additions and 17 deletions

View File

@@ -0,0 +1,92 @@
# π₀.₅ (pi05)
This repository contains the Hugging Face port of **π₀.₅**, adapted from [OpenPI](https://github.com/Physical-Intelligence/openpi) by the Physical Intelligence.
It is designed as a **Vision-Language-Action model with open-world generalization**.
---
### ⚠️ WARNING ⚠️
This project requires **patching the Hugging Face `transformers` library**.
1. Make sure you have the exact version installed:
```bash
pip show transformers
```
It must be version **4.53.2**.
2. Apply the custom patches by copying the modified files into your environment:
```bash
cp -r ./src/lerobot/policies/pi0_openpi/transformers_replace/* \
$(python -c "import transformers, os; print(os.path.dirname(transformers.__file__))")
```
These patches overwrite parts of `transformers` to:
- Support the **AdaRMS optimizer**,
- Correctly control the precision of activations,
- Allow the KV cache to be used without updates.
**Important:**
- This permanently modifies your `transformers` installation.
- The changes survive reinstalls unless you explicitly remove the patched files or recreate the environment.
To undo and restore a clean state:
```bash
pip uninstall transformers
pip install transformers==4.53.2
```
---
## Model Overview
| Feature | π₀ | π₀.₅ |
| -------------------- | ------------------------------------------------------ | ----------------------------------------- |
| State Embedding | Uses `state_proj` layer | No state embedding |
| Time Conditioning | Concatenates time with actions via `action_time_mlp_*` | Uses `time_mlp_*` for AdaRMS conditioning |
| AdaRMS | Not used | Used in action expert |
| Tokenizer Length | 48 tokens | 200 tokens |
| Discrete State Input | False | True |
| Parameter Count | Higher (includes state embedding) | Lower (no state embedding) |
---
## Citation
If you use this work, please cite both **OpenPI** and the π₀.₅ paper:
```bibtex
@misc{openpi2024,
author = {Physical Intelligence Lab},
title = {OpenPI: PyTorch Implementation of π0 and π0.5 Policies},
year = {2024},
publisher = {GitHub},
howpublished = {\url{https://github.com/Physical-Intelligence/openpi}},
license = {Apache-2.0}
}
@misc{intelligence2025pi05visionlanguageactionmodelopenworld,
title = {π₀.₅: a Vision-Language-Action Model with Open-World Generalization},
author = {Physical Intelligence and Kevin Black and Noah Brown and James Darpinian and Karan Dhabalia and Danny Driess and Adnan Esmail and Michael Equi and Chelsea Finn and Niccolo Fusai and Manuel Y. Galliker and Dibya Ghosh and Lachy Groom and Karol Hausman and Brian Ichter and Szymon Jakubczak and Tim Jones and Liyiming Ke and Devin LeBlanc and Sergey Levine and Adrian Li-Bell and Mohith Mothukuri and Suraj Nair and Karl Pertsch and Allen Z. Ren and Lucy Xiaoyang Shi and Laura Smith and Jost Tobias Springenberg and Kyle Stachowicz and James Tanner and Quan Vuong and Homer Walke and Anna Walling and Haohuan Wang and Lili Yu and Ury Zhilinsky},
year = {2025},
eprint = {2504.16054},
archivePrefix= {arXiv},
primaryClass = {cs.LG},
url = {https://arxiv.org/abs/2504.16054},
}
```
---
## License
This port follows the **Apache 2.0 License**, consistent with the original [OpenPI repository](https://github.com/Physical-Intelligence/openpi).
```
```

View File

@@ -1,6 +1,6 @@
#!/usr/bin/env python
# Copyright 2024 Tony Z. Zhao and The HuggingFace Inc. team. All rights reserved.
# Copyright 2025 Physical Intelligence and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.

View File

@@ -1,6 +1,6 @@
#!/usr/bin/env python
# Copyright 2024 Tony Z. Zhao and The HuggingFace Inc. team. All rights reserved.
# Copyright 2025 Physical Intelligence and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.

View File

@@ -1,6 +1,6 @@
#!/usr/bin/env python
# Copyright 2024 Tony Z. Zhao and The HuggingFace Inc. team. All rights reserved.
# Copyright 2025 Physical Intelligence and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -35,7 +35,7 @@ from lerobot.policies.pretrained import PreTrainedPolicy
# Helper functions
def get_safe_dtype(target_dtype, device_type): # see openpi `get_safe_dtype`
def get_safe_dtype(target_dtype, device_type): # see openpi `get_safe_dtype` (exact copy)
"""Get a safe dtype for the given device type."""
if device_type == "cpu":
# CPU doesn't support bfloat16, use float32 instead
@@ -46,7 +46,7 @@ def get_safe_dtype(target_dtype, device_type): # see openpi `get_safe_dtype`
return target_dtype
def create_sinusoidal_pos_embedding( # see openpi `create_sinusoidal_pos_embedding`
def create_sinusoidal_pos_embedding( # see openpi `create_sinusoidal_pos_embedding` (exact copy)
time: torch.tensor, dimension: int, min_period: float, max_period: float, device="cpu"
) -> Tensor:
"""Computes sine-cosine positional embedding vectors for scalar positions."""
@@ -66,14 +66,14 @@ def create_sinusoidal_pos_embedding( # see openpi `create_sinusoidal_pos_embedd
return torch.cat([torch.sin(sin_input), torch.cos(sin_input)], dim=1)
def sample_beta(alpha, beta, bsize, device): # see openpi `sample_beta`
def sample_beta(alpha, beta, bsize, device): # see openpi `sample_beta` (exact copy)
alpha_t = torch.as_tensor(alpha, dtype=torch.float32, device=device)
beta_t = torch.as_tensor(beta, dtype=torch.float32, device=device)
dist = torch.distributions.Beta(alpha_t, beta_t)
return dist.sample((bsize,))
def make_att_2d_masks(pad_masks, att_masks): # see openpi `make_att_2d_masks`
def make_att_2d_masks(pad_masks, att_masks): # see openpi `make_att_2d_masks` (exact copy)
"""Copied from big_vision.
Tokens can attend to valid inputs tokens which have a cumulative mask_ar
@@ -105,7 +105,7 @@ def make_att_2d_masks(pad_masks, att_masks): # see openpi `make_att_2d_masks`
return att_2d_masks & pad_2d_masks
def resize_with_pad_torch( # see openpi `resize_with_pad_torch`
def resize_with_pad_torch( # see openpi `resize_with_pad_torch` (exact copy)
images: torch.Tensor,
height: int,
width: int,
@@ -217,7 +217,9 @@ def get_gemma_config(variant: str) -> GemmaConfig: # see openpi `gemma.py: get_
raise ValueError(f"Unknown variant: {variant}")
class PaliGemmaWithExpertModel(nn.Module): # see openpi `gemma_pytorch.py: PaliGemmaWithExpertModel`
class PaliGemmaWithExpertModel(
nn.Module
): # see openpi `gemma_pytorch.py: PaliGemmaWithExpertModel` this class is almost a exact copy of PaliGemmaWithExpertModel in openpi
"""PaliGemma model with action expert for PI05."""
def __init__(

View File

@@ -0,0 +1,92 @@
# π₀ (pi0)
This repository contains the Hugging Face port of **π₀**, adapted from [OpenPI](https://github.com/Physical-Intelligence/openpi) by the Physical Intelligence.
It is designed as a **Vision-Language-Action flow model for general robot control**.
---
### ⚠️ WARNING ⚠️
This project requires **patching the Hugging Face `transformers` library**.
1. Make sure you have the exact version installed:
```bash
pip show transformers
```
It must be version **4.53.2**.
2. Apply the custom patches by copying the modified files into your environment:
```bash
cp -r ./src/lerobot/policies/pi0_openpi/transformers_replace/* \
$(python -c "import transformers, os; print(os.path.dirname(transformers.__file__))")
```
These patches overwrite parts of `transformers` to:
- Support the **AdaRMS optimizer**,
- Correctly control the precision of activations,
- Allow the KV cache to be used without updates.
**Important:**
- This permanently modifies your `transformers` installation.
- The changes survive reinstalls unless you explicitly remove the patched files or recreate the environment.
To undo and restore a clean state:
```bash
pip uninstall transformers
pip install transformers==4.53.2
```
---
## Model Overview
| Feature | π₀ | π₀.₅ |
| -------------------- | ------------------------------------------------------ | ----------------------------------------- |
| State Embedding | Uses `state_proj` layer | No state embedding |
| Time Conditioning | Concatenates time with actions via `action_time_mlp_*` | Uses `time_mlp_*` for AdaRMS conditioning |
| AdaRMS | Not used | Used in action expert |
| Tokenizer Length | 48 tokens | 200 tokens |
| Discrete State Input | False | True |
| Parameter Count | Higher (includes state embedding) | Lower (no state embedding) |
---
## Citation
If you use this work, please cite both **OpenPI** and the π₀ paper:
```bibtex
@misc{openpi2024,
author = {Physical Intelligence Lab},
title = {OpenPI: PyTorch Implementation of π0 and π0.5 Policies},
year = {2024},
publisher = {GitHub},
howpublished = {\url{https://github.com/Physical-Intelligence/openpi}},
license = {Apache-2.0}
}
@misc{black2024pi0visionlanguageactionflowmodel,
title = {π₀: A Vision-Language-Action Flow Model for General Robot Control},
author = {Kevin Black and Noah Brown and Danny Driess and Adnan Esmail and Michael Equi and Chelsea Finn and Niccolo Fusai and Lachy Groom and Karol Hausman and Brian Ichter and Szymon Jakubczak and Tim Jones and Liyiming Ke and Sergey Levine and Adrian Li-Bell and Mohith Mothukuri and Suraj Nair and Karl Pertsch and Lucy Xiaoyang Shi and James Tanner and Quan Vuong and Anna Walling and Haohuan Wang and Ury Zhilinsky},
year = {2024},
eprint = {2410.24164},
archivePrefix= {arXiv},
primaryClass = {cs.LG},
url = {https://arxiv.org/abs/2410.24164},
}
```
---
## License
This port follows the **Apache 2.0 License**, consistent with the original [OpenPI repository](https://github.com/Physical-Intelligence/openpi).
```
```

View File

@@ -1,6 +1,6 @@
#!/usr/bin/env python
# Copyright 2024 Tony Z. Zhao and The HuggingFace Inc. team. All rights reserved.
# Copyright 2025 Physical Intelligence and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.

View File

@@ -1,6 +1,6 @@
#!/usr/bin/env python
# Copyright 2024 Tony Z. Zhao and The HuggingFace Inc. team. All rights reserved.
# Copyright 2025 Physical Intelligence and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.

View File

@@ -35,7 +35,7 @@ from lerobot.policies.pretrained import PreTrainedPolicy
# Helper functions
def get_safe_dtype(target_dtype, device_type): # see openpi `get_safe_dtype`
def get_safe_dtype(target_dtype, device_type): # see openpi `get_safe_dtype` (exact copy)
"""Get a safe dtype for the given device type."""
if device_type == "cpu":
# CPU doesn't support bfloat16, use float32 instead
@@ -46,7 +46,7 @@ def get_safe_dtype(target_dtype, device_type): # see openpi `get_safe_dtype`
return target_dtype
def create_sinusoidal_pos_embedding( # see openpi `create_sinusoidal_pos_embedding`
def create_sinusoidal_pos_embedding( # see openpi `create_sinusoidal_pos_embedding` (exact copy)
time: torch.tensor, dimension: int, min_period: float, max_period: float, device="cpu"
) -> Tensor:
"""Computes sine-cosine positional embedding vectors for scalar positions."""
@@ -66,14 +66,14 @@ def create_sinusoidal_pos_embedding( # see openpi `create_sinusoidal_pos_embedd
return torch.cat([torch.sin(sin_input), torch.cos(sin_input)], dim=1)
def sample_beta(alpha, beta, bsize, device): # see openpi `sample_beta`
def sample_beta(alpha, beta, bsize, device): # see openpi `sample_beta` (exact copy)
alpha_t = torch.as_tensor(alpha, dtype=torch.float32, device=device)
beta_t = torch.as_tensor(beta, dtype=torch.float32, device=device)
dist = torch.distributions.Beta(alpha_t, beta_t)
return dist.sample((bsize,))
def make_att_2d_masks(pad_masks, att_masks): # see openpi `make_att_2d_masks`
def make_att_2d_masks(pad_masks, att_masks): # see openpi `make_att_2d_masks` (exact copy)
"""Copied from big_vision.
Tokens can attend to valid inputs tokens which have a cumulative mask_ar
@@ -105,7 +105,7 @@ def make_att_2d_masks(pad_masks, att_masks): # see openpi `make_att_2d_masks`
return att_2d_masks & pad_2d_masks
def resize_with_pad_torch( # see openpi `resize_with_pad_torch`
def resize_with_pad_torch( # see openpi `resize_with_pad_torch` (exact copy)
images: torch.Tensor,
height: int,
width: int,
@@ -217,7 +217,9 @@ def get_gemma_config(variant: str) -> GemmaConfig: # see openpi `gemma.py: get_
raise ValueError(f"Unknown variant: {variant}")
class PaliGemmaWithExpertModel(nn.Module): # see openpi `gemma_pytorch.py: PaliGemmaWithExpertModel`
class PaliGemmaWithExpertModel(
nn.Module
): # see openpi `gemma_pytorch.py: PaliGemmaWithExpertModel` this class is almost a exact copy of PaliGemmaWithExpertModel in openpi
"""PaliGemma model with action expert for PI0."""
def __init__(