fix: single level loop

add: inference benchmark
update
2026-05-31 19:01:28 +00:00 · 2025-09-24 01:06:13 +02:00 · 2025-09-23 22:34:52 +02:00 · 2025-09-23 21:52:15 +02:00 · 2025-09-23 21:52:14 +02:00 · 2025-09-23 18:47:36 +02:00
9 changed files with 399 additions and 15 deletions
--- a/README.md
+++ b/README.md
@@ -202,7 +202,7 @@ Check out [example 1](https://github.com/huggingface/lerobot/blob/main/examples/
 You can also locally visualize episodes from a dataset on the hub by executing our script from the command line:

 ```bash
-python -m lerobot.scripts.visualize_dataset \
+lerobot-dataset-viz \
    --repo-id lerobot/pusht \
    --episode-index 0
 ```
@@ -210,7 +210,7 @@ python -m lerobot.scripts.visualize_dataset \
 or from a dataset in a local folder with the `root` option and the `--local-files-only` (in the following case the dataset will be searched for in `./my_local_data_dir/lerobot/pusht`)

 ```bash
-python -m lerobot.scripts.visualize_dataset \
+lerobot-dataset-viz \
    --repo-id lerobot/pusht \
    --root ./my_local_data_dir \
    --local-files-only 1 \
@@ -221,7 +221,7 @@ It will open `rerun.io` and display the camera streams, robot states and actions

 https://github-production-user-asset-6210df.s3.amazonaws.com/4681518/328035972-fd46b787-b532-47e2-bb6f-fd536a55a7ed.mov?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVCODYLSA53PQK4ZA%2F20240505%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240505T172924Z&X-Amz-Expires=300&X-Amz-Signature=d680b26c532eeaf80740f08af3320d22ad0b8a4e4da1bcc4f33142c15b509eda&X-Amz-SignedHeaders=host&actor_id=24889239&key_id=0&repo_id=748713144

-Our script can also visualize datasets stored on a distant server. See `python -m lerobot.scripts.visualize_dataset --help` for more instructions.
+Our script can also visualize datasets stored on a distant server. See `lerobot-dataset-viz --help` for more instructions.

 ### The `LeRobotDataset` format

--- a/benchmarks/policies/inference.py
+++ b/benchmarks/policies/inference.py
@@ -0,0 +1,378 @@
+"""
+Benchmark memory footprint and inference latency of a policy on arbitrary devices.
+
+This script loads a pretrained policy directly (similar to the async inference server)
+and generates dummy input data based on the policy's input_features to perform
+accurate benchmarking without requiring datasets.
+"""
+
+import argparse
+import os
+import signal
+import statistics
+from contextlib import contextmanager
+from datetime import datetime
+from pathlib import Path
+
+import psutil
+import torch
+from tqdm import tqdm
+
+from lerobot.configs.types import FeatureType
+from lerobot.policies.factory import get_policy_class
+from lerobot.policies.pretrained import PreTrainedPolicy
+
+
+class TimeoutException:
+    pass
+
+
+@contextmanager
+def timeout(seconds):
+    def signal_handler(signum, frame):
+        raise TimeoutException(f"Timed out after {seconds} seconds")
+
+    # On Windows, signal is not available, so we can't use this timeout mechanism
+    if not hasattr(signal, "SIGALRM"):
+        yield
+        return
+
+    old_handler = signal.signal(signal.SIGALRM, signal_handler)
+    try:
+        # signal.alarm expects integer seconds
+        # for float seconds, we can use setitimer
+        signal.setitimer(signal.ITIMER_REAL, seconds)
+        yield
+    finally:
+        signal.setitimer(signal.ITIMER_REAL, 0)
+        signal.signal(signal.SIGALRM, old_handler)
+
+
+def bytes_to_human(n: int) -> str:
+    for unit in ["B", "KB", "MB", "GB", "TB"]:
+        if n < 1024:
+            return f"{n:.2f} {unit}"
+        n /= 1024
+    return f"{n:.2f} PB"
+
+
+def percentile(values: list[float], p: float) -> float:
+    if not values:
+        return float("nan")
+    k = (len(values) - 1) * (p / 100.0)
+    f = int(k)
+    c = min(f + 1, len(values) - 1)
+    if f == c:
+        return values[f]
+    return values[f] + (values[c] - values[f]) * (k - f)
+
+
+def generate_dummy_observation(input_features: dict, device: str = "cpu") -> dict:
+    """Generate dummy observation data based on policy input features."""
+    dummy_obs = {}
+
+    for key, feature in input_features.items():
+        shape = feature.shape
+
+        if feature.type == FeatureType.VISUAL:
+            # Images: random values in [0, 1] range (already normalized)
+            dummy_obs[key] = torch.rand(shape, dtype=torch.float32, device=device)
+        elif feature.type in [FeatureType.STATE, FeatureType.ACTION, FeatureType.ENV]:
+            # State/action/env: random normal distribution
+            dummy_obs[key] = torch.randn(shape, dtype=torch.float32, device=device)
+        else:
+            # Default: random normal for unknown types
+            dummy_obs[key] = torch.randn(shape, dtype=torch.float32, device=device)
+
+    # Add batch dimension
+    for key in dummy_obs:
+        dummy_obs[key] = dummy_obs[key].unsqueeze(0)
+
+    # Add task string for language-conditioned policies
+    dummy_obs["task"] = ""
+
+    return dummy_obs
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Policy inference benchmark")
+    parser.add_argument(
+        "--policy-id", type=str, required=True, help="Model ID or local path to pretrained policy"
+    )
+    parser.add_argument(
+        "--policy-type", type=str, required=True, help="Type of policy (smolvla, act, diffusion, etc.)"
+    )
+    parser.add_argument(
+        "--device", type=str, default="mps", choices=["cuda", "cpu", "mps"], help="Device to run on"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="Random seed")
+    parser.add_argument(
+        "--num-samples", type=int, default=100, help="Number of inference samples to benchmark"
+    )
+    parser.add_argument("--warmup", type=int, default=10, help="Number of warmup samples (not timed)")
+    parser.add_argument(
+        "--output-dir", type=str, default="outputs/benchmarks", help="Directory to save benchmark results"
+    )
+    parser.add_argument(
+        "--timeout",
+        type=float,
+        default=0.3,
+        help="Timeout for each inference pass in seconds (default: 0.3s = 300ms)",
+    )
+    args = parser.parse_args()
+
+    # Seed & deterministic-ish setup
+    torch.manual_seed(args.seed)
+    if args.device == "cuda":
+        torch.cuda.manual_seed_all(args.seed)
+    torch.backends.cudnn.benchmark = False
+    torch.backends.cudnn.deterministic = False  # leave False to avoid perf cliffs
+
+    # Resolve device availability
+    device = args.device.lower()
+    if device == "cuda" and not torch.cuda.is_available():
+        print("[!] CUDA requested but unavailable. Falling back to CPU.")
+        device = "cpu"
+    elif device == "mps" and not (hasattr(torch.backends, "mps") and torch.backends.mps.is_available()):
+        print("[!] MPS requested but unavailable. Falling back to CPU.")
+        device = "cpu"
+
+    use_cuda = device == "cuda"
+
+    # Create output directory and log file
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    policy_name = args.policy_id.replace("/", "_").replace("\\", "_")
+    log_file = output_dir / f"benchmark_{args.policy_type}_{policy_name}_{device}_{timestamp}.txt"
+
+    # Load policy directly from pretrained (similar to async inference server)
+    print(f"Loading policy {args.policy_type} from {args.policy_id}...")
+    policy_class = get_policy_class(args.policy_type)
+    policy: PreTrainedPolicy = policy_class.from_pretrained(args.policy_id)
+    policy.eval()
+    policy.to(device)
+
+    print(f"Policy loaded on {device}")
+    print(f"Input features: {list(policy.config.input_features.keys())}")
+    print(f"Output features: {list(policy.config.output_features.keys())}")
+
+    # Generate dummy observation based on policy input features
+    dummy_observation = generate_dummy_observation(policy.config.input_features, device)
+    dummy_observation["task"] = ""
+
+    # Helper to sync for fair timings
+    def _sync(dev_=device):
+        if dev_ == "cuda" and torch.cuda.is_available():
+            torch.cuda.synchronize()
+        elif dev_ == "mps" and hasattr(torch, "mps"):
+            try:
+                torch.mps.synchronize()
+            except AttributeError:
+                pass  # MPS sync not available in this PyTorch version
+
+    # Warmup (to stabilize kernels/caches)
+    print("Warming up...")
+    with torch.no_grad():
+        policy.reset()
+        for _ in range(args.warmup):
+            _ = policy.select_action(dummy_observation)
+        _sync()
+
+    # Memory footprint before timing
+    process = psutil.Process(os.getpid())
+    rss_before = process.memory_info().rss
+    if use_cuda:
+        torch.cuda.reset_peak_memory_stats()
+
+    # PyTorch timing with Event objects for more accurate GPU timing
+    print(f"Running benchmark: {args.num_samples} samples...")
+
+    if use_cuda:
+        # Use CUDA Events for precise GPU timing
+        start_events = []
+        end_events = []
+        timeout_count = 0
+
+        with torch.no_grad():
+            for forward in tqdm(range(args.num_samples), desc="Trials"):
+                start_event = torch.cuda.Event(enable_timing=True)
+                end_event = torch.cuda.Event(enable_timing=True)
+                try:
+                    with timeout(args.timeout):
+                        start_event.record()
+                        _ = policy.select_action(dummy_observation)
+                        end_event.record()
+
+                    start_events.append(start_event)
+                    end_events.append(end_event)
+                except TimeoutException:
+                    timeout_count += 1
+                    # Add placeholder for timeout
+                    start_events.append(None)
+                    end_events.append(None)
+                    print(f"\n[!] Timeout on forward {forward + 1}")
+                    continue
+
+        # Synchronize and collect timing results
+        torch.cuda.synchronize()
+        per_forward_ms = []
+        for start_event, end_event in zip(start_events, end_events, strict=True):
+            if start_event is None:
+                per_forward_ms.append(args.timeout * 1000)
+            else:
+                per_forward_ms.append(start_event.elapsed_time(end_event))
+
+        if timeout_count > 0:
+            print(f"[!] {timeout_count} inference passes timed out (>{args.timeout * 1000:.1f}ms)")
+
+    else:
+        # Use simple time.perf_counter for CPU/MPS timing with timeout
+        import time
+
+        per_forward_ms = []
+        timeout_count = 0
+
+        with torch.no_grad():
+            for sample in tqdm(range(args.num_samples), desc="Samples"):
+                try:
+                    with timeout(args.timeout):
+                        start_time = time.perf_counter()
+                        _ = policy.select_action(dummy_observation)
+                        end_time = time.perf_counter()
+
+                    per_forward_ms.append((end_time - start_time) * 1000)  # Convert to ms
+                except TimeoutException:
+                    timeout_count += 1
+                    per_forward_ms.append(args.timeout * 1000)
+                    print(f"\n[!] Timeout on sample {sample + 1}")
+                    continue
+
+        if timeout_count > 0:
+            print(f"[!] {timeout_count} inference passes timed out (>{args.timeout * 1000:.1f}ms)")
+
+    # Memory footprint after timing
+    rss_after = process.memory_info().rss
+    rss_delta = rss_after - rss_before
+    cuda_peak = torch.cuda.max_memory_allocated() if use_cuda else 0
+
+    # Sort timing results for percentile calculations
+    per_forward_ms_sorted = sorted(per_forward_ms)
+
+    mean_ms = statistics.fmean(per_forward_ms) if per_forward_ms else float("nan")
+    std_ms = statistics.pstdev(per_forward_ms) if len(per_forward_ms) > 1 else 0.0
+    min_ms = per_forward_ms_sorted[0] if per_forward_ms_sorted else float("nan")
+    max_ms = per_forward_ms_sorted[-1] if per_forward_ms_sorted else float("nan")
+    p50_ms = percentile(per_forward_ms_sorted, 50)
+    p95_ms = percentile(per_forward_ms_sorted, 95)
+
+    # Model size
+    num_params = sum(p.numel() for p in policy.parameters())
+
+    # Prepare results for logging
+    results = {
+        "timestamp": datetime.now().isoformat(),
+        "policy_type": args.policy_type,
+        "policy_id": args.policy_id,
+        "device": device,
+        "num_trials": args.num_samples,
+        "forwards_per_trial": 1,
+        "warmup": args.warmup,
+        "timeout_ms": args.timeout * 1000,
+        "seed": args.seed,
+        "num_params": num_params,
+        "timeout_count": timeout_count,
+        "latency_mean_ms": mean_ms,
+        "latency_std_ms": std_ms,
+        "latency_min_ms": min_ms,
+        "latency_max_ms": max_ms,
+        "latency_p50_ms": p50_ms,
+        "latency_p95_ms": p95_ms,
+        "cpu_rss_before": rss_before,
+        "cpu_rss_after": rss_after,
+        "cpu_rss_delta": rss_delta,
+        "cuda_peak_alloc": cuda_peak,
+        "input_features": list(policy.config.input_features.keys()),
+        "output_features": list(policy.config.output_features.keys()),
+    }
+
+    # Format and write results to log file
+    log_content = f"""
+=== LeRobot Policy Inference Benchmark ===
+Timestamp: {results["timestamp"]}
+Policy: {results["policy_type"]} ({results["policy_id"]})
+Device: {results["device"]}
+Seed: {results["seed"]}
+
+=== Model Information ===
+Parameters: {results["num_params"]:,}
+Input Features: {", ".join(results["input_features"])}
+Output Features: {", ".join(results["output_features"])}
+
+=== Benchmark Configuration ===
+Samples: {results["num_trials"]}
+Warmup: {results["warmup"]}
+Total Measurements: {len(per_forward_ms)}
+Timeout: {results["timeout_ms"]:.1f}ms
+Timeouts: {results["timeout_count"]} / {results["num_trials"]}
+
+=== Latency Results (ms) ===
+Mean:     {results["latency_mean_ms"]:.3f}
+Std Dev:  {results["latency_std_ms"]:.3f}
+Min:      {results["latency_min_ms"]:.3f}
+Max:      {results["latency_max_ms"]:.3f}
+P50:      {results["latency_p50_ms"]:.3f}
+P95:      {results["latency_p95_ms"]:.3f}
+
+=== Memory Footprint ===
+CPU RSS Before: {bytes_to_human(results["cpu_rss_before"])}
+CPU RSS After:  {bytes_to_human(results["cpu_rss_after"])} (Δ {bytes_to_human(results["cpu_rss_delta"])})
+"""
+
+    if use_cuda:
+        log_content += f"CUDA Peak:      {bytes_to_human(results['cuda_peak_alloc'])} (reset before timing)\n"
+
+    log_content += f"""
+=== Raw Timing Data (first 20 measurements, ms) ===
+{", ".join(f"{t:.3f}" for t in per_forward_ms[:20])}
+{"..." if len(per_forward_ms) > 20 else ""}
+
+=== Summary Statistics ===
+Timing Method: {"CUDA Events" if use_cuda else "torch.utils.benchmark.Timer"}
+Device Available: {torch.cuda.is_available() if device == "cuda" else torch.backends.mps.is_available() if device == "mps" else True}
+PyTorch Version: {torch.__version__}
+
+Benchmark completed successfully at {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
+"""
+
+    # Write to log file
+    with open(log_file, "w") as f:
+        f.write(log_content)
+
+    # Print to console (shorter version)
+    print("\n=== Inference Benchmark Results ===")
+    print(f"Policy: {args.policy_type} ({args.policy_id})")
+    print(f"Device: {device}")
+    print(f"Samples: {args.num_samples} | Warmup: {args.warmup}")
+    print(f"Model params: {num_params:,}")
+
+    print("\nLatency per forward (ms):")
+    print(f"  mean: {mean_ms:.3f}  std: {std_ms:.3f}")
+    print(f"  min:  {min_ms:.3f}   max: {max_ms:.3f}")
+    print(f"  p50:  {p50_ms:.3f}   p95: {p95_ms:.3f}")
+
+    print("\nMemory footprint:")
+    print(f"  CPU RSS before: {bytes_to_human(rss_before)}")
+    print(f"  CPU RSS after : {bytes_to_human(rss_after)}  (Δ {bytes_to_human(rss_delta)})")
+    if use_cuda:
+        print(
+            f"  CUDA peak allocated: {bytes_to_human(cuda_peak)} "
+            f"(reset by reset_peak_memory_stats before timing)"
+        )
+
+    print(f"\nResults saved to: {log_file}")
+    print("Benchmark completed successfully!")
+
+
+if __name__ == "__main__":
+    main()
--- a/docs/source/lerobot-dataset-v3.mdx
+++ b/docs/source/lerobot-dataset-v3.mdx
@@ -246,7 +246,7 @@ You can also use any `torchvision.transforms.v2` transform by passing it directl
 Use the visualization script to preview how transforms affect your data:

 ```bash
-python -m lerobot.scripts.visualize_image_transforms \
+lerobot-imgtransform-viz \
  --repo-id=your-username/your-dataset \
  --output-dir=./transform_examples \
  --n-examples=5
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -171,7 +171,9 @@ lerobot-setup-motors="lerobot.setup_motors:main"
 lerobot-teleoperate="lerobot.teleoperate:main"
 lerobot-eval="lerobot.scripts.eval:main"
 lerobot-train="lerobot.scripts.train:main"
+lerobot-dataset-viz="lerobot.scripts.lerobot_dataset_viz:main"
 lerobot-info="lerobot.scripts.lerobot_info:main"
+lerobot-imgtransform-viz="lerobot.scripts.lerobot_imgtransform_viz:main"

 # ---------------- Tool Configurations ----------------
 [tool.setuptools.packages.find]
--- a/src/lerobot/robots/viperx/README.md
+++ b/src/lerobot/robots/viperx/README.md
@@ -118,7 +118,7 @@ echo ${HF_USER}/aloha_test
 If you didn't upload with `--control.push_to_hub=false`, you can also visualize it locally with [Rerun](https://github.com/rerun-io/rerun):

 ```bash
-python -m lerobot.scripts.visualize_dataset \
+lerobot-dataset-viz \
  --repo-id ${HF_USER}/aloha_test --episode 0
 ```

--- a/src/lerobot/scripts/lerobot_dataset_viz.py
+++ b/src/lerobot/scripts/lerobot_dataset_viz.py
@@ -29,14 +29,14 @@ Examples:

 - Visualize data stored on a local machine:
 ```
-local$ python -m lerobot.scripts.visualize_dataset \
+local$ lerobot-dataset-viz \
    --repo-id lerobot/pusht \
    --episode-index 0
 ```

 - Visualize data stored on a distant machine with a local viewer:
 ```
-distant$ python -m lerobot.scripts.visualize_dataset \
+distant$ lerobot-dataset-viz \
    --repo-id lerobot/pusht \
    --episode-index 0 \
    --save 1 \
@@ -50,7 +50,7 @@ local$ rerun lerobot_pusht_episode_0.rrd
 (You need to forward the websocket port to the distant machine, with
 `ssh -L 9087:localhost:9087 username@remote-host`)
 ```
-distant$ python -m lerobot.scripts.visualize_dataset \
+distant$ lerobot-dataset-viz \
    --repo-id lerobot/pusht \
    --episode-index 0 \
    --mode distant \
--- a/src/lerobot/scripts/visualize_image_transforms.py
+++ b/src/lerobot/scripts/visualize_image_transforms.py
@@ -20,10 +20,10 @@ Additionally, each individual transform can be visualized separately as well as

 Example:
 ```bash
-python -m lerobot.scripts.visualize_image_transforms \
-    --repo_id=lerobot/pusht \
-    --episodes='[0]' \
-    --image_transforms.enable=True
+lerobot-imgtransform-viz \
+  --repo_id=lerobot/pusht \
+  --episodes='[0]' \
+  --image_transforms.enable=True
 ```
 """

@@ -126,5 +126,9 @@ def visualize_image_transforms(cfg: DatasetConfig, output_dir: Path = OUTPUT_DIR
    save_each_transform(cfg.image_transforms, original_frame, output_dir, n_examples)


-if __name__ == "__main__":
+def main():
    visualize_image_transforms()
+
+
+if __name__ == "__main__":
+    main()
--- a/tests/datasets/test_image_transforms.py
+++ b/tests/datasets/test_image_transforms.py
@@ -29,7 +29,7 @@ from lerobot.datasets.transforms import (
    SharpnessJitter,
    make_transform_from_config,
 )
-from lerobot.scripts.visualize_image_transforms import (
+from lerobot.scripts.lerobot_imgtransform_viz import (
    save_all_transforms,
    save_each_transform,
 )
--- a/tests/datasets/test_visualize_dataset.py
+++ b/tests/datasets/test_visualize_dataset.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 import pytest

-from lerobot.scripts.visualize_dataset import visualize_dataset
+from lerobot.scripts.lerobot_dataset_viz import visualize_dataset


@pytest.mark.skip("TODO: add dummy videos")
Author	SHA1	Message	Date
Francesco Capuano	6eaf6a861a	fix: single level loop	2025-09-24 01:06:13 +02:00
Francesco Capuano	cdd6cb606c	add: inference benchmark	2025-09-23 22:34:52 +02:00
Jade Choghari	f6cd24be17	update Signed-off-by: Jade Choghari <chogharijade@gmail.com>	2025-09-23 21:52:15 +02:00
Jade Choghari	54c6b8ae52	add file Signed-off-by: Jade Choghari <chogharijade@gmail.com>	2025-09-23 21:52:14 +02:00
Steven Palma	c9787bd98a	feat(script): add entry point for image transform viz (#2007 ) * feat(Scripts): add entry point for img transform viz * chore(style): pre-commit style	2025-09-23 18:47:36 +02:00
Steven Palma	c435d3cebc	feat(script): add entry point for dataset viz (#2006 ) * chore(scripts): rename script dataset viz * feat(scripts): add entry point for dataset-viz --------- Signed-off-by: Steven Palma <imstevenpmwork@ieee.org>	2025-09-23 18:46:27 +02:00