From 5ab3dfd7627f7cd5bec41f4cd7fa46facfb2db2b Mon Sep 17 00:00:00 2001 From: Jade Choghari Date: Sun, 25 Jan 2026 15:51:50 +0100 Subject: [PATCH] add videoprism example --- .../policies/videovla/videoprism/test.py | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 src/lerobot/policies/videovla/videoprism/test.py diff --git a/src/lerobot/policies/videovla/videoprism/test.py b/src/lerobot/policies/videovla/videoprism/test.py new file mode 100644 index 000000000..7a188bd89 --- /dev/null +++ b/src/lerobot/policies/videovla/videoprism/test.py @@ -0,0 +1,50 @@ +import torch +import numpy as np +from torchcodec.decoders import VideoDecoder + +from lerobot.policies.videovla.videoprism import VideoPrismVideoProcessor +from lerobot.policies.videovla.videoprism import VideoPrismVisionModel +processor = VideoPrismVideoProcessor.from_pretrained( + "MHRDYN7/videoprism-base-f16r288" +) + +model = VideoPrismVisionModel.from_pretrained( + "MHRDYN7/videoprism-base-f16r288", + torch_dtype=torch.float16, + device_map="auto", + attn_implementation="sdpa", +) + +video_url = "https://huggingface.co/datasets/nateraw/kinetics-mini/resolve/main/val/archery/-Qz25rXdMjE_000014_000024.mp4" + +vr = VideoDecoder(video_url) +frame_idx = np.arange(0, 64) +video = vr.get_frames_at(indices=frame_idx).data # T x C x H x W + +video = processor(video, return_tensors="pt") +video = {k: v.to(model.device, model.dtype) for k, v in video.items()} +outputs = model(**video) +encoder_outputs = outputs.last_hidden_state +print(encoder_outputs.shape) # + +import time +import torch + +# warmup +for _ in range(10): + _ = model(**video) + +times = [] +for _ in range(50): + torch.cuda.synchronize() + t0 = time.perf_counter() + + _ = model(**video) + + torch.cuda.synchronize() + t1 = time.perf_counter() + times.append(t1 - t0) + +print(f"Mean: {1000*sum(times)/len(times):.2f} ms") +print(f"Min : {1000*min(times):.2f} ms") +print(f"Max : {1000*max(times):.2f} ms")