import asyncio import base64 import json import logging import time import argparse import uuid import sys import statistics import os from datetime import datetime from typing import List, Dict, Any, Optional, Tuple import httpx from pydantic import BaseModel # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[logging.StreamHandler(sys.stdout)] ) logger = logging.getLogger("api_benchmarker") # Constants DEFAULT_BASE_URL = "http://localhost:8001" DEFAULT_API_KEY = "your-secret-key-here" CHARS_PER_TOKEN = 4 # Data Generators def get_dummy_base64_image(): return "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=" def get_dummy_base64_audio(): # A bit longer dummy audio to pass validation (44 bytes header + some data) return "UklGRigAAABXQVZFZm10IBAAAAABAAEARKwAAIhYAQACABAAZGF0YQQAAAAAAA" + "A" * 100 + "==" def generate_context_text(tokens: int) -> str: """Generate synthetic text of approximately 'tokens' tokens.""" base_phrase = "The quick brown fox jumps over the lazy dog. " repeat_count = (tokens * CHARS_PER_TOKEN) // len(base_phrase) + 1 return (base_phrase * repeat_count)[:tokens * CHARS_PER_TOKEN] # Metric Models class RequestMetric(BaseModel): task_name: str endpoint: str status_code: int ttfb_ms: float total_ms: float success: bool tokens: int error: Optional[str] = None class BenchStats: def __init__(self, name: str): self.name = name self.metrics: List[RequestMetric] = [] self.start_time = 0.0 self.end_time = 0.0 def add(self, m: RequestMetric): self.metrics.append(m) def get_summary(self) -> Dict[str, Any]: if not self.metrics: return {} total = len(self.metrics) successes = [m for m in self.metrics if m.success] success_count = len(successes) fail_count = total - success_count total_latencies = [m.total_ms for m in successes] if successes else [0] ttfb_latencies = [m.ttfb_ms for m in successes] if successes else [0] duration = self.end_time - self.start_time total_tokens = sum(m.tokens for m in successes) return { "name": self.name, "total_requests": total, "success_rate": (success_count / total) * 100 if total > 0 else 0, "avg_latency": statistics.mean(total_latencies), "p50_latency": statistics.median(total_latencies), "p95_latency": sorted(total_latencies)[int(len(total_latencies)*0.95)] if total_latencies else 0, "avg_ttfb": statistics.mean(ttfb_latencies), "tps": total_tokens / duration if duration > 0 else 0, "rps": total / duration if duration > 0 else 0, "duration": duration } # Benchmarking Engine class ApiBenchmarker: def __init__(self, base_url: str, api_key: str): self.base_url = base_url self.api_key = api_key self.headers = {"X-API-Key": api_key} self.semaphores = { "completions": asyncio.Semaphore(5), "ocr": asyncio.Semaphore(2), "convert": asyncio.Semaphore(2), "tts-asr": asyncio.Semaphore(3) } self.results: Dict[str, BenchStats] = {} async def _execute_request(self, client: httpx.AsyncClient, name: str, method: str, path: str, **kwargs) -> RequestMetric: url = f"{self.base_url}{path}" start = time.perf_counter() ttfb = 0.0 tokens_count = 0 # Estimate input + output tokens (mock for output) if "json" in kwargs: input_text = str(kwargs["json"].get("prefix", "")) + str(kwargs["json"].get("text", "")) tokens_count += len(input_text) // CHARS_PER_TOKEN try: async with client.stream(method, url, **kwargs) as response: ttfb = (time.perf_counter() - start) * 1000 body = await response.aread() total_ms = (time.perf_counter() - start) * 1000 success = 200 <= response.status_code < 300 error_msg = None if not success: error_msg = body.decode(errors="ignore")[:200] else: # Estimate output tokens from response content try: resp_json = json.loads(body) content = resp_json.get("content", "") or resp_json.get("text", "") or resp_json.get("markdown", "") tokens_count += len(content) // CHARS_PER_TOKEN except: pass return RequestMetric( task_name=name, endpoint=path, status_code=response.status_code, ttfb_ms=ttfb, total_ms=total_ms, success=success, tokens=tokens_count, error=error_msg ) except Exception as e: total_ms = (time.perf_counter() - start) * 1000 return RequestMetric( task_name=name, endpoint=path, status_code=0, ttfb_ms=ttfb or total_ms, total_ms=total_ms, success=False, tokens=tokens_count, error=str(e) ) async def run_task(self, client: httpx.AsyncClient, task_type: str, name: str, iterations: int): if name not in self.results: self.results[name] = BenchStats(name) stats = self.results[name] stats.start_time = time.perf_counter() sem = self.semaphores.get(task_type, self.semaphores["completions"]) async def worker(): async with sem: if task_type == "completions": # Stability Test Variation prefix_len = 100 if "Normal" in name: prefix_len = 1000 if "Long" in name: prefix_len = 4000 metric = await self._execute_request(client, name, "POST", "/v1/completions", json={ "prefix": generate_context_text(prefix_len), "suffix": "End of document.", "model_thinking": "low" }) elif task_type == "ocr": metric = await self._execute_request(client, name, "POST", "/v1/ocr", json={ "image": get_dummy_base64_image(), "filename": "bench.png" }) elif task_type == "convert": metric = await self._execute_request(client, name, "POST", "/v1/convert", json={ "file": base64.b64encode(b"Performance test data").decode(), "filename": "bench.txt" }) elif task_type == "tts": metric = await self._execute_request(client, name, "POST", "/v1/tts-asr/tts", json={ "text": "This is a performance benchmark for the text to speech engine.", "voice": "v2/en_speaker_6", "format": "wav" }) elif task_type == "asr": metric = await self._execute_request(client, name, "POST", "/v1/tts-asr/asr", json={ "audio_base64": get_dummy_base64_audio(), "language": "en" }) else: metric = await self._execute_request(client, name, "GET", "/v1/tts-asr/status") stats.add(metric) tasks = [worker() for _ in range(iterations)] await asyncio.gather(*tasks) stats.end_time = time.perf_counter() def generate_report(self, output_file: str): report = [] report.append(f"# API Benchmarking Report ({datetime.now().strftime('%Y-%m-%d %H:%M:%S')})") report.append(f"\n**Base URL:** `{self.base_url}`") # Summary Table report.append("\n## Executive Summary") report.append("| Task | Success Rate | Avg TTFB | Avg Latency | P95 Latency | TPS | RPS |") report.append("| :--- | :--- | :--- | :--- | :--- | :--- | :--- |") for name, stats in self.results.items(): s = stats.get_summary() if not s: continue report.append(f"| {s['name']} | {s['success_rate']:.1f}% | {s['avg_ttfb']:.1f}ms | {s['avg_latency']:.1f}ms | {s['p95_latency']:.1f}ms | {s['tps']:.1f} | {s['rps']:.2f} |") # Stability Analysis report.append("\n## Stability & Context Analysis") report.append("Detailed analysis of how context length affects TTFB and overall performance.") # Details per category for name, stats in self.results.items(): s = stats.get_summary() if not s: continue report.append(f"\n### {name} Details") report.append(f"- **Total Samples:** {s['total_requests']}") report.append(f"- **Duration:** {s['duration']:.2f}s") failures = [m for m in stats.metrics if not m.success] if failures: report.append(f"- **Top Errors:**") for f in failures[:3]: report.append(f" - `[{f.status_code}]` {f.error}") with open(output_file, "w", encoding="utf-8") as f: f.write("\n".join(report)) logger.info(f"Report generated: {output_file}") async def main(): parser = argparse.ArgumentParser(description="Advanced LLM API Benchmarker") parser.add_argument("--url", default=DEFAULT_BASE_URL, help="Base URL") parser.add_argument("--key", default=DEFAULT_API_KEY, help="API Key") parser.add_argument("--c-comp", type=int, default=5, help="Completion Concurrency") parser.add_argument("--c-ocr", type=int, default=2, help="OCR Concurrency") parser.add_argument("--c-audio", type=int, default=2, help="TTS/ASR Concurrency") parser.add_argument("--iters", type=int, default=10, help="Iterations per test suite") parser.add_argument("--output", default="api_performance_report.md", help="Output report file") args = parser.parse_args() bench = ApiBenchmarker(args.url, args.key) bench.semaphores["completions"] = asyncio.Semaphore(args.c_comp) bench.semaphores["ocr"] = asyncio.Semaphore(args.c_ocr) bench.semaphores["tts-asr"] = asyncio.Semaphore(args.c_audio) async with httpx.AsyncClient(headers=bench.headers, timeout=120.0) as client: logger.info("Starting Benchmark Suites...") # Suite 1: Stability - Completion Contexts logger.info("Running Stability Suite (Short Context)...") await bench.run_task(client, "completions", "Completion-Short", args.iters) logger.info("Running Stability Suite (Normal Context)...") await bench.run_task(client, "completions", "Completion-Normal", args.iters) logger.info("Running Stability Suite (Long Context)...") await bench.run_task(client, "completions", "Completion-Long", args.iters) # Suite 2: Functional Concurrency logger.info("Running OCR Concurrency Suite...") await bench.run_task(client, "ocr", "OCR-Concurrent", args.iters) logger.info("Running TTS Concurrency Suite...") await bench.run_task(client, "tts", "TTS-Concurrent", args.iters) logger.info("Running ASR Concurrency Suite...") await bench.run_task(client, "asr", "ASR-Concurrent", args.iters) logger.info("Running File Transformation Suite...") await bench.run_task(client, "convert", "Convert-Concurrent", args.iters) bench.generate_report(args.output) print(f"\nBenchmark Complete! View the report at: {args.output}") if __name__ == "__main__": asyncio.run(main())