Files
llm-in-text/backend/test_api_performance.py

296 lines
12 KiB
Python
Raw Normal View History

import asyncio
import base64
import json
import logging
import time
import argparse
import uuid
import sys
import statistics
import os
from datetime import datetime
from typing import List, Dict, Any, Optional, Tuple
import httpx
from pydantic import BaseModel
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[logging.StreamHandler(sys.stdout)]
)
logger = logging.getLogger("api_benchmarker")
# Constants
DEFAULT_BASE_URL = "http://localhost:8001"
DEFAULT_API_KEY = "your-secret-key-here"
CHARS_PER_TOKEN = 4
# Data Generators
def get_dummy_base64_image():
return "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII="
def get_dummy_base64_audio():
# A bit longer dummy audio to pass validation (44 bytes header + some data)
return "UklGRigAAABXQVZFZm10IBAAAAABAAEARKwAAIhYAQACABAAZGF0YQQAAAAAAA" + "A" * 100 + "=="
def generate_context_text(tokens: int) -> str:
"""Generate synthetic text of approximately 'tokens' tokens."""
base_phrase = "The quick brown fox jumps over the lazy dog. "
repeat_count = (tokens * CHARS_PER_TOKEN) // len(base_phrase) + 1
return (base_phrase * repeat_count)[:tokens * CHARS_PER_TOKEN]
# Metric Models
class RequestMetric(BaseModel):
task_name: str
endpoint: str
status_code: int
ttfb_ms: float
total_ms: float
success: bool
tokens: int
error: Optional[str] = None
class BenchStats:
def __init__(self, name: str):
self.name = name
self.metrics: List[RequestMetric] = []
self.start_time = 0.0
self.end_time = 0.0
def add(self, m: RequestMetric):
self.metrics.append(m)
def get_summary(self) -> Dict[str, Any]:
if not self.metrics:
return {}
total = len(self.metrics)
successes = [m for m in self.metrics if m.success]
success_count = len(successes)
fail_count = total - success_count
total_latencies = [m.total_ms for m in successes] if successes else [0]
ttfb_latencies = [m.ttfb_ms for m in successes] if successes else [0]
duration = self.end_time - self.start_time
total_tokens = sum(m.tokens for m in successes)
return {
"name": self.name,
"total_requests": total,
"success_rate": (success_count / total) * 100 if total > 0 else 0,
"avg_latency": statistics.mean(total_latencies),
"p50_latency": statistics.median(total_latencies),
"p95_latency": sorted(total_latencies)[int(len(total_latencies)*0.95)] if total_latencies else 0,
"avg_ttfb": statistics.mean(ttfb_latencies),
"tps": total_tokens / duration if duration > 0 else 0,
"rps": total / duration if duration > 0 else 0,
"duration": duration
}
# Benchmarking Engine
class ApiBenchmarker:
def __init__(self, base_url: str, api_key: str):
self.base_url = base_url
self.api_key = api_key
self.headers = {"X-API-Key": api_key}
self.semaphores = {
"completions": asyncio.Semaphore(5),
"ocr": asyncio.Semaphore(2),
"convert": asyncio.Semaphore(2),
"tts-asr": asyncio.Semaphore(3)
}
self.results: Dict[str, BenchStats] = {}
async def _execute_request(self, client: httpx.AsyncClient, name: str, method: str, path: str, **kwargs) -> RequestMetric:
url = f"{self.base_url}{path}"
start = time.perf_counter()
ttfb = 0.0
tokens_count = 0
# Estimate input + output tokens (mock for output)
if "json" in kwargs:
input_text = str(kwargs["json"].get("prefix", "")) + str(kwargs["json"].get("text", ""))
tokens_count += len(input_text) // CHARS_PER_TOKEN
try:
async with client.stream(method, url, **kwargs) as response:
ttfb = (time.perf_counter() - start) * 1000
body = await response.aread()
total_ms = (time.perf_counter() - start) * 1000
success = 200 <= response.status_code < 300
error_msg = None
if not success:
error_msg = body.decode(errors="ignore")[:200]
else:
# Estimate output tokens from response content
try:
resp_json = json.loads(body)
content = resp_json.get("content", "") or resp_json.get("text", "") or resp_json.get("markdown", "")
tokens_count += len(content) // CHARS_PER_TOKEN
except:
pass
return RequestMetric(
task_name=name,
endpoint=path,
status_code=response.status_code,
ttfb_ms=ttfb,
total_ms=total_ms,
success=success,
tokens=tokens_count,
error=error_msg
)
except Exception as e:
total_ms = (time.perf_counter() - start) * 1000
return RequestMetric(
task_name=name,
endpoint=path,
status_code=0,
ttfb_ms=ttfb or total_ms,
total_ms=total_ms,
success=False,
tokens=tokens_count,
error=str(e)
)
async def run_task(self, client: httpx.AsyncClient, task_type: str, name: str, iterations: int):
if name not in self.results:
self.results[name] = BenchStats(name)
stats = self.results[name]
stats.start_time = time.perf_counter()
sem = self.semaphores.get(task_type, self.semaphores["completions"])
async def worker():
async with sem:
if task_type == "completions":
# Stability Test Variation
prefix_len = 100
if "Normal" in name: prefix_len = 1000
if "Long" in name: prefix_len = 4000
metric = await self._execute_request(client, name, "POST", "/v1/completions", json={
"prefix": generate_context_text(prefix_len),
"suffix": "End of document.",
"model_thinking": "low"
})
elif task_type == "ocr":
metric = await self._execute_request(client, name, "POST", "/v1/ocr", json={
"image": get_dummy_base64_image(),
"filename": "bench.png"
})
elif task_type == "convert":
metric = await self._execute_request(client, name, "POST", "/v1/convert", json={
"file": base64.b64encode(b"Performance test data").decode(),
"filename": "bench.txt"
})
elif task_type == "tts":
metric = await self._execute_request(client, name, "POST", "/v1/tts-asr/tts", json={
"text": "This is a performance benchmark for the text to speech engine.",
"voice": "v2/en_speaker_6",
"format": "wav"
})
elif task_type == "asr":
metric = await self._execute_request(client, name, "POST", "/v1/tts-asr/asr", json={
"audio_base64": get_dummy_base64_audio(),
"language": "en"
})
else:
metric = await self._execute_request(client, name, "GET", "/v1/tts-asr/status")
stats.add(metric)
tasks = [worker() for _ in range(iterations)]
await asyncio.gather(*tasks)
stats.end_time = time.perf_counter()
def generate_report(self, output_file: str):
report = []
report.append(f"# API Benchmarking Report ({datetime.now().strftime('%Y-%m-%d %H:%M:%S')})")
report.append(f"\n**Base URL:** `{self.base_url}`")
# Summary Table
report.append("\n## Executive Summary")
report.append("| Task | Success Rate | Avg TTFB | Avg Latency | P95 Latency | TPS | RPS |")
report.append("| :--- | :--- | :--- | :--- | :--- | :--- | :--- |")
for name, stats in self.results.items():
s = stats.get_summary()
if not s: continue
report.append(f"| {s['name']} | {s['success_rate']:.1f}% | {s['avg_ttfb']:.1f}ms | {s['avg_latency']:.1f}ms | {s['p95_latency']:.1f}ms | {s['tps']:.1f} | {s['rps']:.2f} |")
# Stability Analysis
report.append("\n## Stability & Context Analysis")
report.append("Detailed analysis of how context length affects TTFB and overall performance.")
# Details per category
for name, stats in self.results.items():
s = stats.get_summary()
if not s: continue
report.append(f"\n### {name} Details")
report.append(f"- **Total Samples:** {s['total_requests']}")
report.append(f"- **Duration:** {s['duration']:.2f}s")
failures = [m for m in stats.metrics if not m.success]
if failures:
report.append(f"- **Top Errors:**")
for f in failures[:3]:
report.append(f" - `[{f.status_code}]` {f.error}")
with open(output_file, "w", encoding="utf-8") as f:
f.write("\n".join(report))
logger.info(f"Report generated: {output_file}")
async def main():
parser = argparse.ArgumentParser(description="Advanced LLM API Benchmarker")
parser.add_argument("--url", default=DEFAULT_BASE_URL, help="Base URL")
parser.add_argument("--key", default=DEFAULT_API_KEY, help="API Key")
parser.add_argument("--c-comp", type=int, default=5, help="Completion Concurrency")
parser.add_argument("--c-ocr", type=int, default=2, help="OCR Concurrency")
parser.add_argument("--c-audio", type=int, default=2, help="TTS/ASR Concurrency")
parser.add_argument("--iters", type=int, default=10, help="Iterations per test suite")
parser.add_argument("--output", default="api_performance_report.md", help="Output report file")
args = parser.parse_args()
bench = ApiBenchmarker(args.url, args.key)
bench.semaphores["completions"] = asyncio.Semaphore(args.c_comp)
bench.semaphores["ocr"] = asyncio.Semaphore(args.c_ocr)
bench.semaphores["tts-asr"] = asyncio.Semaphore(args.c_audio)
async with httpx.AsyncClient(headers=bench.headers, timeout=120.0) as client:
logger.info("Starting Benchmark Suites...")
# Suite 1: Stability - Completion Contexts
logger.info("Running Stability Suite (Short Context)...")
await bench.run_task(client, "completions", "Completion-Short", args.iters)
logger.info("Running Stability Suite (Normal Context)...")
await bench.run_task(client, "completions", "Completion-Normal", args.iters)
logger.info("Running Stability Suite (Long Context)...")
await bench.run_task(client, "completions", "Completion-Long", args.iters)
# Suite 2: Functional Concurrency
logger.info("Running OCR Concurrency Suite...")
await bench.run_task(client, "ocr", "OCR-Concurrent", args.iters)
logger.info("Running TTS Concurrency Suite...")
await bench.run_task(client, "tts", "TTS-Concurrent", args.iters)
logger.info("Running ASR Concurrency Suite...")
await bench.run_task(client, "asr", "ASR-Concurrent", args.iters)
logger.info("Running File Transformation Suite...")
await bench.run_task(client, "convert", "Convert-Concurrent", args.iters)
bench.generate_report(args.output)
print(f"\nBenchmark Complete! View the report at: {args.output}")
if __name__ == "__main__":
asyncio.run(main())