feat(tts): add api endpoints and optimization for apple silicon
Introduce a comprehensive TTS/ASR module that:
- Adds /v1/tts-asr/config, /status, /warmup, /tts, /asr endpoints with detailed JSON responses
- Implements Apple‑Silicon detection, device selection (MPS/CUDA/CPU), and memory limiting logic
- Supports selectable model size, quantization, and offline mode via environment variables
- Adds robust audio validation and multi‑path resampling fallback
- Provides new README sections for API usage, device detection, and performance benchmarking
- Includes a full testing suite: unit tests, integration tests, macOS simulation and performance reports
- Updates backend dependencies and CI scripts
- Adds new front‑end views and components for Univer editor integration
All changes are backward compatible; new features are exposed through environment variables and new API routes.
2026-04-06 11:14:09 +08:00
|
|
|
import asyncio
|
|
|
|
|
import base64
|
|
|
|
|
import json
|
|
|
|
|
import logging
|
|
|
|
|
import time
|
|
|
|
|
import argparse
|
|
|
|
|
import uuid
|
|
|
|
|
import sys
|
|
|
|
|
import statistics
|
|
|
|
|
import os
|
|
|
|
|
from datetime import datetime
|
|
|
|
|
from typing import List, Dict, Any, Optional, Tuple
|
|
|
|
|
import httpx
|
|
|
|
|
from pydantic import BaseModel
|
|
|
|
|
|
|
|
|
|
# Configure logging
|
|
|
|
|
logging.basicConfig(
|
|
|
|
|
level=logging.INFO,
|
|
|
|
|
format="%(asctime)s [%(levelname)s] %(message)s",
|
|
|
|
|
handlers=[logging.StreamHandler(sys.stdout)]
|
|
|
|
|
)
|
|
|
|
|
logger = logging.getLogger("api_benchmarker")
|
|
|
|
|
|
|
|
|
|
# Constants
|
2026-04-06 13:40:41 +08:00
|
|
|
DEFAULT_BASE_URL = "http://localhost:8001"
|
feat(tts): add api endpoints and optimization for apple silicon
Introduce a comprehensive TTS/ASR module that:
- Adds /v1/tts-asr/config, /status, /warmup, /tts, /asr endpoints with detailed JSON responses
- Implements Apple‑Silicon detection, device selection (MPS/CUDA/CPU), and memory limiting logic
- Supports selectable model size, quantization, and offline mode via environment variables
- Adds robust audio validation and multi‑path resampling fallback
- Provides new README sections for API usage, device detection, and performance benchmarking
- Includes a full testing suite: unit tests, integration tests, macOS simulation and performance reports
- Updates backend dependencies and CI scripts
- Adds new front‑end views and components for Univer editor integration
All changes are backward compatible; new features are exposed through environment variables and new API routes.
2026-04-06 11:14:09 +08:00
|
|
|
DEFAULT_API_KEY = "your-secret-key-here"
|
|
|
|
|
CHARS_PER_TOKEN = 4
|
|
|
|
|
|
|
|
|
|
# Data Generators
|
|
|
|
|
def get_dummy_base64_image():
|
|
|
|
|
return "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII="
|
|
|
|
|
|
|
|
|
|
def get_dummy_base64_audio():
|
2026-04-06 13:40:41 +08:00
|
|
|
# A bit longer dummy audio to pass validation (44 bytes header + some data)
|
|
|
|
|
return "UklGRigAAABXQVZFZm10IBAAAAABAAEARKwAAIhYAQACABAAZGF0YQQAAAAAAA" + "A" * 100 + "=="
|
feat(tts): add api endpoints and optimization for apple silicon
Introduce a comprehensive TTS/ASR module that:
- Adds /v1/tts-asr/config, /status, /warmup, /tts, /asr endpoints with detailed JSON responses
- Implements Apple‑Silicon detection, device selection (MPS/CUDA/CPU), and memory limiting logic
- Supports selectable model size, quantization, and offline mode via environment variables
- Adds robust audio validation and multi‑path resampling fallback
- Provides new README sections for API usage, device detection, and performance benchmarking
- Includes a full testing suite: unit tests, integration tests, macOS simulation and performance reports
- Updates backend dependencies and CI scripts
- Adds new front‑end views and components for Univer editor integration
All changes are backward compatible; new features are exposed through environment variables and new API routes.
2026-04-06 11:14:09 +08:00
|
|
|
|
|
|
|
|
def generate_context_text(tokens: int) -> str:
|
|
|
|
|
"""Generate synthetic text of approximately 'tokens' tokens."""
|
|
|
|
|
base_phrase = "The quick brown fox jumps over the lazy dog. "
|
|
|
|
|
repeat_count = (tokens * CHARS_PER_TOKEN) // len(base_phrase) + 1
|
|
|
|
|
return (base_phrase * repeat_count)[:tokens * CHARS_PER_TOKEN]
|
|
|
|
|
|
|
|
|
|
# Metric Models
|
|
|
|
|
class RequestMetric(BaseModel):
|
|
|
|
|
task_name: str
|
|
|
|
|
endpoint: str
|
|
|
|
|
status_code: int
|
|
|
|
|
ttfb_ms: float
|
|
|
|
|
total_ms: float
|
|
|
|
|
success: bool
|
|
|
|
|
tokens: int
|
|
|
|
|
error: Optional[str] = None
|
|
|
|
|
|
|
|
|
|
class BenchStats:
|
|
|
|
|
def __init__(self, name: str):
|
|
|
|
|
self.name = name
|
|
|
|
|
self.metrics: List[RequestMetric] = []
|
|
|
|
|
self.start_time = 0.0
|
|
|
|
|
self.end_time = 0.0
|
|
|
|
|
|
|
|
|
|
def add(self, m: RequestMetric):
|
|
|
|
|
self.metrics.append(m)
|
|
|
|
|
|
|
|
|
|
def get_summary(self) -> Dict[str, Any]:
|
|
|
|
|
if not self.metrics:
|
|
|
|
|
return {}
|
|
|
|
|
|
|
|
|
|
total = len(self.metrics)
|
|
|
|
|
successes = [m for m in self.metrics if m.success]
|
|
|
|
|
success_count = len(successes)
|
|
|
|
|
fail_count = total - success_count
|
|
|
|
|
|
|
|
|
|
total_latencies = [m.total_ms for m in successes] if successes else [0]
|
|
|
|
|
ttfb_latencies = [m.ttfb_ms for m in successes] if successes else [0]
|
|
|
|
|
|
|
|
|
|
duration = self.end_time - self.start_time
|
|
|
|
|
total_tokens = sum(m.tokens for m in successes)
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
"name": self.name,
|
|
|
|
|
"total_requests": total,
|
|
|
|
|
"success_rate": (success_count / total) * 100 if total > 0 else 0,
|
|
|
|
|
"avg_latency": statistics.mean(total_latencies),
|
|
|
|
|
"p50_latency": statistics.median(total_latencies),
|
|
|
|
|
"p95_latency": sorted(total_latencies)[int(len(total_latencies)*0.95)] if total_latencies else 0,
|
|
|
|
|
"avg_ttfb": statistics.mean(ttfb_latencies),
|
|
|
|
|
"tps": total_tokens / duration if duration > 0 else 0,
|
|
|
|
|
"rps": total / duration if duration > 0 else 0,
|
|
|
|
|
"duration": duration
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Benchmarking Engine
|
|
|
|
|
class ApiBenchmarker:
|
|
|
|
|
def __init__(self, base_url: str, api_key: str):
|
|
|
|
|
self.base_url = base_url
|
|
|
|
|
self.api_key = api_key
|
|
|
|
|
self.headers = {"X-API-Key": api_key}
|
|
|
|
|
self.semaphores = {
|
|
|
|
|
"completions": asyncio.Semaphore(5),
|
|
|
|
|
"ocr": asyncio.Semaphore(2),
|
|
|
|
|
"convert": asyncio.Semaphore(2),
|
|
|
|
|
"tts-asr": asyncio.Semaphore(3)
|
|
|
|
|
}
|
|
|
|
|
self.results: Dict[str, BenchStats] = {}
|
|
|
|
|
|
|
|
|
|
async def _execute_request(self, client: httpx.AsyncClient, name: str, method: str, path: str, **kwargs) -> RequestMetric:
|
|
|
|
|
url = f"{self.base_url}{path}"
|
|
|
|
|
start = time.perf_counter()
|
|
|
|
|
ttfb = 0.0
|
|
|
|
|
tokens_count = 0
|
|
|
|
|
|
|
|
|
|
# Estimate input + output tokens (mock for output)
|
|
|
|
|
if "json" in kwargs:
|
|
|
|
|
input_text = str(kwargs["json"].get("prefix", "")) + str(kwargs["json"].get("text", ""))
|
|
|
|
|
tokens_count += len(input_text) // CHARS_PER_TOKEN
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
async with client.stream(method, url, **kwargs) as response:
|
|
|
|
|
ttfb = (time.perf_counter() - start) * 1000
|
|
|
|
|
|
|
|
|
|
body = await response.aread()
|
|
|
|
|
total_ms = (time.perf_counter() - start) * 1000
|
|
|
|
|
success = 200 <= response.status_code < 300
|
|
|
|
|
|
|
|
|
|
error_msg = None
|
|
|
|
|
if not success:
|
|
|
|
|
error_msg = body.decode(errors="ignore")[:200]
|
|
|
|
|
else:
|
|
|
|
|
# Estimate output tokens from response content
|
|
|
|
|
try:
|
|
|
|
|
resp_json = json.loads(body)
|
|
|
|
|
content = resp_json.get("content", "") or resp_json.get("text", "") or resp_json.get("markdown", "")
|
|
|
|
|
tokens_count += len(content) // CHARS_PER_TOKEN
|
|
|
|
|
except:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
return RequestMetric(
|
|
|
|
|
task_name=name,
|
|
|
|
|
endpoint=path,
|
|
|
|
|
status_code=response.status_code,
|
|
|
|
|
ttfb_ms=ttfb,
|
|
|
|
|
total_ms=total_ms,
|
|
|
|
|
success=success,
|
|
|
|
|
tokens=tokens_count,
|
|
|
|
|
error=error_msg
|
|
|
|
|
)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
total_ms = (time.perf_counter() - start) * 1000
|
|
|
|
|
return RequestMetric(
|
|
|
|
|
task_name=name,
|
|
|
|
|
endpoint=path,
|
|
|
|
|
status_code=0,
|
|
|
|
|
ttfb_ms=ttfb or total_ms,
|
|
|
|
|
total_ms=total_ms,
|
|
|
|
|
success=False,
|
|
|
|
|
tokens=tokens_count,
|
|
|
|
|
error=str(e)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
async def run_task(self, client: httpx.AsyncClient, task_type: str, name: str, iterations: int):
|
|
|
|
|
if name not in self.results:
|
|
|
|
|
self.results[name] = BenchStats(name)
|
|
|
|
|
|
|
|
|
|
stats = self.results[name]
|
|
|
|
|
stats.start_time = time.perf_counter()
|
|
|
|
|
|
|
|
|
|
sem = self.semaphores.get(task_type, self.semaphores["completions"])
|
|
|
|
|
|
|
|
|
|
async def worker():
|
|
|
|
|
async with sem:
|
|
|
|
|
if task_type == "completions":
|
|
|
|
|
# Stability Test Variation
|
|
|
|
|
prefix_len = 100
|
|
|
|
|
if "Normal" in name: prefix_len = 1000
|
|
|
|
|
if "Long" in name: prefix_len = 4000
|
|
|
|
|
|
|
|
|
|
metric = await self._execute_request(client, name, "POST", "/v1/completions", json={
|
|
|
|
|
"prefix": generate_context_text(prefix_len),
|
|
|
|
|
"suffix": "End of document.",
|
|
|
|
|
"model_thinking": "low"
|
|
|
|
|
})
|
|
|
|
|
elif task_type == "ocr":
|
|
|
|
|
metric = await self._execute_request(client, name, "POST", "/v1/ocr", json={
|
|
|
|
|
"image": get_dummy_base64_image(),
|
|
|
|
|
"filename": "bench.png"
|
|
|
|
|
})
|
|
|
|
|
elif task_type == "convert":
|
|
|
|
|
metric = await self._execute_request(client, name, "POST", "/v1/convert", json={
|
|
|
|
|
"file": base64.b64encode(b"Performance test data").decode(),
|
|
|
|
|
"filename": "bench.txt"
|
|
|
|
|
})
|
|
|
|
|
elif task_type == "tts":
|
|
|
|
|
metric = await self._execute_request(client, name, "POST", "/v1/tts-asr/tts", json={
|
|
|
|
|
"text": "This is a performance benchmark for the text to speech engine.",
|
2026-04-06 13:40:41 +08:00
|
|
|
"voice": "v2/en_speaker_6",
|
feat(tts): add api endpoints and optimization for apple silicon
Introduce a comprehensive TTS/ASR module that:
- Adds /v1/tts-asr/config, /status, /warmup, /tts, /asr endpoints with detailed JSON responses
- Implements Apple‑Silicon detection, device selection (MPS/CUDA/CPU), and memory limiting logic
- Supports selectable model size, quantization, and offline mode via environment variables
- Adds robust audio validation and multi‑path resampling fallback
- Provides new README sections for API usage, device detection, and performance benchmarking
- Includes a full testing suite: unit tests, integration tests, macOS simulation and performance reports
- Updates backend dependencies and CI scripts
- Adds new front‑end views and components for Univer editor integration
All changes are backward compatible; new features are exposed through environment variables and new API routes.
2026-04-06 11:14:09 +08:00
|
|
|
"format": "wav"
|
|
|
|
|
})
|
|
|
|
|
elif task_type == "asr":
|
|
|
|
|
metric = await self._execute_request(client, name, "POST", "/v1/tts-asr/asr", json={
|
|
|
|
|
"audio_base64": get_dummy_base64_audio(),
|
|
|
|
|
"language": "en"
|
|
|
|
|
})
|
|
|
|
|
else:
|
|
|
|
|
metric = await self._execute_request(client, name, "GET", "/v1/tts-asr/status")
|
|
|
|
|
|
|
|
|
|
stats.add(metric)
|
|
|
|
|
|
|
|
|
|
tasks = [worker() for _ in range(iterations)]
|
|
|
|
|
await asyncio.gather(*tasks)
|
|
|
|
|
stats.end_time = time.perf_counter()
|
|
|
|
|
|
|
|
|
|
def generate_report(self, output_file: str):
|
|
|
|
|
report = []
|
|
|
|
|
report.append(f"# API Benchmarking Report ({datetime.now().strftime('%Y-%m-%d %H:%M:%S')})")
|
|
|
|
|
report.append(f"\n**Base URL:** `{self.base_url}`")
|
|
|
|
|
|
|
|
|
|
# Summary Table
|
|
|
|
|
report.append("\n## Executive Summary")
|
|
|
|
|
report.append("| Task | Success Rate | Avg TTFB | Avg Latency | P95 Latency | TPS | RPS |")
|
|
|
|
|
report.append("| :--- | :--- | :--- | :--- | :--- | :--- | :--- |")
|
|
|
|
|
|
|
|
|
|
for name, stats in self.results.items():
|
|
|
|
|
s = stats.get_summary()
|
|
|
|
|
if not s: continue
|
|
|
|
|
report.append(f"| {s['name']} | {s['success_rate']:.1f}% | {s['avg_ttfb']:.1f}ms | {s['avg_latency']:.1f}ms | {s['p95_latency']:.1f}ms | {s['tps']:.1f} | {s['rps']:.2f} |")
|
|
|
|
|
|
|
|
|
|
# Stability Analysis
|
|
|
|
|
report.append("\n## Stability & Context Analysis")
|
|
|
|
|
report.append("Detailed analysis of how context length affects TTFB and overall performance.")
|
|
|
|
|
|
|
|
|
|
# Details per category
|
|
|
|
|
for name, stats in self.results.items():
|
|
|
|
|
s = stats.get_summary()
|
|
|
|
|
if not s: continue
|
|
|
|
|
report.append(f"\n### {name} Details")
|
|
|
|
|
report.append(f"- **Total Samples:** {s['total_requests']}")
|
|
|
|
|
report.append(f"- **Duration:** {s['duration']:.2f}s")
|
|
|
|
|
failures = [m for m in stats.metrics if not m.success]
|
|
|
|
|
if failures:
|
|
|
|
|
report.append(f"- **Top Errors:**")
|
|
|
|
|
for f in failures[:3]:
|
|
|
|
|
report.append(f" - `[{f.status_code}]` {f.error}")
|
|
|
|
|
|
|
|
|
|
with open(output_file, "w", encoding="utf-8") as f:
|
|
|
|
|
f.write("\n".join(report))
|
|
|
|
|
logger.info(f"Report generated: {output_file}")
|
|
|
|
|
|
|
|
|
|
async def main():
|
|
|
|
|
parser = argparse.ArgumentParser(description="Advanced LLM API Benchmarker")
|
|
|
|
|
parser.add_argument("--url", default=DEFAULT_BASE_URL, help="Base URL")
|
|
|
|
|
parser.add_argument("--key", default=DEFAULT_API_KEY, help="API Key")
|
|
|
|
|
parser.add_argument("--c-comp", type=int, default=5, help="Completion Concurrency")
|
|
|
|
|
parser.add_argument("--c-ocr", type=int, default=2, help="OCR Concurrency")
|
|
|
|
|
parser.add_argument("--c-audio", type=int, default=2, help="TTS/ASR Concurrency")
|
|
|
|
|
parser.add_argument("--iters", type=int, default=10, help="Iterations per test suite")
|
|
|
|
|
parser.add_argument("--output", default="api_performance_report.md", help="Output report file")
|
|
|
|
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
bench = ApiBenchmarker(args.url, args.key)
|
|
|
|
|
bench.semaphores["completions"] = asyncio.Semaphore(args.c_comp)
|
|
|
|
|
bench.semaphores["ocr"] = asyncio.Semaphore(args.c_ocr)
|
|
|
|
|
bench.semaphores["tts-asr"] = asyncio.Semaphore(args.c_audio)
|
|
|
|
|
|
|
|
|
|
async with httpx.AsyncClient(headers=bench.headers, timeout=120.0) as client:
|
|
|
|
|
logger.info("Starting Benchmark Suites...")
|
|
|
|
|
|
|
|
|
|
# Suite 1: Stability - Completion Contexts
|
|
|
|
|
logger.info("Running Stability Suite (Short Context)...")
|
|
|
|
|
await bench.run_task(client, "completions", "Completion-Short", args.iters)
|
|
|
|
|
|
|
|
|
|
logger.info("Running Stability Suite (Normal Context)...")
|
|
|
|
|
await bench.run_task(client, "completions", "Completion-Normal", args.iters)
|
|
|
|
|
|
|
|
|
|
logger.info("Running Stability Suite (Long Context)...")
|
|
|
|
|
await bench.run_task(client, "completions", "Completion-Long", args.iters)
|
|
|
|
|
|
|
|
|
|
# Suite 2: Functional Concurrency
|
|
|
|
|
logger.info("Running OCR Concurrency Suite...")
|
|
|
|
|
await bench.run_task(client, "ocr", "OCR-Concurrent", args.iters)
|
|
|
|
|
|
|
|
|
|
logger.info("Running TTS Concurrency Suite...")
|
|
|
|
|
await bench.run_task(client, "tts", "TTS-Concurrent", args.iters)
|
|
|
|
|
|
|
|
|
|
logger.info("Running ASR Concurrency Suite...")
|
|
|
|
|
await bench.run_task(client, "asr", "ASR-Concurrent", args.iters)
|
|
|
|
|
|
|
|
|
|
logger.info("Running File Transformation Suite...")
|
|
|
|
|
await bench.run_task(client, "convert", "Convert-Concurrent", args.iters)
|
|
|
|
|
|
|
|
|
|
bench.generate_report(args.output)
|
|
|
|
|
print(f"\nBenchmark Complete! View the report at: {args.output}")
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
asyncio.run(main())
|