import os from typing import AsyncGenerator from openai import AsyncOpenAI import json import time api_key = os.getenv('OPENAI_API_KEY', 'ollama') base_url = os.getenv('OLLAMA_BASE_URL', 'http://192.168.0.120:11434/v1/') model = os.getenv('OLLAMA_MODEL', 'gpt-oss:120b') print(f"[LLM] API key configured: {'Yes' if api_key else 'No'}") print(f"[LLM] Base URL: {base_url}") print(f"[LLM] Model: {model}") client = AsyncOpenAI(api_key=api_key, base_url=base_url) async def stream_openai(prompt: str) -> AsyncGenerator[str, None]: """ 调用 OpenAI/Ollama API 并流式返回补全内容。 参考 completions-sample-code 的 streaming 逻辑。 """ start_time = time.time() print(f"[LLM] ========== API Call Start ==========") print(f"[LLM] Prompt length: {len(prompt)}") print(f"[LLM] Model: {model}") try: print(f"[LLM] Creating streaming chat completion...") stream = await client.chat.completions.create( model=model, messages=[{"role": "user", "content": prompt}], stream=True, max_tokens=128, temperature=0.2, ) print(f"[LLM] Stream created successfully, iterating...") chunk_count = 0 first_chunk_time = None async for chunk in stream: current_time = time.time() if first_chunk_time is None: first_chunk_time = current_time - start_time chunk_count += 1 choice = chunk.choices[0] if chunk.choices else None if choice and choice.delta.content: content = choice.delta.content print(f"[LLM] Chunk {chunk_count}: '{content}' (latency: {current_time - start_time:.3f}s)") yield json.dumps({"content": content}) elif chunk.choices and hasattr(chunk.choices[0], 'finish_reason'): finish_reason = chunk.choices[0].finish_reason print(f"[LLM] Chunk {chunk_count}: finish_reason={finish_reason}") if finish_reason: break else: print(f"[LLM] Chunk {chunk_count}: empty or no content") total_time = time.time() - start_time print(f"[LLM] Stream complete - chunks: {chunk_count}, first chunk latency: {first_chunk_time:.3f}s, total time: {total_time:.3f}s") print(f"[LLM] ========== API Call End ==========") except Exception as e: error_msg = f"Error: {str(e)}" print(f"[LLM] Error: {error_msg}") import traceback traceback.print_exc() yield json.dumps({"error": str(e), "type": type(e).__name__})