45 lines
1.5 KiB
Python
45 lines
1.5 KiB
Python
import os
|
|
from typing import AsyncGenerator
|
|
from openai import AsyncOpenAI
|
|
import json
|
|
|
|
api_key = os.getenv('OPENAI_API_KEY', 'ollama')
|
|
base_url = os.getenv('OLLAMA_BASE_URL', 'http://192.168.0.120:11434/v1/')
|
|
model = os.getenv('OLLAMA_MODEL', 'gpt-oss:120b')
|
|
|
|
print(f"[LLM] API key configured: {'Yes' if api_key else 'No'}")
|
|
print(f"[LLM] Base URL: {base_url}")
|
|
print(f"[LLM] Model: {model}")
|
|
|
|
client = AsyncOpenAI(api_key=api_key, base_url=base_url)
|
|
|
|
async def stream_openai(prompt: str) -> AsyncGenerator[str, None]:
|
|
"""
|
|
调用 OpenAI/Ollama API 并流式返回补全内容。
|
|
参考 completions-sample-code 的 streaming 逻辑。
|
|
"""
|
|
print(f"[LLM] Calling API with prompt length: {len(prompt)}")
|
|
|
|
try:
|
|
stream = await client.chat.completions.create(
|
|
model=model,
|
|
messages=[{"role": "user", "content": prompt}],
|
|
stream=True,
|
|
max_tokens=128,
|
|
temperature=0.2,
|
|
)
|
|
|
|
chunk_count = 0
|
|
async for chunk in stream:
|
|
if chunk.choices[0].delta.content:
|
|
content = chunk.choices[0].delta.content
|
|
chunk_count += 1
|
|
print(f"[LLM] Chunk {chunk_count}: {content}")
|
|
yield json.dumps({"content": content})
|
|
|
|
print(f"[LLM] Stream complete, total chunks: {chunk_count}")
|
|
except Exception as e:
|
|
error_msg = f"Error: {str(e)}"
|
|
print(f"[LLM] Error: {error_msg}")
|
|
yield json.dumps({"error": str(e)})
|