Files
llm-in-text/backend/llm.py

45 lines
1.5 KiB
Python

import os
from typing import AsyncGenerator
from openai import AsyncOpenAI
import json
api_key = os.getenv('OPENAI_API_KEY', 'ollama')
base_url = os.getenv('OLLAMA_BASE_URL', 'http://192.168.0.120:11434/v1/')
model = os.getenv('OLLAMA_MODEL', 'gpt-oss:120b')
print(f"[LLM] API key configured: {'Yes' if api_key else 'No'}")
print(f"[LLM] Base URL: {base_url}")
print(f"[LLM] Model: {model}")
client = AsyncOpenAI(api_key=api_key, base_url=base_url)
async def stream_openai(prompt: str) -> AsyncGenerator[str, None]:
"""
调用 OpenAI/Ollama API 并流式返回补全内容。
参考 completions-sample-code 的 streaming 逻辑。
"""
print(f"[LLM] Calling API with prompt length: {len(prompt)}")
try:
stream = await client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}],
stream=True,
max_tokens=128,
temperature=0.2,
)
chunk_count = 0
async for chunk in stream:
if chunk.choices[0].delta.content:
content = chunk.choices[0].delta.content
chunk_count += 1
print(f"[LLM] Chunk {chunk_count}: {content}")
yield json.dumps({"content": content})
print(f"[LLM] Stream complete, total chunks: {chunk_count}")
except Exception as e:
error_msg = f"Error: {str(e)}"
print(f"[LLM] Error: {error_msg}")
yield json.dumps({"error": str(e)})