llm-in-text/backend/llm.py

import os
from typing import AsyncGenerator
from openai import AsyncOpenAI
import json

api_key = os.getenv('OPENAI_API_KEY', 'ollama')
base_url = os.getenv('OLLAMA_BASE_URL', 'http://192.168.0.120:11434/v1/')
model = os.getenv('OLLAMA_MODEL', 'gpt-oss:120b')

print(f"[LLM] API key configured: {'Yes' if api_key else 'No'}")
print(f"[LLM] Base URL: {base_url}")
print(f"[LLM] Model: {model}")

client = AsyncOpenAI(api_key=api_key, base_url=base_url)

async def stream_openai(prompt: str) -> AsyncGenerator[str, None]:
    """
    调用 OpenAI/Ollama API 并流式返回补全内容。
    参考 completions-sample-code 的 streaming 逻辑。
    """
    print(f"[LLM] Calling API with prompt length: {len(prompt)}")

    try:
        stream = await client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            stream=True,
            max_tokens=128,
            temperature=0.2,
        )

        chunk_count = 0
        async for chunk in stream:
            if chunk.choices[0].delta.content:
                content = chunk.choices[0].delta.content
                chunk_count += 1
                print(f"[LLM] Chunk {chunk_count}: {content}")
                yield json.dumps({"content": content})

        print(f"[LLM] Stream complete, total chunks: {chunk_count}")
    except Exception as e:
        error_msg = f"Error: {str(e)}"
        print(f"[LLM] Error: {error_msg}")
        yield json.dumps({"error": str(e)})