llm-in-text/backend/llm.py

import os
import json
import ollama
from typing import AsyncGenerator

OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'gpt-oss:20b')
OLLAMA_HOST = os.getenv('OLLAMA_HOST', 'http://192.168.0.120:11434')

# 移除 /v1/ 后缀（如果有的话），因为 Ollama Python 包使用原生 API
if OLLAMA_HOST.endswith('/v1/'):
    OLLAMA_HOST = OLLAMA_HOST[:-4]
elif OLLAMA_HOST.endswith('/v1'):
    OLLAMA_HOST = OLLAMA_HOST[:-3]

os.environ['OLLAMA_HOST'] = OLLAMA_HOST

print(f"[LLM] Ollama host: {OLLAMA_HOST}")
print(f"[LLM] Model: {OLLAMA_MODEL}")

client = ollama.AsyncClient(host=OLLAMA_HOST)

async def stream_openai(prompt: str) -> AsyncGenerator[str, None]:
    print(f"[LLM] Calling Ollama API with prompt length: {len(prompt)}")

    try:
        print(f"[LLM] Awaiting client.chat...")
        stream = await client.chat(
            model=OLLAMA_MODEL,
            messages=[{'role': 'user', 'content': prompt}],
            stream=True,
            options={
                'num_predict': 8192,
                'temperature': 0.2,
            }
        )
        print(f"[LLM] Got stream object, starting iteration...")

        chunk_count = 0
        async for chunk in stream:
            if chunk['message'] and chunk['message']['content']:
                content = chunk['message']['content']
                chunk_count += 1
                print(f"[LLM] Chunk {chunk_count}: {content}")
                yield json.dumps({"content": content})

        print(f"[LLM] Stream complete, total chunks: {chunk_count}")
    except Exception as e:
        error_msg = f"Error: {str(e)}"
        print(f"[LLM] Error: {error_msg}")
        import traceback
        traceback.print_exc()
        yield json.dumps({"error": str(e)})