backend/llm.py

import os
from typing import AsyncGenerator
from openai import AsyncOpenAI
import json
import time

api_key = os.getenv('OPENAI_API_KEY', 'ollama')
base_url = os.getenv('OLLAMA_BASE_URL', 'http://192.168.0.120:11434/v1/')
model = os.getenv('OLLAMA_MODEL', 'gpt-oss:120b')

print(f"[LLM] API key configured: {'Yes' if api_key else 'No'}")
print(f"[LLM] Base URL: {base_url}")
print(f"[LLM] Model: {model}")

client = AsyncOpenAI(api_key=api_key, base_url=base_url)

async def stream_openai(prompt: str) -> AsyncGenerator[str, None]:
    """
    调用 OpenAI/Ollama API 并流式返回补全内容。
    参考 completions-sample-code 的 streaming 逻辑。
    """
    start_time = time.time()
    print(f"[LLM] ========== API Call Start ==========")
    print(f"[LLM] Prompt length: {len(prompt)}")
    print(f"[LLM] Model: {model}")
    
    try:
        print(f"[LLM] Creating streaming chat completion...")
        stream = await client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            stream=True,
            max_tokens=128,
            temperature=0.2,
        )

        print(f"[LLM] Stream created successfully, iterating...")
        chunk_count = 0
        first_chunk_time = None
        
        async for chunk in stream:
            current_time = time.time()
            if first_chunk_time is None:
                first_chunk_time = current_time - start_time
            
            chunk_count += 1
            choice = chunk.choices[0] if chunk.choices else None
            
            if choice and choice.delta.content:
                content = choice.delta.content
                print(f"[LLM] Chunk {chunk_count}: '{content}' (latency: {current_time - start_time:.3f}s)")
                yield json.dumps({"content": content})
            elif chunk.choices and hasattr(chunk.choices[0], 'finish_reason'):
                finish_reason = chunk.choices[0].finish_reason
                print(f"[LLM] Chunk {chunk_count}: finish_reason={finish_reason}")
                if finish_reason:
                    break
            else:
                print(f"[LLM] Chunk {chunk_count}: empty or no content")
        
        total_time = time.time() - start_time
        print(f"[LLM] Stream complete - chunks: {chunk_count}, first chunk latency: {first_chunk_time:.3f}s, total time: {total_time:.3f}s")
        print(f"[LLM] ========== API Call End ==========")
    except Exception as e:
        error_msg = f"Error: {str(e)}"
        print(f"[LLM] Error: {error_msg}")
        import traceback
        traceback.print_exc()
        yield json.dumps({"error": str(e), "type": type(e).__name__})
feat: implement inline autocomplete suggestions with FastAPI backend and Milkdown editor integration 2026-01-18 19:42:58 +08:00			`import os`
			`from typing import AsyncGenerator`
			`from openai import AsyncOpenAI`
			`import json`
feat: enhance logging and error handling in backend and editor components 2026-01-25 13:29:11 +08:00			`import time`
feat: implement inline autocomplete suggestions with FastAPI backend and Milkdown editor integration 2026-01-18 19:42:58 +08:00
			`api_key = os.getenv('OPENAI_API_KEY', 'ollama')`
			`base_url = os.getenv('OLLAMA_BASE_URL', 'http://192.168.0.120:11434/v1/')`
			`model = os.getenv('OLLAMA_MODEL', 'gpt-oss:120b')`

			`print(f"[LLM] API key configured: {'Yes' if api_key else 'No'}")`
			`print(f"[LLM] Base URL: {base_url}")`
			`print(f"[LLM] Model: {model}")`

			`client = AsyncOpenAI(api_key=api_key, base_url=base_url)`

			`async def stream_openai(prompt: str) -> AsyncGenerator[str, None]:`
			`"""`
			`调用 OpenAI/Ollama API 并流式返回补全内容。`
			`参考 completions-sample-code 的 streaming 逻辑。`
			`"""`
feat: enhance logging and error handling in backend and editor components 2026-01-25 13:29:11 +08:00			`start_time = time.time()`
			`print(f"[LLM] ========== API Call Start ==========")`
			`print(f"[LLM] Prompt length: {len(prompt)}")`
			`print(f"[LLM] Model: {model}")`
feat: implement inline autocomplete suggestions with FastAPI backend and Milkdown editor integration 2026-01-18 19:42:58 +08:00
			`try:`
feat: enhance logging and error handling in backend and editor components 2026-01-25 13:29:11 +08:00			`print(f"[LLM] Creating streaming chat completion...")`
feat: implement inline autocomplete suggestions with FastAPI backend and Milkdown editor integration 2026-01-18 19:42:58 +08:00			`stream = await client.chat.completions.create(`
			`model=model,`
			`messages=[{"role": "user", "content": prompt}],`
			`stream=True,`
			`max_tokens=128,`
			`temperature=0.2,`
			`)`

feat: enhance logging and error handling in backend and editor components 2026-01-25 13:29:11 +08:00			`print(f"[LLM] Stream created successfully, iterating...")`
feat: implement inline autocomplete suggestions with FastAPI backend and Milkdown editor integration 2026-01-18 19:42:58 +08:00			`chunk_count = 0`
feat: enhance logging and error handling in backend and editor components 2026-01-25 13:29:11 +08:00			`first_chunk_time = None`

feat: implement inline autocomplete suggestions with FastAPI backend and Milkdown editor integration 2026-01-18 19:42:58 +08:00			`async for chunk in stream:`
feat: enhance logging and error handling in backend and editor components 2026-01-25 13:29:11 +08:00			`current_time = time.time()`
			`if first_chunk_time is None:`
			`first_chunk_time = current_time - start_time`

			`chunk_count += 1`
			`choice = chunk.choices[0] if chunk.choices else None`

			`if choice and choice.delta.content:`
			`content = choice.delta.content`
			`print(f"[LLM] Chunk {chunk_count}: '{content}' (latency: {current_time - start_time:.3f}s)")`
feat: implement inline autocomplete suggestions with FastAPI backend and Milkdown editor integration 2026-01-18 19:42:58 +08:00			`yield json.dumps({"content": content})`
feat: enhance logging and error handling in backend and editor components 2026-01-25 13:29:11 +08:00			`elif chunk.choices and hasattr(chunk.choices[0], 'finish_reason'):`
			`finish_reason = chunk.choices[0].finish_reason`
			`print(f"[LLM] Chunk {chunk_count}: finish_reason={finish_reason}")`
			`if finish_reason:`
			`break`
			`else:`
			`print(f"[LLM] Chunk {chunk_count}: empty or no content")`
feat: implement inline autocomplete suggestions with FastAPI backend and Milkdown editor integration 2026-01-18 19:42:58 +08:00
feat: enhance logging and error handling in backend and editor components 2026-01-25 13:29:11 +08:00			`total_time = time.time() - start_time`
			`print(f"[LLM] Stream complete - chunks: {chunk_count}, first chunk latency: {first_chunk_time:.3f}s, total time: {total_time:.3f}s")`
			`print(f"[LLM] ========== API Call End ==========")`
feat: implement inline autocomplete suggestions with FastAPI backend and Milkdown editor integration 2026-01-18 19:42:58 +08:00			`except Exception as e:`
			`error_msg = f"Error: {str(e)}"`
			`print(f"[LLM] Error: {error_msg}")`
feat: enhance logging and error handling in backend and editor components 2026-01-25 13:29:11 +08:00			`import traceback`
			`traceback.print_exc()`
			`yield json.dumps({"error": str(e), "type": type(e).__name__})`