import os
from typing import AsyncGenerator
from openai import AsyncOpenAI
import json
import time

api_key = os.getenv('OPENAI_API_KEY', 'ollama')
base_url = os.getenv('OLLAMA_BASE_URL', 'http://192.168.0.120:11434/v1/')
model = os.getenv('OLLAMA_MODEL', 'gpt-oss:120b')

print(f"[LLM] API key configured: {'Yes' if api_key else 'No'}")
print(f"[LLM] Base URL: {base_url}")
print(f"[LLM] Model: {model}")

client = AsyncOpenAI(api_key=api_key, base_url=base_url)

async def stream_openai(prompt: str) -> AsyncGenerator[str, None]:
    """
    调用 OpenAI/Ollama API 并流式返回补全内容。
    参考 completions-sample-code 的 streaming 逻辑。
    """
    start_time = time.time()
    print(f"[LLM] ========== API Call Start ==========")
    print(f"[LLM] Prompt length: {len(prompt)}")
    print(f"[LLM] Model: {model}")
    
    try:
        print(f"[LLM] Creating streaming chat completion...")
        stream = await client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            stream=True,
            max_tokens=128,
            temperature=0.2,
        )

        print(f"[LLM] Stream created successfully, iterating...")
        chunk_count = 0
        first_chunk_time = None
        
        async for chunk in stream:
            current_time = time.time()
            if first_chunk_time is None:
                first_chunk_time = current_time - start_time
            
            chunk_count += 1
            choice = chunk.choices[0] if chunk.choices else None
            
            if choice and choice.delta.content:
                content = choice.delta.content
                print(f"[LLM] Chunk {chunk_count}: '{content}' (latency: {current_time - start_time:.3f}s)")
                yield json.dumps({"content": content})
            elif chunk.choices and hasattr(chunk.choices[0], 'finish_reason'):
                finish_reason = chunk.choices[0].finish_reason
                print(f"[LLM] Chunk {chunk_count}: finish_reason={finish_reason}")
                if finish_reason:
                    break
            else:
                print(f"[LLM] Chunk {chunk_count}: empty or no content")
        
        total_time = time.time() - start_time
        print(f"[LLM] Stream complete - chunks: {chunk_count}, first chunk latency: {first_chunk_time:.3f}s, total time: {total_time:.3f}s")
        print(f"[LLM] ========== API Call End ==========")
    except Exception as e:
        error_msg = f"Error: {str(e)}"
        print(f"[LLM] Error: {error_msg}")
        import traceback
        traceback.print_exc()
        yield json.dumps({"error": str(e), "type": type(e).__name__})