import os import time import logging import asyncio from datetime import datetime import ollama from dotenv import load_dotenv load_dotenv() OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'gpt-oss:20b') OLLAMA_HOST = os.getenv('OLLAMA_HOST', 'http://localhost:11434') VLM_MODEL = os.getenv('VLM_MODEL', 'qwen3-vl:30b') # Timeouts in seconds COMPLETION_TIMEOUT = 30 OCR_TIMEOUT = 60 CONVERT_TIMEOUT = 30 client = ollama.AsyncClient(host=OLLAMA_HOST) logger = logging.getLogger("llm") VLM_OCR_CONTEXT_PROMPT = """You are an OCR and visual-context extractor for markdown writing assistance. Your output will be embedded inside an HTML comment as hidden context for a text-completion model. Requirements: - Keep output compact: maximum 120 words. - Use plain text only (no markdown code fences). - Never output . - Do not invent unreadable text; mark uncertain characters with ?. - Preserve original script for recognized text (do not forcibly translate). Return exactly this format: TEXT: KEY_DETAILS: - <3-5 short factual bullets about relevant objects/layout> LANGUAGE: SUMMARY: """ def _extract_message(response) -> tuple[str, str]: content = "" thinking = "" if hasattr(response, 'message') and response.message: content = response.message.content or "" thinking = getattr(response.message, 'thinking', '') or "" elif isinstance(response, dict): msg = response.get('message', {}) content = msg.get('content', '') or "" thinking = msg.get('thinking', '') or "" return content, thinking async def call_ollama( prompt: str, *, system_prompt: str = None, tag: str = "default", temperature: float = 0.7, thinking: str = None, ) -> dict: """ 调用 Ollama API 并返回 content 和 thinking。 """ start = time.perf_counter() start_dt = datetime.now() logger.info( "[LLM][%s] request model=%s host=%s prompt_chars=%d system_chars=%d temp=%.2f thinking=%s", tag, OLLAMA_MODEL, OLLAMA_HOST, len(prompt), len(system_prompt or ""), temperature, thinking, ) try: messages = [] if system_prompt and system_prompt.strip(): messages.append({"role": "system", "content": system_prompt}) messages.append({"role": "user", "content": prompt}) kwargs = { "model": OLLAMA_MODEL, "messages": messages, "stream": False, "options": { 'temperature': temperature, 'repeat_penalty': 1.1, }, } if thinking: kwargs["think"] = thinking response = await asyncio.wait_for(client.chat(**kwargs), timeout=COMPLETION_TIMEOUT) except asyncio.CancelledError: elapsed_ms = (time.perf_counter() - start) * 1000 end_dt = datetime.now() logger.info( "[LLM][%s] call_time [%s --> %s]", tag, start_dt.strftime("%H:%M:%S"), end_dt.strftime("%H:%M:%S"), ) logger.warning("[LLM][%s] request cancelled after %.1fms", tag, elapsed_ms) raise except Exception: elapsed_ms = (time.perf_counter() - start) * 1000 end_dt = datetime.now() logger.info( "[LLM][%s] call_time [%s --> %s]", tag, start_dt.strftime("%H:%M:%S"), end_dt.strftime("%H:%M:%S"), ) logger.exception("[LLM][%s] request failed after %.1fms", tag, elapsed_ms) raise content, thinking = _extract_message(response) elapsed_ms = (time.perf_counter() - start) * 1000 end_dt = datetime.now() logger.info( "[LLM][%s] call_time [%s --> %s]", tag, start_dt.strftime("%H:%M:%S"), end_dt.strftime("%H:%M:%S"), ) logger.info( "[LLM][%s] response in %.1fms response_type=%s content_chars=%d thinking_chars=%d", tag, elapsed_ms, type(response).__name__, len(content), len(thinking), ) if not content.strip(): logger.warning("[LLM][%s] empty content returned by model", tag) return {"content": content, "think": thinking} async def call_vlm_ocr(image_bytes: bytes, language: str = 'auto') -> str: start = time.perf_counter() start_dt = datetime.now() logger.info( "[VLM][ocr] request model=%s host=%s image_bytes=%d language=%s", VLM_MODEL, OLLAMA_HOST, len(image_bytes), language, ) try: response = await asyncio.wait_for( client.chat( model=VLM_MODEL, messages=[{ 'role': 'user', 'content': VLM_OCR_CONTEXT_PROMPT, 'images': [image_bytes] }], stream=False, options={'temperature': 0.3} ), timeout=OCR_TIMEOUT ) except Exception: elapsed_ms = (time.perf_counter() - start) * 1000 end_dt = datetime.now() logger.info( "[VLM][ocr] call_time [%s --> %s]", start_dt.strftime("%H:%M:%S"), end_dt.strftime("%H:%M:%S"), ) logger.exception("[VLM][ocr] request failed after %.1fms", elapsed_ms) raise content, thinking = _extract_message(response) elapsed_ms = (time.perf_counter() - start) * 1000 end_dt = datetime.now() logger.info( "[VLM][ocr] call_time [%s --> %s]", start_dt.strftime("%H:%M:%S"), end_dt.strftime("%H:%M:%S"), ) logger.info( "[VLM][ocr] response in %.1fms response_type=%s content_chars=%d thinking_chars=%d", elapsed_ms, type(response).__name__, len(content), len(thinking), ) if not content.strip(): logger.warning("[VLM][ocr] empty content returned by model") return content