import os import time import logging from datetime import datetime import ollama from dotenv import load_dotenv load_dotenv() OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'gpt-oss:20b') OLLAMA_HOST = os.getenv('OLLAMA_HOST', 'http://192.168.0.120:11434') VLM_MODEL = os.getenv('VLM_MODEL', 'qwen3-vl:30b') client = ollama.AsyncClient(host=OLLAMA_HOST) logger = logging.getLogger("llm") VLM_OCR_CONTEXT_PROMPT = """You are an OCR and visual-context extractor for markdown writing assistance. Your output will be embedded inside an HTML comment as hidden context for a text-completion model. Requirements: - Keep output compact: maximum 120 words. - Use plain text only (no markdown code fences). - Never output . - Do not invent unreadable text; mark uncertain characters with ?. - Preserve original script for recognized text (do not forcibly translate). Return exactly this format: TEXT: KEY_DETAILS: - <3-5 short factual bullets about relevant objects/layout> LANGUAGE: SUMMARY: """ def _extract_message(response) -> tuple[str, str]: content = "" thinking = "" if hasattr(response, 'message') and response.message: content = response.message.content or "" thinking = getattr(response.message, 'thinking', '') or "" elif isinstance(response, dict): msg = response.get('message', {}) content = msg.get('content', '') or "" thinking = msg.get('thinking', '') or "" return content, thinking async def call_ollama(prompt: str, *, tag: str = "default", temperature: float = 0.7) -> dict: """ 调用 Ollama API 并返回 content 和 thinking。 """ start = time.perf_counter() start_dt = datetime.now() logger.info( "[LLM][%s] request model=%s host=%s prompt_chars=%d temp=%.2f", tag, OLLAMA_MODEL, OLLAMA_HOST, len(prompt), temperature, ) try: response = await client.chat( model=OLLAMA_MODEL, messages=[{'role': 'user', 'content': prompt}], stream=False, options={ 'temperature': temperature, 'repeat_penalty': 1.1, }, ) except Exception: elapsed_ms = (time.perf_counter() - start) * 1000 end_dt = datetime.now() logger.info( "[LLM][%s] call_time [%s --> %s]", tag, start_dt.strftime("%H:%M:%S"), end_dt.strftime("%H:%M:%S"), ) logger.exception("[LLM][%s] request failed after %.1fms", tag, elapsed_ms) raise content, thinking = _extract_message(response) elapsed_ms = (time.perf_counter() - start) * 1000 end_dt = datetime.now() logger.info( "[LLM][%s] call_time [%s --> %s]", tag, start_dt.strftime("%H:%M:%S"), end_dt.strftime("%H:%M:%S"), ) logger.info( "[LLM][%s] response in %.1fms response_type=%s content_chars=%d thinking_chars=%d", tag, elapsed_ms, type(response).__name__, len(content), len(thinking), ) if not content.strip(): logger.warning("[LLM][%s] empty content returned by model", tag) return {"content": content, "thinking": thinking} async def call_vlm_ocr(image_bytes: bytes, language: str = 'auto') -> str: start = time.perf_counter() start_dt = datetime.now() logger.info( "[VLM][ocr] request model=%s host=%s image_bytes=%d language=%s", VLM_MODEL, OLLAMA_HOST, len(image_bytes), language, ) try: response = await client.chat( model=VLM_MODEL, messages=[{ 'role': 'user', 'content': VLM_OCR_CONTEXT_PROMPT, 'images': [image_bytes] }], stream=False, options={'temperature': 0.3} ) except Exception: elapsed_ms = (time.perf_counter() - start) * 1000 end_dt = datetime.now() logger.info( "[VLM][ocr] call_time [%s --> %s]", start_dt.strftime("%H:%M:%S"), end_dt.strftime("%H:%M:%S"), ) logger.exception("[VLM][ocr] request failed after %.1fms", elapsed_ms) raise content, thinking = _extract_message(response) elapsed_ms = (time.perf_counter() - start) * 1000 end_dt = datetime.now() logger.info( "[VLM][ocr] call_time [%s --> %s]", start_dt.strftime("%H:%M:%S"), end_dt.strftime("%H:%M:%S"), ) logger.info( "[VLM][ocr] response in %.1fms response_type=%s content_chars=%d thinking_chars=%d", elapsed_ms, type(response).__name__, len(content), len(thinking), ) if not content.strip(): logger.warning("[VLM][ocr] empty content returned by model") return content