208 lines
6.0 KiB
Python
208 lines
6.0 KiB
Python
import os
|
|
import time
|
|
import logging
|
|
import asyncio
|
|
from datetime import datetime
|
|
import ollama
|
|
from dotenv import load_dotenv
|
|
|
|
load_dotenv()
|
|
|
|
OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'gpt-oss:20b')
|
|
OLLAMA_HOST = os.getenv('OLLAMA_HOST', 'http://localhost:11434')
|
|
VLM_MODEL = os.getenv('VLM_MODEL', 'qwen3-vl:30b')
|
|
|
|
# Timeouts in seconds
|
|
COMPLETION_TIMEOUT = 30
|
|
OCR_TIMEOUT = 60
|
|
CONVERT_TIMEOUT = 30
|
|
|
|
client = ollama.AsyncClient(host=OLLAMA_HOST)
|
|
logger = logging.getLogger("llm")
|
|
|
|
VLM_OCR_CONTEXT_PROMPT = """You are an OCR and visual-context extractor for markdown writing assistance.
|
|
|
|
Your output will be embedded inside an HTML comment as hidden context for a text-completion model.
|
|
|
|
Requirements:
|
|
- Keep output compact: maximum 120 words.
|
|
- Use plain text only (no markdown code fences).
|
|
- Never output <!-- or -->.
|
|
- Do not invent unreadable text; mark uncertain characters with ?.
|
|
- Preserve original script for recognized text (do not forcibly translate).
|
|
|
|
Return exactly this format:
|
|
|
|
TEXT:
|
|
<exact transcription of visible text; use " | " for line breaks; write "(none)" if no readable text>
|
|
|
|
KEY_DETAILS:
|
|
- <3-5 short factual bullets about relevant objects/layout>
|
|
|
|
LANGUAGE:
|
|
<dominant language(s) in visible text, e.g. English / Chinese / Mixed>
|
|
|
|
SUMMARY:
|
|
<one short sentence, <= 20 words>"""
|
|
|
|
def _extract_message(response) -> tuple[str, str]:
|
|
content = ""
|
|
thinking = ""
|
|
|
|
if hasattr(response, 'message') and response.message:
|
|
content = response.message.content or ""
|
|
thinking = getattr(response.message, 'thinking', '') or ""
|
|
elif isinstance(response, dict):
|
|
msg = response.get('message', {})
|
|
content = msg.get('content', '') or ""
|
|
thinking = msg.get('thinking', '') or ""
|
|
|
|
return content, thinking
|
|
|
|
|
|
async def call_ollama(
|
|
prompt: str,
|
|
*,
|
|
system_prompt: str = None,
|
|
tag: str = "default",
|
|
temperature: float = 0.7,
|
|
thinking: str = None,
|
|
) -> dict:
|
|
"""
|
|
调用 Ollama API 并返回 content 和 thinking。
|
|
"""
|
|
start = time.perf_counter()
|
|
start_dt = datetime.now()
|
|
logger.info(
|
|
"[LLM][%s] request model=%s host=%s prompt_chars=%d system_chars=%d temp=%.2f thinking=%s",
|
|
tag,
|
|
OLLAMA_MODEL,
|
|
OLLAMA_HOST,
|
|
len(prompt),
|
|
len(system_prompt or ""),
|
|
temperature,
|
|
thinking,
|
|
)
|
|
|
|
try:
|
|
messages = []
|
|
if system_prompt and system_prompt.strip():
|
|
messages.append({"role": "system", "content": system_prompt})
|
|
messages.append({"role": "user", "content": prompt})
|
|
|
|
kwargs = {
|
|
"model": OLLAMA_MODEL,
|
|
"messages": messages,
|
|
"stream": False,
|
|
"options": {
|
|
'temperature': temperature,
|
|
'repeat_penalty': 1.1,
|
|
},
|
|
}
|
|
if thinking:
|
|
kwargs["think"] = thinking
|
|
|
|
response = await asyncio.wait_for(client.chat(**kwargs), timeout=COMPLETION_TIMEOUT)
|
|
except asyncio.CancelledError:
|
|
elapsed_ms = (time.perf_counter() - start) * 1000
|
|
end_dt = datetime.now()
|
|
logger.info(
|
|
"[LLM][%s] call_time [%s --> %s]",
|
|
tag,
|
|
start_dt.strftime("%H:%M:%S"),
|
|
end_dt.strftime("%H:%M:%S"),
|
|
)
|
|
logger.warning("[LLM][%s] request cancelled after %.1fms", tag, elapsed_ms)
|
|
raise
|
|
except Exception:
|
|
elapsed_ms = (time.perf_counter() - start) * 1000
|
|
end_dt = datetime.now()
|
|
logger.info(
|
|
"[LLM][%s] call_time [%s --> %s]",
|
|
tag,
|
|
start_dt.strftime("%H:%M:%S"),
|
|
end_dt.strftime("%H:%M:%S"),
|
|
)
|
|
logger.exception("[LLM][%s] request failed after %.1fms", tag, elapsed_ms)
|
|
raise
|
|
|
|
content, thinking = _extract_message(response)
|
|
elapsed_ms = (time.perf_counter() - start) * 1000
|
|
end_dt = datetime.now()
|
|
logger.info(
|
|
"[LLM][%s] call_time [%s --> %s]",
|
|
tag,
|
|
start_dt.strftime("%H:%M:%S"),
|
|
end_dt.strftime("%H:%M:%S"),
|
|
)
|
|
logger.info(
|
|
"[LLM][%s] response in %.1fms response_type=%s content_chars=%d thinking_chars=%d",
|
|
tag,
|
|
elapsed_ms,
|
|
type(response).__name__,
|
|
len(content),
|
|
len(thinking),
|
|
)
|
|
|
|
if not content.strip():
|
|
logger.warning("[LLM][%s] empty content returned by model", tag)
|
|
|
|
return {"content": content, "think": thinking}
|
|
|
|
async def call_vlm_ocr(image_bytes: bytes, language: str = 'auto') -> str:
|
|
start = time.perf_counter()
|
|
start_dt = datetime.now()
|
|
logger.info(
|
|
"[VLM][ocr] request model=%s host=%s image_bytes=%d language=%s",
|
|
VLM_MODEL,
|
|
OLLAMA_HOST,
|
|
len(image_bytes),
|
|
language,
|
|
)
|
|
|
|
try:
|
|
response = await asyncio.wait_for(
|
|
client.chat(
|
|
model=VLM_MODEL,
|
|
messages=[{
|
|
'role': 'user',
|
|
'content': VLM_OCR_CONTEXT_PROMPT,
|
|
'images': [image_bytes]
|
|
}],
|
|
stream=False,
|
|
options={'temperature': 0.3}
|
|
),
|
|
timeout=OCR_TIMEOUT
|
|
)
|
|
except Exception:
|
|
elapsed_ms = (time.perf_counter() - start) * 1000
|
|
end_dt = datetime.now()
|
|
logger.info(
|
|
"[VLM][ocr] call_time [%s --> %s]",
|
|
start_dt.strftime("%H:%M:%S"),
|
|
end_dt.strftime("%H:%M:%S"),
|
|
)
|
|
logger.exception("[VLM][ocr] request failed after %.1fms", elapsed_ms)
|
|
raise
|
|
|
|
content, thinking = _extract_message(response)
|
|
elapsed_ms = (time.perf_counter() - start) * 1000
|
|
end_dt = datetime.now()
|
|
logger.info(
|
|
"[VLM][ocr] call_time [%s --> %s]",
|
|
start_dt.strftime("%H:%M:%S"),
|
|
end_dt.strftime("%H:%M:%S"),
|
|
)
|
|
logger.info(
|
|
"[VLM][ocr] response in %.1fms response_type=%s content_chars=%d thinking_chars=%d",
|
|
elapsed_ms,
|
|
type(response).__name__,
|
|
len(content),
|
|
len(thinking),
|
|
)
|
|
|
|
if not content.strip():
|
|
logger.warning("[VLM][ocr] empty content returned by model")
|
|
|
|
return content
|