Add optional thinking parameter to the call_ollama function and pass it from the request. Also enhance timezone handling in prompt generation to support configurable timezone preferences.
175 lines
5.1 KiB
Python
175 lines
5.1 KiB
Python
import os
|
|
import time
|
|
import logging
|
|
from datetime import datetime
|
|
import ollama
|
|
from dotenv import load_dotenv
|
|
|
|
load_dotenv()
|
|
|
|
OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'gpt-oss:20b')
|
|
OLLAMA_HOST = os.getenv('OLLAMA_HOST', 'http://192.168.0.120:11434')
|
|
VLM_MODEL = os.getenv('VLM_MODEL', 'qwen3-vl:30b')
|
|
|
|
client = ollama.AsyncClient(host=OLLAMA_HOST)
|
|
logger = logging.getLogger("llm")
|
|
|
|
VLM_OCR_CONTEXT_PROMPT = """You are an OCR and visual-context extractor for markdown writing assistance.
|
|
|
|
Your output will be embedded inside an HTML comment as hidden context for a text-completion model.
|
|
|
|
Requirements:
|
|
- Keep output compact: maximum 120 words.
|
|
- Use plain text only (no markdown code fences).
|
|
- Never output <!-- or -->.
|
|
- Do not invent unreadable text; mark uncertain characters with ?.
|
|
- Preserve original script for recognized text (do not forcibly translate).
|
|
|
|
Return exactly this format:
|
|
|
|
TEXT:
|
|
<exact transcription of visible text; use " | " for line breaks; write "(none)" if no readable text>
|
|
|
|
KEY_DETAILS:
|
|
- <3-5 short factual bullets about relevant objects/layout>
|
|
|
|
LANGUAGE:
|
|
<dominant language(s) in visible text, e.g. English / Chinese / Mixed>
|
|
|
|
SUMMARY:
|
|
<one short sentence, <= 20 words>"""
|
|
|
|
def _extract_message(response) -> tuple[str, str]:
|
|
content = ""
|
|
thinking = ""
|
|
|
|
if hasattr(response, 'message') and response.message:
|
|
content = response.message.content or ""
|
|
thinking = getattr(response.message, 'thinking', '') or ""
|
|
elif isinstance(response, dict):
|
|
msg = response.get('message', {})
|
|
content = msg.get('content', '') or ""
|
|
thinking = msg.get('thinking', '') or ""
|
|
|
|
return content, thinking
|
|
|
|
|
|
async def call_ollama(prompt: str, *, tag: str = "default", temperature: float = 0.7, thinking: str = None) -> dict:
|
|
"""
|
|
调用 Ollama API 并返回 content 和 thinking。
|
|
"""
|
|
start = time.perf_counter()
|
|
start_dt = datetime.now()
|
|
logger.info(
|
|
"[LLM][%s] request model=%s host=%s prompt_chars=%d temp=%.2f thinking=%s",
|
|
tag,
|
|
OLLAMA_MODEL,
|
|
OLLAMA_HOST,
|
|
len(prompt),
|
|
temperature,
|
|
thinking,
|
|
)
|
|
|
|
try:
|
|
kwargs = {
|
|
"model": OLLAMA_MODEL,
|
|
"messages": [{'role': 'user', 'content': prompt}],
|
|
"stream": False,
|
|
"options": {
|
|
'temperature': temperature,
|
|
'repeat_penalty': 1.1,
|
|
},
|
|
}
|
|
if thinking:
|
|
kwargs["thinking"] = thinking
|
|
|
|
response = await client.chat(**kwargs)
|
|
except Exception:
|
|
elapsed_ms = (time.perf_counter() - start) * 1000
|
|
end_dt = datetime.now()
|
|
logger.info(
|
|
"[LLM][%s] call_time [%s --> %s]",
|
|
tag,
|
|
start_dt.strftime("%H:%M:%S"),
|
|
end_dt.strftime("%H:%M:%S"),
|
|
)
|
|
logger.exception("[LLM][%s] request failed after %.1fms", tag, elapsed_ms)
|
|
raise
|
|
|
|
content, thinking = _extract_message(response)
|
|
elapsed_ms = (time.perf_counter() - start) * 1000
|
|
end_dt = datetime.now()
|
|
logger.info(
|
|
"[LLM][%s] call_time [%s --> %s]",
|
|
tag,
|
|
start_dt.strftime("%H:%M:%S"),
|
|
end_dt.strftime("%H:%M:%S"),
|
|
)
|
|
logger.info(
|
|
"[LLM][%s] response in %.1fms response_type=%s content_chars=%d thinking_chars=%d",
|
|
tag,
|
|
elapsed_ms,
|
|
type(response).__name__,
|
|
len(content),
|
|
len(thinking),
|
|
)
|
|
|
|
if not content.strip():
|
|
logger.warning("[LLM][%s] empty content returned by model", tag)
|
|
|
|
return {"content": content, "thinking": thinking}
|
|
|
|
async def call_vlm_ocr(image_bytes: bytes, language: str = 'auto') -> str:
|
|
start = time.perf_counter()
|
|
start_dt = datetime.now()
|
|
logger.info(
|
|
"[VLM][ocr] request model=%s host=%s image_bytes=%d language=%s",
|
|
VLM_MODEL,
|
|
OLLAMA_HOST,
|
|
len(image_bytes),
|
|
language,
|
|
)
|
|
|
|
try:
|
|
response = await client.chat(
|
|
model=VLM_MODEL,
|
|
messages=[{
|
|
'role': 'user',
|
|
'content': VLM_OCR_CONTEXT_PROMPT,
|
|
'images': [image_bytes]
|
|
}],
|
|
stream=False,
|
|
options={'temperature': 0.3}
|
|
)
|
|
except Exception:
|
|
elapsed_ms = (time.perf_counter() - start) * 1000
|
|
end_dt = datetime.now()
|
|
logger.info(
|
|
"[VLM][ocr] call_time [%s --> %s]",
|
|
start_dt.strftime("%H:%M:%S"),
|
|
end_dt.strftime("%H:%M:%S"),
|
|
)
|
|
logger.exception("[VLM][ocr] request failed after %.1fms", elapsed_ms)
|
|
raise
|
|
|
|
content, thinking = _extract_message(response)
|
|
elapsed_ms = (time.perf_counter() - start) * 1000
|
|
end_dt = datetime.now()
|
|
logger.info(
|
|
"[VLM][ocr] call_time [%s --> %s]",
|
|
start_dt.strftime("%H:%M:%S"),
|
|
end_dt.strftime("%H:%M:%S"),
|
|
)
|
|
logger.info(
|
|
"[VLM][ocr] response in %.1fms response_type=%s content_chars=%d thinking_chars=%d",
|
|
elapsed_ms,
|
|
type(response).__name__,
|
|
len(content),
|
|
len(thinking),
|
|
)
|
|
|
|
if not content.strip():
|
|
logger.warning("[VLM][ocr] empty content returned by model")
|
|
|
|
return content
|