Files
llm-in-text/backend/llm.py
ydy0615 5434f3eb47 fix(backend): rename thinking parameter to think for Ollama API compatibility
The Ollama API expects "think" parameter instead of "thinking". Also updates the API base URL in the frontend configuration to point to the correct endpoint.
2026-02-19 18:55:49 +08:00

175 lines
5.1 KiB
Python

import os
import time
import logging
from datetime import datetime
import ollama
from dotenv import load_dotenv
load_dotenv()
OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'gpt-oss:20b')
OLLAMA_HOST = os.getenv('OLLAMA_HOST', 'http://192.168.0.120:11434')
VLM_MODEL = os.getenv('VLM_MODEL', 'qwen3-vl:30b')
client = ollama.AsyncClient(host=OLLAMA_HOST)
logger = logging.getLogger("llm")
VLM_OCR_CONTEXT_PROMPT = """You are an OCR and visual-context extractor for markdown writing assistance.
Your output will be embedded inside an HTML comment as hidden context for a text-completion model.
Requirements:
- Keep output compact: maximum 120 words.
- Use plain text only (no markdown code fences).
- Never output <!-- or -->.
- Do not invent unreadable text; mark uncertain characters with ?.
- Preserve original script for recognized text (do not forcibly translate).
Return exactly this format:
TEXT:
<exact transcription of visible text; use " | " for line breaks; write "(none)" if no readable text>
KEY_DETAILS:
- <3-5 short factual bullets about relevant objects/layout>
LANGUAGE:
<dominant language(s) in visible text, e.g. English / Chinese / Mixed>
SUMMARY:
<one short sentence, <= 20 words>"""
def _extract_message(response) -> tuple[str, str]:
content = ""
thinking = ""
if hasattr(response, 'message') and response.message:
content = response.message.content or ""
thinking = getattr(response.message, 'thinking', '') or ""
elif isinstance(response, dict):
msg = response.get('message', {})
content = msg.get('content', '') or ""
thinking = msg.get('thinking', '') or ""
return content, thinking
async def call_ollama(prompt: str, *, tag: str = "default", temperature: float = 0.7, thinking: str = None) -> dict:
"""
调用 Ollama API 并返回 content 和 thinking。
"""
start = time.perf_counter()
start_dt = datetime.now()
logger.info(
"[LLM][%s] request model=%s host=%s prompt_chars=%d temp=%.2f thinking=%s",
tag,
OLLAMA_MODEL,
OLLAMA_HOST,
len(prompt),
temperature,
thinking,
)
try:
kwargs = {
"model": OLLAMA_MODEL,
"messages": [{'role': 'user', 'content': prompt}],
"stream": False,
"options": {
'temperature': temperature,
'repeat_penalty': 1.1,
},
}
if thinking:
kwargs["think"] = thinking
response = await client.chat(**kwargs)
except Exception:
elapsed_ms = (time.perf_counter() - start) * 1000
end_dt = datetime.now()
logger.info(
"[LLM][%s] call_time [%s --> %s]",
tag,
start_dt.strftime("%H:%M:%S"),
end_dt.strftime("%H:%M:%S"),
)
logger.exception("[LLM][%s] request failed after %.1fms", tag, elapsed_ms)
raise
content, thinking = _extract_message(response)
elapsed_ms = (time.perf_counter() - start) * 1000
end_dt = datetime.now()
logger.info(
"[LLM][%s] call_time [%s --> %s]",
tag,
start_dt.strftime("%H:%M:%S"),
end_dt.strftime("%H:%M:%S"),
)
logger.info(
"[LLM][%s] response in %.1fms response_type=%s content_chars=%d thinking_chars=%d",
tag,
elapsed_ms,
type(response).__name__,
len(content),
len(thinking),
)
if not content.strip():
logger.warning("[LLM][%s] empty content returned by model", tag)
return {"content": content, "think": thinking}
async def call_vlm_ocr(image_bytes: bytes, language: str = 'auto') -> str:
start = time.perf_counter()
start_dt = datetime.now()
logger.info(
"[VLM][ocr] request model=%s host=%s image_bytes=%d language=%s",
VLM_MODEL,
OLLAMA_HOST,
len(image_bytes),
language,
)
try:
response = await client.chat(
model=VLM_MODEL,
messages=[{
'role': 'user',
'content': VLM_OCR_CONTEXT_PROMPT,
'images': [image_bytes]
}],
stream=False,
options={'temperature': 0.3}
)
except Exception:
elapsed_ms = (time.perf_counter() - start) * 1000
end_dt = datetime.now()
logger.info(
"[VLM][ocr] call_time [%s --> %s]",
start_dt.strftime("%H:%M:%S"),
end_dt.strftime("%H:%M:%S"),
)
logger.exception("[VLM][ocr] request failed after %.1fms", elapsed_ms)
raise
content, thinking = _extract_message(response)
elapsed_ms = (time.perf_counter() - start) * 1000
end_dt = datetime.now()
logger.info(
"[VLM][ocr] call_time [%s --> %s]",
start_dt.strftime("%H:%M:%S"),
end_dt.strftime("%H:%M:%S"),
)
logger.info(
"[VLM][ocr] response in %.1fms response_type=%s content_chars=%d thinking_chars=%d",
elapsed_ms,
type(response).__name__,
len(content),
len(thinking),
)
if not content.strip():
logger.warning("[VLM][ocr] empty content returned by model")
return content