Files
llm-in-text/backend/llm.py
ydy0615 818baa349a modified: backend/llm.py
modified:   src/components/MilkdownEditor.vue
	modified:   src/utils/config.js
	modified:   src/utils/i18n.js
2026-04-05 15:10:23 +08:00

186 lines
5.3 KiB
Python

import os
import time
import logging
import asyncio
from datetime import datetime
import ollama
from dotenv import load_dotenv
from prompts import get_vlm_ocr_prompt
load_dotenv()
OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'gpt-oss:20b')
OLLAMA_HOST = os.getenv('OLLAMA_HOST', 'http://localhost:11434')
VLM_MODEL = os.getenv('VLM_MODEL', 'qwen3-vl:30b')
# Timeouts in seconds (10 minutes for large model loading)
COMPLETION_TIMEOUT = 600
OCR_TIMEOUT = 600
CONVERT_TIMEOUT = 600
client = ollama.AsyncClient(host=OLLAMA_HOST)
logger = logging.getLogger("llm")
def _extract_message(response) -> tuple[str, str]:
content = ""
thinking = ""
if hasattr(response, 'message') and response.message:
content = response.message.content or ""
thinking = getattr(response.message, 'thinking', '') or ""
elif isinstance(response, dict):
msg = response.get('message', {})
content = msg.get('content', '') or ""
thinking = msg.get('thinking', '') or ""
return content, thinking
async def call_ollama(
prompt: str,
*,
system_prompt: str | None = None,
tag: str = "default",
temperature: float = 0.7,
thinking: str | None = None,
) -> dict:
"""
调用 Ollama API 并返回 content 和 thinking。
"""
start = time.perf_counter()
start_dt = datetime.now()
logger.info(
"[LLM][%s] request model=%s host=%s prompt_chars=%d system_chars=%d temp=%.2f thinking=%s",
tag,
OLLAMA_MODEL,
OLLAMA_HOST,
len(prompt),
len(system_prompt or ""),
temperature,
thinking,
)
try:
messages = []
if system_prompt and system_prompt.strip():
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": prompt})
kwargs = {
"model": OLLAMA_MODEL,
"messages": messages,
"stream": False,
"options": {
'temperature': temperature,
'repeat_penalty': 1.1,
},
}
if thinking:
kwargs["think"] = thinking
response = await asyncio.wait_for(client.chat(**kwargs), timeout=COMPLETION_TIMEOUT)
except asyncio.CancelledError:
elapsed_ms = (time.perf_counter() - start) * 1000
end_dt = datetime.now()
logger.info(
"[LLM][%s] call_time [%s --> %s]",
tag,
start_dt.strftime("%H:%M:%S"),
end_dt.strftime("%H:%M:%S"),
)
logger.warning("[LLM][%s] request cancelled after %.1fms", tag, elapsed_ms)
raise
except Exception:
elapsed_ms = (time.perf_counter() - start) * 1000
end_dt = datetime.now()
logger.info(
"[LLM][%s] call_time [%s --> %s]",
tag,
start_dt.strftime("%H:%M:%S"),
end_dt.strftime("%H:%M:%S"),
)
logger.exception("[LLM][%s] request failed after %.1fms", tag, elapsed_ms)
raise
content, thinking = _extract_message(response)
elapsed_ms = (time.perf_counter() - start) * 1000
end_dt = datetime.now()
logger.info(
"[LLM][%s] call_time [%s --> %s]",
tag,
start_dt.strftime("%H:%M:%S"),
end_dt.strftime("%H:%M:%S"),
)
logger.info(
"[LLM][%s] response in %.1fms response_type=%s content_chars=%d thinking_chars=%d",
tag,
elapsed_ms,
type(response).__name__,
len(content),
len(thinking),
)
if not content.strip():
logger.warning("[LLM][%s] empty content returned by model", tag)
return {"content": content, "think": thinking}
async def call_vlm_ocr(image_bytes: bytes, language: str = 'auto') -> str:
start = time.perf_counter()
start_dt = datetime.now()
logger.info(
"[VLM][ocr] request model=%s host=%s image_bytes=%d language=%s",
VLM_MODEL,
OLLAMA_HOST,
len(image_bytes),
language,
)
try:
response = await asyncio.wait_for(
client.chat(
model=VLM_MODEL,
messages=[{
'role': 'user',
'content': get_vlm_ocr_prompt(),
'images': [image_bytes]
}],
stream=False,
options={'temperature': 0.3}
),
timeout=OCR_TIMEOUT
)
except Exception:
elapsed_ms = (time.perf_counter() - start) * 1000
end_dt = datetime.now()
logger.info(
"[VLM][ocr] call_time [%s --> %s]",
start_dt.strftime("%H:%M:%S"),
end_dt.strftime("%H:%M:%S"),
)
logger.exception("[VLM][ocr] request failed after %.1fms", elapsed_ms)
raise
content, thinking = _extract_message(response)
elapsed_ms = (time.perf_counter() - start) * 1000
end_dt = datetime.now()
logger.info(
"[VLM][ocr] call_time [%s --> %s]",
start_dt.strftime("%H:%M:%S"),
end_dt.strftime("%H:%M:%S"),
)
logger.info(
"[VLM][ocr] response in %.1fms response_type=%s content_chars=%d thinking_chars=%d",
elapsed_ms,
type(response).__name__,
len(content),
len(thinking),
)
if not content.strip():
logger.warning("[VLM][ocr] empty content returned by model")
return content