Add support for cancelling in-progress LLM completion requests via new /v1/completions/cancel endpoint with task tracking. Implement mermaid diagram rendering in the Milkdown editor with a new mermaidPlugin. Update copilotPlugin to properly abort requests with descriptive reasons. Refactor settings panel to handle system theme changes reactively. Add camera capture support for image uploads.
200 lines
5.8 KiB
Python
200 lines
5.8 KiB
Python
import os
|
|
import time
|
|
import logging
|
|
import asyncio
|
|
from datetime import datetime
|
|
import ollama
|
|
from dotenv import load_dotenv
|
|
|
|
load_dotenv()
|
|
|
|
OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'gpt-oss:20b')
|
|
OLLAMA_HOST = os.getenv('OLLAMA_HOST', 'http://192.168.0.120:11434')
|
|
VLM_MODEL = os.getenv('VLM_MODEL', 'qwen3-vl:30b')
|
|
|
|
client = ollama.AsyncClient(host=OLLAMA_HOST)
|
|
logger = logging.getLogger("llm")
|
|
|
|
VLM_OCR_CONTEXT_PROMPT = """You are an OCR and visual-context extractor for markdown writing assistance.
|
|
|
|
Your output will be embedded inside an HTML comment as hidden context for a text-completion model.
|
|
|
|
Requirements:
|
|
- Keep output compact: maximum 120 words.
|
|
- Use plain text only (no markdown code fences).
|
|
- Never output <!-- or -->.
|
|
- Do not invent unreadable text; mark uncertain characters with ?.
|
|
- Preserve original script for recognized text (do not forcibly translate).
|
|
|
|
Return exactly this format:
|
|
|
|
TEXT:
|
|
<exact transcription of visible text; use " | " for line breaks; write "(none)" if no readable text>
|
|
|
|
KEY_DETAILS:
|
|
- <3-5 short factual bullets about relevant objects/layout>
|
|
|
|
LANGUAGE:
|
|
<dominant language(s) in visible text, e.g. English / Chinese / Mixed>
|
|
|
|
SUMMARY:
|
|
<one short sentence, <= 20 words>"""
|
|
|
|
def _extract_message(response) -> tuple[str, str]:
|
|
content = ""
|
|
thinking = ""
|
|
|
|
if hasattr(response, 'message') and response.message:
|
|
content = response.message.content or ""
|
|
thinking = getattr(response.message, 'thinking', '') or ""
|
|
elif isinstance(response, dict):
|
|
msg = response.get('message', {})
|
|
content = msg.get('content', '') or ""
|
|
thinking = msg.get('thinking', '') or ""
|
|
|
|
return content, thinking
|
|
|
|
|
|
async def call_ollama(
|
|
prompt: str,
|
|
*,
|
|
system_prompt: str = None,
|
|
tag: str = "default",
|
|
temperature: float = 0.7,
|
|
thinking: str = None,
|
|
) -> dict:
|
|
"""
|
|
调用 Ollama API 并返回 content 和 thinking。
|
|
"""
|
|
start = time.perf_counter()
|
|
start_dt = datetime.now()
|
|
logger.info(
|
|
"[LLM][%s] request model=%s host=%s prompt_chars=%d system_chars=%d temp=%.2f thinking=%s",
|
|
tag,
|
|
OLLAMA_MODEL,
|
|
OLLAMA_HOST,
|
|
len(prompt),
|
|
len(system_prompt or ""),
|
|
temperature,
|
|
thinking,
|
|
)
|
|
|
|
try:
|
|
messages = []
|
|
if system_prompt and system_prompt.strip():
|
|
messages.append({"role": "system", "content": system_prompt})
|
|
messages.append({"role": "user", "content": prompt})
|
|
|
|
kwargs = {
|
|
"model": OLLAMA_MODEL,
|
|
"messages": messages,
|
|
"stream": False,
|
|
"options": {
|
|
'temperature': temperature,
|
|
'repeat_penalty': 1.1,
|
|
},
|
|
}
|
|
if thinking:
|
|
kwargs["think"] = thinking
|
|
|
|
response = await client.chat(**kwargs)
|
|
except asyncio.CancelledError:
|
|
elapsed_ms = (time.perf_counter() - start) * 1000
|
|
end_dt = datetime.now()
|
|
logger.info(
|
|
"[LLM][%s] call_time [%s --> %s]",
|
|
tag,
|
|
start_dt.strftime("%H:%M:%S"),
|
|
end_dt.strftime("%H:%M:%S"),
|
|
)
|
|
logger.warning("[LLM][%s] request cancelled after %.1fms", tag, elapsed_ms)
|
|
raise
|
|
except Exception:
|
|
elapsed_ms = (time.perf_counter() - start) * 1000
|
|
end_dt = datetime.now()
|
|
logger.info(
|
|
"[LLM][%s] call_time [%s --> %s]",
|
|
tag,
|
|
start_dt.strftime("%H:%M:%S"),
|
|
end_dt.strftime("%H:%M:%S"),
|
|
)
|
|
logger.exception("[LLM][%s] request failed after %.1fms", tag, elapsed_ms)
|
|
raise
|
|
|
|
content, thinking = _extract_message(response)
|
|
elapsed_ms = (time.perf_counter() - start) * 1000
|
|
end_dt = datetime.now()
|
|
logger.info(
|
|
"[LLM][%s] call_time [%s --> %s]",
|
|
tag,
|
|
start_dt.strftime("%H:%M:%S"),
|
|
end_dt.strftime("%H:%M:%S"),
|
|
)
|
|
logger.info(
|
|
"[LLM][%s] response in %.1fms response_type=%s content_chars=%d thinking_chars=%d",
|
|
tag,
|
|
elapsed_ms,
|
|
type(response).__name__,
|
|
len(content),
|
|
len(thinking),
|
|
)
|
|
|
|
if not content.strip():
|
|
logger.warning("[LLM][%s] empty content returned by model", tag)
|
|
|
|
return {"content": content, "think": thinking}
|
|
|
|
async def call_vlm_ocr(image_bytes: bytes, language: str = 'auto') -> str:
|
|
start = time.perf_counter()
|
|
start_dt = datetime.now()
|
|
logger.info(
|
|
"[VLM][ocr] request model=%s host=%s image_bytes=%d language=%s",
|
|
VLM_MODEL,
|
|
OLLAMA_HOST,
|
|
len(image_bytes),
|
|
language,
|
|
)
|
|
|
|
try:
|
|
response = await client.chat(
|
|
model=VLM_MODEL,
|
|
messages=[{
|
|
'role': 'user',
|
|
'content': VLM_OCR_CONTEXT_PROMPT,
|
|
'images': [image_bytes]
|
|
}],
|
|
stream=False,
|
|
options={'temperature': 0.3}
|
|
)
|
|
except Exception:
|
|
elapsed_ms = (time.perf_counter() - start) * 1000
|
|
end_dt = datetime.now()
|
|
logger.info(
|
|
"[VLM][ocr] call_time [%s --> %s]",
|
|
start_dt.strftime("%H:%M:%S"),
|
|
end_dt.strftime("%H:%M:%S"),
|
|
)
|
|
logger.exception("[VLM][ocr] request failed after %.1fms", elapsed_ms)
|
|
raise
|
|
|
|
content, thinking = _extract_message(response)
|
|
elapsed_ms = (time.perf_counter() - start) * 1000
|
|
end_dt = datetime.now()
|
|
logger.info(
|
|
"[VLM][ocr] call_time [%s --> %s]",
|
|
start_dt.strftime("%H:%M:%S"),
|
|
end_dt.strftime("%H:%M:%S"),
|
|
)
|
|
logger.info(
|
|
"[VLM][ocr] response in %.1fms response_type=%s content_chars=%d thinking_chars=%d",
|
|
elapsed_ms,
|
|
type(response).__name__,
|
|
len(content),
|
|
len(thinking),
|
|
)
|
|
|
|
if not content.strip():
|
|
logger.warning("[VLM][ocr] empty content returned by model")
|
|
|
|
return content
|