Files
llm-in-text/backend/prompt.py
ydy0615 d8b7832b14 refactor: improve codebase structure and Univer integration
- Add AGENTS.md knowledge base with project documentation
- Move UserPreferences model to separate models.py file
- Extract API_KEY to environment variable for security
- Enhance Univer Editor with PPTX support and improved UI
- Improve file system handling with binary file detection
- Add HF_ENDPOINT mirror for better China connectivity
- Clean up unused imports and code structure
2026-04-11 09:24:14 +08:00

367 lines
11 KiB
Python

from datetime import datetime, timedelta, timezone
import re
from typing import Tuple
from models import UserPreferences
from prompts import get_language_guidance_map, get_system_prompt_template, get_inline_examples
def _get_current_datetime(timezone_pref: str = "auto") -> str:
# Default to UTC+8 if auto or not specified.
offset = 8
tz_info = " (UTC+8)"
if timezone_pref and timezone_pref != "auto":
# Parse values like "UTC+8" or "GMT-5".
match = re.search(r"([+-])(\d+)", timezone_pref)
if match:
sign = match.group(1)
hours = int(match.group(2))
offset = hours if sign == "+" else -hours
tz_info = f" ({timezone_pref})"
else:
tz_info = f" ({timezone_pref})"
now = datetime.now(timezone(timedelta(hours=offset)))
weekdays = [
"Monday",
"Tuesday",
"Wednesday",
"Thursday",
"Friday",
"Saturday",
"Sunday",
]
weekday = weekdays[now.weekday()]
return (
f"{now.year}-{now.month:02d}-{now.day:02d} "
f"{weekday} {now.hour:02d}:{now.minute:02d}:{now.second:02d}{tz_info}"
)
def _sanitize_language_id(language_id: str) -> str:
if not language_id:
return "markdown"
allowed = []
for ch in language_id.strip():
if ch.isalnum() or ch in "-_+.":
allowed.append(ch)
value = "".join(allowed)[:32]
return value or "markdown"
def _normalize_newlines(text: str) -> str:
return (text or "").replace("\r\n", "\n").replace("\r", "\n")
def _prepare_context(prefix: str, suffix: str) -> Tuple[str, str]:
"""
Prepare prefix/suffix for model completion context.
Filter out potential web-scraping or legacy artifacts like <br>, <br/>, <br\\>.
"""
br_pattern = re.compile(r"<br\s*/?\s*\\?>", re.IGNORECASE)
clean_prefix = br_pattern.sub("", prefix or "")
clean_suffix = br_pattern.sub("", suffix or "")
return clean_prefix, clean_suffix
FENCE_LINE_RE = re.compile(r"^[ \t]*```.*$")
FENCE_INFO_RE = re.compile(r"^[ \t]*```[ \t]*(.*)$")
MERMAID_CONTEXT_RE = re.compile(
r"```[ \t]*mermaid\b|"
r"\b(flowchart|sequencediagram|classdiagram|statediagram(?:-v2)?|"
r"erdiagram|journey|gantt|pie|mindmap|timeline|gitgraph|quadrantchart|xychart-beta)\b|"
r"\bgraph[ \t]+(TD|TB|BT|RL|LR)\b",
re.IGNORECASE,
)
def _cursor_in_fenced_code_block(prefix: str) -> bool:
"""
Determine whether the cursor is currently inside a fenced code block.
The state is computed by toggling on each markdown fence line that matches:
^[ \t]*```.*$
"""
return _active_fence_language(prefix) != "none"
def _active_fence_language(prefix: str) -> str:
"""
Return active fence language at cursor based on prefix.
- "none": cursor is outside fenced code block
- "unknown": cursor is inside a fence without language tag
- "<language>": cursor is inside a fenced block with language tag
"""
normalized = _normalize_newlines(prefix)
in_fence = False
active_language = "none"
for line in normalized.split("\n"):
if FENCE_LINE_RE.match(line):
if in_fence:
in_fence = False
active_language = "none"
else:
info_match = FENCE_INFO_RE.match(line)
info = info_match.group(1).strip() if info_match else ""
if not info:
active_language = "unknown"
else:
first_token = info.split()[0]
lang_chars = []
for ch in first_token.strip():
if ch.isalnum() or ch in "-_+.":
lang_chars.append(ch)
active_language = "".join(lang_chars)[:32].lower() or "unknown"
in_fence = True
return active_language if in_fence else "none"
def _is_mermaid_context(prefix: str, suffix: str, cursor_fence_language: str) -> bool:
if cursor_fence_language == "mermaid":
return True
prefix_tail = (prefix or "")[-1200:]
suffix_head = (suffix or "")[:400]
combined = f"{prefix_tail}\n{suffix_head}"
return MERMAID_CONTEXT_RE.search(combined) is not None
def prepare_prompt_context(prefix: str, suffix: str) -> Tuple[str, str]:
return _prepare_context(prefix, suffix)
LANGUAGE_SYNONYMS = {
"md": "markdown",
"markdown": "markdown",
"txt": "text",
"text": "text",
"plain": "text",
"plaintext": "text",
"py": "python",
"python": "python",
"js": "javascript",
"javascript": "javascript",
"jsx": "javascript",
"node": "javascript",
"ts": "typescript",
"tsx": "typescript",
"typescript": "typescript",
"json": "json",
"jsonc": "json",
"json5": "json",
"yaml": "yaml",
"yml": "yaml",
"toml": "toml",
"ini": "ini",
"cfg": "ini",
"bash": "bash",
"shell": "bash",
"sh": "bash",
"zsh": "bash",
"fish": "bash",
"ps": "powershell",
"ps1": "powershell",
"powershell": "powershell",
"sql": "sql",
"postgres": "sql",
"postgresql": "sql",
"mysql": "sql",
"sqlite": "sql",
"html": "html",
"xml": "xml",
"svg": "xml",
"css": "css",
"scss": "css",
"less": "css",
"latex": "latex",
"tex": "latex",
"katex": "latex",
"mermaid": "mermaid",
"c": "c",
"c++": "cpp",
"cpp": "cpp",
"cxx": "cpp",
"h": "c",
"hpp": "cpp",
"c#": "csharp",
"cs": "csharp",
"csharp": "csharp",
"go": "go",
"golang": "go",
"rust": "rust",
"rs": "rust",
"java": "java",
"kotlin": "kotlin",
"swift": "swift",
"ruby": "ruby",
"rb": "ruby",
"php": "php",
"lua": "lua",
"r": "r",
"matlab": "matlab",
"dart": "dart",
"docker": "dockerfile",
"dockerfile": "dockerfile",
"make": "makefile",
"makefile": "makefile",
"diff": "diff",
"patch": "diff",
"regex": "regex",
}
def _canonical_language_id(language_id: str) -> str:
safe = _sanitize_language_id(language_id).lower()
if not safe:
return "markdown"
return LANGUAGE_SYNONYMS.get(safe, safe)
_JS_LANGS = {"javascript", "typescript"}
_CODE_LANGS = {"python", "go", "rust", "java", "kotlin", "swift", "ruby", "php", "lua", "c", "cpp", "csharp", "r", "matlab", "dart"}
def _language_guidance(language_id: str) -> str:
canonical = _canonical_language_id(language_id)
if canonical == "markdown":
return ""
guidance_map = get_language_guidance_map()
guidance = guidance_map.get(canonical)
if guidance:
return guidance
if canonical in _JS_LANGS:
return guidance_map.get("_js_code", "").replace("{lang}", canonical)
if canonical in _CODE_LANGS:
return guidance_map.get("_generic_code", "").replace("{lang}", canonical)
return guidance_map.get("_generic_code", "").replace("{lang}", canonical)
def build_inline_system_prompt(language_id: str = "markdown") -> str:
safe_language_id = _canonical_language_id(language_id)
language_guidance = _language_guidance(safe_language_id)
template = get_system_prompt_template()
system_prompt = template.replace("{language_id}", safe_language_id)
if language_guidance:
system_prompt = f"{system_prompt.rstrip()}\n{language_guidance.strip()}"
return system_prompt.strip()
_INLINE_EXAMPLES = get_inline_examples()
def build_completion_prompts(
prefix: str,
suffix: str,
language_id: str = "markdown",
location: str = "",
thinking_level: str = "low",
preferences: UserPreferences | None = None,
) -> Tuple[str, str]:
safe_language_id = _canonical_language_id(language_id)
recent_prefix, recent_suffix = _prepare_context(prefix, suffix)
recent_prefix = _normalize_newlines(recent_prefix)
recent_suffix = _normalize_newlines(recent_suffix)
cursor_fence_language = _active_fence_language(recent_prefix)
cursor_in_fenced_code_block = cursor_fence_language != "none"
mermaid_context = _is_mermaid_context(
recent_prefix, recent_suffix, cursor_fence_language
)
prefix_ends_with_newline = recent_prefix.endswith("\n")
suffix_starts_with_newline = recent_suffix.startswith("\n")
tz_pref = preferences.timezone if preferences else "auto"
current_time = _get_current_datetime(tz_pref)
location_info = f"\nUser location: {location}" if location else ""
pref_info = []
if preferences:
if preferences.language and preferences.language != "auto":
pref_info.append(f"Preferred language: {preferences.language}")
if preferences.currency and preferences.currency != "auto":
pref_info.append(f"Preferred currency: {preferences.currency}")
preferences_instruction = "\n".join(pref_info)
if preferences_instruction:
preferences_instruction = f"\nUser Preferences:\n{preferences_instruction}"
user_prompt = f"""Current time: {current_time}{location_info}{preferences_instruction}
Reasoning level: {thinking_level}
Editor language: {safe_language_id}
=== STATE FLAGS ===
- CURSOR_IN_FENCED_CODE_BLOCK: {"true" if cursor_in_fenced_code_block else "false"}
- CURSOR_FENCE_LANGUAGE: {cursor_fence_language}
- MERMAID_CONTEXT: {"true" if mermaid_context else "false"}
- PREFIX_ENDS_WITH_NEWLINE: {"true" if prefix_ends_with_newline else "false"}
- SUFFIX_STARTS_WITH_NEWLINE: {"true" if suffix_starts_with_newline else "false"}
=== TASK ===
Produce the best insertion text between PREFIX and SUFFIX.
Requirements:
- Non-empty and meaningful
- Concise unless structure needs more
- Follows markdown rules in system prompt
=== BOUNDARY DECISION GUIDE ===
Step 1: Check PREFIX_ENDS_WITH_NEWLINE
If false, ask: "Does output need to start on a new line?"
- YES if PREFIX ends with: ":", "steps:", "items:", heading text, or complete sentence before heading
- If YES: start output with \\n
Step 2: Check SUFFIX_STARTS_WITH_NEWLINE
If false, ask: "Does output need to end with a newline?"
- YES if SUFFIX starts with: heading (##), new paragraph, or list marker
- If YES: end output with \\n
Step 3: Choose newline type
- Use \\n\\n for: new paragraphs, before headings, starting lists
- Use \\n for: continuing within blocks, list items, table cells
- Exception: inside code fences, use \\n freely
=== CONTEXT NOTES ===
- OCR metadata (e.g., <OCR:description>) is hidden context, never copy to output
- Match PREFIX tone, style, and indentation
- Do not repeat text from SUFFIX beginning
=== EXAMPLES BY CATEGORY ===
{_INLINE_EXAMPLES}
=== NOW COMPLETE THE TASK ===
<PREFIX>
{recent_prefix}
</PREFIX>
<SUFFIX>
{recent_suffix}
</SUFFIX>
Output:"""
system_prompt = build_inline_system_prompt(safe_language_id)
return system_prompt.strip(), user_prompt.strip()
def build_prompt(
prefix: str,
suffix: str,
language_id: str = "markdown",
location: str = "",
thinking_level: str = "low",
preferences: UserPreferences | None = None,
) -> str:
"""
Backward-compatible helper. Returns only the user prompt body.
"""
_, user_prompt = build_completion_prompts(
prefix=prefix,
suffix=suffix,
language_id=language_id,
location=location,
thinking_level=thinking_level,
preferences=preferences,
)
return user_prompt