- Add AGENTS.md knowledge base with project documentation - Move UserPreferences model to separate models.py file - Extract API_KEY to environment variable for security - Enhance Univer Editor with PPTX support and improved UI - Improve file system handling with binary file detection - Add HF_ENDPOINT mirror for better China connectivity - Clean up unused imports and code structure
367 lines
11 KiB
Python
367 lines
11 KiB
Python
from datetime import datetime, timedelta, timezone
|
|
import re
|
|
from typing import Tuple
|
|
|
|
from models import UserPreferences
|
|
from prompts import get_language_guidance_map, get_system_prompt_template, get_inline_examples
|
|
|
|
|
|
def _get_current_datetime(timezone_pref: str = "auto") -> str:
|
|
# Default to UTC+8 if auto or not specified.
|
|
offset = 8
|
|
tz_info = " (UTC+8)"
|
|
|
|
if timezone_pref and timezone_pref != "auto":
|
|
# Parse values like "UTC+8" or "GMT-5".
|
|
match = re.search(r"([+-])(\d+)", timezone_pref)
|
|
if match:
|
|
sign = match.group(1)
|
|
hours = int(match.group(2))
|
|
offset = hours if sign == "+" else -hours
|
|
tz_info = f" ({timezone_pref})"
|
|
else:
|
|
tz_info = f" ({timezone_pref})"
|
|
|
|
now = datetime.now(timezone(timedelta(hours=offset)))
|
|
weekdays = [
|
|
"Monday",
|
|
"Tuesday",
|
|
"Wednesday",
|
|
"Thursday",
|
|
"Friday",
|
|
"Saturday",
|
|
"Sunday",
|
|
]
|
|
weekday = weekdays[now.weekday()]
|
|
return (
|
|
f"{now.year}-{now.month:02d}-{now.day:02d} "
|
|
f"{weekday} {now.hour:02d}:{now.minute:02d}:{now.second:02d}{tz_info}"
|
|
)
|
|
|
|
|
|
def _sanitize_language_id(language_id: str) -> str:
|
|
if not language_id:
|
|
return "markdown"
|
|
allowed = []
|
|
for ch in language_id.strip():
|
|
if ch.isalnum() or ch in "-_+.":
|
|
allowed.append(ch)
|
|
value = "".join(allowed)[:32]
|
|
return value or "markdown"
|
|
|
|
|
|
def _normalize_newlines(text: str) -> str:
|
|
return (text or "").replace("\r\n", "\n").replace("\r", "\n")
|
|
|
|
|
|
def _prepare_context(prefix: str, suffix: str) -> Tuple[str, str]:
|
|
"""
|
|
Prepare prefix/suffix for model completion context.
|
|
Filter out potential web-scraping or legacy artifacts like <br>, <br/>, <br\\>.
|
|
"""
|
|
br_pattern = re.compile(r"<br\s*/?\s*\\?>", re.IGNORECASE)
|
|
clean_prefix = br_pattern.sub("", prefix or "")
|
|
clean_suffix = br_pattern.sub("", suffix or "")
|
|
return clean_prefix, clean_suffix
|
|
|
|
|
|
FENCE_LINE_RE = re.compile(r"^[ \t]*```.*$")
|
|
FENCE_INFO_RE = re.compile(r"^[ \t]*```[ \t]*(.*)$")
|
|
MERMAID_CONTEXT_RE = re.compile(
|
|
r"```[ \t]*mermaid\b|"
|
|
r"\b(flowchart|sequencediagram|classdiagram|statediagram(?:-v2)?|"
|
|
r"erdiagram|journey|gantt|pie|mindmap|timeline|gitgraph|quadrantchart|xychart-beta)\b|"
|
|
r"\bgraph[ \t]+(TD|TB|BT|RL|LR)\b",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
|
|
def _cursor_in_fenced_code_block(prefix: str) -> bool:
|
|
"""
|
|
Determine whether the cursor is currently inside a fenced code block.
|
|
The state is computed by toggling on each markdown fence line that matches:
|
|
^[ \t]*```.*$
|
|
"""
|
|
return _active_fence_language(prefix) != "none"
|
|
|
|
|
|
def _active_fence_language(prefix: str) -> str:
|
|
"""
|
|
Return active fence language at cursor based on prefix.
|
|
- "none": cursor is outside fenced code block
|
|
- "unknown": cursor is inside a fence without language tag
|
|
- "<language>": cursor is inside a fenced block with language tag
|
|
"""
|
|
normalized = _normalize_newlines(prefix)
|
|
in_fence = False
|
|
active_language = "none"
|
|
for line in normalized.split("\n"):
|
|
if FENCE_LINE_RE.match(line):
|
|
if in_fence:
|
|
in_fence = False
|
|
active_language = "none"
|
|
else:
|
|
info_match = FENCE_INFO_RE.match(line)
|
|
info = info_match.group(1).strip() if info_match else ""
|
|
if not info:
|
|
active_language = "unknown"
|
|
else:
|
|
first_token = info.split()[0]
|
|
lang_chars = []
|
|
for ch in first_token.strip():
|
|
if ch.isalnum() or ch in "-_+.":
|
|
lang_chars.append(ch)
|
|
active_language = "".join(lang_chars)[:32].lower() or "unknown"
|
|
in_fence = True
|
|
return active_language if in_fence else "none"
|
|
|
|
|
|
def _is_mermaid_context(prefix: str, suffix: str, cursor_fence_language: str) -> bool:
|
|
if cursor_fence_language == "mermaid":
|
|
return True
|
|
|
|
prefix_tail = (prefix or "")[-1200:]
|
|
suffix_head = (suffix or "")[:400]
|
|
combined = f"{prefix_tail}\n{suffix_head}"
|
|
return MERMAID_CONTEXT_RE.search(combined) is not None
|
|
|
|
|
|
def prepare_prompt_context(prefix: str, suffix: str) -> Tuple[str, str]:
|
|
return _prepare_context(prefix, suffix)
|
|
|
|
|
|
LANGUAGE_SYNONYMS = {
|
|
"md": "markdown",
|
|
"markdown": "markdown",
|
|
"txt": "text",
|
|
"text": "text",
|
|
"plain": "text",
|
|
"plaintext": "text",
|
|
"py": "python",
|
|
"python": "python",
|
|
"js": "javascript",
|
|
"javascript": "javascript",
|
|
"jsx": "javascript",
|
|
"node": "javascript",
|
|
"ts": "typescript",
|
|
"tsx": "typescript",
|
|
"typescript": "typescript",
|
|
"json": "json",
|
|
"jsonc": "json",
|
|
"json5": "json",
|
|
"yaml": "yaml",
|
|
"yml": "yaml",
|
|
"toml": "toml",
|
|
"ini": "ini",
|
|
"cfg": "ini",
|
|
"bash": "bash",
|
|
"shell": "bash",
|
|
"sh": "bash",
|
|
"zsh": "bash",
|
|
"fish": "bash",
|
|
"ps": "powershell",
|
|
"ps1": "powershell",
|
|
"powershell": "powershell",
|
|
"sql": "sql",
|
|
"postgres": "sql",
|
|
"postgresql": "sql",
|
|
"mysql": "sql",
|
|
"sqlite": "sql",
|
|
"html": "html",
|
|
"xml": "xml",
|
|
"svg": "xml",
|
|
"css": "css",
|
|
"scss": "css",
|
|
"less": "css",
|
|
"latex": "latex",
|
|
"tex": "latex",
|
|
"katex": "latex",
|
|
"mermaid": "mermaid",
|
|
"c": "c",
|
|
"c++": "cpp",
|
|
"cpp": "cpp",
|
|
"cxx": "cpp",
|
|
"h": "c",
|
|
"hpp": "cpp",
|
|
"c#": "csharp",
|
|
"cs": "csharp",
|
|
"csharp": "csharp",
|
|
"go": "go",
|
|
"golang": "go",
|
|
"rust": "rust",
|
|
"rs": "rust",
|
|
"java": "java",
|
|
"kotlin": "kotlin",
|
|
"swift": "swift",
|
|
"ruby": "ruby",
|
|
"rb": "ruby",
|
|
"php": "php",
|
|
"lua": "lua",
|
|
"r": "r",
|
|
"matlab": "matlab",
|
|
"dart": "dart",
|
|
"docker": "dockerfile",
|
|
"dockerfile": "dockerfile",
|
|
"make": "makefile",
|
|
"makefile": "makefile",
|
|
"diff": "diff",
|
|
"patch": "diff",
|
|
"regex": "regex",
|
|
}
|
|
|
|
|
|
def _canonical_language_id(language_id: str) -> str:
|
|
safe = _sanitize_language_id(language_id).lower()
|
|
if not safe:
|
|
return "markdown"
|
|
return LANGUAGE_SYNONYMS.get(safe, safe)
|
|
|
|
|
|
_JS_LANGS = {"javascript", "typescript"}
|
|
_CODE_LANGS = {"python", "go", "rust", "java", "kotlin", "swift", "ruby", "php", "lua", "c", "cpp", "csharp", "r", "matlab", "dart"}
|
|
|
|
|
|
def _language_guidance(language_id: str) -> str:
|
|
canonical = _canonical_language_id(language_id)
|
|
if canonical == "markdown":
|
|
return ""
|
|
guidance_map = get_language_guidance_map()
|
|
guidance = guidance_map.get(canonical)
|
|
if guidance:
|
|
return guidance
|
|
if canonical in _JS_LANGS:
|
|
return guidance_map.get("_js_code", "").replace("{lang}", canonical)
|
|
if canonical in _CODE_LANGS:
|
|
return guidance_map.get("_generic_code", "").replace("{lang}", canonical)
|
|
return guidance_map.get("_generic_code", "").replace("{lang}", canonical)
|
|
|
|
|
|
def build_inline_system_prompt(language_id: str = "markdown") -> str:
|
|
safe_language_id = _canonical_language_id(language_id)
|
|
language_guidance = _language_guidance(safe_language_id)
|
|
template = get_system_prompt_template()
|
|
system_prompt = template.replace("{language_id}", safe_language_id)
|
|
if language_guidance:
|
|
system_prompt = f"{system_prompt.rstrip()}\n{language_guidance.strip()}"
|
|
return system_prompt.strip()
|
|
|
|
|
|
_INLINE_EXAMPLES = get_inline_examples()
|
|
|
|
|
|
def build_completion_prompts(
|
|
prefix: str,
|
|
suffix: str,
|
|
language_id: str = "markdown",
|
|
location: str = "",
|
|
thinking_level: str = "low",
|
|
preferences: UserPreferences | None = None,
|
|
) -> Tuple[str, str]:
|
|
safe_language_id = _canonical_language_id(language_id)
|
|
recent_prefix, recent_suffix = _prepare_context(prefix, suffix)
|
|
recent_prefix = _normalize_newlines(recent_prefix)
|
|
recent_suffix = _normalize_newlines(recent_suffix)
|
|
|
|
cursor_fence_language = _active_fence_language(recent_prefix)
|
|
cursor_in_fenced_code_block = cursor_fence_language != "none"
|
|
mermaid_context = _is_mermaid_context(
|
|
recent_prefix, recent_suffix, cursor_fence_language
|
|
)
|
|
prefix_ends_with_newline = recent_prefix.endswith("\n")
|
|
suffix_starts_with_newline = recent_suffix.startswith("\n")
|
|
|
|
tz_pref = preferences.timezone if preferences else "auto"
|
|
current_time = _get_current_datetime(tz_pref)
|
|
location_info = f"\nUser location: {location}" if location else ""
|
|
|
|
pref_info = []
|
|
if preferences:
|
|
if preferences.language and preferences.language != "auto":
|
|
pref_info.append(f"Preferred language: {preferences.language}")
|
|
if preferences.currency and preferences.currency != "auto":
|
|
pref_info.append(f"Preferred currency: {preferences.currency}")
|
|
|
|
preferences_instruction = "\n".join(pref_info)
|
|
if preferences_instruction:
|
|
preferences_instruction = f"\nUser Preferences:\n{preferences_instruction}"
|
|
|
|
user_prompt = f"""Current time: {current_time}{location_info}{preferences_instruction}
|
|
Reasoning level: {thinking_level}
|
|
Editor language: {safe_language_id}
|
|
|
|
=== STATE FLAGS ===
|
|
- CURSOR_IN_FENCED_CODE_BLOCK: {"true" if cursor_in_fenced_code_block else "false"}
|
|
- CURSOR_FENCE_LANGUAGE: {cursor_fence_language}
|
|
- MERMAID_CONTEXT: {"true" if mermaid_context else "false"}
|
|
- PREFIX_ENDS_WITH_NEWLINE: {"true" if prefix_ends_with_newline else "false"}
|
|
- SUFFIX_STARTS_WITH_NEWLINE: {"true" if suffix_starts_with_newline else "false"}
|
|
|
|
=== TASK ===
|
|
Produce the best insertion text between PREFIX and SUFFIX.
|
|
Requirements:
|
|
- Non-empty and meaningful
|
|
- Concise unless structure needs more
|
|
- Follows markdown rules in system prompt
|
|
|
|
=== BOUNDARY DECISION GUIDE ===
|
|
|
|
Step 1: Check PREFIX_ENDS_WITH_NEWLINE
|
|
If false, ask: "Does output need to start on a new line?"
|
|
- YES if PREFIX ends with: ":", "steps:", "items:", heading text, or complete sentence before heading
|
|
- If YES: start output with \\n
|
|
|
|
Step 2: Check SUFFIX_STARTS_WITH_NEWLINE
|
|
If false, ask: "Does output need to end with a newline?"
|
|
- YES if SUFFIX starts with: heading (##), new paragraph, or list marker
|
|
- If YES: end output with \\n
|
|
|
|
Step 3: Choose newline type
|
|
- Use \\n\\n for: new paragraphs, before headings, starting lists
|
|
- Use \\n for: continuing within blocks, list items, table cells
|
|
- Exception: inside code fences, use \\n freely
|
|
|
|
=== CONTEXT NOTES ===
|
|
- OCR metadata (e.g., <OCR:description>) is hidden context, never copy to output
|
|
- Match PREFIX tone, style, and indentation
|
|
- Do not repeat text from SUFFIX beginning
|
|
|
|
=== EXAMPLES BY CATEGORY ===
|
|
{_INLINE_EXAMPLES}
|
|
|
|
=== NOW COMPLETE THE TASK ===
|
|
|
|
<PREFIX>
|
|
{recent_prefix}
|
|
</PREFIX>
|
|
|
|
<SUFFIX>
|
|
{recent_suffix}
|
|
</SUFFIX>
|
|
|
|
Output:"""
|
|
|
|
system_prompt = build_inline_system_prompt(safe_language_id)
|
|
return system_prompt.strip(), user_prompt.strip()
|
|
|
|
|
|
def build_prompt(
|
|
prefix: str,
|
|
suffix: str,
|
|
language_id: str = "markdown",
|
|
location: str = "",
|
|
thinking_level: str = "low",
|
|
preferences: UserPreferences | None = None,
|
|
) -> str:
|
|
"""
|
|
Backward-compatible helper. Returns only the user prompt body.
|
|
"""
|
|
_, user_prompt = build_completion_prompts(
|
|
prefix=prefix,
|
|
suffix=suffix,
|
|
language_id=language_id,
|
|
location=location,
|
|
thinking_level=thinking_level,
|
|
preferences=preferences,
|
|
)
|
|
return user_prompt
|