Files
llm-in-text/backend/prompt.py
ydy0615 d452d1747e feat: add language synonym mapping and canonicalization
Add LANGUAGE_SYNONYMS dictionary to map language aliases to canonical IDs,
_canonical_language_id() to normalize language identifiers, and
_language_guidance() to provide language-specific instructions for LLM
code generation. This improves language detection and ensures consistent
prompt context across different language format variations.
2026-03-14 18:20:39 +08:00

618 lines
19 KiB
Python

from datetime import datetime, timedelta, timezone
import re
from typing import Tuple
def _get_current_datetime(timezone_pref: str = "auto") -> str:
# Default to UTC+8 if auto or not specified.
offset = 8
tz_info = " (UTC+8)"
if timezone_pref and timezone_pref != "auto":
# Parse values like "UTC+8" or "GMT-5".
match = re.search(r"([+-])(\d+)", timezone_pref)
if match:
sign = match.group(1)
hours = int(match.group(2))
offset = hours if sign == "+" else -hours
tz_info = f" ({timezone_pref})"
else:
tz_info = f" ({timezone_pref})"
now = datetime.now(timezone(timedelta(hours=offset)))
weekdays = [
"Monday",
"Tuesday",
"Wednesday",
"Thursday",
"Friday",
"Saturday",
"Sunday",
]
weekday = weekdays[now.weekday()]
return (
f"{now.year}-{now.month:02d}-{now.day:02d} "
f"{weekday} {now.hour:02d}:{now.minute:02d}:{now.second:02d}{tz_info}"
)
def _sanitize_language_id(language_id: str) -> str:
if not language_id:
return "markdown"
allowed = []
for ch in language_id.strip():
if ch.isalnum() or ch in "-_+.":
allowed.append(ch)
value = "".join(allowed)[:32]
return value or "markdown"
def _normalize_newlines(text: str) -> str:
return (text or "").replace("\r\n", "\n").replace("\r", "\n")
def _prepare_context(prefix: str, suffix: str) -> Tuple[str, str]:
"""
Prepare prefix/suffix for model completion context.
Filter out potential web-scraping or legacy artifacts like <br>, <br/>, <br\\>.
"""
br_pattern = re.compile(r"<br\s*/?\s*\\?>", re.IGNORECASE)
clean_prefix = br_pattern.sub("", prefix or "")
clean_suffix = br_pattern.sub("", suffix or "")
return clean_prefix, clean_suffix
FENCE_LINE_RE = re.compile(r"^[ \t]*```.*$")
FENCE_INFO_RE = re.compile(r"^[ \t]*```[ \t]*(.*)$")
MERMAID_CONTEXT_RE = re.compile(
r"```[ \t]*mermaid\b|"
r"\b(flowchart|sequencediagram|classdiagram|statediagram(?:-v2)?|"
r"erdiagram|journey|gantt|pie|mindmap|timeline|gitgraph|quadrantchart|xychart-beta)\b|"
r"\bgraph[ \t]+(TD|TB|BT|RL|LR)\b",
re.IGNORECASE,
)
def _cursor_in_fenced_code_block(prefix: str) -> bool:
"""
Determine whether the cursor is currently inside a fenced code block.
The state is computed by toggling on each markdown fence line that matches:
^[ \t]*```.*$
"""
return _active_fence_language(prefix) != "none"
def _active_fence_language(prefix: str) -> str:
"""
Return active fence language at cursor based on prefix.
- "none": cursor is outside fenced code block
- "unknown": cursor is inside a fence without language tag
- "<language>": cursor is inside a fenced block with language tag
"""
normalized = _normalize_newlines(prefix)
in_fence = False
active_language = "none"
for line in normalized.split("\n"):
if FENCE_LINE_RE.match(line):
if in_fence:
in_fence = False
active_language = "none"
else:
info_match = FENCE_INFO_RE.match(line)
info = info_match.group(1).strip() if info_match else ""
if not info:
active_language = "unknown"
else:
first_token = info.split()[0]
lang_chars = []
for ch in first_token.strip():
if ch.isalnum() or ch in "-_+.":
lang_chars.append(ch)
active_language = "".join(lang_chars)[:32].lower() or "unknown"
in_fence = True
return active_language if in_fence else "none"
def _is_mermaid_context(prefix: str, suffix: str, cursor_fence_language: str) -> bool:
if cursor_fence_language == "mermaid":
return True
prefix_tail = (prefix or "")[-1200:]
suffix_head = (suffix or "")[:400]
combined = f"{prefix_tail}\n{suffix_head}"
return MERMAID_CONTEXT_RE.search(combined) is not None
def prepare_prompt_context(prefix: str, suffix: str) -> Tuple[str, str]:
return _prepare_context(prefix, suffix)
LANGUAGE_SYNONYMS = {
"md": "markdown",
"markdown": "markdown",
"txt": "text",
"text": "text",
"plain": "text",
"plaintext": "text",
"py": "python",
"python": "python",
"js": "javascript",
"javascript": "javascript",
"jsx": "javascript",
"node": "javascript",
"ts": "typescript",
"tsx": "typescript",
"typescript": "typescript",
"json": "json",
"jsonc": "json",
"json5": "json",
"yaml": "yaml",
"yml": "yaml",
"toml": "toml",
"ini": "ini",
"cfg": "ini",
"bash": "bash",
"shell": "bash",
"sh": "bash",
"zsh": "bash",
"fish": "bash",
"ps": "powershell",
"ps1": "powershell",
"powershell": "powershell",
"sql": "sql",
"postgres": "sql",
"postgresql": "sql",
"mysql": "sql",
"sqlite": "sql",
"html": "html",
"xml": "xml",
"svg": "xml",
"css": "css",
"scss": "css",
"less": "css",
"latex": "latex",
"tex": "latex",
"katex": "latex",
"mermaid": "mermaid",
"c": "c",
"c++": "cpp",
"cpp": "cpp",
"cxx": "cpp",
"h": "c",
"hpp": "cpp",
"c#": "csharp",
"cs": "csharp",
"csharp": "csharp",
"go": "go",
"golang": "go",
"rust": "rust",
"rs": "rust",
"java": "java",
"kotlin": "kotlin",
"swift": "swift",
"ruby": "ruby",
"rb": "ruby",
"php": "php",
"lua": "lua",
"r": "r",
"matlab": "matlab",
"dart": "dart",
"docker": "dockerfile",
"dockerfile": "dockerfile",
"make": "makefile",
"makefile": "makefile",
"diff": "diff",
"patch": "diff",
"regex": "regex",
}
def _canonical_language_id(language_id: str) -> str:
safe = _sanitize_language_id(language_id).lower()
if not safe:
return "markdown"
return LANGUAGE_SYNONYMS.get(safe, safe)
def _language_guidance(language_id: str) -> str:
canonical = _canonical_language_id(language_id)
if canonical == "markdown":
return ""
if canonical == "mermaid":
return """
Language-specific guidance (mermaid):
- Output valid Mermaid syntax only.
- Prefer concise, syntactically correct diagram statements.
- Avoid prose unless the user prompt explicitly requires it."""
if canonical == "latex":
return """
Language-specific guidance (latex):
- Output LaTeX math content only when completing LaTeX.
- If CURSOR_IN_FENCED_CODE_BLOCK=true and CURSOR_FENCE_LANGUAGE is latex/tex/katex:
- Output raw LaTeX lines only.
- Do not wrap with $ or $$."""
if canonical == "json":
return """
Language-specific guidance (json):
- Output strict JSON only (no comments, no trailing commas).
- Ensure valid quotes and braces."""
if canonical == "yaml":
return """
Language-specific guidance (yaml):
- Output valid YAML only.
- Use consistent indentation and avoid tabs."""
if canonical == "toml":
return """
Language-specific guidance (toml):
- Output valid TOML only.
- Keep key types consistent."""
if canonical == "ini":
return """
Language-specific guidance (ini):
- Output valid INI only.
- Keep section headers and key=value pairs consistent."""
if canonical == "sql":
return """
Language-specific guidance (sql):
- Output a single, valid SQL statement unless context requires multiple.
- Prefer ANSI SQL when dialect is unclear."""
if canonical == "bash":
return """
Language-specific guidance (bash):
- Output POSIX-compatible shell when possible.
- Avoid interactive prompts or destructive commands unless requested."""
if canonical == "powershell":
return """
Language-specific guidance (powershell):
- Output valid PowerShell commands.
- Avoid destructive commands unless explicitly requested."""
if canonical == "html":
return """
Language-specific guidance (html):
- Output valid HTML only.
- Keep markup minimal and well-formed."""
if canonical == "css":
return """
Language-specific guidance (css):
- Output valid CSS only.
- Use concise, readable selectors."""
if canonical == "diff":
return """
Language-specific guidance (diff):
- Output a unified diff only.
- Ensure @@ hunk headers and +/- lines are consistent."""
if canonical == "regex":
return """
Language-specific guidance (regex):
- Output the regex pattern only.
- Avoid delimiters unless explicitly requested."""
if canonical in {"javascript", "typescript"}:
return f"""
Language-specific guidance ({canonical}):
- Output valid {canonical} code.
- Prefer modern syntax and avoid prose unless comments are needed."""
if canonical in {"python", "go", "rust", "java", "kotlin", "swift", "ruby", "php", "lua", "c", "cpp", "csharp", "r", "matlab", "dart"}:
return f"""
Language-specific guidance ({canonical}):
- Output valid {canonical} code.
- Avoid prose unless context clearly expects comments or docstrings."""
if canonical == "text":
return """
Language-specific guidance (text):
- Output plain text only.
- Avoid markdown formatting unless explicitly asked."""
if canonical == "xml":
return """
Language-specific guidance (xml):
- Output well-formed XML only.
- Ensure matching tags and proper escaping."""
if canonical == "dockerfile":
return """
Language-specific guidance (dockerfile):
- Output valid Dockerfile instructions only.
- Keep layers minimal and ordered logically."""
if canonical == "makefile":
return """
Language-specific guidance (makefile):
- Output valid Makefile syntax only.
- Use tabs for recipe lines."""
return f"""
Language-specific guidance ({canonical}):
- Output valid {canonical} code.
- Avoid prose unless context clearly expects comments or docstrings."""
def build_inline_system_prompt(language_id: str = "markdown") -> str:
safe_language_id = _canonical_language_id(language_id)
language_guidance = _language_guidance(safe_language_id)
system_prompt = f"""You are an inline completion engine for a {safe_language_id} editor with ghost-text suggestions.
Return only the insertion text that should be placed between PREFIX and SUFFIX.
Hard constraints you must follow:
1) Output-only contract:
- Output insertion text only.
- No explanations, no meta labels, no wrapper quotes around the whole answer.
2) Strict math formatting (KaTeX):
- If you output any math expression, it must be strict KaTeX-compatible math.
- Every formula must be wrapped with either $...$ (inline) or $$...$$ (block).
- Never output bare formulas without $ or $$ wrappers.
- Exception: If CURSOR_IN_FENCED_CODE_BLOCK=true and CURSOR_FENCE_LANGUAGE is latex/tex/katex,
output raw LaTeX without $ or $$ wrappers.
3) Strict code formatting:
- Read CURSOR_IN_FENCED_CODE_BLOCK from the user prompt.
- If CURSOR_IN_FENCED_CODE_BLOCK=true:
- You are already inside a fenced code block.
- Never output triple backticks.
- Output code lines only.
- If CURSOR_IN_FENCED_CODE_BLOCK=false:
- Any code output must be in a fenced code block with a language tag:
```{{language}}
...
```
- Do not output code snippets as inline backticks.
- Choose the language tag from context (no default fallback tag instruction).
4) Mermaid-specific completion rules:
- Read CURSOR_FENCE_LANGUAGE and MERMAID_CONTEXT from the user prompt.
- If CURSOR_FENCE_LANGUAGE=mermaid:
- Output Mermaid statements only.
- Never output triple backticks.
- Never output prose explanations.
- If CURSOR_IN_FENCED_CODE_BLOCK=false and MERMAID_CONTEXT=true:
- Output a complete Mermaid fenced block:
```mermaid
...
```
- Keep Mermaid syntax valid and concise.
- Never mix Mermaid code and explanatory narration in one output.
5) Boundary newline repair:
- Read PREFIX_ENDS_WITH_NEWLINE and SUFFIX_STARTS_WITH_NEWLINE from the user prompt.
- Carefully reason about whether OUTPUT should start or end with a newline.
- If PREFIX lacks a required boundary newline, add it at OUTPUT start.
- If SUFFIX lacks a required boundary newline, add it at OUTPUT end.
- Ensure PREFIX + OUTPUT + SUFFIX is structurally natural.
6) Context stitching:
- Do not repeat text that already appears at the start of SUFFIX.
- Preserve nearby language, tone, punctuation, indentation, and markdown structure.
- Continue existing structures naturally (lists, tables, block quotes, headings).
7) OCR safety:
- PREFIX may include hidden OCR metadata tags like <OCR:...>.
- Never output any OCR tag.
- Never output OCR tag fragments such as <OCR:...>."""
if language_guidance:
system_prompt = f"{system_prompt.rstrip()}\n{language_guidance.strip()}"
return system_prompt.strip()
INLINE_EXAMPLES = """[EX01] Prose continuation
<PREFIX>The quick brown fox </PREFIX>
<SUFFIX>jumps over the lazy dog.</SUFFIX>
Expected OUTPUT:
moved quietly and then
[EX02] Avoid repeating suffix beginning
<PREFIX>Our launch plan starts with </PREFIX>
<SUFFIX>phase one, followed by phase two.</SUFFIX>
Expected OUTPUT:
careful internal testing before
[EX03] Continue markdown checklist
<PREFIX>## TODO
- [ ] Buy milk
- [ ] </PREFIX>
<SUFFIX></SUFFIX>
Expected OUTPUT:
Write release notes and share draft with team
[EX04] Cursor outside code block, code must use fenced block
CURSOR_IN_FENCED_CODE_BLOCK=false
<PREFIX>Parse this JSON payload in Python:</PREFIX>
<SUFFIX></SUFFIX>
Expected OUTPUT:
```python
import json
data = json.loads(payload)
```
[EX05] Cursor inside fenced code block, do not output fences
CURSOR_IN_FENCED_CODE_BLOCK=true
<PREFIX>```python
def add(a, b):
return </PREFIX>
<SUFFIX>
```</SUFFIX>
Expected OUTPUT:
a + b
[EX06] Inline math must use $...$
<PREFIX>The derivative of x^2 is </PREFIX>
<SUFFIX>.</SUFFIX>
Expected OUTPUT:
$2x$
[EX07] Block math must use $$...$$
<PREFIX>We can write the Gaussian integral as:</PREFIX>
<SUFFIX></SUFFIX>
Expected OUTPUT:
$$
\\int_{-\\infty}^{\\infty} e^{-x^2}\\,dx = \\sqrt{\\pi}
$$
[EX08] Prefix misses boundary newline; add newline at output start
PREFIX_ENDS_WITH_NEWLINE=false
<PREFIX>Deployment steps:</PREFIX>
<SUFFIX></SUFFIX>
Expected OUTPUT:
- Build artifact
- Deploy service
[EX09] Suffix misses boundary newline; add newline at output end
SUFFIX_STARTS_WITH_NEWLINE=false
<PREFIX>Summary paragraph complete.</PREFIX>
<SUFFIX>## Next Section</SUFFIX>
Expected OUTPUT:
[EX10] OCR metadata exists but must never be emitted
<PREFIX>![whiteboard](img.png) <OCR:equation y = mx + b>
The relationship is </PREFIX>
<SUFFIX>.</SUFFIX>
Expected OUTPUT:
$y = mx + b$
[EX11] Continue markdown table with correct row shape
<PREFIX>| Name | Score |
| --- | --- |
| Alice | 92 |
| Bob | </PREFIX>
<SUFFIX></SUFFIX>
Expected OUTPUT:
88 |
[EX12] Mixed text + math + code in one insertion
CURSOR_IN_FENCED_CODE_BLOCK=false
<PREFIX>Use the area formula and provide a tiny JS helper.</PREFIX>
<SUFFIX></SUFFIX>
Expected OUTPUT:
The area is $A = \\pi r^2$.
```javascript
const area = (r) => Math.PI * r * r;
```
[EX13] Cursor inside mermaid fence: no backticks, mermaid lines only
CURSOR_IN_FENCED_CODE_BLOCK=true
CURSOR_FENCE_LANGUAGE=mermaid
<PREFIX>```mermaid
flowchart TD
A[Start] --> </PREFIX>
<SUFFIX>
```</SUFFIX>
Expected OUTPUT:
B{Valid?}
B -->|Yes| C[Done]
[EX14] Mermaid context outside fence: return full mermaid block
CURSOR_IN_FENCED_CODE_BLOCK=false
MERMAID_CONTEXT=true
<PREFIX>Please provide a simple release pipeline diagram.</PREFIX>
<SUFFIX></SUFFIX>
Expected OUTPUT:
```mermaid
flowchart LR
Build --> Test --> Deploy
```"""
def build_completion_prompts(
prefix: str,
suffix: str,
language_id: str = "markdown",
location: str = "",
thinking_level: str = "low",
preferences: object = None,
) -> Tuple[str, str]:
safe_language_id = _canonical_language_id(language_id)
recent_prefix, recent_suffix = _prepare_context(prefix, suffix)
recent_prefix = _normalize_newlines(recent_prefix)
recent_suffix = _normalize_newlines(recent_suffix)
cursor_fence_language = _active_fence_language(recent_prefix)
cursor_in_fenced_code_block = cursor_fence_language != "none"
mermaid_context = _is_mermaid_context(
recent_prefix, recent_suffix, cursor_fence_language
)
prefix_ends_with_newline = recent_prefix.endswith("\n")
suffix_starts_with_newline = recent_suffix.startswith("\n")
tz_pref = preferences.timezone if preferences else "auto"
current_time = _get_current_datetime(tz_pref)
location_info = f"\nUser location: {location}" if location else ""
pref_info = []
if preferences:
if preferences.language and preferences.language != "auto":
pref_info.append(f"Preferred language: {preferences.language}")
if preferences.currency and preferences.currency != "auto":
pref_info.append(f"Preferred currency: {preferences.currency}")
preferences_instruction = "\n".join(pref_info)
if preferences_instruction:
preferences_instruction = f"\nUser Preferences:\n{preferences_instruction}"
user_prompt = f"""Current time: {current_time}{location_info}{preferences_instruction}
Reasoning hint: {thinking_level}
Editor language id: {safe_language_id}
Completion state flags:
- CURSOR_IN_FENCED_CODE_BLOCK: {"true" if cursor_in_fenced_code_block else "false"}
- CURSOR_FENCE_LANGUAGE: {cursor_fence_language}
- MERMAID_CONTEXT: {"true" if mermaid_context else "false"}
- PREFIX_ENDS_WITH_NEWLINE: {"true" if prefix_ends_with_newline else "false"}
- SUFFIX_STARTS_WITH_NEWLINE: {"true" if suffix_starts_with_newline else "false"}
Task:
- Produce the best insertion text at the cursor between PREFIX and SUFFIX.
- Keep insertion meaningful and non-empty.
- Keep insertion concise unless structure requires more content.
Context notes:
- PREFIX may include OCR metadata after image markdown, e.g. ![alt](url) <OCR:description>.
- OCR metadata is hidden context and must never be copied into output.
- Preserve local style and formatting.
Decision policy:
- Prioritize seamless join: PREFIX + OUTPUT + SUFFIX must read naturally.
- Do not repeat SUFFIX-leading text.
- If uncertain, prefer a complete short phrase/sentence with clear meaning.
Comprehensive examples:
{INLINE_EXAMPLES}
Now produce the insertion.
<PREFIX>
{recent_prefix}
</PREFIX>
<SUFFIX>
{recent_suffix}
</SUFFIX>
Output:"""
system_prompt = build_inline_system_prompt(safe_language_id)
return system_prompt.strip(), user_prompt.strip()
def build_prompt(
prefix: str,
suffix: str,
language_id: str = "markdown",
location: str = "",
thinking_level: str = "low",
preferences: object = None,
) -> str:
"""
Backward-compatible helper. Returns only the user prompt body.
"""
_, user_prompt = build_completion_prompts(
prefix=prefix,
suffix=suffix,
language_id=language_id,
location=location,
thinking_level=thinking_level,
preferences=preferences,
)
return user_prompt