from datetime import datetime, timedelta, timezone import re from typing import Tuple def _get_current_datetime(timezone_pref: str = "auto") -> str: # Default to UTC+8 if auto or not specified. offset = 8 tz_info = " (UTC+8)" if timezone_pref and timezone_pref != "auto": # Parse values like "UTC+8" or "GMT-5". match = re.search(r"([+-])(\d+)", timezone_pref) if match: sign = match.group(1) hours = int(match.group(2)) offset = hours if sign == "+" else -hours tz_info = f" ({timezone_pref})" else: tz_info = f" ({timezone_pref})" now = datetime.now(timezone(timedelta(hours=offset))) weekdays = [ "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday", ] weekday = weekdays[now.weekday()] return ( f"{now.year}-{now.month:02d}-{now.day:02d} " f"{weekday} {now.hour:02d}:{now.minute:02d}:{now.second:02d}{tz_info}" ) def _sanitize_language_id(language_id: str) -> str: if not language_id: return "markdown" allowed = [] for ch in language_id.strip(): if ch.isalnum() or ch in "-_+.": allowed.append(ch) value = "".join(allowed)[:32] return value or "markdown" def _normalize_newlines(text: str) -> str: return (text or "").replace("\r\n", "\n").replace("\r", "\n") def _prepare_context(prefix: str, suffix: str) -> Tuple[str, str]: """ Prepare prefix/suffix for model completion context. Filter out potential web-scraping or legacy artifacts like
,
, . """ br_pattern = re.compile(r"", re.IGNORECASE) clean_prefix = br_pattern.sub("", prefix or "") clean_suffix = br_pattern.sub("", suffix or "") return clean_prefix, clean_suffix FENCE_LINE_RE = re.compile(r"^[ \t]*```.*$") FENCE_INFO_RE = re.compile(r"^[ \t]*```[ \t]*(.*)$") MERMAID_CONTEXT_RE = re.compile( r"```[ \t]*mermaid\b|" r"\b(flowchart|sequencediagram|classdiagram|statediagram(?:-v2)?|" r"erdiagram|journey|gantt|pie|mindmap|timeline|gitgraph|quadrantchart|xychart-beta)\b|" r"\bgraph[ \t]+(TD|TB|BT|RL|LR)\b", re.IGNORECASE, ) def _cursor_in_fenced_code_block(prefix: str) -> bool: """ Determine whether the cursor is currently inside a fenced code block. The state is computed by toggling on each markdown fence line that matches: ^[ \t]*```.*$ """ return _active_fence_language(prefix) != "none" def _active_fence_language(prefix: str) -> str: """ Return active fence language at cursor based on prefix. - "none": cursor is outside fenced code block - "unknown": cursor is inside a fence without language tag - "": cursor is inside a fenced block with language tag """ normalized = _normalize_newlines(prefix) in_fence = False active_language = "none" for line in normalized.split("\n"): if FENCE_LINE_RE.match(line): if in_fence: in_fence = False active_language = "none" else: info_match = FENCE_INFO_RE.match(line) info = info_match.group(1).strip() if info_match else "" if not info: active_language = "unknown" else: first_token = info.split()[0] lang_chars = [] for ch in first_token.strip(): if ch.isalnum() or ch in "-_+.": lang_chars.append(ch) active_language = "".join(lang_chars)[:32].lower() or "unknown" in_fence = True return active_language if in_fence else "none" def _is_mermaid_context(prefix: str, suffix: str, cursor_fence_language: str) -> bool: if cursor_fence_language == "mermaid": return True prefix_tail = (prefix or "")[-1200:] suffix_head = (suffix or "")[:400] combined = f"{prefix_tail}\n{suffix_head}" return MERMAID_CONTEXT_RE.search(combined) is not None def prepare_prompt_context(prefix: str, suffix: str) -> Tuple[str, str]: return _prepare_context(prefix, suffix) LANGUAGE_SYNONYMS = { "md": "markdown", "markdown": "markdown", "txt": "text", "text": "text", "plain": "text", "plaintext": "text", "py": "python", "python": "python", "js": "javascript", "javascript": "javascript", "jsx": "javascript", "node": "javascript", "ts": "typescript", "tsx": "typescript", "typescript": "typescript", "json": "json", "jsonc": "json", "json5": "json", "yaml": "yaml", "yml": "yaml", "toml": "toml", "ini": "ini", "cfg": "ini", "bash": "bash", "shell": "bash", "sh": "bash", "zsh": "bash", "fish": "bash", "ps": "powershell", "ps1": "powershell", "powershell": "powershell", "sql": "sql", "postgres": "sql", "postgresql": "sql", "mysql": "sql", "sqlite": "sql", "html": "html", "xml": "xml", "svg": "xml", "css": "css", "scss": "css", "less": "css", "latex": "latex", "tex": "latex", "katex": "latex", "mermaid": "mermaid", "c": "c", "c++": "cpp", "cpp": "cpp", "cxx": "cpp", "h": "c", "hpp": "cpp", "c#": "csharp", "cs": "csharp", "csharp": "csharp", "go": "go", "golang": "go", "rust": "rust", "rs": "rust", "java": "java", "kotlin": "kotlin", "swift": "swift", "ruby": "ruby", "rb": "ruby", "php": "php", "lua": "lua", "r": "r", "matlab": "matlab", "dart": "dart", "docker": "dockerfile", "dockerfile": "dockerfile", "make": "makefile", "makefile": "makefile", "diff": "diff", "patch": "diff", "regex": "regex", } def _canonical_language_id(language_id: str) -> str: safe = _sanitize_language_id(language_id).lower() if not safe: return "markdown" return LANGUAGE_SYNONYMS.get(safe, safe) def _language_guidance(language_id: str) -> str: canonical = _canonical_language_id(language_id) if canonical == "markdown": return "" if canonical == "mermaid": return """ Language-specific guidance (mermaid): - Output valid Mermaid syntax only. - Prefer concise, syntactically correct diagram statements. - Avoid prose unless the user prompt explicitly requires it.""" if canonical == "latex": return """ Language-specific guidance (latex): - Output LaTeX math content only when completing LaTeX. - If CURSOR_IN_FENCED_CODE_BLOCK=true and CURSOR_FENCE_LANGUAGE is latex/tex/katex: - Output raw LaTeX lines only. - Do not wrap with $ or $$.""" if canonical == "json": return """ Language-specific guidance (json): - Output strict JSON only (no comments, no trailing commas). - Ensure valid quotes and braces.""" if canonical == "yaml": return """ Language-specific guidance (yaml): - Output valid YAML only. - Use consistent indentation and avoid tabs.""" if canonical == "toml": return """ Language-specific guidance (toml): - Output valid TOML only. - Keep key types consistent.""" if canonical == "ini": return """ Language-specific guidance (ini): - Output valid INI only. - Keep section headers and key=value pairs consistent.""" if canonical == "sql": return """ Language-specific guidance (sql): - Output a single, valid SQL statement unless context requires multiple. - Prefer ANSI SQL when dialect is unclear.""" if canonical == "bash": return """ Language-specific guidance (bash): - Output POSIX-compatible shell when possible. - Avoid interactive prompts or destructive commands unless requested.""" if canonical == "powershell": return """ Language-specific guidance (powershell): - Output valid PowerShell commands. - Avoid destructive commands unless explicitly requested.""" if canonical == "html": return """ Language-specific guidance (html): - Output valid HTML only. - Keep markup minimal and well-formed.""" if canonical == "css": return """ Language-specific guidance (css): - Output valid CSS only. - Use concise, readable selectors.""" if canonical == "diff": return """ Language-specific guidance (diff): - Output a unified diff only. - Ensure @@ hunk headers and +/- lines are consistent.""" if canonical == "regex": return """ Language-specific guidance (regex): - Output the regex pattern only. - Avoid delimiters unless explicitly requested.""" if canonical in {"javascript", "typescript"}: return f""" Language-specific guidance ({canonical}): - Output valid {canonical} code. - Prefer modern syntax and avoid prose unless comments are needed.""" if canonical in {"python", "go", "rust", "java", "kotlin", "swift", "ruby", "php", "lua", "c", "cpp", "csharp", "r", "matlab", "dart"}: return f""" Language-specific guidance ({canonical}): - Output valid {canonical} code. - Avoid prose unless context clearly expects comments or docstrings.""" if canonical == "text": return """ Language-specific guidance (text): - Output plain text only. - Avoid markdown formatting unless explicitly asked.""" if canonical == "xml": return """ Language-specific guidance (xml): - Output well-formed XML only. - Ensure matching tags and proper escaping.""" if canonical == "dockerfile": return """ Language-specific guidance (dockerfile): - Output valid Dockerfile instructions only. - Keep layers minimal and ordered logically.""" if canonical == "makefile": return """ Language-specific guidance (makefile): - Output valid Makefile syntax only. - Use tabs for recipe lines.""" return f""" Language-specific guidance ({canonical}): - Output valid {canonical} code. - Avoid prose unless context clearly expects comments or docstrings.""" def build_inline_system_prompt(language_id: str = "markdown") -> str: safe_language_id = _canonical_language_id(language_id) language_guidance = _language_guidance(safe_language_id) system_prompt = f"""You are an inline completion engine for a {safe_language_id} editor with ghost-text suggestions. Return only the insertion text that should be placed between PREFIX and SUFFIX. Hard constraints you must follow: 1) Output-only contract: - Output insertion text only. - No explanations, no meta labels, no wrapper quotes around the whole answer. 2) Strict math formatting (KaTeX): - If you output any math expression, it must be strict KaTeX-compatible math. - Every formula must be wrapped with either $...$ (inline) or $$...$$ (block). - Never output bare formulas without $ or $$ wrappers. - Exception: If CURSOR_IN_FENCED_CODE_BLOCK=true and CURSOR_FENCE_LANGUAGE is latex/tex/katex, output raw LaTeX without $ or $$ wrappers. 3) Strict code formatting: - Read CURSOR_IN_FENCED_CODE_BLOCK from the user prompt. - If CURSOR_IN_FENCED_CODE_BLOCK=true: - You are already inside a fenced code block. - Never output triple backticks. - Output code lines only. - If CURSOR_IN_FENCED_CODE_BLOCK=false: - Any code output must be in a fenced code block with a language tag: ```{{language}} ... ``` - Do not output code snippets as inline backticks. - Choose the language tag from context (no default fallback tag instruction). 4) Mermaid-specific completion rules: - Read CURSOR_FENCE_LANGUAGE and MERMAID_CONTEXT from the user prompt. - If CURSOR_FENCE_LANGUAGE=mermaid: - Output Mermaid statements only. - Never output triple backticks. - Never output prose explanations. - If CURSOR_IN_FENCED_CODE_BLOCK=false and MERMAID_CONTEXT=true: - Output a complete Mermaid fenced block: ```mermaid ... ``` - Keep Mermaid syntax valid and concise. - Never mix Mermaid code and explanatory narration in one output. 5) Boundary newline repair: - Read PREFIX_ENDS_WITH_NEWLINE and SUFFIX_STARTS_WITH_NEWLINE from the user prompt. - Carefully reason about whether OUTPUT should start or end with a newline. - If PREFIX lacks a required boundary newline, add it at OUTPUT start. - If SUFFIX lacks a required boundary newline, add it at OUTPUT end. - Ensure PREFIX + OUTPUT + SUFFIX is structurally natural. 6) Context stitching: - Do not repeat text that already appears at the start of SUFFIX. - Preserve nearby language, tone, punctuation, indentation, and markdown structure. - Continue existing structures naturally (lists, tables, block quotes, headings). 7) OCR safety: - PREFIX may include hidden OCR metadata tags like . - Never output any OCR tag. - Never output OCR tag fragments such as .""" if language_guidance: system_prompt = f"{system_prompt.rstrip()}\n{language_guidance.strip()}" return system_prompt.strip() INLINE_EXAMPLES = """[EX01] Prose continuation The quick brown fox jumps over the lazy dog. Expected OUTPUT: moved quietly and then [EX02] Avoid repeating suffix beginning Our launch plan starts with phase one, followed by phase two. Expected OUTPUT: careful internal testing before [EX03] Continue markdown checklist ## TODO - [ ] Buy milk - [ ] Expected OUTPUT: Write release notes and share draft with team [EX04] Cursor outside code block, code must use fenced block CURSOR_IN_FENCED_CODE_BLOCK=false Parse this JSON payload in Python: Expected OUTPUT: ```python import json data = json.loads(payload) ``` [EX05] Cursor inside fenced code block, do not output fences CURSOR_IN_FENCED_CODE_BLOCK=true ```python def add(a, b): return ``` Expected OUTPUT: a + b [EX06] Inline math must use $...$ The derivative of x^2 is . Expected OUTPUT: $2x$ [EX07] Block math must use $$...$$ We can write the Gaussian integral as: Expected OUTPUT: $$ \\int_{-\\infty}^{\\infty} e^{-x^2}\\,dx = \\sqrt{\\pi} $$ [EX08] Prefix misses boundary newline; add newline at output start PREFIX_ENDS_WITH_NEWLINE=false Deployment steps: Expected OUTPUT: - Build artifact - Deploy service [EX09] Suffix misses boundary newline; add newline at output end SUFFIX_STARTS_WITH_NEWLINE=false Summary paragraph complete. ## Next Section Expected OUTPUT: [EX10] OCR metadata exists but must never be emitted ![whiteboard](img.png) The relationship is . Expected OUTPUT: $y = mx + b$ [EX11] Continue markdown table with correct row shape | Name | Score | | --- | --- | | Alice | 92 | | Bob | Expected OUTPUT: 88 | [EX12] Mixed text + math + code in one insertion CURSOR_IN_FENCED_CODE_BLOCK=false Use the area formula and provide a tiny JS helper. Expected OUTPUT: The area is $A = \\pi r^2$. ```javascript const area = (r) => Math.PI * r * r; ``` [EX13] Cursor inside mermaid fence: no backticks, mermaid lines only CURSOR_IN_FENCED_CODE_BLOCK=true CURSOR_FENCE_LANGUAGE=mermaid ```mermaid flowchart TD A[Start] --> ``` Expected OUTPUT: B{Valid?} B -->|Yes| C[Done] [EX14] Mermaid context outside fence: return full mermaid block CURSOR_IN_FENCED_CODE_BLOCK=false MERMAID_CONTEXT=true Please provide a simple release pipeline diagram. Expected OUTPUT: ```mermaid flowchart LR Build --> Test --> Deploy ```""" def build_completion_prompts( prefix: str, suffix: str, language_id: str = "markdown", location: str = "", thinking_level: str = "low", preferences: object = None, ) -> Tuple[str, str]: safe_language_id = _canonical_language_id(language_id) recent_prefix, recent_suffix = _prepare_context(prefix, suffix) recent_prefix = _normalize_newlines(recent_prefix) recent_suffix = _normalize_newlines(recent_suffix) cursor_fence_language = _active_fence_language(recent_prefix) cursor_in_fenced_code_block = cursor_fence_language != "none" mermaid_context = _is_mermaid_context( recent_prefix, recent_suffix, cursor_fence_language ) prefix_ends_with_newline = recent_prefix.endswith("\n") suffix_starts_with_newline = recent_suffix.startswith("\n") tz_pref = preferences.timezone if preferences else "auto" current_time = _get_current_datetime(tz_pref) location_info = f"\nUser location: {location}" if location else "" pref_info = [] if preferences: if preferences.language and preferences.language != "auto": pref_info.append(f"Preferred language: {preferences.language}") if preferences.currency and preferences.currency != "auto": pref_info.append(f"Preferred currency: {preferences.currency}") preferences_instruction = "\n".join(pref_info) if preferences_instruction: preferences_instruction = f"\nUser Preferences:\n{preferences_instruction}" user_prompt = f"""Current time: {current_time}{location_info}{preferences_instruction} Reasoning hint: {thinking_level} Editor language id: {safe_language_id} Completion state flags: - CURSOR_IN_FENCED_CODE_BLOCK: {"true" if cursor_in_fenced_code_block else "false"} - CURSOR_FENCE_LANGUAGE: {cursor_fence_language} - MERMAID_CONTEXT: {"true" if mermaid_context else "false"} - PREFIX_ENDS_WITH_NEWLINE: {"true" if prefix_ends_with_newline else "false"} - SUFFIX_STARTS_WITH_NEWLINE: {"true" if suffix_starts_with_newline else "false"} Task: - Produce the best insertion text at the cursor between PREFIX and SUFFIX. - Keep insertion meaningful and non-empty. - Keep insertion concise unless structure requires more content. Context notes: - PREFIX may include OCR metadata after image markdown, e.g. ![alt](url) . - OCR metadata is hidden context and must never be copied into output. - Preserve local style and formatting. Decision policy: - Prioritize seamless join: PREFIX + OUTPUT + SUFFIX must read naturally. - Do not repeat SUFFIX-leading text. - If uncertain, prefer a complete short phrase/sentence with clear meaning. Comprehensive examples: {INLINE_EXAMPLES} Now produce the insertion. {recent_prefix} {recent_suffix} Output:""" system_prompt = build_inline_system_prompt(safe_language_id) return system_prompt.strip(), user_prompt.strip() def build_prompt( prefix: str, suffix: str, language_id: str = "markdown", location: str = "", thinking_level: str = "low", preferences: object = None, ) -> str: """ Backward-compatible helper. Returns only the user prompt body. """ _, user_prompt = build_completion_prompts( prefix=prefix, suffix=suffix, language_id=language_id, location=location, thinking_level=thinking_level, preferences=preferences, ) return user_prompt