llm-in-text/backend/prompt.py

from datetime import datetime, timedelta, timezone
import re
from typing import Tuple


def _get_current_datetime(timezone_pref: str = "auto") -> str:
    # Default to UTC+8 if auto or not specified.
    offset = 8
    tz_info = " (UTC+8)"

    if timezone_pref and timezone_pref != "auto":
        # Parse values like "UTC+8" or "GMT-5".
        match = re.search(r"([+-])(\d+)", timezone_pref)
        if match:
            sign = match.group(1)
            hours = int(match.group(2))
            offset = hours if sign == "+" else -hours
            tz_info = f" ({timezone_pref})"
        else:
            tz_info = f" ({timezone_pref})"

    now = datetime.now(timezone(timedelta(hours=offset)))
    weekdays = [
        "Monday",
        "Tuesday",
        "Wednesday",
        "Thursday",
        "Friday",
        "Saturday",
        "Sunday",
    ]
    weekday = weekdays[now.weekday()]
    return (
        f"{now.year}-{now.month:02d}-{now.day:02d} "
        f"{weekday} {now.hour:02d}:{now.minute:02d}:{now.second:02d}{tz_info}"
    )


def _sanitize_language_id(language_id: str) -> str:
    if not language_id:
        return "markdown"
    allowed = []
    for ch in language_id.strip():
        if ch.isalnum() or ch in "-_+.":
            allowed.append(ch)
    value = "".join(allowed)[:32]
    return value or "markdown"


def _normalize_newlines(text: str) -> str:
    return (text or "").replace("\r\n", "\n").replace("\r", "\n")


def _prepare_context(prefix: str, suffix: str) -> Tuple[str, str]:
    """
    Prepare prefix/suffix for model completion context.
    Filter out potential web-scraping or legacy artifacts like <br>, <br/>, <br\\>.
    """
    br_pattern = re.compile(r"<br\s*/?\s*\\?>", re.IGNORECASE)
    clean_prefix = br_pattern.sub("", prefix or "")
    clean_suffix = br_pattern.sub("", suffix or "")
    return clean_prefix, clean_suffix


FENCE_LINE_RE = re.compile(r"^[ \t]*```.*$")
FENCE_INFO_RE = re.compile(r"^[ \t]*```[ \t]*(.*)$")
MERMAID_CONTEXT_RE = re.compile(
    r"```[ \t]*mermaid\b|"
    r"\b(flowchart|sequencediagram|classdiagram|statediagram(?:-v2)?|"
    r"erdiagram|journey|gantt|pie|mindmap|timeline|gitgraph|quadrantchart|xychart-beta)\b|"
    r"\bgraph[ \t]+(TD|TB|BT|RL|LR)\b",
    re.IGNORECASE,
)


def _cursor_in_fenced_code_block(prefix: str) -> bool:
    """
    Determine whether the cursor is currently inside a fenced code block.
    The state is computed by toggling on each markdown fence line that matches:
    ^[ \t]*```.*$
    """
    return _active_fence_language(prefix) != "none"


def _active_fence_language(prefix: str) -> str:
    """
    Return active fence language at cursor based on prefix.
    - "none": cursor is outside fenced code block
    - "unknown": cursor is inside a fence without language tag
    - "<language>": cursor is inside a fenced block with language tag
    """
    normalized = _normalize_newlines(prefix)
    in_fence = False
    active_language = "none"
    for line in normalized.split("\n"):
        if FENCE_LINE_RE.match(line):
            if in_fence:
                in_fence = False
                active_language = "none"
            else:
                info_match = FENCE_INFO_RE.match(line)
                info = info_match.group(1).strip() if info_match else ""
                if not info:
                    active_language = "unknown"
                else:
                    first_token = info.split()[0]
                    lang_chars = []
                    for ch in first_token.strip():
                        if ch.isalnum() or ch in "-_+.":
                            lang_chars.append(ch)
                    active_language = "".join(lang_chars)[:32].lower() or "unknown"
                in_fence = True
    return active_language if in_fence else "none"


def _is_mermaid_context(prefix: str, suffix: str, cursor_fence_language: str) -> bool:
    if cursor_fence_language == "mermaid":
        return True

    prefix_tail = (prefix or "")[-1200:]
    suffix_head = (suffix or "")[:400]
    combined = f"{prefix_tail}\n{suffix_head}"
    return MERMAID_CONTEXT_RE.search(combined) is not None


def prepare_prompt_context(prefix: str, suffix: str) -> Tuple[str, str]:
    return _prepare_context(prefix, suffix)


LANGUAGE_SYNONYMS = {
    "md": "markdown",
    "markdown": "markdown",
    "txt": "text",
    "text": "text",
    "plain": "text",
    "plaintext": "text",
    "py": "python",
    "python": "python",
    "js": "javascript",
    "javascript": "javascript",
    "jsx": "javascript",
    "node": "javascript",
    "ts": "typescript",
    "tsx": "typescript",
    "typescript": "typescript",
    "json": "json",
    "jsonc": "json",
    "json5": "json",
    "yaml": "yaml",
    "yml": "yaml",
    "toml": "toml",
    "ini": "ini",
    "cfg": "ini",
    "bash": "bash",
    "shell": "bash",
    "sh": "bash",
    "zsh": "bash",
    "fish": "bash",
    "ps": "powershell",
    "ps1": "powershell",
    "powershell": "powershell",
    "sql": "sql",
    "postgres": "sql",
    "postgresql": "sql",
    "mysql": "sql",
    "sqlite": "sql",
    "html": "html",
    "xml": "xml",
    "svg": "xml",
    "css": "css",
    "scss": "css",
    "less": "css",
    "latex": "latex",
    "tex": "latex",
    "katex": "latex",
    "mermaid": "mermaid",
    "c": "c",
    "c++": "cpp",
    "cpp": "cpp",
    "cxx": "cpp",
    "h": "c",
    "hpp": "cpp",
    "c#": "csharp",
    "cs": "csharp",
    "csharp": "csharp",
    "go": "go",
    "golang": "go",
    "rust": "rust",
    "rs": "rust",
    "java": "java",
    "kotlin": "kotlin",
    "swift": "swift",
    "ruby": "ruby",
    "rb": "ruby",
    "php": "php",
    "lua": "lua",
    "r": "r",
    "matlab": "matlab",
    "dart": "dart",
    "docker": "dockerfile",
    "dockerfile": "dockerfile",
    "make": "makefile",
    "makefile": "makefile",
    "diff": "diff",
    "patch": "diff",
    "regex": "regex",
}


def _canonical_language_id(language_id: str) -> str:
    safe = _sanitize_language_id(language_id).lower()
    if not safe:
        return "markdown"
    return LANGUAGE_SYNONYMS.get(safe, safe)


def _language_guidance(language_id: str) -> str:
    canonical = _canonical_language_id(language_id)
    if canonical == "markdown":
        return ""
    if canonical == "mermaid":
        return """
Language-specific guidance (mermaid):
- Output valid Mermaid syntax only.
- Prefer concise, syntactically correct diagram statements.
- Avoid prose unless the user prompt explicitly requires it."""
    if canonical == "latex":
        return """
Language-specific guidance (latex):
- Output LaTeX math content only when completing LaTeX.
- If CURSOR_IN_FENCED_CODE_BLOCK=true and CURSOR_FENCE_LANGUAGE is latex/tex/katex:
  - Output raw LaTeX lines only.
  - Do not wrap with $ or $$."""
    if canonical == "json":
        return """
Language-specific guidance (json):
- Output strict JSON only (no comments, no trailing commas).
- Ensure valid quotes and braces."""
    if canonical == "yaml":
        return """
Language-specific guidance (yaml):
- Output valid YAML only.
- Use consistent indentation and avoid tabs."""
    if canonical == "toml":
        return """
Language-specific guidance (toml):
- Output valid TOML only.
- Keep key types consistent."""
    if canonical == "ini":
        return """
Language-specific guidance (ini):
- Output valid INI only.
- Keep section headers and key=value pairs consistent."""
    if canonical == "sql":
        return """
Language-specific guidance (sql):
- Output a single, valid SQL statement unless context requires multiple.
- Prefer ANSI SQL when dialect is unclear."""
    if canonical == "bash":
        return """
Language-specific guidance (bash):
- Output POSIX-compatible shell when possible.
- Avoid interactive prompts or destructive commands unless requested."""
    if canonical == "powershell":
        return """
Language-specific guidance (powershell):
- Output valid PowerShell commands.
- Avoid destructive commands unless explicitly requested."""
    if canonical == "html":
        return """
Language-specific guidance (html):
- Output valid HTML only.
- Keep markup minimal and well-formed."""
    if canonical == "css":
        return """
Language-specific guidance (css):
- Output valid CSS only.
- Use concise, readable selectors."""
    if canonical == "diff":
        return """
Language-specific guidance (diff):
- Output a unified diff only.
- Ensure @@ hunk headers and +/- lines are consistent."""
    if canonical == "regex":
        return """
Language-specific guidance (regex):
- Output the regex pattern only.
- Avoid delimiters unless explicitly requested."""
    if canonical in {"javascript", "typescript"}:
        return f"""
Language-specific guidance ({canonical}):
- Output valid {canonical} code.
- Prefer modern syntax and avoid prose unless comments are needed."""
    if canonical in {"python", "go", "rust", "java", "kotlin", "swift", "ruby", "php", "lua", "c", "cpp", "csharp", "r", "matlab", "dart"}:
        return f"""
Language-specific guidance ({canonical}):
- Output valid {canonical} code.
- Avoid prose unless context clearly expects comments or docstrings."""
    if canonical == "text":
        return """
Language-specific guidance (text):
- Output plain text only.
- Avoid markdown formatting unless explicitly asked."""
    if canonical == "xml":
        return """
Language-specific guidance (xml):
- Output well-formed XML only.
- Ensure matching tags and proper escaping."""
    if canonical == "dockerfile":
        return """
Language-specific guidance (dockerfile):
- Output valid Dockerfile instructions only.
- Keep layers minimal and ordered logically."""
    if canonical == "makefile":
        return """
Language-specific guidance (makefile):
- Output valid Makefile syntax only.
- Use tabs for recipe lines."""
    return f"""
Language-specific guidance ({canonical}):
- Output valid {canonical} code.
- Avoid prose unless context clearly expects comments or docstrings."""


def build_inline_system_prompt(language_id: str = "markdown") -> str:
    safe_language_id = _canonical_language_id(language_id)
    language_guidance = _language_guidance(safe_language_id)

    system_prompt = f"""You are an inline completion engine for a {safe_language_id} editor with ghost-text suggestions.

Return only the insertion text that should be placed between PREFIX and SUFFIX.

Hard constraints you must follow:
1) Output-only contract:
- Output insertion text only.
- No explanations, no meta labels, no wrapper quotes around the whole answer.

2) Strict math formatting (KaTeX):
- If you output any math expression, it must be strict KaTeX-compatible math.
- Every formula must be wrapped with either $...$ (inline) or $$...$$ (block).
- Never output bare formulas without $ or $$ wrappers.
- Exception: If CURSOR_IN_FENCED_CODE_BLOCK=true and CURSOR_FENCE_LANGUAGE is latex/tex/katex,
  output raw LaTeX without $ or $$ wrappers.

3) Strict code formatting:
- Read CURSOR_IN_FENCED_CODE_BLOCK from the user prompt.
- If CURSOR_IN_FENCED_CODE_BLOCK=true:
  - You are already inside a fenced code block.
  - Never output triple backticks.
  - Output code lines only.
- If CURSOR_IN_FENCED_CODE_BLOCK=false:
  - Any code output must be in a fenced code block with a language tag:
    ```{{language}}
    ...
    ```
  - Do not output code snippets as inline backticks.
  - Choose the language tag from context (no default fallback tag instruction).

4) Mermaid-specific completion rules:
- Read CURSOR_FENCE_LANGUAGE and MERMAID_CONTEXT from the user prompt.
- If CURSOR_FENCE_LANGUAGE=mermaid:
  - Output Mermaid statements only.
  - Never output triple backticks.
  - Never output prose explanations.
- If CURSOR_IN_FENCED_CODE_BLOCK=false and MERMAID_CONTEXT=true:
  - Output a complete Mermaid fenced block:
    ```mermaid
    ...
    ```
  - Keep Mermaid syntax valid and concise.
- Never mix Mermaid code and explanatory narration in one output.

5) Boundary newline repair:
- Read PREFIX_ENDS_WITH_NEWLINE and SUFFIX_STARTS_WITH_NEWLINE from the user prompt.
- Carefully reason about whether OUTPUT should start or end with a newline.
- If PREFIX lacks a required boundary newline, add it at OUTPUT start.
- If SUFFIX lacks a required boundary newline, add it at OUTPUT end.
- Ensure PREFIX + OUTPUT + SUFFIX is structurally natural.

6) Context stitching:
- Do not repeat text that already appears at the start of SUFFIX.
- Preserve nearby language, tone, punctuation, indentation, and markdown structure.
- Continue existing structures naturally (lists, tables, block quotes, headings).

7) OCR safety:
- PREFIX may include hidden OCR metadata tags like <OCR:...>.
- Never output any OCR tag.
- Never output OCR tag fragments such as <OCR:...>."""

    if language_guidance:
        system_prompt = f"{system_prompt.rstrip()}\n{language_guidance.strip()}"

    return system_prompt.strip()


INLINE_EXAMPLES = """[EX01] Prose continuation
<PREFIX>The quick brown fox </PREFIX>
<SUFFIX>jumps over the lazy dog.</SUFFIX>
Expected OUTPUT:
moved quietly and then

[EX02] Avoid repeating suffix beginning
<PREFIX>Our launch plan starts with </PREFIX>
<SUFFIX>phase one, followed by phase two.</SUFFIX>
Expected OUTPUT:
careful internal testing before

[EX03] Continue markdown checklist
<PREFIX>## TODO
- [ ] Buy milk
- [ ] </PREFIX>
<SUFFIX></SUFFIX>
Expected OUTPUT:
Write release notes and share draft with team

[EX04] Cursor outside code block, code must use fenced block
CURSOR_IN_FENCED_CODE_BLOCK=false
<PREFIX>Parse this JSON payload in Python:</PREFIX>
<SUFFIX></SUFFIX>
Expected OUTPUT:
```python
import json
data = json.loads(payload)
```

[EX05] Cursor inside fenced code block, do not output fences
CURSOR_IN_FENCED_CODE_BLOCK=true
<PREFIX>```python
def add(a, b):
    return </PREFIX>
<SUFFIX>
```</SUFFIX>
Expected OUTPUT:
a + b

[EX06] Inline math must use $...$
<PREFIX>The derivative of x^2 is </PREFIX>
<SUFFIX>.</SUFFIX>
Expected OUTPUT:
$2x$

[EX07] Block math must use $$...$$
<PREFIX>We can write the Gaussian integral as:</PREFIX>
<SUFFIX></SUFFIX>
Expected OUTPUT:
$$
\\int_{-\\infty}^{\\infty} e^{-x^2}\\,dx = \\sqrt{\\pi}
$$

[EX08] Prefix misses boundary newline; add newline at output start
PREFIX_ENDS_WITH_NEWLINE=false
<PREFIX>Deployment steps:</PREFIX>
<SUFFIX></SUFFIX>
Expected OUTPUT:

- Build artifact
- Deploy service

[EX09] Suffix misses boundary newline; add newline at output end
SUFFIX_STARTS_WITH_NEWLINE=false
<PREFIX>Summary paragraph complete.</PREFIX>
<SUFFIX>## Next Section</SUFFIX>
Expected OUTPUT:


[EX10] OCR metadata exists but must never be emitted
<PREFIX>![whiteboard](img.png) <OCR:equation y = mx + b>
The relationship is </PREFIX>
<SUFFIX>.</SUFFIX>
Expected OUTPUT:
$y = mx + b$

[EX11] Continue markdown table with correct row shape
<PREFIX>| Name | Score |
| --- | --- |
| Alice | 92 |
| Bob | </PREFIX>
<SUFFIX></SUFFIX>
Expected OUTPUT:
88 |

[EX12] Mixed text + math + code in one insertion
CURSOR_IN_FENCED_CODE_BLOCK=false
<PREFIX>Use the area formula and provide a tiny JS helper.</PREFIX>
<SUFFIX></SUFFIX>
Expected OUTPUT:
The area is $A = \\pi r^2$.

```javascript
const area = (r) => Math.PI * r * r;
```

[EX13] Cursor inside mermaid fence: no backticks, mermaid lines only
CURSOR_IN_FENCED_CODE_BLOCK=true
CURSOR_FENCE_LANGUAGE=mermaid
<PREFIX>```mermaid
flowchart TD
A[Start] --> </PREFIX>
<SUFFIX>
```</SUFFIX>
Expected OUTPUT:
B{Valid?}
B -->|Yes| C[Done]

[EX14] Mermaid context outside fence: return full mermaid block
CURSOR_IN_FENCED_CODE_BLOCK=false
MERMAID_CONTEXT=true
<PREFIX>Please provide a simple release pipeline diagram.</PREFIX>
<SUFFIX></SUFFIX>
Expected OUTPUT:
```mermaid
flowchart LR
Build --> Test --> Deploy
```"""


def build_completion_prompts(
    prefix: str,
    suffix: str,
    language_id: str = "markdown",
    location: str = "",
    thinking_level: str = "low",
    preferences: object = None,
) -> Tuple[str, str]:
    safe_language_id = _canonical_language_id(language_id)
    recent_prefix, recent_suffix = _prepare_context(prefix, suffix)
    recent_prefix = _normalize_newlines(recent_prefix)
    recent_suffix = _normalize_newlines(recent_suffix)

    cursor_fence_language = _active_fence_language(recent_prefix)
    cursor_in_fenced_code_block = cursor_fence_language != "none"
    mermaid_context = _is_mermaid_context(
        recent_prefix, recent_suffix, cursor_fence_language
    )
    prefix_ends_with_newline = recent_prefix.endswith("\n")
    suffix_starts_with_newline = recent_suffix.startswith("\n")

    tz_pref = preferences.timezone if preferences else "auto"
    current_time = _get_current_datetime(tz_pref)
    location_info = f"\nUser location: {location}" if location else ""

    pref_info = []
    if preferences:
        if preferences.language and preferences.language != "auto":
            pref_info.append(f"Preferred language: {preferences.language}")
        if preferences.currency and preferences.currency != "auto":
            pref_info.append(f"Preferred currency: {preferences.currency}")

    preferences_instruction = "\n".join(pref_info)
    if preferences_instruction:
        preferences_instruction = f"\nUser Preferences:\n{preferences_instruction}"

    user_prompt = f"""Current time: {current_time}{location_info}{preferences_instruction}
Reasoning hint: {thinking_level}
Editor language id: {safe_language_id}

Completion state flags:
- CURSOR_IN_FENCED_CODE_BLOCK: {"true" if cursor_in_fenced_code_block else "false"}
- CURSOR_FENCE_LANGUAGE: {cursor_fence_language}
- MERMAID_CONTEXT: {"true" if mermaid_context else "false"}
- PREFIX_ENDS_WITH_NEWLINE: {"true" if prefix_ends_with_newline else "false"}
- SUFFIX_STARTS_WITH_NEWLINE: {"true" if suffix_starts_with_newline else "false"}

Task:
- Produce the best insertion text at the cursor between PREFIX and SUFFIX.
- Keep insertion meaningful and non-empty.
- Keep insertion concise unless structure requires more content.

Context notes:
- PREFIX may include OCR metadata after image markdown, e.g. ![alt](url) <OCR:description>.
- OCR metadata is hidden context and must never be copied into output.
- Preserve local style and formatting.

Decision policy:
- Prioritize seamless join: PREFIX + OUTPUT + SUFFIX must read naturally.
- Do not repeat SUFFIX-leading text.
- If uncertain, prefer a complete short phrase/sentence with clear meaning.

Comprehensive examples:
{INLINE_EXAMPLES}

Now produce the insertion.

<PREFIX>
{recent_prefix}
</PREFIX>

<SUFFIX>
{recent_suffix}
</SUFFIX>

Output:"""

    system_prompt = build_inline_system_prompt(safe_language_id)
    return system_prompt.strip(), user_prompt.strip()


def build_prompt(
    prefix: str,
    suffix: str,
    language_id: str = "markdown",
    location: str = "",
    thinking_level: str = "low",
    preferences: object = None,
) -> str:
    """
    Backward-compatible helper. Returns only the user prompt body.
    """
    _, user_prompt = build_completion_prompts(
        prefix=prefix,
        suffix=suffix,
        language_id=language_id,
        location=location,
        thinking_level=thinking_level,
        preferences=preferences,
    )
    return user_prompt