llm-in-text/backend/prompt.py

from datetime import datetime, timedelta, timezone
import re
from typing import Tuple

from models import UserPreferences
from prompts import get_language_guidance_map, get_system_prompt_template, get_inline_examples


def _get_current_datetime(timezone_pref: str = "auto") -> str:
    # Default to UTC+8 if auto or not specified.
    offset = 8
    tz_info = " (UTC+8)"

    if timezone_pref and timezone_pref != "auto":
        # Parse values like "UTC+8" or "GMT-5".
        match = re.search(r"([+-])(\d+)", timezone_pref)
        if match:
            sign = match.group(1)
            hours = int(match.group(2))
            offset = hours if sign == "+" else -hours
            tz_info = f" ({timezone_pref})"
        else:
            tz_info = f" ({timezone_pref})"

    now = datetime.now(timezone(timedelta(hours=offset)))
    weekdays = [
        "Monday",
        "Tuesday",
        "Wednesday",
        "Thursday",
        "Friday",
        "Saturday",
        "Sunday",
    ]
    weekday = weekdays[now.weekday()]
    return (
        f"{now.year}-{now.month:02d}-{now.day:02d} "
        f"{weekday} {now.hour:02d}:{now.minute:02d}:{now.second:02d}{tz_info}"
    )


def _sanitize_language_id(language_id: str) -> str:
    if not language_id:
        return "markdown"
    allowed = []
    for ch in language_id.strip():
        if ch.isalnum() or ch in "-_+.":
            allowed.append(ch)
    value = "".join(allowed)[:32]
    return value or "markdown"


def _normalize_newlines(text: str) -> str:
    return (text or "").replace("\r\n", "\n").replace("\r", "\n")


def _prepare_context(prefix: str, suffix: str) -> Tuple[str, str]:
    """
    Prepare prefix/suffix for model completion context.
    Filter out potential web-scraping or legacy artifacts like <br>, <br/>, <br\\>.
    """
    br_pattern = re.compile(r"<br\s*/?\s*\\?>", re.IGNORECASE)
    clean_prefix = br_pattern.sub("", prefix or "")
    clean_suffix = br_pattern.sub("", suffix or "")
    return clean_prefix, clean_suffix


FENCE_LINE_RE = re.compile(r"^[ \t]*```.*$")
FENCE_INFO_RE = re.compile(r"^[ \t]*```[ \t]*(.*)$")
MERMAID_CONTEXT_RE = re.compile(
    r"```[ \t]*mermaid\b|"
    r"\b(flowchart|sequencediagram|classdiagram|statediagram(?:-v2)?|"
    r"erdiagram|journey|gantt|pie|mindmap|timeline|gitgraph|quadrantchart|xychart-beta)\b|"
    r"\bgraph[ \t]+(TD|TB|BT|RL|LR)\b",
    re.IGNORECASE,
)


def _cursor_in_fenced_code_block(prefix: str) -> bool:
    """
    Determine whether the cursor is currently inside a fenced code block.
    The state is computed by toggling on each markdown fence line that matches:
    ^[ \t]*```.*$
    """
    return _active_fence_language(prefix) != "none"


def _active_fence_language(prefix: str) -> str:
    """
    Return active fence language at cursor based on prefix.
    - "none": cursor is outside fenced code block
    - "unknown": cursor is inside a fence without language tag
    - "<language>": cursor is inside a fenced block with language tag
    """
    normalized = _normalize_newlines(prefix)
    in_fence = False
    active_language = "none"
    for line in normalized.split("\n"):
        if FENCE_LINE_RE.match(line):
            if in_fence:
                in_fence = False
                active_language = "none"
            else:
                info_match = FENCE_INFO_RE.match(line)
                info = info_match.group(1).strip() if info_match else ""
                if not info:
                    active_language = "unknown"
                else:
                    first_token = info.split()[0]
                    lang_chars = []
                    for ch in first_token.strip():
                        if ch.isalnum() or ch in "-_+.":
                            lang_chars.append(ch)
                    active_language = "".join(lang_chars)[:32].lower() or "unknown"
                in_fence = True
    return active_language if in_fence else "none"


def _is_mermaid_context(prefix: str, suffix: str, cursor_fence_language: str) -> bool:
    if cursor_fence_language == "mermaid":
        return True

    prefix_tail = (prefix or "")[-1200:]
    suffix_head = (suffix or "")[:400]
    combined = f"{prefix_tail}\n{suffix_head}"
    return MERMAID_CONTEXT_RE.search(combined) is not None


def prepare_prompt_context(prefix: str, suffix: str) -> Tuple[str, str]:
    return _prepare_context(prefix, suffix)


LANGUAGE_SYNONYMS = {
    "md": "markdown",
    "markdown": "markdown",
    "txt": "text",
    "text": "text",
    "plain": "text",
    "plaintext": "text",
    "py": "python",
    "python": "python",
    "js": "javascript",
    "javascript": "javascript",
    "jsx": "javascript",
    "node": "javascript",
    "ts": "typescript",
    "tsx": "typescript",
    "typescript": "typescript",
    "json": "json",
    "jsonc": "json",
    "json5": "json",
    "yaml": "yaml",
    "yml": "yaml",
    "toml": "toml",
    "ini": "ini",
    "cfg": "ini",
    "bash": "bash",
    "shell": "bash",
    "sh": "bash",
    "zsh": "bash",
    "fish": "bash",
    "ps": "powershell",
    "ps1": "powershell",
    "powershell": "powershell",
    "sql": "sql",
    "postgres": "sql",
    "postgresql": "sql",
    "mysql": "sql",
    "sqlite": "sql",
    "html": "html",
    "xml": "xml",
    "svg": "xml",
    "css": "css",
    "scss": "css",
    "less": "css",
    "latex": "latex",
    "tex": "latex",
    "katex": "latex",
    "mermaid": "mermaid",
    "c": "c",
    "c++": "cpp",
    "cpp": "cpp",
    "cxx": "cpp",
    "h": "c",
    "hpp": "cpp",
    "c#": "csharp",
    "cs": "csharp",
    "csharp": "csharp",
    "go": "go",
    "golang": "go",
    "rust": "rust",
    "rs": "rust",
    "java": "java",
    "kotlin": "kotlin",
    "swift": "swift",
    "ruby": "ruby",
    "rb": "ruby",
    "php": "php",
    "lua": "lua",
    "r": "r",
    "matlab": "matlab",
    "dart": "dart",
    "docker": "dockerfile",
    "dockerfile": "dockerfile",
    "make": "makefile",
    "makefile": "makefile",
    "diff": "diff",
    "patch": "diff",
    "regex": "regex",
}


def _canonical_language_id(language_id: str) -> str:
    safe = _sanitize_language_id(language_id).lower()
    if not safe:
        return "markdown"
    return LANGUAGE_SYNONYMS.get(safe, safe)


_JS_LANGS = {"javascript", "typescript"}
_CODE_LANGS = {"python", "go", "rust", "java", "kotlin", "swift", "ruby", "php", "lua", "c", "cpp", "csharp", "r", "matlab", "dart"}


def _language_guidance(language_id: str) -> str:
    canonical = _canonical_language_id(language_id)
    if canonical == "markdown":
        return ""
    guidance_map = get_language_guidance_map()
    guidance = guidance_map.get(canonical)
    if guidance:
        return guidance
    if canonical in _JS_LANGS:
        return guidance_map.get("_js_code", "").replace("{lang}", canonical)
    if canonical in _CODE_LANGS:
        return guidance_map.get("_generic_code", "").replace("{lang}", canonical)
    return guidance_map.get("_generic_code", "").replace("{lang}", canonical)


def build_inline_system_prompt(language_id: str = "markdown") -> str:
    safe_language_id = _canonical_language_id(language_id)
    language_guidance = _language_guidance(safe_language_id)
    template = get_system_prompt_template()
    system_prompt = template.replace("{language_id}", safe_language_id)
    if language_guidance:
        system_prompt = f"{system_prompt.rstrip()}\n{language_guidance.strip()}"
    return system_prompt.strip()


_INLINE_EXAMPLES = get_inline_examples()


def build_completion_prompts(
    prefix: str,
    suffix: str,
    language_id: str = "markdown",
    location: str = "",
    thinking_level: str = "low",
    preferences: UserPreferences | None = None,
) -> Tuple[str, str]:
    safe_language_id = _canonical_language_id(language_id)
    recent_prefix, recent_suffix = _prepare_context(prefix, suffix)
    recent_prefix = _normalize_newlines(recent_prefix)
    recent_suffix = _normalize_newlines(recent_suffix)

    cursor_fence_language = _active_fence_language(recent_prefix)
    cursor_in_fenced_code_block = cursor_fence_language != "none"
    mermaid_context = _is_mermaid_context(
        recent_prefix, recent_suffix, cursor_fence_language
    )
    prefix_ends_with_newline = recent_prefix.endswith("\n")
    suffix_starts_with_newline = recent_suffix.startswith("\n")

    tz_pref = preferences.timezone if preferences else "auto"
    current_time = _get_current_datetime(tz_pref)
    location_info = f"\nUser location: {location}" if location else ""

    pref_info = []
    if preferences:
        if preferences.language and preferences.language != "auto":
            pref_info.append(f"Preferred language: {preferences.language}")
        if preferences.currency and preferences.currency != "auto":
            pref_info.append(f"Preferred currency: {preferences.currency}")

    preferences_instruction = "\n".join(pref_info)
    if preferences_instruction:
        preferences_instruction = f"\nUser Preferences:\n{preferences_instruction}"

    user_prompt = f"""Current time: {current_time}{location_info}{preferences_instruction}
Reasoning level: {thinking_level}
Editor language: {safe_language_id}

=== STATE FLAGS ===
- CURSOR_IN_FENCED_CODE_BLOCK: {"true" if cursor_in_fenced_code_block else "false"}
- CURSOR_FENCE_LANGUAGE: {cursor_fence_language}
- MERMAID_CONTEXT: {"true" if mermaid_context else "false"}
- PREFIX_ENDS_WITH_NEWLINE: {"true" if prefix_ends_with_newline else "false"}
- SUFFIX_STARTS_WITH_NEWLINE: {"true" if suffix_starts_with_newline else "false"}

=== TASK ===
Produce the best insertion text between PREFIX and SUFFIX.
Requirements:
- Non-empty and meaningful
- Concise unless structure needs more
- Follows markdown rules in system prompt

=== BOUNDARY DECISION GUIDE ===

Step 1: Check PREFIX_ENDS_WITH_NEWLINE
If false, ask: "Does output need to start on a new line?"
  - YES if PREFIX ends with: ":", "steps:", "items:", heading text, or complete sentence before heading
  - If YES: start output with \\n

Step 2: Check SUFFIX_STARTS_WITH_NEWLINE
If false, ask: "Does output need to end with a newline?"
  - YES if SUFFIX starts with: heading (##), new paragraph, or list marker
  - If YES: end output with \\n

Step 3: Choose newline type
  - Use \\n\\n for: new paragraphs, before headings, starting lists
  - Use \\n for: continuing within blocks, list items, table cells
  - Exception: inside code fences, use \\n freely

=== CONTEXT NOTES ===
- OCR metadata (e.g., <OCR:description>) is hidden context, never copy to output
- Match PREFIX tone, style, and indentation
- Do not repeat text from SUFFIX beginning

=== EXAMPLES BY CATEGORY ===
{_INLINE_EXAMPLES}

=== NOW COMPLETE THE TASK ===

<PREFIX>
{recent_prefix}
</PREFIX>

<SUFFIX>
{recent_suffix}
</SUFFIX>

Output:"""

    system_prompt = build_inline_system_prompt(safe_language_id)
    return system_prompt.strip(), user_prompt.strip()


def build_prompt(
    prefix: str,
    suffix: str,
    language_id: str = "markdown",
    location: str = "",
    thinking_level: str = "low",
    preferences: UserPreferences | None = None,
) -> str:
    """
    Backward-compatible helper. Returns only the user prompt body.
    """
    _, user_prompt = build_completion_prompts(
        prefix=prefix,
        suffix=suffix,
        language_id=language_id,
        location=location,
        thinking_level=thinking_level,
        preferences=preferences,
    )
    return user_prompt