llm-in-text/backend/prompt.py

from typing import Tuple
from datetime import datetime, timezone, timedelta

def _get_current_datetime(timezone_pref: str = "auto") -> str:
    # Default to UTC+8 if auto or not specified
    offset = 8
    tz_info = " (UTC+8)"

    if timezone_pref and timezone_pref != 'auto':
        # Try to parse something like "UTC+8" or "GMT+8"
        import re
        match = re.search(r'([+-])(\d+)', timezone_pref)
        if match:
            sign = match.group(1)
            hours = int(match.group(2))
            offset = hours if sign == '+' else -hours
            tz_info = f" ({timezone_pref})"
        else:
            tz_info = f" ({timezone_pref})"

    now = datetime.now(timezone(timedelta(hours=offset)))
    weekdays = ["星期一", "星期二", "星期三", "星期四", "星期五", "星期六", "星期日"]
    weekday = weekdays[now.weekday()]
    return f"{now.year}年{now.month}月{now.day}日 {weekday} {now.hour:02d}:{now.minute:02d}:{now.second:02d}{tz_info}"

def _sanitize_language_id(language_id: str) -> str:
    if not language_id:
        return "markdown"
    allowed = []
    for ch in language_id.strip():
        if ch.isalnum() or ch in "-_+.":
            allowed.append(ch)
    value = "".join(allowed)[:32]
    return value or "markdown"


def _prepare_context(prefix: str, suffix: str) -> Tuple[str, str]:
    """
    Prepare prefix/suffix for model completion context.
    Filter out potential web-scraping or legacy artifacts like <br>, <br/>, <br\>.
    """
    import re
    br_pattern = re.compile(r'<br\s*/?\s*\\?>', re.IGNORECASE)
    clean_prefix = br_pattern.sub('', prefix or "")
    clean_suffix = br_pattern.sub('', suffix or "")
    return clean_prefix, clean_suffix


def prepare_prompt_context(prefix: str, suffix: str) -> Tuple[str, str]:
    return _prepare_context(prefix, suffix)


def build_prompt(
    prefix: str,
    suffix: str,
    language_id: str = "markdown",
    location: str = "",
    thinking_level: str = "low",
    preferences: object = None
) -> str:
    safe_language_id = _sanitize_language_id(language_id)
    recent_prefix, recent_suffix = _prepare_context(prefix, suffix)
    tz_pref = preferences.timezone if preferences else "auto"
    current_time = _get_current_datetime(tz_pref)
    location_info = f"\nUser location: {location}" if location else ""

    pref_info = []
    if preferences:
        if preferences.language and preferences.language != 'auto':
            pref_info.append(f"Preferred language: {preferences.language}")
        if preferences.currency and preferences.currency != 'auto':
            pref_info.append(f"Preferred currency: {preferences.currency}")

    preferences_instruction = "\n".join(pref_info)
    if preferences_instruction:
        preferences_instruction = f"\nUser Preferences:\n{preferences_instruction}"

    prompt = f"""Current time: {current_time}{location_info}{preferences_instruction}

You are an inline completion engine for a {safe_language_id} editor with ghost-text suggestions.

Your job:
- Return ONLY the text that should be inserted at the cursor between PREFIX and SUFFIX.
- Prefer a meaningful, non-empty insertion with moderate length.
- Avoid overly short outputs with little information value.

Important context:
- PREFIX may contain OCR metadata inline after images, e.g. ![alt](url) <OCR:description>.
- The <OCR:...> is hidden context describing image content.
- Never copy, rewrite, or emit OCR tags in output.
- Never output <OCR: or >.

Hard rules:
1. Seamless join:
   PREFIX + OUTPUT + SUFFIX must read naturally as one continuous document.
2. No suffix repetition:
   Do NOT repeat text that already appears at the start of SUFFIX.
3. Balanced length:
   Prefer concise but meaningful continuation, not ultra-short fragments.
   Default target is 10-500 characters and 1-20 lines for plain prose.
   You may be longer when structure requires it (lists, tables, code blocks, math blocks).
4. Avoid trivial output:
   Do not output only punctuation or filler such as ".", ",", ";", ":".
   Do not output just one token unless it is structurally necessary.
5. Preserve local style:
   Match nearby language, tone, punctuation, spacing, and indentation.
6. Markdown awareness:
   Continue active list/checkbox/ordered-list patterns when applicable.
   Preserve indentation in nested list/code contexts.
   You may output full markdown structures when context needs them: headings, lists, tables, fenced code blocks, blockquotes, and LaTeX ($...$ / $$...$$).
   Close obvious unclosed inline markdown markers only when needed to bridge.
7. Strict output format:
   Output insertion text only.
   No explanations, labels, or wrapper quotes around the whole output.
   Markdown syntax is allowed when it is the intended insertion (including fenced code blocks and LaTeX).

Decision policy:
- If PREFIX already connects naturally to SUFFIX, add a brief but useful continuation when possible.
- If uncertain, prefer a complete short phrase or sentence with clear meaning.

Examples:
<PREFIX>The quick brown fox </PREFIX>
<SUFFIX>jumps over the lazy dog.</SUFFIX>
Output: "moved quietly and then "

<PREFIX>## TODO\\n- [ ] Buy milk\\n- [ ] </PREFIX>
<SUFFIX></SUFFIX>
Output: "Write release notes and share draft with team"

Now produce the insertion.

<PREFIX>
{recent_prefix}
</PREFIX>

<SUFFIX>
{recent_suffix}
</SUFFIX>

Output:"""

    return prompt.strip()