Filter out potential web-scraping or legacy artifacts like <br>, <br/>, <br\> from prefix/suffix before using in model completion context
143 lines
5.3 KiB
Python
143 lines
5.3 KiB
Python
from typing import Tuple
|
|
from datetime import datetime, timezone, timedelta
|
|
|
|
def _get_current_datetime(timezone_pref: str = "auto") -> str:
|
|
# Default to UTC+8 if auto or not specified
|
|
offset = 8
|
|
tz_info = " (UTC+8)"
|
|
|
|
if timezone_pref and timezone_pref != 'auto':
|
|
# Try to parse something like "UTC+8" or "GMT+8"
|
|
import re
|
|
match = re.search(r'([+-])(\d+)', timezone_pref)
|
|
if match:
|
|
sign = match.group(1)
|
|
hours = int(match.group(2))
|
|
offset = hours if sign == '+' else -hours
|
|
tz_info = f" ({timezone_pref})"
|
|
else:
|
|
tz_info = f" ({timezone_pref})"
|
|
|
|
now = datetime.now(timezone(timedelta(hours=offset)))
|
|
weekdays = ["星期一", "星期二", "星期三", "星期四", "星期五", "星期六", "星期日"]
|
|
weekday = weekdays[now.weekday()]
|
|
return f"{now.year}年{now.month}月{now.day}日 {weekday} {now.hour:02d}:{now.minute:02d}:{now.second:02d}{tz_info}"
|
|
|
|
def _sanitize_language_id(language_id: str) -> str:
|
|
if not language_id:
|
|
return "markdown"
|
|
allowed = []
|
|
for ch in language_id.strip():
|
|
if ch.isalnum() or ch in "-_+.":
|
|
allowed.append(ch)
|
|
value = "".join(allowed)[:32]
|
|
return value or "markdown"
|
|
|
|
|
|
def _prepare_context(prefix: str, suffix: str) -> Tuple[str, str]:
|
|
"""
|
|
Prepare prefix/suffix for model completion context.
|
|
Filter out potential web-scraping or legacy artifacts like <br>, <br/>, <br\>.
|
|
"""
|
|
import re
|
|
br_pattern = re.compile(r'<br\s*/?\s*\\?>', re.IGNORECASE)
|
|
clean_prefix = br_pattern.sub('', prefix or "")
|
|
clean_suffix = br_pattern.sub('', suffix or "")
|
|
return clean_prefix, clean_suffix
|
|
|
|
|
|
def prepare_prompt_context(prefix: str, suffix: str) -> Tuple[str, str]:
|
|
return _prepare_context(prefix, suffix)
|
|
|
|
|
|
def build_prompt(
|
|
prefix: str,
|
|
suffix: str,
|
|
language_id: str = "markdown",
|
|
location: str = "",
|
|
thinking_level: str = "low",
|
|
preferences: object = None
|
|
) -> str:
|
|
safe_language_id = _sanitize_language_id(language_id)
|
|
recent_prefix, recent_suffix = _prepare_context(prefix, suffix)
|
|
tz_pref = preferences.timezone if preferences else "auto"
|
|
current_time = _get_current_datetime(tz_pref)
|
|
location_info = f"\nUser location: {location}" if location else ""
|
|
|
|
pref_info = []
|
|
if preferences:
|
|
if preferences.language and preferences.language != 'auto':
|
|
pref_info.append(f"Preferred language: {preferences.language}")
|
|
if preferences.currency and preferences.currency != 'auto':
|
|
pref_info.append(f"Preferred currency: {preferences.currency}")
|
|
|
|
preferences_instruction = "\n".join(pref_info)
|
|
if preferences_instruction:
|
|
preferences_instruction = f"\nUser Preferences:\n{preferences_instruction}"
|
|
|
|
prompt = f"""Current time: {current_time}{location_info}{preferences_instruction}
|
|
|
|
You are an inline completion engine for a {safe_language_id} editor with ghost-text suggestions.
|
|
|
|
Your job:
|
|
- Return ONLY the text that should be inserted at the cursor between PREFIX and SUFFIX.
|
|
- Prefer a meaningful, non-empty insertion with moderate length.
|
|
- Avoid overly short outputs with little information value.
|
|
|
|
Important context:
|
|
- PREFIX may contain OCR metadata inline after images, e.g.  <OCR:description>.
|
|
- The <OCR:...> is hidden context describing image content.
|
|
- Never copy, rewrite, or emit OCR tags in output.
|
|
- Never output <OCR: or >.
|
|
|
|
Hard rules:
|
|
1. Seamless join:
|
|
PREFIX + OUTPUT + SUFFIX must read naturally as one continuous document.
|
|
2. No suffix repetition:
|
|
Do NOT repeat text that already appears at the start of SUFFIX.
|
|
3. Balanced length:
|
|
Prefer concise but meaningful continuation, not ultra-short fragments.
|
|
Default target is 10-500 characters and 1-20 lines for plain prose.
|
|
You may be longer when structure requires it (lists, tables, code blocks, math blocks).
|
|
4. Avoid trivial output:
|
|
Do not output only punctuation or filler such as ".", ",", ";", ":".
|
|
Do not output just one token unless it is structurally necessary.
|
|
5. Preserve local style:
|
|
Match nearby language, tone, punctuation, spacing, and indentation.
|
|
6. Markdown awareness:
|
|
Continue active list/checkbox/ordered-list patterns when applicable.
|
|
Preserve indentation in nested list/code contexts.
|
|
You may output full markdown structures when context needs them: headings, lists, tables, fenced code blocks, blockquotes, and LaTeX ($...$ / $$...$$).
|
|
Close obvious unclosed inline markdown markers only when needed to bridge.
|
|
7. Strict output format:
|
|
Output insertion text only.
|
|
No explanations, labels, or wrapper quotes around the whole output.
|
|
Markdown syntax is allowed when it is the intended insertion (including fenced code blocks and LaTeX).
|
|
|
|
Decision policy:
|
|
- If PREFIX already connects naturally to SUFFIX, add a brief but useful continuation when possible.
|
|
- If uncertain, prefer a complete short phrase or sentence with clear meaning.
|
|
|
|
Examples:
|
|
<PREFIX>The quick brown fox </PREFIX>
|
|
<SUFFIX>jumps over the lazy dog.</SUFFIX>
|
|
Output: "moved quietly and then "
|
|
|
|
<PREFIX>## TODO\\n- [ ] Buy milk\\n- [ ] </PREFIX>
|
|
<SUFFIX></SUFFIX>
|
|
Output: "Write release notes and share draft with team"
|
|
|
|
Now produce the insertion.
|
|
|
|
<PREFIX>
|
|
{recent_prefix}
|
|
</PREFIX>
|
|
|
|
<SUFFIX>
|
|
{recent_suffix}
|
|
</SUFFIX>
|
|
|
|
Output:"""
|
|
|
|
return prompt.strip()
|