feat: add language synonym mapping and canonicalization

Add LANGUAGE_SYNONYMS dictionary to map language aliases to canonical IDs, _canonical_language_id() to normalize language identifiers, and _language_guidance() to provide language-specific instructions for LLM code generation. This improves language detection and ensures consistent prompt context across different language format variations.
2026-03-14 18:20:39 +08:00
parent c0d4bf8b2b
commit d452d1747e
3 changed files with 266 additions and 6 deletions
--- a/backend/prompt.py
+++ b/backend/prompt.py
@@ -127,8 +127,205 @@ def prepare_prompt_context(prefix: str, suffix: str) -> Tuple[str, str]:
    return _prepare_context(prefix, suffix)


+LANGUAGE_SYNONYMS = {
+    "md": "markdown",
+    "markdown": "markdown",
+    "txt": "text",
+    "text": "text",
+    "plain": "text",
+    "plaintext": "text",
+    "py": "python",
+    "python": "python",
+    "js": "javascript",
+    "javascript": "javascript",
+    "jsx": "javascript",
+    "node": "javascript",
+    "ts": "typescript",
+    "tsx": "typescript",
+    "typescript": "typescript",
+    "json": "json",
+    "jsonc": "json",
+    "json5": "json",
+    "yaml": "yaml",
+    "yml": "yaml",
+    "toml": "toml",
+    "ini": "ini",
+    "cfg": "ini",
+    "bash": "bash",
+    "shell": "bash",
+    "sh": "bash",
+    "zsh": "bash",
+    "fish": "bash",
+    "ps": "powershell",
+    "ps1": "powershell",
+    "powershell": "powershell",
+    "sql": "sql",
+    "postgres": "sql",
+    "postgresql": "sql",
+    "mysql": "sql",
+    "sqlite": "sql",
+    "html": "html",
+    "xml": "xml",
+    "svg": "xml",
+    "css": "css",
+    "scss": "css",
+    "less": "css",
+    "latex": "latex",
+    "tex": "latex",
+    "katex": "latex",
+    "mermaid": "mermaid",
+    "c": "c",
+    "c++": "cpp",
+    "cpp": "cpp",
+    "cxx": "cpp",
+    "h": "c",
+    "hpp": "cpp",
+    "c#": "csharp",
+    "cs": "csharp",
+    "csharp": "csharp",
+    "go": "go",
+    "golang": "go",
+    "rust": "rust",
+    "rs": "rust",
+    "java": "java",
+    "kotlin": "kotlin",
+    "swift": "swift",
+    "ruby": "ruby",
+    "rb": "ruby",
+    "php": "php",
+    "lua": "lua",
+    "r": "r",
+    "matlab": "matlab",
+    "dart": "dart",
+    "docker": "dockerfile",
+    "dockerfile": "dockerfile",
+    "make": "makefile",
+    "makefile": "makefile",
+    "diff": "diff",
+    "patch": "diff",
+    "regex": "regex",
+}
+
+
+def _canonical_language_id(language_id: str) -> str:
+    safe = _sanitize_language_id(language_id).lower()
+    if not safe:
+        return "markdown"
+    return LANGUAGE_SYNONYMS.get(safe, safe)
+
+
+def _language_guidance(language_id: str) -> str:
+    canonical = _canonical_language_id(language_id)
+    if canonical == "markdown":
+        return ""
+    if canonical == "mermaid":
+        return """
+Language-specific guidance (mermaid):
+- Output valid Mermaid syntax only.
+- Prefer concise, syntactically correct diagram statements.
+- Avoid prose unless the user prompt explicitly requires it."""
+    if canonical == "latex":
+        return """
+Language-specific guidance (latex):
+- Output LaTeX math content only when completing LaTeX.
+- If CURSOR_IN_FENCED_CODE_BLOCK=true and CURSOR_FENCE_LANGUAGE is latex/tex/katex:
+  - Output raw LaTeX lines only.
+  - Do not wrap with $ or $$."""
+    if canonical == "json":
+        return """
+Language-specific guidance (json):
+- Output strict JSON only (no comments, no trailing commas).
+- Ensure valid quotes and braces."""
+    if canonical == "yaml":
+        return """
+Language-specific guidance (yaml):
+- Output valid YAML only.
+- Use consistent indentation and avoid tabs."""
+    if canonical == "toml":
+        return """
+Language-specific guidance (toml):
+- Output valid TOML only.
+- Keep key types consistent."""
+    if canonical == "ini":
+        return """
+Language-specific guidance (ini):
+- Output valid INI only.
+- Keep section headers and key=value pairs consistent."""
+    if canonical == "sql":
+        return """
+Language-specific guidance (sql):
+- Output a single, valid SQL statement unless context requires multiple.
+- Prefer ANSI SQL when dialect is unclear."""
+    if canonical == "bash":
+        return """
+Language-specific guidance (bash):
+- Output POSIX-compatible shell when possible.
+- Avoid interactive prompts or destructive commands unless requested."""
+    if canonical == "powershell":
+        return """
+Language-specific guidance (powershell):
+- Output valid PowerShell commands.
+- Avoid destructive commands unless explicitly requested."""
+    if canonical == "html":
+        return """
+Language-specific guidance (html):
+- Output valid HTML only.
+- Keep markup minimal and well-formed."""
+    if canonical == "css":
+        return """
+Language-specific guidance (css):
+- Output valid CSS only.
+- Use concise, readable selectors."""
+    if canonical == "diff":
+        return """
+Language-specific guidance (diff):
+- Output a unified diff only.
+- Ensure @@ hunk headers and +/- lines are consistent."""
+    if canonical == "regex":
+        return """
+Language-specific guidance (regex):
+- Output the regex pattern only.
+- Avoid delimiters unless explicitly requested."""
+    if canonical in {"javascript", "typescript"}:
+        return f"""
+Language-specific guidance ({canonical}):
+- Output valid {canonical} code.
+- Prefer modern syntax and avoid prose unless comments are needed."""
+    if canonical in {"python", "go", "rust", "java", "kotlin", "swift", "ruby", "php", "lua", "c", "cpp", "csharp", "r", "matlab", "dart"}:
+        return f"""
+Language-specific guidance ({canonical}):
+- Output valid {canonical} code.
+- Avoid prose unless context clearly expects comments or docstrings."""
+    if canonical == "text":
+        return """
+Language-specific guidance (text):
+- Output plain text only.
+- Avoid markdown formatting unless explicitly asked."""
+    if canonical == "xml":
+        return """
+Language-specific guidance (xml):
+- Output well-formed XML only.
+- Ensure matching tags and proper escaping."""
+    if canonical == "dockerfile":
+        return """
+Language-specific guidance (dockerfile):
+- Output valid Dockerfile instructions only.
+- Keep layers minimal and ordered logically."""
+    if canonical == "makefile":
+        return """
+Language-specific guidance (makefile):
+- Output valid Makefile syntax only.
+- Use tabs for recipe lines."""
+    return f"""
+Language-specific guidance ({canonical}):
+- Output valid {canonical} code.
+- Avoid prose unless context clearly expects comments or docstrings."""
+
+
 def build_inline_system_prompt(language_id: str = "markdown") -> str:
-    safe_language_id = _sanitize_language_id(language_id)
+    safe_language_id = _canonical_language_id(language_id)
+    language_guidance = _language_guidance(safe_language_id)
+
    system_prompt = f"""You are an inline completion engine for a {safe_language_id} editor with ghost-text suggestions.

 Return only the insertion text that should be placed between PREFIX and SUFFIX.
@@ -142,6 +339,8 @@ Hard constraints you must follow:
 - If you output any math expression, it must be strict KaTeX-compatible math.
 - Every formula must be wrapped with either $...$ (inline) or $$...$$ (block).
 - Never output bare formulas without $ or $$ wrappers.
+- Exception: If CURSOR_IN_FENCED_CODE_BLOCK=true and CURSOR_FENCE_LANGUAGE is latex/tex/katex,
+  output raw LaTeX without $ or $$ wrappers.

 3) Strict code formatting:
 - Read CURSOR_IN_FENCED_CODE_BLOCK from the user prompt.
@@ -187,6 +386,10 @@ Hard constraints you must follow:
 - PREFIX may include hidden OCR metadata tags like <OCR:...>.
 - Never output any OCR tag.
 - Never output OCR tag fragments such as <OCR:...>."""
+
+    if language_guidance:
+        system_prompt = f"{system_prompt.rstrip()}\n{language_guidance.strip()}"
+
    return system_prompt.strip()


@@ -319,7 +522,7 @@ def build_completion_prompts(
    thinking_level: str = "low",
    preferences: object = None,
 ) -> Tuple[str, str]:
-    safe_language_id = _sanitize_language_id(language_id)
+    safe_language_id = _canonical_language_id(language_id)
    recent_prefix, recent_suffix = _prepare_context(prefix, suffix)
    recent_prefix = _normalize_newlines(recent_prefix)
    recent_suffix = _normalize_newlines(recent_suffix)
--- a/src/plugins/copilotPlugin.ts
+++ b/src/plugins/copilotPlugin.ts
@@ -18,7 +18,7 @@ interface CopilotState {
 }

 interface CopilotConfig {
-  fetchSuggestion: (prefix: string, suffix: string, signal?: AbortSignal) => Promise<string>
+  fetchSuggestion: (prefix: string, suffix: string, languageId: string, signal?: AbortSignal) => Promise<string>
  debounceMs?: number
 }

@@ -187,6 +187,52 @@ function normalizeSuggestionText(raw: string): string {
  return text
 }

+function sanitizeLanguageId(value: string): string {
+  if (!value) return ''
+  const trimmed = value.trim()
+  if (!trimmed) return ''
+  let safe = ''
+  for (const ch of trimmed) {
+    if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || (ch >= '0' && ch <= '9') || ch === '-' || ch === '_' || ch === '+' || ch === '.') {
+      safe += ch
+    }
+  }
+  return safe.slice(0, 32).toLowerCase()
+}
+
+function getCodeBlockLanguage(node: ProseNode): string {
+  const candidates = [node.attrs?.language, node.attrs?.lang, node.attrs?.info]
+  for (const value of candidates) {
+    if (typeof value === 'string' && value.trim()) {
+      return value.trim()
+    }
+  }
+  return ''
+}
+
+function getCursorLanguageId(view: EditorView): string {
+  const fallback = 'markdown'
+  const { $from } = view.state.selection
+
+  for (let depth = $from.depth; depth > 0; depth -= 1) {
+    const node = $from.node(depth)
+    const typeName = node.type?.name || ''
+
+    if (typeName === 'math_inline' || typeName === 'math_block' || typeName === 'latex' || typeName === 'math') {
+      return 'latex'
+    }
+
+    if (typeName === 'code_block' || typeName === 'codeBlock' || typeName === 'code_fence' || typeName === 'fence') {
+      const lang = sanitizeLanguageId(getCodeBlockLanguage(node))
+      if (!lang) return fallback
+      if (lang === 'tex' || lang === 'latex' || lang === 'katex') return 'latex'
+      return lang
+    }
+  }
+
+  return fallback
+}
+
 async function insertGhostText(view: EditorView, suggestion: string, from: number, ctx: Ctx) {
  if (!suggestion) return

@@ -303,7 +349,8 @@ function doFetchSuggestion(
  const controller = new AbortController()
  runtime.abortController = controller

-  config.fetchSuggestion(prefix, suffix, controller.signal)
+  const languageId = getCursorLanguageId(view)
+  config.fetchSuggestion(prefix, suffix, languageId, controller.signal)
    .then((suggestion) => {
      if (!runtime.enabled) return
      if (runtime.requestSeq !== requestSeq) return
--- a/src/utils/api.js
+++ b/src/utils/api.js
@@ -60,7 +60,17 @@ async function getClientIP() {
    }
 }

-export async function fetchSuggestion(prefix, suffix, signal, apiUrl = API_URL) {
+export async function fetchSuggestion(prefix, suffix, languageId, signal, apiUrl = API_URL) {
+    let normalizedLanguageId = 'markdown'
+    if (typeof languageId === 'string' && languageId.trim()) {
+        normalizedLanguageId = languageId.trim()
+    } else if (languageId && typeof languageId === 'object' && 'aborted' in languageId) {
+        signal = languageId
+    }
+    if (typeof signal === 'string') {
+        apiUrl = signal
+        signal = undefined
+    }
    const requestId = generateRequestId()
    const cancelUrl = getCancelUrl(apiUrl)

@@ -94,7 +104,7 @@ export async function fetchSuggestion(prefix, suffix, signal, apiUrl = API_URL)
        const body = {
            prefix,
            suffix,
-            languageId: 'markdown',
+            languageId: normalizedLanguageId,
            model_thinking: settings.modelThinking,
            privacy_mode: settings.privacyMode,
            user_preferences: {