From d452d1747e92e73e2d78465905fc9882d5bd73f7 Mon Sep 17 00:00:00 2001 From: ydy0615 Date: Sat, 14 Mar 2026 18:20:39 +0800 Subject: [PATCH] feat: add language synonym mapping and canonicalization Add LANGUAGE_SYNONYMS dictionary to map language aliases to canonical IDs, _canonical_language_id() to normalize language identifiers, and _language_guidance() to provide language-specific instructions for LLM code generation. This improves language detection and ensures consistent prompt context across different language format variations. --- backend/prompt.py | 207 ++++++++++++++++++++++++++++++++++- src/plugins/copilotPlugin.ts | 51 ++++++++- src/utils/api.js | 14 ++- 3 files changed, 266 insertions(+), 6 deletions(-) diff --git a/backend/prompt.py b/backend/prompt.py index 6dafde4..fb8b0fa 100644 --- a/backend/prompt.py +++ b/backend/prompt.py @@ -127,8 +127,205 @@ def prepare_prompt_context(prefix: str, suffix: str) -> Tuple[str, str]: return _prepare_context(prefix, suffix) +LANGUAGE_SYNONYMS = { + "md": "markdown", + "markdown": "markdown", + "txt": "text", + "text": "text", + "plain": "text", + "plaintext": "text", + "py": "python", + "python": "python", + "js": "javascript", + "javascript": "javascript", + "jsx": "javascript", + "node": "javascript", + "ts": "typescript", + "tsx": "typescript", + "typescript": "typescript", + "json": "json", + "jsonc": "json", + "json5": "json", + "yaml": "yaml", + "yml": "yaml", + "toml": "toml", + "ini": "ini", + "cfg": "ini", + "bash": "bash", + "shell": "bash", + "sh": "bash", + "zsh": "bash", + "fish": "bash", + "ps": "powershell", + "ps1": "powershell", + "powershell": "powershell", + "sql": "sql", + "postgres": "sql", + "postgresql": "sql", + "mysql": "sql", + "sqlite": "sql", + "html": "html", + "xml": "xml", + "svg": "xml", + "css": "css", + "scss": "css", + "less": "css", + "latex": "latex", + "tex": "latex", + "katex": "latex", + "mermaid": "mermaid", + "c": "c", + "c++": "cpp", + "cpp": "cpp", + "cxx": "cpp", + "h": "c", + "hpp": "cpp", + "c#": "csharp", + "cs": "csharp", + "csharp": "csharp", + "go": "go", + "golang": "go", + "rust": "rust", + "rs": "rust", + "java": "java", + "kotlin": "kotlin", + "swift": "swift", + "ruby": "ruby", + "rb": "ruby", + "php": "php", + "lua": "lua", + "r": "r", + "matlab": "matlab", + "dart": "dart", + "docker": "dockerfile", + "dockerfile": "dockerfile", + "make": "makefile", + "makefile": "makefile", + "diff": "diff", + "patch": "diff", + "regex": "regex", +} + + +def _canonical_language_id(language_id: str) -> str: + safe = _sanitize_language_id(language_id).lower() + if not safe: + return "markdown" + return LANGUAGE_SYNONYMS.get(safe, safe) + + +def _language_guidance(language_id: str) -> str: + canonical = _canonical_language_id(language_id) + if canonical == "markdown": + return "" + if canonical == "mermaid": + return """ +Language-specific guidance (mermaid): +- Output valid Mermaid syntax only. +- Prefer concise, syntactically correct diagram statements. +- Avoid prose unless the user prompt explicitly requires it.""" + if canonical == "latex": + return """ +Language-specific guidance (latex): +- Output LaTeX math content only when completing LaTeX. +- If CURSOR_IN_FENCED_CODE_BLOCK=true and CURSOR_FENCE_LANGUAGE is latex/tex/katex: + - Output raw LaTeX lines only. + - Do not wrap with $ or $$.""" + if canonical == "json": + return """ +Language-specific guidance (json): +- Output strict JSON only (no comments, no trailing commas). +- Ensure valid quotes and braces.""" + if canonical == "yaml": + return """ +Language-specific guidance (yaml): +- Output valid YAML only. +- Use consistent indentation and avoid tabs.""" + if canonical == "toml": + return """ +Language-specific guidance (toml): +- Output valid TOML only. +- Keep key types consistent.""" + if canonical == "ini": + return """ +Language-specific guidance (ini): +- Output valid INI only. +- Keep section headers and key=value pairs consistent.""" + if canonical == "sql": + return """ +Language-specific guidance (sql): +- Output a single, valid SQL statement unless context requires multiple. +- Prefer ANSI SQL when dialect is unclear.""" + if canonical == "bash": + return """ +Language-specific guidance (bash): +- Output POSIX-compatible shell when possible. +- Avoid interactive prompts or destructive commands unless requested.""" + if canonical == "powershell": + return """ +Language-specific guidance (powershell): +- Output valid PowerShell commands. +- Avoid destructive commands unless explicitly requested.""" + if canonical == "html": + return """ +Language-specific guidance (html): +- Output valid HTML only. +- Keep markup minimal and well-formed.""" + if canonical == "css": + return """ +Language-specific guidance (css): +- Output valid CSS only. +- Use concise, readable selectors.""" + if canonical == "diff": + return """ +Language-specific guidance (diff): +- Output a unified diff only. +- Ensure @@ hunk headers and +/- lines are consistent.""" + if canonical == "regex": + return """ +Language-specific guidance (regex): +- Output the regex pattern only. +- Avoid delimiters unless explicitly requested.""" + if canonical in {"javascript", "typescript"}: + return f""" +Language-specific guidance ({canonical}): +- Output valid {canonical} code. +- Prefer modern syntax and avoid prose unless comments are needed.""" + if canonical in {"python", "go", "rust", "java", "kotlin", "swift", "ruby", "php", "lua", "c", "cpp", "csharp", "r", "matlab", "dart"}: + return f""" +Language-specific guidance ({canonical}): +- Output valid {canonical} code. +- Avoid prose unless context clearly expects comments or docstrings.""" + if canonical == "text": + return """ +Language-specific guidance (text): +- Output plain text only. +- Avoid markdown formatting unless explicitly asked.""" + if canonical == "xml": + return """ +Language-specific guidance (xml): +- Output well-formed XML only. +- Ensure matching tags and proper escaping.""" + if canonical == "dockerfile": + return """ +Language-specific guidance (dockerfile): +- Output valid Dockerfile instructions only. +- Keep layers minimal and ordered logically.""" + if canonical == "makefile": + return """ +Language-specific guidance (makefile): +- Output valid Makefile syntax only. +- Use tabs for recipe lines.""" + return f""" +Language-specific guidance ({canonical}): +- Output valid {canonical} code. +- Avoid prose unless context clearly expects comments or docstrings.""" + + def build_inline_system_prompt(language_id: str = "markdown") -> str: - safe_language_id = _sanitize_language_id(language_id) + safe_language_id = _canonical_language_id(language_id) + language_guidance = _language_guidance(safe_language_id) + system_prompt = f"""You are an inline completion engine for a {safe_language_id} editor with ghost-text suggestions. Return only the insertion text that should be placed between PREFIX and SUFFIX. @@ -142,6 +339,8 @@ Hard constraints you must follow: - If you output any math expression, it must be strict KaTeX-compatible math. - Every formula must be wrapped with either $...$ (inline) or $$...$$ (block). - Never output bare formulas without $ or $$ wrappers. +- Exception: If CURSOR_IN_FENCED_CODE_BLOCK=true and CURSOR_FENCE_LANGUAGE is latex/tex/katex, + output raw LaTeX without $ or $$ wrappers. 3) Strict code formatting: - Read CURSOR_IN_FENCED_CODE_BLOCK from the user prompt. @@ -187,6 +386,10 @@ Hard constraints you must follow: - PREFIX may include hidden OCR metadata tags like . - Never output any OCR tag. - Never output OCR tag fragments such as .""" + + if language_guidance: + system_prompt = f"{system_prompt.rstrip()}\n{language_guidance.strip()}" + return system_prompt.strip() @@ -319,7 +522,7 @@ def build_completion_prompts( thinking_level: str = "low", preferences: object = None, ) -> Tuple[str, str]: - safe_language_id = _sanitize_language_id(language_id) + safe_language_id = _canonical_language_id(language_id) recent_prefix, recent_suffix = _prepare_context(prefix, suffix) recent_prefix = _normalize_newlines(recent_prefix) recent_suffix = _normalize_newlines(recent_suffix) diff --git a/src/plugins/copilotPlugin.ts b/src/plugins/copilotPlugin.ts index 2cb085d..c7c3516 100644 --- a/src/plugins/copilotPlugin.ts +++ b/src/plugins/copilotPlugin.ts @@ -18,7 +18,7 @@ interface CopilotState { } interface CopilotConfig { - fetchSuggestion: (prefix: string, suffix: string, signal?: AbortSignal) => Promise + fetchSuggestion: (prefix: string, suffix: string, languageId: string, signal?: AbortSignal) => Promise debounceMs?: number } @@ -187,6 +187,52 @@ function normalizeSuggestionText(raw: string): string { return text } +function sanitizeLanguageId(value: string): string { + if (!value) return '' + const trimmed = value.trim() + if (!trimmed) return '' + let safe = '' + for (const ch of trimmed) { + if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || (ch >= '0' && ch <= '9') || ch === '-' || ch === '_' || ch === '+' || ch === '.') { + safe += ch + } + } + return safe.slice(0, 32).toLowerCase() +} + +function getCodeBlockLanguage(node: ProseNode): string { + const candidates = [node.attrs?.language, node.attrs?.lang, node.attrs?.info] + for (const value of candidates) { + if (typeof value === 'string' && value.trim()) { + return value.trim() + } + } + return '' +} + +function getCursorLanguageId(view: EditorView): string { + const fallback = 'markdown' + const { $from } = view.state.selection + + for (let depth = $from.depth; depth > 0; depth -= 1) { + const node = $from.node(depth) + const typeName = node.type?.name || '' + + if (typeName === 'math_inline' || typeName === 'math_block' || typeName === 'latex' || typeName === 'math') { + return 'latex' + } + + if (typeName === 'code_block' || typeName === 'codeBlock' || typeName === 'code_fence' || typeName === 'fence') { + const lang = sanitizeLanguageId(getCodeBlockLanguage(node)) + if (!lang) return fallback + if (lang === 'tex' || lang === 'latex' || lang === 'katex') return 'latex' + return lang + } + } + + return fallback +} + async function insertGhostText(view: EditorView, suggestion: string, from: number, ctx: Ctx) { if (!suggestion) return @@ -303,7 +349,8 @@ function doFetchSuggestion( const controller = new AbortController() runtime.abortController = controller - config.fetchSuggestion(prefix, suffix, controller.signal) + const languageId = getCursorLanguageId(view) + config.fetchSuggestion(prefix, suffix, languageId, controller.signal) .then((suggestion) => { if (!runtime.enabled) return if (runtime.requestSeq !== requestSeq) return diff --git a/src/utils/api.js b/src/utils/api.js index 56a1f58..61c9350 100644 --- a/src/utils/api.js +++ b/src/utils/api.js @@ -60,7 +60,17 @@ async function getClientIP() { } } -export async function fetchSuggestion(prefix, suffix, signal, apiUrl = API_URL) { +export async function fetchSuggestion(prefix, suffix, languageId, signal, apiUrl = API_URL) { + let normalizedLanguageId = 'markdown' + if (typeof languageId === 'string' && languageId.trim()) { + normalizedLanguageId = languageId.trim() + } else if (languageId && typeof languageId === 'object' && 'aborted' in languageId) { + signal = languageId + } + if (typeof signal === 'string') { + apiUrl = signal + signal = undefined + } const requestId = generateRequestId() const cancelUrl = getCancelUrl(apiUrl) @@ -94,7 +104,7 @@ export async function fetchSuggestion(prefix, suffix, signal, apiUrl = API_URL) const body = { prefix, suffix, - languageId: 'markdown', + languageId: normalizedLanguageId, model_thinking: settings.modelThinking, privacy_mode: settings.privacyMode, user_preferences: {