feat: add language synonym mapping and canonicalization

Add LANGUAGE_SYNONYMS dictionary to map language aliases to canonical IDs,
_canonical_language_id() to normalize language identifiers, and
_language_guidance() to provide language-specific instructions for LLM
code generation. This improves language detection and ensures consistent
prompt context across different language format variations.
This commit is contained in:
2026-03-14 18:20:39 +08:00
parent c0d4bf8b2b
commit d452d1747e
3 changed files with 266 additions and 6 deletions

View File

@@ -127,8 +127,205 @@ def prepare_prompt_context(prefix: str, suffix: str) -> Tuple[str, str]:
return _prepare_context(prefix, suffix)
LANGUAGE_SYNONYMS = {
"md": "markdown",
"markdown": "markdown",
"txt": "text",
"text": "text",
"plain": "text",
"plaintext": "text",
"py": "python",
"python": "python",
"js": "javascript",
"javascript": "javascript",
"jsx": "javascript",
"node": "javascript",
"ts": "typescript",
"tsx": "typescript",
"typescript": "typescript",
"json": "json",
"jsonc": "json",
"json5": "json",
"yaml": "yaml",
"yml": "yaml",
"toml": "toml",
"ini": "ini",
"cfg": "ini",
"bash": "bash",
"shell": "bash",
"sh": "bash",
"zsh": "bash",
"fish": "bash",
"ps": "powershell",
"ps1": "powershell",
"powershell": "powershell",
"sql": "sql",
"postgres": "sql",
"postgresql": "sql",
"mysql": "sql",
"sqlite": "sql",
"html": "html",
"xml": "xml",
"svg": "xml",
"css": "css",
"scss": "css",
"less": "css",
"latex": "latex",
"tex": "latex",
"katex": "latex",
"mermaid": "mermaid",
"c": "c",
"c++": "cpp",
"cpp": "cpp",
"cxx": "cpp",
"h": "c",
"hpp": "cpp",
"c#": "csharp",
"cs": "csharp",
"csharp": "csharp",
"go": "go",
"golang": "go",
"rust": "rust",
"rs": "rust",
"java": "java",
"kotlin": "kotlin",
"swift": "swift",
"ruby": "ruby",
"rb": "ruby",
"php": "php",
"lua": "lua",
"r": "r",
"matlab": "matlab",
"dart": "dart",
"docker": "dockerfile",
"dockerfile": "dockerfile",
"make": "makefile",
"makefile": "makefile",
"diff": "diff",
"patch": "diff",
"regex": "regex",
}
def _canonical_language_id(language_id: str) -> str:
safe = _sanitize_language_id(language_id).lower()
if not safe:
return "markdown"
return LANGUAGE_SYNONYMS.get(safe, safe)
def _language_guidance(language_id: str) -> str:
canonical = _canonical_language_id(language_id)
if canonical == "markdown":
return ""
if canonical == "mermaid":
return """
Language-specific guidance (mermaid):
- Output valid Mermaid syntax only.
- Prefer concise, syntactically correct diagram statements.
- Avoid prose unless the user prompt explicitly requires it."""
if canonical == "latex":
return """
Language-specific guidance (latex):
- Output LaTeX math content only when completing LaTeX.
- If CURSOR_IN_FENCED_CODE_BLOCK=true and CURSOR_FENCE_LANGUAGE is latex/tex/katex:
- Output raw LaTeX lines only.
- Do not wrap with $ or $$."""
if canonical == "json":
return """
Language-specific guidance (json):
- Output strict JSON only (no comments, no trailing commas).
- Ensure valid quotes and braces."""
if canonical == "yaml":
return """
Language-specific guidance (yaml):
- Output valid YAML only.
- Use consistent indentation and avoid tabs."""
if canonical == "toml":
return """
Language-specific guidance (toml):
- Output valid TOML only.
- Keep key types consistent."""
if canonical == "ini":
return """
Language-specific guidance (ini):
- Output valid INI only.
- Keep section headers and key=value pairs consistent."""
if canonical == "sql":
return """
Language-specific guidance (sql):
- Output a single, valid SQL statement unless context requires multiple.
- Prefer ANSI SQL when dialect is unclear."""
if canonical == "bash":
return """
Language-specific guidance (bash):
- Output POSIX-compatible shell when possible.
- Avoid interactive prompts or destructive commands unless requested."""
if canonical == "powershell":
return """
Language-specific guidance (powershell):
- Output valid PowerShell commands.
- Avoid destructive commands unless explicitly requested."""
if canonical == "html":
return """
Language-specific guidance (html):
- Output valid HTML only.
- Keep markup minimal and well-formed."""
if canonical == "css":
return """
Language-specific guidance (css):
- Output valid CSS only.
- Use concise, readable selectors."""
if canonical == "diff":
return """
Language-specific guidance (diff):
- Output a unified diff only.
- Ensure @@ hunk headers and +/- lines are consistent."""
if canonical == "regex":
return """
Language-specific guidance (regex):
- Output the regex pattern only.
- Avoid delimiters unless explicitly requested."""
if canonical in {"javascript", "typescript"}:
return f"""
Language-specific guidance ({canonical}):
- Output valid {canonical} code.
- Prefer modern syntax and avoid prose unless comments are needed."""
if canonical in {"python", "go", "rust", "java", "kotlin", "swift", "ruby", "php", "lua", "c", "cpp", "csharp", "r", "matlab", "dart"}:
return f"""
Language-specific guidance ({canonical}):
- Output valid {canonical} code.
- Avoid prose unless context clearly expects comments or docstrings."""
if canonical == "text":
return """
Language-specific guidance (text):
- Output plain text only.
- Avoid markdown formatting unless explicitly asked."""
if canonical == "xml":
return """
Language-specific guidance (xml):
- Output well-formed XML only.
- Ensure matching tags and proper escaping."""
if canonical == "dockerfile":
return """
Language-specific guidance (dockerfile):
- Output valid Dockerfile instructions only.
- Keep layers minimal and ordered logically."""
if canonical == "makefile":
return """
Language-specific guidance (makefile):
- Output valid Makefile syntax only.
- Use tabs for recipe lines."""
return f"""
Language-specific guidance ({canonical}):
- Output valid {canonical} code.
- Avoid prose unless context clearly expects comments or docstrings."""
def build_inline_system_prompt(language_id: str = "markdown") -> str:
safe_language_id = _sanitize_language_id(language_id)
safe_language_id = _canonical_language_id(language_id)
language_guidance = _language_guidance(safe_language_id)
system_prompt = f"""You are an inline completion engine for a {safe_language_id} editor with ghost-text suggestions.
Return only the insertion text that should be placed between PREFIX and SUFFIX.
@@ -142,6 +339,8 @@ Hard constraints you must follow:
- If you output any math expression, it must be strict KaTeX-compatible math.
- Every formula must be wrapped with either $...$ (inline) or $$...$$ (block).
- Never output bare formulas without $ or $$ wrappers.
- Exception: If CURSOR_IN_FENCED_CODE_BLOCK=true and CURSOR_FENCE_LANGUAGE is latex/tex/katex,
output raw LaTeX without $ or $$ wrappers.
3) Strict code formatting:
- Read CURSOR_IN_FENCED_CODE_BLOCK from the user prompt.
@@ -187,6 +386,10 @@ Hard constraints you must follow:
- PREFIX may include hidden OCR metadata tags like <OCR:...>.
- Never output any OCR tag.
- Never output OCR tag fragments such as <OCR:...>."""
if language_guidance:
system_prompt = f"{system_prompt.rstrip()}\n{language_guidance.strip()}"
return system_prompt.strip()
@@ -319,7 +522,7 @@ def build_completion_prompts(
thinking_level: str = "low",
preferences: object = None,
) -> Tuple[str, str]:
safe_language_id = _sanitize_language_id(language_id)
safe_language_id = _canonical_language_id(language_id)
recent_prefix, recent_suffix = _prepare_context(prefix, suffix)
recent_prefix = _normalize_newlines(recent_prefix)
recent_suffix = _normalize_newlines(recent_suffix)

View File

@@ -18,7 +18,7 @@ interface CopilotState {
}
interface CopilotConfig {
fetchSuggestion: (prefix: string, suffix: string, signal?: AbortSignal) => Promise<string>
fetchSuggestion: (prefix: string, suffix: string, languageId: string, signal?: AbortSignal) => Promise<string>
debounceMs?: number
}
@@ -187,6 +187,52 @@ function normalizeSuggestionText(raw: string): string {
return text
}
function sanitizeLanguageId(value: string): string {
if (!value) return ''
const trimmed = value.trim()
if (!trimmed) return ''
let safe = ''
for (const ch of trimmed) {
if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || (ch >= '0' && ch <= '9') || ch === '-' || ch === '_' || ch === '+' || ch === '.') {
safe += ch
}
}
return safe.slice(0, 32).toLowerCase()
}
function getCodeBlockLanguage(node: ProseNode): string {
const candidates = [node.attrs?.language, node.attrs?.lang, node.attrs?.info]
for (const value of candidates) {
if (typeof value === 'string' && value.trim()) {
return value.trim()
}
}
return ''
}
function getCursorLanguageId(view: EditorView): string {
const fallback = 'markdown'
const { $from } = view.state.selection
for (let depth = $from.depth; depth > 0; depth -= 1) {
const node = $from.node(depth)
const typeName = node.type?.name || ''
if (typeName === 'math_inline' || typeName === 'math_block' || typeName === 'latex' || typeName === 'math') {
return 'latex'
}
if (typeName === 'code_block' || typeName === 'codeBlock' || typeName === 'code_fence' || typeName === 'fence') {
const lang = sanitizeLanguageId(getCodeBlockLanguage(node))
if (!lang) return fallback
if (lang === 'tex' || lang === 'latex' || lang === 'katex') return 'latex'
return lang
}
}
return fallback
}
async function insertGhostText(view: EditorView, suggestion: string, from: number, ctx: Ctx) {
if (!suggestion) return
@@ -303,7 +349,8 @@ function doFetchSuggestion(
const controller = new AbortController()
runtime.abortController = controller
config.fetchSuggestion(prefix, suffix, controller.signal)
const languageId = getCursorLanguageId(view)
config.fetchSuggestion(prefix, suffix, languageId, controller.signal)
.then((suggestion) => {
if (!runtime.enabled) return
if (runtime.requestSeq !== requestSeq) return

View File

@@ -60,7 +60,17 @@ async function getClientIP() {
}
}
export async function fetchSuggestion(prefix, suffix, signal, apiUrl = API_URL) {
export async function fetchSuggestion(prefix, suffix, languageId, signal, apiUrl = API_URL) {
let normalizedLanguageId = 'markdown'
if (typeof languageId === 'string' && languageId.trim()) {
normalizedLanguageId = languageId.trim()
} else if (languageId && typeof languageId === 'object' && 'aborted' in languageId) {
signal = languageId
}
if (typeof signal === 'string') {
apiUrl = signal
signal = undefined
}
const requestId = generateRequestId()
const cancelUrl = getCancelUrl(apiUrl)
@@ -94,7 +104,7 @@ export async function fetchSuggestion(prefix, suffix, signal, apiUrl = API_URL)
const body = {
prefix,
suffix,
languageId: 'markdown',
languageId: normalizedLanguageId,
model_thinking: settings.modelThinking,
privacy_mode: settings.privacyMode,
user_preferences: {