feat: add language synonym mapping and canonicalization
Add LANGUAGE_SYNONYMS dictionary to map language aliases to canonical IDs, _canonical_language_id() to normalize language identifiers, and _language_guidance() to provide language-specific instructions for LLM code generation. This improves language detection and ensures consistent prompt context across different language format variations.
This commit is contained in:
@@ -127,8 +127,205 @@ def prepare_prompt_context(prefix: str, suffix: str) -> Tuple[str, str]:
|
||||
return _prepare_context(prefix, suffix)
|
||||
|
||||
|
||||
LANGUAGE_SYNONYMS = {
|
||||
"md": "markdown",
|
||||
"markdown": "markdown",
|
||||
"txt": "text",
|
||||
"text": "text",
|
||||
"plain": "text",
|
||||
"plaintext": "text",
|
||||
"py": "python",
|
||||
"python": "python",
|
||||
"js": "javascript",
|
||||
"javascript": "javascript",
|
||||
"jsx": "javascript",
|
||||
"node": "javascript",
|
||||
"ts": "typescript",
|
||||
"tsx": "typescript",
|
||||
"typescript": "typescript",
|
||||
"json": "json",
|
||||
"jsonc": "json",
|
||||
"json5": "json",
|
||||
"yaml": "yaml",
|
||||
"yml": "yaml",
|
||||
"toml": "toml",
|
||||
"ini": "ini",
|
||||
"cfg": "ini",
|
||||
"bash": "bash",
|
||||
"shell": "bash",
|
||||
"sh": "bash",
|
||||
"zsh": "bash",
|
||||
"fish": "bash",
|
||||
"ps": "powershell",
|
||||
"ps1": "powershell",
|
||||
"powershell": "powershell",
|
||||
"sql": "sql",
|
||||
"postgres": "sql",
|
||||
"postgresql": "sql",
|
||||
"mysql": "sql",
|
||||
"sqlite": "sql",
|
||||
"html": "html",
|
||||
"xml": "xml",
|
||||
"svg": "xml",
|
||||
"css": "css",
|
||||
"scss": "css",
|
||||
"less": "css",
|
||||
"latex": "latex",
|
||||
"tex": "latex",
|
||||
"katex": "latex",
|
||||
"mermaid": "mermaid",
|
||||
"c": "c",
|
||||
"c++": "cpp",
|
||||
"cpp": "cpp",
|
||||
"cxx": "cpp",
|
||||
"h": "c",
|
||||
"hpp": "cpp",
|
||||
"c#": "csharp",
|
||||
"cs": "csharp",
|
||||
"csharp": "csharp",
|
||||
"go": "go",
|
||||
"golang": "go",
|
||||
"rust": "rust",
|
||||
"rs": "rust",
|
||||
"java": "java",
|
||||
"kotlin": "kotlin",
|
||||
"swift": "swift",
|
||||
"ruby": "ruby",
|
||||
"rb": "ruby",
|
||||
"php": "php",
|
||||
"lua": "lua",
|
||||
"r": "r",
|
||||
"matlab": "matlab",
|
||||
"dart": "dart",
|
||||
"docker": "dockerfile",
|
||||
"dockerfile": "dockerfile",
|
||||
"make": "makefile",
|
||||
"makefile": "makefile",
|
||||
"diff": "diff",
|
||||
"patch": "diff",
|
||||
"regex": "regex",
|
||||
}
|
||||
|
||||
|
||||
def _canonical_language_id(language_id: str) -> str:
|
||||
safe = _sanitize_language_id(language_id).lower()
|
||||
if not safe:
|
||||
return "markdown"
|
||||
return LANGUAGE_SYNONYMS.get(safe, safe)
|
||||
|
||||
|
||||
def _language_guidance(language_id: str) -> str:
|
||||
canonical = _canonical_language_id(language_id)
|
||||
if canonical == "markdown":
|
||||
return ""
|
||||
if canonical == "mermaid":
|
||||
return """
|
||||
Language-specific guidance (mermaid):
|
||||
- Output valid Mermaid syntax only.
|
||||
- Prefer concise, syntactically correct diagram statements.
|
||||
- Avoid prose unless the user prompt explicitly requires it."""
|
||||
if canonical == "latex":
|
||||
return """
|
||||
Language-specific guidance (latex):
|
||||
- Output LaTeX math content only when completing LaTeX.
|
||||
- If CURSOR_IN_FENCED_CODE_BLOCK=true and CURSOR_FENCE_LANGUAGE is latex/tex/katex:
|
||||
- Output raw LaTeX lines only.
|
||||
- Do not wrap with $ or $$."""
|
||||
if canonical == "json":
|
||||
return """
|
||||
Language-specific guidance (json):
|
||||
- Output strict JSON only (no comments, no trailing commas).
|
||||
- Ensure valid quotes and braces."""
|
||||
if canonical == "yaml":
|
||||
return """
|
||||
Language-specific guidance (yaml):
|
||||
- Output valid YAML only.
|
||||
- Use consistent indentation and avoid tabs."""
|
||||
if canonical == "toml":
|
||||
return """
|
||||
Language-specific guidance (toml):
|
||||
- Output valid TOML only.
|
||||
- Keep key types consistent."""
|
||||
if canonical == "ini":
|
||||
return """
|
||||
Language-specific guidance (ini):
|
||||
- Output valid INI only.
|
||||
- Keep section headers and key=value pairs consistent."""
|
||||
if canonical == "sql":
|
||||
return """
|
||||
Language-specific guidance (sql):
|
||||
- Output a single, valid SQL statement unless context requires multiple.
|
||||
- Prefer ANSI SQL when dialect is unclear."""
|
||||
if canonical == "bash":
|
||||
return """
|
||||
Language-specific guidance (bash):
|
||||
- Output POSIX-compatible shell when possible.
|
||||
- Avoid interactive prompts or destructive commands unless requested."""
|
||||
if canonical == "powershell":
|
||||
return """
|
||||
Language-specific guidance (powershell):
|
||||
- Output valid PowerShell commands.
|
||||
- Avoid destructive commands unless explicitly requested."""
|
||||
if canonical == "html":
|
||||
return """
|
||||
Language-specific guidance (html):
|
||||
- Output valid HTML only.
|
||||
- Keep markup minimal and well-formed."""
|
||||
if canonical == "css":
|
||||
return """
|
||||
Language-specific guidance (css):
|
||||
- Output valid CSS only.
|
||||
- Use concise, readable selectors."""
|
||||
if canonical == "diff":
|
||||
return """
|
||||
Language-specific guidance (diff):
|
||||
- Output a unified diff only.
|
||||
- Ensure @@ hunk headers and +/- lines are consistent."""
|
||||
if canonical == "regex":
|
||||
return """
|
||||
Language-specific guidance (regex):
|
||||
- Output the regex pattern only.
|
||||
- Avoid delimiters unless explicitly requested."""
|
||||
if canonical in {"javascript", "typescript"}:
|
||||
return f"""
|
||||
Language-specific guidance ({canonical}):
|
||||
- Output valid {canonical} code.
|
||||
- Prefer modern syntax and avoid prose unless comments are needed."""
|
||||
if canonical in {"python", "go", "rust", "java", "kotlin", "swift", "ruby", "php", "lua", "c", "cpp", "csharp", "r", "matlab", "dart"}:
|
||||
return f"""
|
||||
Language-specific guidance ({canonical}):
|
||||
- Output valid {canonical} code.
|
||||
- Avoid prose unless context clearly expects comments or docstrings."""
|
||||
if canonical == "text":
|
||||
return """
|
||||
Language-specific guidance (text):
|
||||
- Output plain text only.
|
||||
- Avoid markdown formatting unless explicitly asked."""
|
||||
if canonical == "xml":
|
||||
return """
|
||||
Language-specific guidance (xml):
|
||||
- Output well-formed XML only.
|
||||
- Ensure matching tags and proper escaping."""
|
||||
if canonical == "dockerfile":
|
||||
return """
|
||||
Language-specific guidance (dockerfile):
|
||||
- Output valid Dockerfile instructions only.
|
||||
- Keep layers minimal and ordered logically."""
|
||||
if canonical == "makefile":
|
||||
return """
|
||||
Language-specific guidance (makefile):
|
||||
- Output valid Makefile syntax only.
|
||||
- Use tabs for recipe lines."""
|
||||
return f"""
|
||||
Language-specific guidance ({canonical}):
|
||||
- Output valid {canonical} code.
|
||||
- Avoid prose unless context clearly expects comments or docstrings."""
|
||||
|
||||
|
||||
def build_inline_system_prompt(language_id: str = "markdown") -> str:
|
||||
safe_language_id = _sanitize_language_id(language_id)
|
||||
safe_language_id = _canonical_language_id(language_id)
|
||||
language_guidance = _language_guidance(safe_language_id)
|
||||
|
||||
system_prompt = f"""You are an inline completion engine for a {safe_language_id} editor with ghost-text suggestions.
|
||||
|
||||
Return only the insertion text that should be placed between PREFIX and SUFFIX.
|
||||
@@ -142,6 +339,8 @@ Hard constraints you must follow:
|
||||
- If you output any math expression, it must be strict KaTeX-compatible math.
|
||||
- Every formula must be wrapped with either $...$ (inline) or $$...$$ (block).
|
||||
- Never output bare formulas without $ or $$ wrappers.
|
||||
- Exception: If CURSOR_IN_FENCED_CODE_BLOCK=true and CURSOR_FENCE_LANGUAGE is latex/tex/katex,
|
||||
output raw LaTeX without $ or $$ wrappers.
|
||||
|
||||
3) Strict code formatting:
|
||||
- Read CURSOR_IN_FENCED_CODE_BLOCK from the user prompt.
|
||||
@@ -187,6 +386,10 @@ Hard constraints you must follow:
|
||||
- PREFIX may include hidden OCR metadata tags like <OCR:...>.
|
||||
- Never output any OCR tag.
|
||||
- Never output OCR tag fragments such as <OCR:...>."""
|
||||
|
||||
if language_guidance:
|
||||
system_prompt = f"{system_prompt.rstrip()}\n{language_guidance.strip()}"
|
||||
|
||||
return system_prompt.strip()
|
||||
|
||||
|
||||
@@ -319,7 +522,7 @@ def build_completion_prompts(
|
||||
thinking_level: str = "low",
|
||||
preferences: object = None,
|
||||
) -> Tuple[str, str]:
|
||||
safe_language_id = _sanitize_language_id(language_id)
|
||||
safe_language_id = _canonical_language_id(language_id)
|
||||
recent_prefix, recent_suffix = _prepare_context(prefix, suffix)
|
||||
recent_prefix = _normalize_newlines(recent_prefix)
|
||||
recent_suffix = _normalize_newlines(recent_suffix)
|
||||
|
||||
@@ -18,7 +18,7 @@ interface CopilotState {
|
||||
}
|
||||
|
||||
interface CopilotConfig {
|
||||
fetchSuggestion: (prefix: string, suffix: string, signal?: AbortSignal) => Promise<string>
|
||||
fetchSuggestion: (prefix: string, suffix: string, languageId: string, signal?: AbortSignal) => Promise<string>
|
||||
debounceMs?: number
|
||||
}
|
||||
|
||||
@@ -187,6 +187,52 @@ function normalizeSuggestionText(raw: string): string {
|
||||
return text
|
||||
}
|
||||
|
||||
function sanitizeLanguageId(value: string): string {
|
||||
if (!value) return ''
|
||||
const trimmed = value.trim()
|
||||
if (!trimmed) return ''
|
||||
let safe = ''
|
||||
for (const ch of trimmed) {
|
||||
if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || (ch >= '0' && ch <= '9') || ch === '-' || ch === '_' || ch === '+' || ch === '.') {
|
||||
safe += ch
|
||||
}
|
||||
}
|
||||
return safe.slice(0, 32).toLowerCase()
|
||||
}
|
||||
|
||||
function getCodeBlockLanguage(node: ProseNode): string {
|
||||
const candidates = [node.attrs?.language, node.attrs?.lang, node.attrs?.info]
|
||||
for (const value of candidates) {
|
||||
if (typeof value === 'string' && value.trim()) {
|
||||
return value.trim()
|
||||
}
|
||||
}
|
||||
return ''
|
||||
}
|
||||
|
||||
function getCursorLanguageId(view: EditorView): string {
|
||||
const fallback = 'markdown'
|
||||
const { $from } = view.state.selection
|
||||
|
||||
for (let depth = $from.depth; depth > 0; depth -= 1) {
|
||||
const node = $from.node(depth)
|
||||
const typeName = node.type?.name || ''
|
||||
|
||||
if (typeName === 'math_inline' || typeName === 'math_block' || typeName === 'latex' || typeName === 'math') {
|
||||
return 'latex'
|
||||
}
|
||||
|
||||
if (typeName === 'code_block' || typeName === 'codeBlock' || typeName === 'code_fence' || typeName === 'fence') {
|
||||
const lang = sanitizeLanguageId(getCodeBlockLanguage(node))
|
||||
if (!lang) return fallback
|
||||
if (lang === 'tex' || lang === 'latex' || lang === 'katex') return 'latex'
|
||||
return lang
|
||||
}
|
||||
}
|
||||
|
||||
return fallback
|
||||
}
|
||||
|
||||
async function insertGhostText(view: EditorView, suggestion: string, from: number, ctx: Ctx) {
|
||||
if (!suggestion) return
|
||||
|
||||
@@ -303,7 +349,8 @@ function doFetchSuggestion(
|
||||
const controller = new AbortController()
|
||||
runtime.abortController = controller
|
||||
|
||||
config.fetchSuggestion(prefix, suffix, controller.signal)
|
||||
const languageId = getCursorLanguageId(view)
|
||||
config.fetchSuggestion(prefix, suffix, languageId, controller.signal)
|
||||
.then((suggestion) => {
|
||||
if (!runtime.enabled) return
|
||||
if (runtime.requestSeq !== requestSeq) return
|
||||
|
||||
@@ -60,7 +60,17 @@ async function getClientIP() {
|
||||
}
|
||||
}
|
||||
|
||||
export async function fetchSuggestion(prefix, suffix, signal, apiUrl = API_URL) {
|
||||
export async function fetchSuggestion(prefix, suffix, languageId, signal, apiUrl = API_URL) {
|
||||
let normalizedLanguageId = 'markdown'
|
||||
if (typeof languageId === 'string' && languageId.trim()) {
|
||||
normalizedLanguageId = languageId.trim()
|
||||
} else if (languageId && typeof languageId === 'object' && 'aborted' in languageId) {
|
||||
signal = languageId
|
||||
}
|
||||
if (typeof signal === 'string') {
|
||||
apiUrl = signal
|
||||
signal = undefined
|
||||
}
|
||||
const requestId = generateRequestId()
|
||||
const cancelUrl = getCancelUrl(apiUrl)
|
||||
|
||||
@@ -94,7 +104,7 @@ export async function fetchSuggestion(prefix, suffix, signal, apiUrl = API_URL)
|
||||
const body = {
|
||||
prefix,
|
||||
suffix,
|
||||
languageId: 'markdown',
|
||||
languageId: normalizedLanguageId,
|
||||
model_thinking: settings.modelThinking,
|
||||
privacy_mode: settings.privacyMode,
|
||||
user_preferences: {
|
||||
|
||||
Reference in New Issue
Block a user