feat(copilot): enhance OCR handling with inline tags and document serializer

- Replace HTML comment OCR metadata with inline `<OCR:...>` tags - Implement serializer-based markdown conversion for prefix/suffix content - Add extractTextFromOCR utility function for text extraction - Enable Table, Diagram, and ListCheck features in MilkdownEditor - Add periodic debug logging for document state analysis
2026-02-14 23:53:26 +08:00
parent 794fbf8493
commit 03bb21d5c6
5 changed files with 3496 additions and 225 deletions
--- a/backend/prompt.py
+++ b/backend/prompt.py
@@ -38,10 +38,10 @@ Your job:
 - Avoid overly short outputs with little information value.

 Important context:
- PREFIX may contain hidden OCR metadata in HTML comments such as <!--OCR:...-->.
- These comments are non-visible context only.
- Never copy, rewrite, or emit HTML comments in output.
- Never output <!-- or -->.
+- PREFIX may contain OCR metadata inline after images, e.g. ![alt](url) <OCR:description>.
+- The <OCR:...> is hidden context describing image content.
+- Never copy, rewrite, or emit OCR tags in output.
+- Never output <OCR: or >.

 Hard rules:
 1. Seamless join:
--- a/package-lock.json
+++ b/package-lock.json
--- a/src/components/MilkdownEditor.vue
+++ b/src/components/MilkdownEditor.vue
@@ -99,7 +99,7 @@
 import { onMounted, onUnmounted, ref, computed } from 'vue'
 import { replaceAll } from '@milkdown/kit/utils'
 import { Crepe } from '@milkdown/crepe'
-import { editorViewCtx } from '@milkdown/kit/core'
+import { editorViewCtx, serializerCtx } from '@milkdown/kit/core'
 import { copilotPlugin, copilotConfigCtx, copilotGhostMark, setCopilotEnabled, COPILOT_PLUGIN_KEY, SIZE_LIMIT, checkSizeLimit } from '../plugins/copilotPlugin'
 import { fetchSuggestion } from '../utils/api.js'
 import { DEBUG, OCR_URL } from '../utils/config.js'
@@ -124,6 +124,7 @@ const aiButtonLabel = computed(() => {

 let crepe = null
 let markdownSyncTimer = null
+let debugLogTimer = null
 const objectUrls = new Set()
 const IMAGE_NODE_TYPES = new Set(['image', 'image-block', 'imageBlock'])

@@ -199,6 +200,57 @@ const scheduleMarkdownSync = () => {
  }, 120)
 }

+const logDebugInfo = async () => {
+  if (!crepe) return
+  try {
+    const markdown = await crepe.getMarkdown()
+    crepe.editor.action((ctx) => {
+      const view = ctx.get(editorViewCtx)
+      const schema = view.state.schema
+      const { from, to } = view.state.selection
+      const serializer = ctx.get(serializerCtx)
+      let prefixMarkdown = '', suffixMarkdown = ''
+      
+      try {
+        // Prefix: 使用 slice 创建文档节点
+        const prefixSlice = view.state.doc.slice(0, from)
+        if (prefixSlice.content.size > 0) {
+          const prefixDoc = schema.topNodeType.createAndFill(undefined, prefixSlice.content)
+          if (prefixDoc) {
+            prefixMarkdown = serializer(prefixDoc)
+          }
+        }
+        if (!prefixMarkdown) {
+          prefixMarkdown = view.state.doc.textBetween(0, from, '\n', '\n')
+        }
+        
+        // Suffix
+        const suffixSlice = view.state.doc.slice(to)
+        if (suffixSlice.content.size > 0) {
+          const suffixDoc = schema.topNodeType.createAndFill(undefined, suffixSlice.content)
+          if (suffixDoc) {
+            suffixMarkdown = serializer(suffixDoc)
+          }
+        }
+        if (!suffixMarkdown) {
+          suffixMarkdown = view.state.doc.textBetween(to, view.state.doc.content.size, '\n', '\n')
+        }
+      } catch (e) {
+        console.error('[Debug] Serializer error:', e)
+        prefixMarkdown = view.state.doc.textBetween(0, from, '\n', '\n')
+        suffixMarkdown = view.state.doc.textBetween(to, view.state.doc.content.size, '\n', '\n')
+      }
+      console.log('[Debug] ===== Document State =====')
+      console.log('[Debug] PREFIX:', prefixMarkdown)
+      console.log('[Debug] SUFFIX:', suffixMarkdown)
+      console.log('[Debug] FULL MARKDOWN:', markdown)
+      console.log('[Debug] ==========================')
+    })
+  } catch (e) {
+    console.error('[Debug] Log failed:', e)
+  }
+}
+
 const clearCurrentSuggestion = (view) => {
  const state = COPILOT_PLUGIN_KEY.getState(view.state)
  if (state?.suggestion && state.from < state.to) {
@@ -261,6 +313,9 @@ onMounted(async () => {
        features: {
            [Crepe.Feature.Latex]: true,
            [Crepe.Feature.ImageBlock]: true,
+            [Crepe.Feature.Table]: true,
+            [Crepe.Feature.Diagram]: true,
+            [Crepe.Feature.ListCheck]: true,
        },
        featureConfigs: {
            [Crepe.Feature.Latex]: {
@@ -308,6 +363,7 @@ onMounted(async () => {
        refreshSizeAndLimit(ctx)
    })
    scheduleMarkdownSync()
+    debugLogTimer = setInterval(logDebugInfo, 20000)
    
    if (DEBUG) console.log('[Debug] Crepe editor created with copilot plugin')
 })
@@ -418,6 +474,10 @@ onUnmounted(() => {
        clearTimeout(markdownSyncTimer)
        markdownSyncTimer = null
    }
+    if (debugLogTimer) {
+        clearInterval(debugLogTimer)
+        debugLogTimer = null
+    }

    for (const url of Array.from(objectUrls)) {
        revokeObjectUrl(url)
--- a/src/plugins/copilotPlugin.ts
+++ b/src/plugins/copilotPlugin.ts
@@ -1,14 +1,15 @@
 import { Plugin, PluginKey, Selection } from '@milkdown/prose/state'
 import { $prose, $ctx, $markSchema } from '@milkdown/kit/utils'
-import { parserCtx } from '@milkdown/kit/core'
+import { parserCtx, serializerCtx } from '@milkdown/kit/core'
 import { Node as ProseNode, Fragment } from '@milkdown/prose/model'
 import type { Ctx } from '@milkdown/kit/core'
 import type { EditorView } from '@milkdown/prose/view'
-import { getOcrCache, checkSizeLimit as checkOcrSizeLimit, OCR_SIZE_LIMIT } from '../utils/ocrCache'
+import { getOcrCache, checkSizeLimit as checkOcrSizeLimit, OCR_SIZE_LIMIT, extractTextFromOCR } from '../utils/ocrCache'

 const COPILOT_PLUGIN_KEY = new PluginKey('milkdown-copilot')
 const DEBOUNCE_MS = 1000
 const SIZE_LIMIT = OCR_SIZE_LIMIT
+const DEBUG = true
 const IMAGE_NODE_TYPES = new Set(['image', 'image-block', 'imageBlock'])

 interface CopilotState {
@@ -279,27 +280,57 @@ function extractImageFilenames(doc: ProseNode): string[] {
  return filenames
 }

-function buildPrefixWithOCR(prefix: string, doc: ProseNode, cursorPos: number): string {
-  const ocrEntries: string[] = []
+function buildPrefixWithOCRFromMarkdown(
+  doc: ProseNode,
+  cursorPos: number,
+  prefixMarkdown: string,
+  serializer: any,
+  schema: any
+): string {
+  const imageNodes: Array<{pos: number, src: string, label: string}> = []

  doc.descendants((node: ProseNode, pos) => {
-    if (pos >= cursorPos) return false
-    if (!isImageNodeWithSrc(node)) return true
-
+    if (!isImageNodeWithSrc(node)) return pos < cursorPos
    const src = getImageSrc(node)
-    const ocrText = getOcrCache(src)
-    if (!ocrText) return true
-
    const label = getImageLabel(node)
-    const safeOcrText = ocrText.replace(/<!--|-->/g, '').trim()
-    if (!safeOcrText) return true
-
-    ocrEntries.push(`image(${label}): ${safeOcrText}`)
-    return true
+    imageNodes.push({ pos, src, label })
+    return pos < cursorPos
  })

-  if (!ocrEntries.length) return prefix
-  return `${prefix}\n\n<!--OCR:\n${ocrEntries.join('\n')}\n-->`
+  if (imageNodes.length === 0) {
+    return prefixMarkdown
+  }
+
+  imageNodes.sort((a, b) => a.pos - b.pos)
+
+  const parts: string[] = []
+  let lastPos = 0
+
+  for (const img of imageNodes) {
+    if (img.pos > lastPos) {
+      const slice = doc.slice(lastPos, img.pos)
+      const sliceDoc = schema.topNodeType.createAndFill(undefined, slice.content)
+      parts.push(sliceDoc ? serializer(sliceDoc) : doc.textBetween(lastPos, img.pos))
+    }
+    const imageSyntax = `![${img.label}](${img.src})`
+    parts.push(imageSyntax)
+    const ocrText = getOcrCache(img.src)
+    if (ocrText) {
+      const textOnly = extractTextFromOCR(ocrText, 100)
+      if (textOnly) {
+        parts.push(` <OCR:${textOnly}>`)
+      }
+    }
+    lastPos = img.pos + 1
+  }
+
+  if (lastPos < cursorPos) {
+    const slice = doc.slice(lastPos, cursorPos)
+    const sliceDoc = schema.topNodeType.createAndFill(undefined, slice.content)
+    parts.push(sliceDoc ? serializer(sliceDoc) : doc.textBetween(lastPos, cursorPos))
+  }
+
+  return parts.join('')
 }

 function doFetchSuggestion(view: EditorView, runtime: CopilotRuntime, pos: number, prefix: string, suffix: string) {
@@ -339,6 +370,7 @@ function scheduleFetch(view: EditorView, runtime: CopilotRuntime, pos: number, p
  if (!runtime.enabled) return

  const doc = view.state.doc
+  const schema = view.state.schema
  const imageFilenames = extractImageFilenames(doc)
  const { overLimit } = checkOcrSizeLimit(doc.content.size, imageFilenames)

@@ -347,7 +379,61 @@ function scheduleFetch(view: EditorView, runtime: CopilotRuntime, pos: number, p
    return
  }

-  const prefixWithOCR = buildPrefixWithOCR(prefix, doc, pos)
+  const serializer = runtime.ctx.get(serializerCtx)
+  
+  // 尝试使用 serializer 将文档切片转换为 Markdown
+  let prefixMarkdown = ''
+  let suffixMarkdown = ''
+  
+  try {
+    // 方法1: 使用 slice 创建文档节点
+    const prefixSlice = doc.slice(0, pos)
+    if (prefixSlice.content.size > 0) {
+      const prefixDoc = schema.topNodeType.createAndFill(undefined, prefixSlice.content)
+      if (prefixDoc) {
+        prefixMarkdown = serializer(prefixDoc)
+      }
+    }
+    if (!prefixMarkdown) {
+      // 方法2: 直接序列化整个文档然后截取
+      const fullMarkdown = serializer(doc)
+      const fullDoc = view.state.doc
+      const totalLen = fullDoc.content.size
+      if (totalLen > 0 && pos < totalLen) {
+        // 简单估算位置
+        prefixMarkdown = fullMarkdown.substring(0, Math.floor(fullMarkdown.length * pos / totalLen))
+      }
+    }
+    if (!prefixMarkdown) {
+      // 回退到 textBetween 但添加换行符
+      prefixMarkdown = doc.textBetween(0, pos, '\n', '\n')
+    }
+    
+    // Suffix
+    const suffixSlice = doc.slice(pos)
+    if (suffixSlice.content.size > 0) {
+      const suffixDoc = schema.topNodeType.createAndFill(undefined, suffixSlice.content)
+      if (suffixDoc) {
+        suffixMarkdown = serializer(suffixDoc)
+      }
+    }
+    if (!suffixMarkdown) {
+      suffixMarkdown = doc.textBetween(pos, doc.content.size, '\n', '\n')
+    }
+  } catch (e) {
+    console.error('[Copilot] Serializer error:', e)
+    prefixMarkdown = doc.textBetween(0, pos, '\n', '\n')
+    suffixMarkdown = doc.textBetween(pos, doc.content.size, '\n', '\n')
+  }
+
+  const prefixWithOCR = buildPrefixWithOCRFromMarkdown(doc, pos, prefixMarkdown, serializer, schema)
+
+  if (DEBUG) {
+    console.log('[Copilot] ===== LLM Request =====')
+    console.log('[Copilot] PREFIX:', prefixWithOCR)
+    console.log('[Copilot] SUFFIX:', suffixMarkdown)
+    console.log('[Copilot] ======================')
+  }

  if (runtime.debounceTimer) {
    clearTimeout(runtime.debounceTimer)
@@ -357,7 +443,7 @@ function scheduleFetch(view: EditorView, runtime: CopilotRuntime, pos: number, p
  const debounceMs = runtime.ctx.get(copilotConfigCtx.key).debounceMs ?? DEBOUNCE_MS
  runtime.debounceTimer = setTimeout(() => {
    runtime.debounceTimer = null
-    doFetchSuggestion(view, runtime, pos, prefixWithOCR, suffix)
+    doFetchSuggestion(view, runtime, pos, prefixWithOCR, suffixMarkdown)
  }, debounceMs)
 }

--- a/src/utils/ocrCache.js
+++ b/src/utils/ocrCache.js
@@ -43,3 +43,11 @@ export function checkSizeLimit(docTextSize, imageFilenames) {
 }

 export const OCR_SIZE_LIMIT = SIZE_LIMIT
+
+export function extractTextFromOCR(ocrText, maxLen = 100) {
+  if (!ocrText) return ''
+  const match = ocrText.match(/TEXT:\s*([\s\S]*?)(?:KEY_DETAILS|LANGUAGE|SUMMARY|$)/i)
+  let text = match ? match[1].trim() : ocrText.trim()
+  if (text.toLowerCase() === '(none)') return ''
+  return text.length > maxLen ? text.substring(0, maxLen) + '...' : text
+}