llm-in-text/backend/main.py

from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import StreamingResponse, JSONResponse
from pydantic import BaseModel
import os
import json
import re

app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

class CompletionRequest(BaseModel):
    prefix: str
    suffix: str
    languageId: str = 'markdown'

def extract_completion_from_thinking(thinking: str) -> str:
    """
    从模型的 thinking 输出中提取实际的续写内容。
    移除推理过程，保留实际的续写。
    """
    if not thinking:
        return ""

    # 尝试找到实际的续写内容
    # 模型通常会在 thinking 中描述上下文，然后输出实际续写
    # 常见的模式是：推理过程以描述开始，然后直接输出续写

    # 查找 "Continuation:" 或类似标记之后的内容
    continuation_match = re.search(r'Continuation[:\s]*([\s\S]*)', thinking, re.IGNORECASE)
    if continuation_match:
        result = continuation_match.group(1).strip()
        # 移除可能的后续推理说明
        result = re.sub(r'\s*It seems like.*$', '', result, flags=re.IGNORECASE)
        return result.strip()

    # 如果没有明确标记，尝试移除描述性内容
    # 查找 "We need to continue" 或类似开头
    continue_match = re.search(r'(?:We need to|Then we should|So we|I will|The|Thus)[,\s]+([A-Z][^.!?]*(?:[.!?]|$))', thinking)
    if continue_match:
        # 取找到的句子及其后续内容
        start_idx = continue_match.start(1)
        result = thinking[start_idx:].strip()
        # 移除 "Probably " 开头及其后续内容
        result = re.sub(r'^Probably\s+', '', result)
        # 如果有 "It seems like" 或类似短语，截断
        result = re.split(r'\s*It seems like\s', result, flags=re.IGNORECASE)[0]
        return result.strip()

    # 最后的策略：直接返回 thinking，移除末尾的推理说明
    result = thinking.strip()
    # 移除 "Probably" 及其后续内容
    result = re.split(r'\s+Probably\s', result, flags=re.IGNORECASE, maxsplit=1)[0]
    # 移除 "The instruction:" 及其后续内容
    result = re.split(r'\s+The instruction:', result, flags=re.IGNORECASE, maxsplit=1)[0]

    return result.strip()

@app.post("/v1/completions")
async def create_completion(request: CompletionRequest):
    from prompt import build_prompt
    import ollama

    print(f"[Backend] POST /v1/completions called")
    print(f"[Backend] Received request - prefix length: {len(request.prefix)}, suffix length: {len(request.suffix)}")

    OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'gpt-oss:20b')
    OLLAMA_HOST = os.getenv('OLLAMA_HOST', 'http://192.168.0.120:11434')

    print(f"[LLM] Using host: {OLLAMA_HOST}, model: {OLLAMA_MODEL}")

    try:
        prompt = build_prompt(request.prefix, request.suffix)
        print(f"[Backend] Built prompt (first 100 chars): {prompt[:100]}...")
        print(f"[LLM] Full prompt:\n{prompt}\n")

        # 使用非流式 API 获取完整响应
        print(f"[LLM] Calling Ollama API (non-streaming)...")
        client = ollama.AsyncClient(host=OLLAMA_HOST)
        response = await client.chat(
            model=OLLAMA_MODEL,
            messages=[{'role': 'user', 'content': prompt}],
            stream=False,
            options={
                'num_predict': 8192,
                'temperature': 0.2,
            }
        )

        print(f"[LLM] Response type: {type(response)}")

        # 提取 content 和 thinking
        content = ""
        thinking = ""

        if hasattr(response, 'message') and response.message:
            content = response.message.content or ""
            thinking = getattr(response.message, 'thinking', '') or ""
        elif isinstance(response, dict):
            msg = response.get('message', {})
            content = msg.get('content', '') or ""
            thinking = msg.get('thinking', '') or ""

        print(f"[LLM] Original content: {repr(content[:100] if content else '')}...")
        print(f"[LLM] Thinking length: {len(thinking)}")
        print(f"[LLM] Thinking (first 200): {thinking[:200]}...")

        # 如果 content 为空，尝试从 thinking 中提取
        if not content and thinking:
            print(f"[LLM] Content is empty, extracting from thinking...")
            content = extract_completion_from_thinking(thinking)
            print(f"[LLM] Extracted completion: {repr(content[:100])}...")

        print(f"[LLM] Final content length: {len(content)}")

        # 返回完整内容
        async def generate():
            if content:
                print(f"[LLM] Yielding full content: {repr(content)}")
                yield f"data: {json.dumps({'content': content})}\n\n"
            yield f"data: {json.dumps({'done': True})}\n\n"

        return StreamingResponse(generate(), media_type="text/event-stream")

    except Exception as e:
        error_msg = f"{{\"error\": \"{str(e)}\"}}"
        print(f"[Backend] Error: {e}")
        import traceback
        traceback.print_exc()
        return JSONResponse(content={"error": str(e)}, status_code=500)

if __name__ == "__main__":
    import uvicorn
    print("[Backend] Starting server on http://0.0.0.0:8000")
    uvicorn.run(app, host="0.0.0.0", port=8000)