from fastapi import FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import StreamingResponse, JSONResponse from pydantic import BaseModel import os import json import re app = FastAPI() app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) class CompletionRequest(BaseModel): prefix: str suffix: str languageId: str = 'markdown' def extract_completion_from_thinking(thinking: str) -> str: """ 从模型的 thinking 输出中提取实际的续写内容。 移除推理过程,保留实际的续写。 """ if not thinking: return "" # 尝试找到实际的续写内容 # 模型通常会在 thinking 中描述上下文,然后输出实际续写 # 常见的模式是:推理过程以描述开始,然后直接输出续写 # 查找 "Continuation:" 或类似标记之后的内容 continuation_match = re.search(r'Continuation[:\s]*([\s\S]*)', thinking, re.IGNORECASE) if continuation_match: result = continuation_match.group(1).strip() # 移除可能的后续推理说明 result = re.sub(r'\s*It seems like.*$', '', result, flags=re.IGNORECASE) return result.strip() # 如果没有明确标记,尝试移除描述性内容 # 查找 "We need to continue" 或类似开头 continue_match = re.search(r'(?:We need to|Then we should|So we|I will|The|Thus)[,\s]+([A-Z][^.!?]*(?:[.!?]|$))', thinking) if continue_match: # 取找到的句子及其后续内容 start_idx = continue_match.start(1) result = thinking[start_idx:].strip() # 移除 "Probably " 开头及其后续内容 result = re.sub(r'^Probably\s+', '', result) # 如果有 "It seems like" 或类似短语,截断 result = re.split(r'\s*It seems like\s', result, flags=re.IGNORECASE)[0] return result.strip() # 最后的策略:直接返回 thinking,移除末尾的推理说明 result = thinking.strip() # 移除 "Probably" 及其后续内容 result = re.split(r'\s+Probably\s', result, flags=re.IGNORECASE, maxsplit=1)[0] # 移除 "The instruction:" 及其后续内容 result = re.split(r'\s+The instruction:', result, flags=re.IGNORECASE, maxsplit=1)[0] return result.strip() @app.post("/v1/completions") async def create_completion(request: CompletionRequest): from prompt import build_prompt import ollama print(f"[Backend] POST /v1/completions called") print(f"[Backend] Received request - prefix length: {len(request.prefix)}, suffix length: {len(request.suffix)}") OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'gpt-oss:20b') OLLAMA_HOST = os.getenv('OLLAMA_HOST', 'http://192.168.0.120:11434') print(f"[LLM] Using host: {OLLAMA_HOST}, model: {OLLAMA_MODEL}") try: prompt = build_prompt(request.prefix, request.suffix) print(f"[Backend] Built prompt (first 100 chars): {prompt[:100]}...") print(f"[LLM] Full prompt:\n{prompt}\n") # 使用非流式 API 获取完整响应 print(f"[LLM] Calling Ollama API (non-streaming)...") client = ollama.AsyncClient(host=OLLAMA_HOST) response = await client.chat( model=OLLAMA_MODEL, messages=[{'role': 'user', 'content': prompt}], stream=False, options={ 'num_predict': 8192, 'temperature': 0.2, } ) print(f"[LLM] Response type: {type(response)}") # 提取 content 和 thinking content = "" thinking = "" if hasattr(response, 'message') and response.message: content = response.message.content or "" thinking = getattr(response.message, 'thinking', '') or "" elif isinstance(response, dict): msg = response.get('message', {}) content = msg.get('content', '') or "" thinking = msg.get('thinking', '') or "" print(f"[LLM] Original content: {repr(content[:100] if content else '')}...") print(f"[LLM] Thinking length: {len(thinking)}") print(f"[LLM] Thinking (first 200): {thinking[:200]}...") # 如果 content 为空,尝试从 thinking 中提取 if not content and thinking: print(f"[LLM] Content is empty, extracting from thinking...") content = extract_completion_from_thinking(thinking) print(f"[LLM] Extracted completion: {repr(content[:100])}...") print(f"[LLM] Final content length: {len(content)}") # 返回完整内容 async def generate(): if content: print(f"[LLM] Yielding full content: {repr(content)}") yield f"data: {json.dumps({'content': content})}\n\n" yield f"data: {json.dumps({'done': True})}\n\n" return StreamingResponse(generate(), media_type="text/event-stream") except Exception as e: error_msg = f"{{\"error\": \"{str(e)}\"}}" print(f"[Backend] Error: {e}") import traceback traceback.print_exc() return JSONResponse(content={"error": str(e)}, status_code=500) if __name__ == "__main__": import uvicorn print("[Backend] Starting server on http://0.0.0.0:8000") uvicorn.run(app, host="0.0.0.0", port=8000)