From e0054d4cbc89e1384d68425ffe11011a9a2bf95c Mon Sep 17 00:00:00 2001 From: ydy0615 Date: Sat, 11 Apr 2026 10:33:46 +0800 Subject: [PATCH] refactor(tts): use numpy and proper temp file cleanup for WAV encoding Update WAV encoding logic to convert audio to a NumPy array, employ a temporary file for safe write with soundfile, and ensure cleanup in a finally block. This resolves the BytesIO limitation and improves the reliability of the TTS endpoint. --- backend/tts_asr.py | 41 +++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/backend/tts_asr.py b/backend/tts_asr.py index c3d3a9d..069805c 100644 --- a/backend/tts_asr.py +++ b/backend/tts_asr.py @@ -2,12 +2,13 @@ import asyncio import base64 import logging import os -from io import BytesIO +import tempfile from typing import Optional # 设置 Hugging Face 镜像源为国内镜像 os.environ.setdefault("HF_ENDPOINT", "https://hf-mirror.com") +import numpy as np import torch from fastapi import APIRouter, HTTPException from pydantic import BaseModel @@ -193,11 +194,11 @@ async def tts_endpoint(req: TTSRequest): text = req.text instruct = req.instruct or "" - speaker = req.speaker or "Vivian" try: # VoiceDesign 模型使用 generate_voice_design 方法 - wavs_sr = model.generate_voice_design( + # 返回 (wavs, sr),其中 wavs 是列表,wavs[0] 是第一个音频数据 + wavs, sr = model.generate_voice_design( text=text, language="Chinese", instruct=instruct, @@ -206,28 +207,36 @@ async def tts_endpoint(req: TTSRequest): logger.exception("TTS 推理失败") raise HTTPException(status_code=500, detail=f"TTS 推理失败: {e}") - # Normalize output - if isinstance(wavs_sr, tuple) and len(wavs_sr) == 2: - wav_data, sr = wavs_sr - else: - wav_data, sr = wavs_sr[0], wavs_sr[1] # type: ignore + # 获取第一个音频数据 + wav_data = wavs[0] if isinstance(wavs, (list, tuple)) else wavs + + # 转换为 numpy 数组 + if hasattr(wav_data, 'numpy'): + wav_data = wav_data.cpu().numpy() + wav_data = np.asarray(wav_data, dtype=np.float32) + + logger.debug("wav_data shape: %s, dtype: %s, sr: %s", wav_data.shape, wav_data.dtype, sr) # 编码 WAV 到内存 + tmp_path = None try: - import tempfile import soundfile as sf - # soundfile 不支持直接写入 BytesIO,需要用临时文件 - with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: - tmp_path = tmp.name - sf.write(tmp_path, wav_data, sr, format="WAV") + # 创建临时文件 + fd, tmp_path = tempfile.mkstemp(suffix=".wav") + os.close(fd) + sf.write(tmp_path, wav_data, sr) with open(tmp_path, "rb") as f: audio_bytes = f.read() - # 清理临时文件 - import os as _os - _os.unlink(tmp_path) except Exception as e: logger.exception("音频编码失败") raise HTTPException(status_code=500, detail=f"音频编码失败: {e}") + finally: + # 清理临时文件 + if tmp_path and os.path.exists(tmp_path): + try: + os.unlink(tmp_path) + except Exception: + pass # 计算时长(毫秒) duration_ms = int(len(wav_data) / sr * 1000) if sr > 0 else 0