- Fix delete agent 500: clean up FK records (agent_llm_logs, permissions, schedules, executions, team_members) and unbind goals/tasks before delete - Remove hardcoded personality templates in Android, replace with dynamic system prompt generation from name + description - Set promptSectionsEnabled=false to bypass PromptComposer for personality - Add Tencent Cloud Linux deployment guide (Docker Compose) - Accumulated backend service updates, frontend UI fixes, Android app changes Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
262 lines
9.3 KiB
Python
262 lines
9.3 KiB
Python
"""
|
||
语音 API — Android 应用语音交互接口
|
||
|
||
提供语音转文字 (ASR) 和文字转语音 (TTS) 的 HTTP API。
|
||
TTS 优先使用 OpenAI TTS(需配置 OPENAI_API_KEY),否则使用免费的 Edge TTS。
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import asyncio
|
||
import logging
|
||
import os
|
||
import subprocess
|
||
import sys
|
||
import tempfile
|
||
import time
|
||
from pathlib import Path
|
||
|
||
from fastapi import APIRouter, Depends, File, HTTPException, Query, UploadFile
|
||
from fastapi.responses import FileResponse
|
||
from pydantic import BaseModel, Field
|
||
|
||
from app.api.auth import get_current_user
|
||
from app.models.user import User
|
||
from app.core.config import settings
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
router = APIRouter(prefix="/api/v1/voice", tags=["voice"])
|
||
|
||
# Edge TTS 中文语音映射 (OpenAI voice name -> Edge TTS Chinese voice)
|
||
_EDGE_VOICE_MAP = {
|
||
"alloy": "zh-CN-YunxiNeural",
|
||
"echo": "zh-CN-YunyangNeural",
|
||
"fable": "zh-CN-XiaoxiaoNeural",
|
||
"onyx": "zh-CN-YunjianNeural",
|
||
"nova": "zh-CN-XiaoyiNeural",
|
||
"shimmer": "zh-CN-XiaoxiaoNeural",
|
||
}
|
||
|
||
|
||
class AsrResponse(BaseModel):
|
||
"""语音识别响应"""
|
||
text: str = Field(..., description="识别出的文字")
|
||
language: str = Field(default="zh", description="语言代码")
|
||
|
||
|
||
class TtsRequest(BaseModel):
|
||
"""文字转语音请求"""
|
||
text: str = Field(..., min_length=1, max_length=4000, description="要合成的文字")
|
||
voice: str = Field(
|
||
default="alloy",
|
||
description="语音风格:alloy / echo / fable / onyx / nova / shimmer",
|
||
)
|
||
|
||
|
||
class TtsResponse(BaseModel):
|
||
"""文字转语音响应"""
|
||
audio_url: str = Field(..., description="音频文件下载 URL")
|
||
text_length: int = Field(..., description="文字长度")
|
||
voice: str = Field(..., description="使用的语音风格")
|
||
|
||
|
||
# TTS 输出缓存目录
|
||
_TTS_DIR = Path(settings.LOCAL_FILE_TOOLS_ROOT) / "tts_outputs"
|
||
|
||
|
||
@router.post("/asr", response_model=AsrResponse)
|
||
async def voice_to_text(
|
||
file: UploadFile = File(..., description="音频文件(AAC/WAV/MP3/WebM/M4A)"),
|
||
language: str = Query("zh", description="语言代码"),
|
||
current_user: User = Depends(get_current_user),
|
||
):
|
||
"""
|
||
语音转文字 (ASR)。
|
||
|
||
接收 Android 端录音上传的 AAC 音频文件,调用 Whisper API 返回识别文字。
|
||
采样率建议 16000 Hz。
|
||
"""
|
||
# 验证文件
|
||
if not file.filename:
|
||
raise HTTPException(status_code=400, detail="文件名为空")
|
||
|
||
ext = (file.filename.rsplit(".", 1)[-1].lower() if "." in file.filename else "aac")
|
||
allowed = {"aac", "wav", "mp3", "webm", "m4a", "ogg", "flac", "mpeg"}
|
||
if ext not in allowed:
|
||
raise HTTPException(status_code=400, detail=f"不支持的音频格式: {ext}")
|
||
|
||
# 检查文件大小
|
||
content = await file.read()
|
||
max_bytes = 25 * 1024 * 1024 # 25 MB
|
||
if len(content) > max_bytes:
|
||
raise HTTPException(status_code=400, detail=f"文件过大 ({len(content) / 1024 / 1024:.1f} MB)")
|
||
|
||
# 写入临时文件
|
||
tmp = tempfile.NamedTemporaryFile(suffix=f".{ext}", delete=False)
|
||
try:
|
||
tmp.write(content)
|
||
tmp.close()
|
||
|
||
api_key = (getattr(settings, "OPENAI_API_KEY", "") or "").strip()
|
||
base_url = (
|
||
getattr(settings, "OPENAI_BASE_URL", "https://api.openai.com/v1")
|
||
or "https://api.openai.com/v1"
|
||
).strip()
|
||
|
||
if not api_key:
|
||
raise HTTPException(status_code=503, detail="ASR 服务未配置 (OPENAI_API_KEY)")
|
||
|
||
import httpx
|
||
|
||
async with httpx.AsyncClient(timeout=120) as client:
|
||
with open(tmp.name, "rb") as f:
|
||
files_payload = {
|
||
"file": (Path(tmp.name).name, f, f"audio/{ext}"),
|
||
"model": (None, "whisper-1"),
|
||
"language": (None, language),
|
||
}
|
||
resp = await client.post(
|
||
f"{base_url.rstrip('/')}/audio/transcriptions",
|
||
headers={"Authorization": f"Bearer {api_key}"},
|
||
files=files_payload,
|
||
)
|
||
|
||
if resp.status_code != 200:
|
||
logger.error("Whisper API 错误 %d: %s", resp.status_code, resp.text[:500])
|
||
raise HTTPException(
|
||
status_code=502,
|
||
detail=f"语音识别服务返回错误 (HTTP {resp.status_code})",
|
||
)
|
||
|
||
data = resp.json()
|
||
text = data.get("text", "")
|
||
logger.info("ASR 完成: user=%s file=%s len=%d", current_user.id, file.filename, len(text))
|
||
|
||
return AsrResponse(text=text, language=language)
|
||
|
||
finally:
|
||
Path(tmp.name).unlink(missing_ok=True)
|
||
|
||
|
||
@router.post("/tts", response_model=TtsResponse)
|
||
async def text_to_voice(
|
||
req: TtsRequest,
|
||
current_user: User = Depends(get_current_user),
|
||
):
|
||
"""
|
||
文字转语音 (TTS)。
|
||
|
||
优先使用 OpenAI TTS(需配置有效 OPENAI_API_KEY),否则使用免费 Edge TTS。
|
||
Android 端使用 ExoPlayer 播放返回的 audio_url。
|
||
"""
|
||
valid_voices = {"alloy", "echo", "fable", "onyx", "nova", "shimmer"}
|
||
voice = req.voice if req.voice in valid_voices else "alloy"
|
||
|
||
text = req.text.strip()
|
||
if len(text) > 4000:
|
||
text = text[:4000] + "..."
|
||
|
||
_TTS_DIR.mkdir(parents=True, exist_ok=True)
|
||
filename = f"tts_{current_user.id}_{int(time.time())}.mp3"
|
||
filepath = _TTS_DIR / filename
|
||
|
||
# 尝试 OpenAI TTS
|
||
api_key = (getattr(settings, "OPENAI_API_KEY", "") or "").strip()
|
||
base_url = (
|
||
getattr(settings, "OPENAI_BASE_URL", "https://api.openai.com/v1")
|
||
or "https://api.openai.com/v1"
|
||
).strip()
|
||
|
||
use_edge = False
|
||
if api_key and api_key not in ("your-openai-api-key", "sk-your-"):
|
||
import httpx
|
||
|
||
try:
|
||
async with httpx.AsyncClient(timeout=60) as client:
|
||
resp = await client.post(
|
||
f"{base_url.rstrip('/')}/audio/speech",
|
||
headers={
|
||
"Authorization": f"Bearer {api_key}",
|
||
"Content-Type": "application/json",
|
||
},
|
||
json={"model": "tts-1", "voice": voice, "input": text},
|
||
)
|
||
|
||
if resp.status_code == 200:
|
||
filepath.write_bytes(resp.content)
|
||
logger.info("OpenAI TTS 完成: user=%s text_len=%d voice=%s", current_user.id, len(req.text), voice)
|
||
return TtsResponse(
|
||
audio_url=f"/api/v1/voice/audio/{filename}",
|
||
text_length=len(req.text),
|
||
voice=voice,
|
||
)
|
||
else:
|
||
logger.warning("OpenAI TTS 失败 (%d), 回退到 Edge TTS", resp.status_code)
|
||
use_edge = True
|
||
except Exception as exc:
|
||
logger.warning("OpenAI TTS 异常: %s, 回退到 Edge TTS", exc)
|
||
use_edge = True
|
||
else:
|
||
use_edge = True
|
||
|
||
# 回退:Edge TTS(免费,无需 API KEY)
|
||
if use_edge:
|
||
edge_voice = _EDGE_VOICE_MAP.get(voice, "zh-CN-YunxiNeural")
|
||
try:
|
||
await _edge_tts_synthesize(text, edge_voice, str(filepath))
|
||
logger.info("Edge TTS 完成: user=%s text_len=%d edge_voice=%s", current_user.id, len(req.text), edge_voice)
|
||
return TtsResponse(
|
||
audio_url=f"/api/v1/voice/audio/{filename}",
|
||
text_length=len(req.text),
|
||
voice=voice,
|
||
)
|
||
except Exception as exc:
|
||
logger.error("Edge TTS 失败: %s", exc)
|
||
raise HTTPException(status_code=502, detail=f"TTS 服务不可用: {exc}")
|
||
|
||
|
||
async def _edge_tts_synthesize(text: str, voice: str, output_path: str) -> None:
|
||
"""使用 edge-tts 命令行工具合成语音。"""
|
||
# 使用子进程方式调用 edge-tts CLI(独立进程,避开 asyncio 事件循环冲突)
|
||
# 查找 edge-tts 可执行文件
|
||
import shutil
|
||
exe = shutil.which("edge-tts") or shutil.which("edge-tts", path=(
|
||
os.environ.get("PATH", "") + os.pathsep +
|
||
os.path.join(os.path.dirname(sys.executable), "Scripts") + os.pathsep +
|
||
os.path.join(os.path.dirname(sys.executable), "..", "Scripts")
|
||
))
|
||
if not exe:
|
||
exe = "edge-tts" # fallback, let subprocess try PATH
|
||
|
||
logger.debug("edge-tts exe: %s", exe)
|
||
proc = await asyncio.create_subprocess_exec(
|
||
exe, "--text", text, "--voice", voice, "--write-media", output_path,
|
||
stdout=asyncio.subprocess.PIPE,
|
||
stderr=asyncio.subprocess.PIPE,
|
||
)
|
||
stdout, stderr = await proc.communicate()
|
||
out_str = stdout.decode(errors="replace") if stdout else ""
|
||
err_str = stderr.decode(errors="replace") if stderr else ""
|
||
|
||
if proc.returncode != 0:
|
||
raise RuntimeError(f"edge-tts CLI 失败 (exit={proc.returncode}): {err_str or out_str}")
|
||
|
||
if not Path(output_path).is_file():
|
||
raise RuntimeError(f"edge-tts 未生成输出文件: {out_str} {err_str}")
|
||
|
||
logger.info("edge-tts CLI 成功: %d bytes", Path(output_path).stat().st_size)
|
||
|
||
|
||
@router.get("/audio/{filename}")
|
||
async def get_tts_audio(filename: str):
|
||
"""获取 TTS 生成的音频文件(无需认证,URL 包含随机名)。"""
|
||
if ".." in filename or "/" in filename or "\\" in filename:
|
||
raise HTTPException(status_code=400, detail="非法文件名")
|
||
|
||
filepath = _TTS_DIR / filename
|
||
if not filepath.is_file():
|
||
raise HTTPException(status_code=404, detail="音频文件不存在或已过期")
|
||
|
||
return FileResponse(filepath, media_type="audio/mpeg")
|