Files
aiagent/backend/app/api/voice.py
renjianbo beff3fac8d fix: delete agent 500 error + dynamic personality + deployment guide
- Fix delete agent 500: clean up FK records (agent_llm_logs, permissions,
  schedules, executions, team_members) and unbind goals/tasks before delete
- Remove hardcoded personality templates in Android, replace with dynamic
  system prompt generation from name + description
- Set promptSectionsEnabled=false to bypass PromptComposer for personality
- Add Tencent Cloud Linux deployment guide (Docker Compose)
- Accumulated backend service updates, frontend UI fixes, Android app changes

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-06-29 01:17:21 +08:00

262 lines
9.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
语音 API — Android 应用语音交互接口
提供语音转文字 (ASR) 和文字转语音 (TTS) 的 HTTP API。
TTS 优先使用 OpenAI TTS需配置 OPENAI_API_KEY否则使用免费的 Edge TTS。
"""
from __future__ import annotations
import asyncio
import logging
import os
import subprocess
import sys
import tempfile
import time
from pathlib import Path
from fastapi import APIRouter, Depends, File, HTTPException, Query, UploadFile
from fastapi.responses import FileResponse
from pydantic import BaseModel, Field
from app.api.auth import get_current_user
from app.models.user import User
from app.core.config import settings
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/v1/voice", tags=["voice"])
# Edge TTS 中文语音映射 (OpenAI voice name -> Edge TTS Chinese voice)
_EDGE_VOICE_MAP = {
"alloy": "zh-CN-YunxiNeural",
"echo": "zh-CN-YunyangNeural",
"fable": "zh-CN-XiaoxiaoNeural",
"onyx": "zh-CN-YunjianNeural",
"nova": "zh-CN-XiaoyiNeural",
"shimmer": "zh-CN-XiaoxiaoNeural",
}
class AsrResponse(BaseModel):
"""语音识别响应"""
text: str = Field(..., description="识别出的文字")
language: str = Field(default="zh", description="语言代码")
class TtsRequest(BaseModel):
"""文字转语音请求"""
text: str = Field(..., min_length=1, max_length=4000, description="要合成的文字")
voice: str = Field(
default="alloy",
description="语音风格alloy / echo / fable / onyx / nova / shimmer",
)
class TtsResponse(BaseModel):
"""文字转语音响应"""
audio_url: str = Field(..., description="音频文件下载 URL")
text_length: int = Field(..., description="文字长度")
voice: str = Field(..., description="使用的语音风格")
# TTS 输出缓存目录
_TTS_DIR = Path(settings.LOCAL_FILE_TOOLS_ROOT) / "tts_outputs"
@router.post("/asr", response_model=AsrResponse)
async def voice_to_text(
file: UploadFile = File(..., description="音频文件AAC/WAV/MP3/WebM/M4A"),
language: str = Query("zh", description="语言代码"),
current_user: User = Depends(get_current_user),
):
"""
语音转文字 (ASR)。
接收 Android 端录音上传的 AAC 音频文件,调用 Whisper API 返回识别文字。
采样率建议 16000 Hz。
"""
# 验证文件
if not file.filename:
raise HTTPException(status_code=400, detail="文件名为空")
ext = (file.filename.rsplit(".", 1)[-1].lower() if "." in file.filename else "aac")
allowed = {"aac", "wav", "mp3", "webm", "m4a", "ogg", "flac", "mpeg"}
if ext not in allowed:
raise HTTPException(status_code=400, detail=f"不支持的音频格式: {ext}")
# 检查文件大小
content = await file.read()
max_bytes = 25 * 1024 * 1024 # 25 MB
if len(content) > max_bytes:
raise HTTPException(status_code=400, detail=f"文件过大 ({len(content) / 1024 / 1024:.1f} MB)")
# 写入临时文件
tmp = tempfile.NamedTemporaryFile(suffix=f".{ext}", delete=False)
try:
tmp.write(content)
tmp.close()
api_key = (getattr(settings, "OPENAI_API_KEY", "") or "").strip()
base_url = (
getattr(settings, "OPENAI_BASE_URL", "https://api.openai.com/v1")
or "https://api.openai.com/v1"
).strip()
if not api_key:
raise HTTPException(status_code=503, detail="ASR 服务未配置 (OPENAI_API_KEY)")
import httpx
async with httpx.AsyncClient(timeout=120) as client:
with open(tmp.name, "rb") as f:
files_payload = {
"file": (Path(tmp.name).name, f, f"audio/{ext}"),
"model": (None, "whisper-1"),
"language": (None, language),
}
resp = await client.post(
f"{base_url.rstrip('/')}/audio/transcriptions",
headers={"Authorization": f"Bearer {api_key}"},
files=files_payload,
)
if resp.status_code != 200:
logger.error("Whisper API 错误 %d: %s", resp.status_code, resp.text[:500])
raise HTTPException(
status_code=502,
detail=f"语音识别服务返回错误 (HTTP {resp.status_code})",
)
data = resp.json()
text = data.get("text", "")
logger.info("ASR 完成: user=%s file=%s len=%d", current_user.id, file.filename, len(text))
return AsrResponse(text=text, language=language)
finally:
Path(tmp.name).unlink(missing_ok=True)
@router.post("/tts", response_model=TtsResponse)
async def text_to_voice(
req: TtsRequest,
current_user: User = Depends(get_current_user),
):
"""
文字转语音 (TTS)。
优先使用 OpenAI TTS需配置有效 OPENAI_API_KEY否则使用免费 Edge TTS。
Android 端使用 ExoPlayer 播放返回的 audio_url。
"""
valid_voices = {"alloy", "echo", "fable", "onyx", "nova", "shimmer"}
voice = req.voice if req.voice in valid_voices else "alloy"
text = req.text.strip()
if len(text) > 4000:
text = text[:4000] + "..."
_TTS_DIR.mkdir(parents=True, exist_ok=True)
filename = f"tts_{current_user.id}_{int(time.time())}.mp3"
filepath = _TTS_DIR / filename
# 尝试 OpenAI TTS
api_key = (getattr(settings, "OPENAI_API_KEY", "") or "").strip()
base_url = (
getattr(settings, "OPENAI_BASE_URL", "https://api.openai.com/v1")
or "https://api.openai.com/v1"
).strip()
use_edge = False
if api_key and api_key not in ("your-openai-api-key", "sk-your-"):
import httpx
try:
async with httpx.AsyncClient(timeout=60) as client:
resp = await client.post(
f"{base_url.rstrip('/')}/audio/speech",
headers={
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
},
json={"model": "tts-1", "voice": voice, "input": text},
)
if resp.status_code == 200:
filepath.write_bytes(resp.content)
logger.info("OpenAI TTS 完成: user=%s text_len=%d voice=%s", current_user.id, len(req.text), voice)
return TtsResponse(
audio_url=f"/api/v1/voice/audio/{filename}",
text_length=len(req.text),
voice=voice,
)
else:
logger.warning("OpenAI TTS 失败 (%d), 回退到 Edge TTS", resp.status_code)
use_edge = True
except Exception as exc:
logger.warning("OpenAI TTS 异常: %s, 回退到 Edge TTS", exc)
use_edge = True
else:
use_edge = True
# 回退Edge TTS免费无需 API KEY
if use_edge:
edge_voice = _EDGE_VOICE_MAP.get(voice, "zh-CN-YunxiNeural")
try:
await _edge_tts_synthesize(text, edge_voice, str(filepath))
logger.info("Edge TTS 完成: user=%s text_len=%d edge_voice=%s", current_user.id, len(req.text), edge_voice)
return TtsResponse(
audio_url=f"/api/v1/voice/audio/{filename}",
text_length=len(req.text),
voice=voice,
)
except Exception as exc:
logger.error("Edge TTS 失败: %s", exc)
raise HTTPException(status_code=502, detail=f"TTS 服务不可用: {exc}")
async def _edge_tts_synthesize(text: str, voice: str, output_path: str) -> None:
"""使用 edge-tts 命令行工具合成语音。"""
# 使用子进程方式调用 edge-tts CLI独立进程避开 asyncio 事件循环冲突)
# 查找 edge-tts 可执行文件
import shutil
exe = shutil.which("edge-tts") or shutil.which("edge-tts", path=(
os.environ.get("PATH", "") + os.pathsep +
os.path.join(os.path.dirname(sys.executable), "Scripts") + os.pathsep +
os.path.join(os.path.dirname(sys.executable), "..", "Scripts")
))
if not exe:
exe = "edge-tts" # fallback, let subprocess try PATH
logger.debug("edge-tts exe: %s", exe)
proc = await asyncio.create_subprocess_exec(
exe, "--text", text, "--voice", voice, "--write-media", output_path,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, stderr = await proc.communicate()
out_str = stdout.decode(errors="replace") if stdout else ""
err_str = stderr.decode(errors="replace") if stderr else ""
if proc.returncode != 0:
raise RuntimeError(f"edge-tts CLI 失败 (exit={proc.returncode}): {err_str or out_str}")
if not Path(output_path).is_file():
raise RuntimeError(f"edge-tts 未生成输出文件: {out_str} {err_str}")
logger.info("edge-tts CLI 成功: %d bytes", Path(output_path).stat().st_size)
@router.get("/audio/{filename}")
async def get_tts_audio(filename: str):
"""获取 TTS 生成的音频文件无需认证URL 包含随机名)。"""
if ".." in filename or "/" in filename or "\\" in filename:
raise HTTPException(status_code=400, detail="非法文件名")
filepath = _TTS_DIR / filename
if not filepath.is_file():
raise HTTPException(status_code=404, detail="音频文件不存在或已过期")
return FileResponse(filepath, media_type="audio/mpeg")