aiagent/backend/app/api/voice.py

"""
语音 API — Android 应用语音交互接口

提供语音转文字 (ASR) 和文字转语音 (TTS) 的 HTTP API。
TTS 优先使用 OpenAI TTS（需配置 OPENAI_API_KEY），否则使用免费的 Edge TTS。
"""

from __future__ import annotations

import asyncio
import logging
import os
import subprocess
import sys
import tempfile
import time
from pathlib import Path

from fastapi import APIRouter, Depends, File, HTTPException, Query, UploadFile
from fastapi.responses import FileResponse
from pydantic import BaseModel, Field

from app.api.auth import get_current_user
from app.models.user import User
from app.core.config import settings

logger = logging.getLogger(__name__)

router = APIRouter(prefix="/api/v1/voice", tags=["voice"])

# Edge TTS 中文语音映射 (OpenAI voice name -> Edge TTS Chinese voice)
_EDGE_VOICE_MAP = {
    "alloy": "zh-CN-YunxiNeural",
    "echo": "zh-CN-YunyangNeural",
    "fable": "zh-CN-XiaoxiaoNeural",
    "onyx": "zh-CN-YunjianNeural",
    "nova": "zh-CN-XiaoyiNeural",
    "shimmer": "zh-CN-XiaoxiaoNeural",
}


class AsrResponse(BaseModel):
    """语音识别响应"""
    text: str = Field(..., description="识别出的文字")
    language: str = Field(default="zh", description="语言代码")


class TtsRequest(BaseModel):
    """文字转语音请求"""
    text: str = Field(..., min_length=1, max_length=4000, description="要合成的文字")
    voice: str = Field(
        default="alloy",
        description="语音风格：alloy / echo / fable / onyx / nova / shimmer",
    )


class TtsResponse(BaseModel):
    """文字转语音响应"""
    audio_url: str = Field(..., description="音频文件下载 URL")
    text_length: int = Field(..., description="文字长度")
    voice: str = Field(..., description="使用的语音风格")


# TTS 输出缓存目录
_TTS_DIR = Path(settings.LOCAL_FILE_TOOLS_ROOT) / "tts_outputs"


@router.post("/asr", response_model=AsrResponse)
async def voice_to_text(
    file: UploadFile = File(..., description="音频文件（AAC/WAV/MP3/WebM/M4A）"),
    language: str = Query("zh", description="语言代码"),
    current_user: User = Depends(get_current_user),
):
    """
    语音转文字 (ASR)。

    接收 Android 端录音上传的 AAC 音频文件，调用 Whisper API 返回识别文字。
    采样率建议 16000 Hz。
    """
    # 验证文件
    if not file.filename:
        raise HTTPException(status_code=400, detail="文件名为空")

    ext = (file.filename.rsplit(".", 1)[-1].lower() if "." in file.filename else "aac")
    allowed = {"aac", "wav", "mp3", "webm", "m4a", "ogg", "flac", "mpeg"}
    if ext not in allowed:
        raise HTTPException(status_code=400, detail=f"不支持的音频格式: {ext}")

    # 检查文件大小
    content = await file.read()
    max_bytes = 25 * 1024 * 1024  # 25 MB
    if len(content) > max_bytes:
        raise HTTPException(status_code=400, detail=f"文件过大 ({len(content) / 1024 / 1024:.1f} MB)")

    # 写入临时文件
    tmp = tempfile.NamedTemporaryFile(suffix=f".{ext}", delete=False)
    try:
        tmp.write(content)
        tmp.close()

        api_key = (getattr(settings, "OPENAI_API_KEY", "") or "").strip()
        base_url = (
            getattr(settings, "OPENAI_BASE_URL", "https://api.openai.com/v1")
            or "https://api.openai.com/v1"
        ).strip()

        if not api_key:
            raise HTTPException(status_code=503, detail="ASR 服务未配置 (OPENAI_API_KEY)")

        import httpx

        async with httpx.AsyncClient(timeout=120) as client:
            with open(tmp.name, "rb") as f:
                files_payload = {
                    "file": (Path(tmp.name).name, f, f"audio/{ext}"),
                    "model": (None, "whisper-1"),
                    "language": (None, language),
                }
                resp = await client.post(
                    f"{base_url.rstrip('/')}/audio/transcriptions",
                    headers={"Authorization": f"Bearer {api_key}"},
                    files=files_payload,
                )

        if resp.status_code != 200:
            logger.error("Whisper API 错误 %d: %s", resp.status_code, resp.text[:500])
            raise HTTPException(
                status_code=502,
                detail=f"语音识别服务返回错误 (HTTP {resp.status_code})",
            )

        data = resp.json()
        text = data.get("text", "")
        logger.info("ASR 完成: user=%s file=%s len=%d", current_user.id, file.filename, len(text))

        return AsrResponse(text=text, language=language)

    finally:
        Path(tmp.name).unlink(missing_ok=True)


@router.post("/tts", response_model=TtsResponse)
async def text_to_voice(
    req: TtsRequest,
    current_user: User = Depends(get_current_user),
):
    """
    文字转语音 (TTS)。

    优先使用 OpenAI TTS（需配置有效 OPENAI_API_KEY），否则使用免费 Edge TTS。
    Android 端使用 ExoPlayer 播放返回的 audio_url。
    """
    valid_voices = {"alloy", "echo", "fable", "onyx", "nova", "shimmer"}
    voice = req.voice if req.voice in valid_voices else "alloy"

    text = req.text.strip()
    if len(text) > 4000:
        text = text[:4000] + "..."

    _TTS_DIR.mkdir(parents=True, exist_ok=True)
    filename = f"tts_{current_user.id}_{int(time.time())}.mp3"
    filepath = _TTS_DIR / filename

    # 尝试 OpenAI TTS
    api_key = (getattr(settings, "OPENAI_API_KEY", "") or "").strip()
    base_url = (
        getattr(settings, "OPENAI_BASE_URL", "https://api.openai.com/v1")
        or "https://api.openai.com/v1"
    ).strip()

    use_edge = False
    if api_key and api_key not in ("your-openai-api-key", "sk-your-"):
        import httpx

        try:
            async with httpx.AsyncClient(timeout=60) as client:
                resp = await client.post(
                    f"{base_url.rstrip('/')}/audio/speech",
                    headers={
                        "Authorization": f"Bearer {api_key}",
                        "Content-Type": "application/json",
                    },
                    json={"model": "tts-1", "voice": voice, "input": text},
                )

            if resp.status_code == 200:
                filepath.write_bytes(resp.content)
                logger.info("OpenAI TTS 完成: user=%s text_len=%d voice=%s", current_user.id, len(req.text), voice)
                return TtsResponse(
                    audio_url=f"/api/v1/voice/audio/{filename}",
                    text_length=len(req.text),
                    voice=voice,
                )
            else:
                logger.warning("OpenAI TTS 失败 (%d), 回退到 Edge TTS", resp.status_code)
                use_edge = True
        except Exception as exc:
            logger.warning("OpenAI TTS 异常: %s, 回退到 Edge TTS", exc)
            use_edge = True
    else:
        use_edge = True

    # 回退：Edge TTS（免费，无需 API KEY）
    if use_edge:
        edge_voice = _EDGE_VOICE_MAP.get(voice, "zh-CN-YunxiNeural")
        try:
            await _edge_tts_synthesize(text, edge_voice, str(filepath))
            logger.info("Edge TTS 完成: user=%s text_len=%d edge_voice=%s", current_user.id, len(req.text), edge_voice)
            return TtsResponse(
                audio_url=f"/api/v1/voice/audio/{filename}",
                text_length=len(req.text),
                voice=voice,
            )
        except Exception as exc:
            logger.error("Edge TTS 失败: %s", exc)
            raise HTTPException(status_code=502, detail=f"TTS 服务不可用: {exc}")


async def _edge_tts_synthesize(text: str, voice: str, output_path: str) -> None:
    """使用 edge-tts 命令行工具合成语音。"""
    # 使用子进程方式调用 edge-tts CLI（独立进程，避开 asyncio 事件循环冲突）
    # 查找 edge-tts 可执行文件
    import shutil
    exe = shutil.which("edge-tts") or shutil.which("edge-tts", path=(
        os.environ.get("PATH", "") + os.pathsep +
        os.path.join(os.path.dirname(sys.executable), "Scripts") + os.pathsep +
        os.path.join(os.path.dirname(sys.executable), "..", "Scripts")
    ))
    if not exe:
        exe = "edge-tts"  # fallback, let subprocess try PATH

    logger.debug("edge-tts exe: %s", exe)
    proc = await asyncio.create_subprocess_exec(
        exe, "--text", text, "--voice", voice, "--write-media", output_path,
        stdout=asyncio.subprocess.PIPE,
        stderr=asyncio.subprocess.PIPE,
    )
    stdout, stderr = await proc.communicate()
    out_str = stdout.decode(errors="replace") if stdout else ""
    err_str = stderr.decode(errors="replace") if stderr else ""

    if proc.returncode != 0:
        raise RuntimeError(f"edge-tts CLI 失败 (exit={proc.returncode}): {err_str or out_str}")

    if not Path(output_path).is_file():
        raise RuntimeError(f"edge-tts 未生成输出文件: {out_str} {err_str}")

    logger.info("edge-tts CLI 成功: %d bytes", Path(output_path).stat().st_size)


@router.get("/audio/{filename}")
async def get_tts_audio(filename: str):
    """获取 TTS 生成的音频文件（无需认证，URL 包含随机名）。"""
    if ".." in filename or "/" in filename or "\\" in filename:
        raise HTTPException(status_code=400, detail="非法文件名")

    filepath = _TTS_DIR / filename
    if not filepath.is_file():
        raise HTTPException(status_code=404, detail="音频文件不存在或已过期")

    return FileResponse(filepath, media_type="audio/mpeg")