aiagent/backend/app/core/metrics.py

"""
Prometheus 指标收集 — HTTP 请求、业务指标、系统指标
使用 prometheus_client 原生实现，无额外框架依赖。
"""
import os
import time
import psutil
from prometheus_client import Counter, Histogram, Gauge, Info, generate_latest, CONTENT_TYPE_LATEST
from fastapi import FastAPI, Request, Response
from starlette.middleware.base import BaseHTTPMiddleware


# ─── HTTP 指标 ───

http_requests_total = Counter(
    "http_requests_total", "HTTP 请求总数",
    ["method", "endpoint", "status"]
)

http_request_size_bytes = Histogram(
    "http_request_size_bytes", "HTTP 请求体大小",
    ["method", "endpoint"],
    buckets=[100, 1024, 10240, 102400, 1048576]
)

http_response_size_bytes = Histogram(
    "http_response_size_bytes", "HTTP 响应体大小",
    ["method", "endpoint", "status"],
    buckets=[100, 1024, 10240, 102400, 1048576]
)

http_request_duration_seconds = Histogram(
    "http_request_duration_seconds", "HTTP 请求延迟",
    ["method", "endpoint", "status"],
    buckets=[0.01, 0.05, 0.1, 0.25, 0.5, 1, 2, 5, 10, 30, 60]
)

# ─── 业务指标 ───

agent_executions_total = Counter(
    "agent_executions_total", "Agent 执行总次数",
    ["agent_id", "agent_name", "status"]
)

agent_execution_duration_seconds = Histogram(
    "agent_execution_duration_seconds", "Agent 单次执行耗时",
    ["agent_id", "agent_name"],
    buckets=[1, 5, 10, 30, 60, 120, 300, 600, 1800]
)

workflow_executions_total = Counter(
    "workflow_executions_total", "工作流执行总次数",
    ["workflow_id", "status"]
)

workflow_node_duration_seconds = Histogram(
    "workflow_node_duration_seconds", "工作流节点执行耗时",
    ["node_type"],
    buckets=[0.1, 0.5, 1, 2, 5, 10, 30, 60]
)

llm_calls_total = Counter(
    "llm_calls_total", "LLM API 调用总次数",
    ["model", "provider", "status"]
)

llm_call_duration_seconds = Histogram(
    "llm_call_duration_seconds", "LLM 单次调用耗时",
    ["model", "provider"],
    buckets=[0.5, 1, 2, 5, 10, 20, 30, 60]
)

llm_token_usage_total = Counter(
    "llm_token_usage_total", "LLM Token 用量",
    ["model", "type"]  # type: input/output
)

tool_calls_total = Counter(
    "tool_calls_total", "工具调用总次数",
    ["tool_name", "status"]
)

knowledge_entries_total = Gauge(
    "knowledge_entries_total", "知识库条目总数"
)

knowledge_queries_total = Counter(
    "knowledge_queries_total", "知识库查询总次数",
    ["status"]
)

active_sessions = Gauge(
    "active_sessions", "当前活跃会话数"
)

login_total = Counter(
    "login_total", "登录总次数",
    ["client_type", "status"]
)

push_notifications_total = Counter(
    "push_notifications_total", "推送通知总次数",
    ["channel", "status"]
)

scheduled_tasks_total = Counter(
    "scheduled_tasks_total", "定时任务执行总次数",
    ["task_type", "status"]
)

# ─── 系统指标 ───

process_cpu_percent = Gauge(
    "process_cpu_percent", "进程 CPU 使用率 (%)"
)

process_memory_bytes = Gauge(
    "process_memory_bytes", "进程内存使用 (bytes)",
    ["type"]  # rss/vms
)

process_open_fds = Gauge(
    "process_open_fds", "进程打开的文件描述符数"
)

db_connections_active = Gauge(
    "db_connections_active", "活跃数据库连接数"
)

redis_connected = Gauge(
    "redis_connected", "Redis 连接状态 (1=已连接 / 0=断开)"
)

# ─── 应用信息 ───

app_info = Info("tiangong_app", "天工智能体平台信息")
app_info.info({
    "version": "1.0.0",
    "framework": "FastAPI",
    "python": os.sys.version.split()[0],
})


class PrometheusMiddleware(BaseHTTPMiddleware):
    """HTTP 请求指标收集中间件"""

    async def dispatch(self, request: Request, call_next):
        start = time.time()

        # 跳过 /metrics 自身
        if request.url.path == "/metrics":
            return await call_next(request)

        status_code = 500
        try:
            response = await call_next(request)
            status_code = response.status_code
            return response
        finally:
            duration = time.time() - start
            endpoint = request.url.path.rstrip("/")

            http_requests_total.labels(
                method=request.method,
                endpoint=endpoint,
                status=str(status_code)
            ).inc()

            http_request_duration_seconds.labels(
                method=request.method,
                endpoint=endpoint,
                status=str(status_code)
            ).observe(duration)


def setup_metrics(app: FastAPI) -> None:
    """配置 Prometheus 指标收集：挂载 /metrics 端点"""

    # /metrics 端点 — Prometheus 抓取
    @app.get("/metrics", include_in_schema=False)
    async def metrics_endpoint():
        _update_system_metrics()
        return Response(
            content=generate_latest(),
            media_type=CONTENT_TYPE_LATEST,
        )

    # /metrics/system 端点 — 内部调试用
    @app.get("/metrics/system", include_in_schema=False)
    async def system_metrics_detail():
        process = psutil.Process(os.getpid())
        mem = process.memory_info()
        return {
            "cpu_percent": process.cpu_percent(interval=0.1),
            "memory_rss_mb": round(mem.rss / 1024 / 1024, 2),
            "memory_vms_mb": round(mem.vms / 1024 / 1024, 2),
            "open_fds": _safe_get_fds(process),
        }


def _update_system_metrics() -> None:
    """更新系统指标（供 /metrics 端点每次抓取时调用）"""
    try:
        process = psutil.Process(os.getpid())
        mem = process.memory_info()
        process_memory_bytes.labels(type="rss").set(mem.rss)
        process_memory_bytes.labels(type="vms").set(mem.vms)
        process_cpu_percent.set(process.cpu_percent(interval=0.05))

        fds = _safe_get_fds(process)
        if fds is not None:
            process_open_fds.set(fds)
    except Exception:
        pass


def _safe_get_fds(process) -> int | None:
    try:
        return process.num_fds()
    except (AttributeError, psutil.AccessDenied, OSError):
        return None


# ─── 便捷记录函数 ───

def record_llm_call(model: str, provider: str, duration: float,
                    input_tokens: int = 0, output_tokens: int = 0,
                    status: str = "success") -> None:
    llm_calls_total.labels(model=model, provider=provider, status=status).inc()
    llm_call_duration_seconds.labels(model=model, provider=provider).observe(duration)
    if input_tokens > 0:
        llm_token_usage_total.labels(model=model, type="input").inc(input_tokens)
    if output_tokens > 0:
        llm_token_usage_total.labels(model=model, type="output").inc(output_tokens)


def record_agent_execution(agent_id: str, agent_name: str, duration: float,
                           status: str = "success") -> None:
    agent_executions_total.labels(
        agent_id=agent_id, agent_name=agent_name, status=status
    ).inc()
    agent_execution_duration_seconds.labels(
        agent_id=agent_id, agent_name=agent_name
    ).observe(duration)


def record_tool_call(tool_name: str, status: str = "success") -> None:
    tool_calls_total.labels(tool_name=tool_name, status=status).inc()


def record_workflow_execution(workflow_id: str, status: str = "success") -> None:
    workflow_executions_total.labels(workflow_id=workflow_id, status=status).inc()