""" Prometheus 指标收集 — HTTP 请求、业务指标、系统指标 使用 prometheus_client 原生实现,无额外框架依赖。 """ import os import time import psutil from prometheus_client import Counter, Histogram, Gauge, Info, generate_latest, CONTENT_TYPE_LATEST from fastapi import FastAPI, Request, Response from starlette.middleware.base import BaseHTTPMiddleware # ─── HTTP 指标 ─── http_requests_total = Counter( "http_requests_total", "HTTP 请求总数", ["method", "endpoint", "status"] ) http_request_size_bytes = Histogram( "http_request_size_bytes", "HTTP 请求体大小", ["method", "endpoint"], buckets=[100, 1024, 10240, 102400, 1048576] ) http_response_size_bytes = Histogram( "http_response_size_bytes", "HTTP 响应体大小", ["method", "endpoint", "status"], buckets=[100, 1024, 10240, 102400, 1048576] ) http_request_duration_seconds = Histogram( "http_request_duration_seconds", "HTTP 请求延迟", ["method", "endpoint", "status"], buckets=[0.01, 0.05, 0.1, 0.25, 0.5, 1, 2, 5, 10, 30, 60] ) # ─── 业务指标 ─── agent_executions_total = Counter( "agent_executions_total", "Agent 执行总次数", ["agent_id", "agent_name", "status"] ) agent_execution_duration_seconds = Histogram( "agent_execution_duration_seconds", "Agent 单次执行耗时", ["agent_id", "agent_name"], buckets=[1, 5, 10, 30, 60, 120, 300, 600, 1800] ) workflow_executions_total = Counter( "workflow_executions_total", "工作流执行总次数", ["workflow_id", "status"] ) workflow_node_duration_seconds = Histogram( "workflow_node_duration_seconds", "工作流节点执行耗时", ["node_type"], buckets=[0.1, 0.5, 1, 2, 5, 10, 30, 60] ) llm_calls_total = Counter( "llm_calls_total", "LLM API 调用总次数", ["model", "provider", "status"] ) llm_call_duration_seconds = Histogram( "llm_call_duration_seconds", "LLM 单次调用耗时", ["model", "provider"], buckets=[0.5, 1, 2, 5, 10, 20, 30, 60] ) llm_token_usage_total = Counter( "llm_token_usage_total", "LLM Token 用量", ["model", "type"] # type: input/output ) tool_calls_total = Counter( "tool_calls_total", "工具调用总次数", ["tool_name", "status"] ) knowledge_entries_total = Gauge( "knowledge_entries_total", "知识库条目总数" ) knowledge_queries_total = Counter( "knowledge_queries_total", "知识库查询总次数", ["status"] ) active_sessions = Gauge( "active_sessions", "当前活跃会话数" ) login_total = Counter( "login_total", "登录总次数", ["client_type", "status"] ) push_notifications_total = Counter( "push_notifications_total", "推送通知总次数", ["channel", "status"] ) scheduled_tasks_total = Counter( "scheduled_tasks_total", "定时任务执行总次数", ["task_type", "status"] ) # ─── 系统指标 ─── process_cpu_percent = Gauge( "process_cpu_percent", "进程 CPU 使用率 (%)" ) process_memory_bytes = Gauge( "process_memory_bytes", "进程内存使用 (bytes)", ["type"] # rss/vms ) process_open_fds = Gauge( "process_open_fds", "进程打开的文件描述符数" ) db_connections_active = Gauge( "db_connections_active", "活跃数据库连接数" ) redis_connected = Gauge( "redis_connected", "Redis 连接状态 (1=已连接 / 0=断开)" ) # ─── 应用信息 ─── app_info = Info("tiangong_app", "天工智能体平台信息") app_info.info({ "version": "1.0.0", "framework": "FastAPI", "python": os.sys.version.split()[0], }) class PrometheusMiddleware(BaseHTTPMiddleware): """HTTP 请求指标收集中间件""" async def dispatch(self, request: Request, call_next): start = time.time() # 跳过 /metrics 自身 if request.url.path == "/metrics": return await call_next(request) status_code = 500 try: response = await call_next(request) status_code = response.status_code return response finally: duration = time.time() - start endpoint = request.url.path.rstrip("/") http_requests_total.labels( method=request.method, endpoint=endpoint, status=str(status_code) ).inc() http_request_duration_seconds.labels( method=request.method, endpoint=endpoint, status=str(status_code) ).observe(duration) def setup_metrics(app: FastAPI) -> None: """配置 Prometheus 指标收集:挂载 /metrics 端点""" # /metrics 端点 — Prometheus 抓取 @app.get("/metrics", include_in_schema=False) async def metrics_endpoint(): _update_system_metrics() return Response( content=generate_latest(), media_type=CONTENT_TYPE_LATEST, ) # /metrics/system 端点 — 内部调试用 @app.get("/metrics/system", include_in_schema=False) async def system_metrics_detail(): process = psutil.Process(os.getpid()) mem = process.memory_info() return { "cpu_percent": process.cpu_percent(interval=0.1), "memory_rss_mb": round(mem.rss / 1024 / 1024, 2), "memory_vms_mb": round(mem.vms / 1024 / 1024, 2), "open_fds": _safe_get_fds(process), } def _update_system_metrics() -> None: """更新系统指标(供 /metrics 端点每次抓取时调用)""" try: process = psutil.Process(os.getpid()) mem = process.memory_info() process_memory_bytes.labels(type="rss").set(mem.rss) process_memory_bytes.labels(type="vms").set(mem.vms) process_cpu_percent.set(process.cpu_percent(interval=0.05)) fds = _safe_get_fds(process) if fds is not None: process_open_fds.set(fds) except Exception: pass def _safe_get_fds(process) -> int | None: try: return process.num_fds() except (AttributeError, psutil.AccessDenied, OSError): return None # ─── 便捷记录函数 ─── def record_llm_call(model: str, provider: str, duration: float, input_tokens: int = 0, output_tokens: int = 0, status: str = "success") -> None: llm_calls_total.labels(model=model, provider=provider, status=status).inc() llm_call_duration_seconds.labels(model=model, provider=provider).observe(duration) if input_tokens > 0: llm_token_usage_total.labels(model=model, type="input").inc(input_tokens) if output_tokens > 0: llm_token_usage_total.labels(model=model, type="output").inc(output_tokens) def record_agent_execution(agent_id: str, agent_name: str, duration: float, status: str = "success") -> None: agent_executions_total.labels( agent_id=agent_id, agent_name=agent_name, status=status ).inc() agent_execution_duration_seconds.labels( agent_id=agent_id, agent_name=agent_name ).observe(duration) def record_tool_call(tool_name: str, status: str = "success") -> None: tool_calls_total.labels(tool_name=tool_name, status=status).inc() def record_workflow_execution(workflow_id: str, status: str = "success") -> None: workflow_executions_total.labels(workflow_id=workflow_id, status=status).inc()