Files
aiagent/backend/app/core/metrics.py
renjianbo beff3fac8d fix: delete agent 500 error + dynamic personality + deployment guide
- Fix delete agent 500: clean up FK records (agent_llm_logs, permissions,
  schedules, executions, team_members) and unbind goals/tasks before delete
- Remove hardcoded personality templates in Android, replace with dynamic
  system prompt generation from name + description
- Set promptSectionsEnabled=false to bypass PromptComposer for personality
- Add Tencent Cloud Linux deployment guide (Docker Compose)
- Accumulated backend service updates, frontend UI fixes, Android app changes

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-06-29 01:17:21 +08:00

253 lines
7.3 KiB
Python

"""
Prometheus 指标收集 — HTTP 请求、业务指标、系统指标
使用 prometheus_client 原生实现,无额外框架依赖。
"""
import os
import time
import psutil
from prometheus_client import Counter, Histogram, Gauge, Info, generate_latest, CONTENT_TYPE_LATEST
from fastapi import FastAPI, Request, Response
from starlette.middleware.base import BaseHTTPMiddleware
# ─── HTTP 指标 ───
http_requests_total = Counter(
"http_requests_total", "HTTP 请求总数",
["method", "endpoint", "status"]
)
http_request_size_bytes = Histogram(
"http_request_size_bytes", "HTTP 请求体大小",
["method", "endpoint"],
buckets=[100, 1024, 10240, 102400, 1048576]
)
http_response_size_bytes = Histogram(
"http_response_size_bytes", "HTTP 响应体大小",
["method", "endpoint", "status"],
buckets=[100, 1024, 10240, 102400, 1048576]
)
http_request_duration_seconds = Histogram(
"http_request_duration_seconds", "HTTP 请求延迟",
["method", "endpoint", "status"],
buckets=[0.01, 0.05, 0.1, 0.25, 0.5, 1, 2, 5, 10, 30, 60]
)
# ─── 业务指标 ───
agent_executions_total = Counter(
"agent_executions_total", "Agent 执行总次数",
["agent_id", "agent_name", "status"]
)
agent_execution_duration_seconds = Histogram(
"agent_execution_duration_seconds", "Agent 单次执行耗时",
["agent_id", "agent_name"],
buckets=[1, 5, 10, 30, 60, 120, 300, 600, 1800]
)
workflow_executions_total = Counter(
"workflow_executions_total", "工作流执行总次数",
["workflow_id", "status"]
)
workflow_node_duration_seconds = Histogram(
"workflow_node_duration_seconds", "工作流节点执行耗时",
["node_type"],
buckets=[0.1, 0.5, 1, 2, 5, 10, 30, 60]
)
llm_calls_total = Counter(
"llm_calls_total", "LLM API 调用总次数",
["model", "provider", "status"]
)
llm_call_duration_seconds = Histogram(
"llm_call_duration_seconds", "LLM 单次调用耗时",
["model", "provider"],
buckets=[0.5, 1, 2, 5, 10, 20, 30, 60]
)
llm_token_usage_total = Counter(
"llm_token_usage_total", "LLM Token 用量",
["model", "type"] # type: input/output
)
tool_calls_total = Counter(
"tool_calls_total", "工具调用总次数",
["tool_name", "status"]
)
knowledge_entries_total = Gauge(
"knowledge_entries_total", "知识库条目总数"
)
knowledge_queries_total = Counter(
"knowledge_queries_total", "知识库查询总次数",
["status"]
)
active_sessions = Gauge(
"active_sessions", "当前活跃会话数"
)
login_total = Counter(
"login_total", "登录总次数",
["client_type", "status"]
)
push_notifications_total = Counter(
"push_notifications_total", "推送通知总次数",
["channel", "status"]
)
scheduled_tasks_total = Counter(
"scheduled_tasks_total", "定时任务执行总次数",
["task_type", "status"]
)
# ─── 系统指标 ───
process_cpu_percent = Gauge(
"process_cpu_percent", "进程 CPU 使用率 (%)"
)
process_memory_bytes = Gauge(
"process_memory_bytes", "进程内存使用 (bytes)",
["type"] # rss/vms
)
process_open_fds = Gauge(
"process_open_fds", "进程打开的文件描述符数"
)
db_connections_active = Gauge(
"db_connections_active", "活跃数据库连接数"
)
redis_connected = Gauge(
"redis_connected", "Redis 连接状态 (1=已连接 / 0=断开)"
)
# ─── 应用信息 ───
app_info = Info("tiangong_app", "天工智能体平台信息")
app_info.info({
"version": "1.0.0",
"framework": "FastAPI",
"python": os.sys.version.split()[0],
})
class PrometheusMiddleware(BaseHTTPMiddleware):
"""HTTP 请求指标收集中间件"""
async def dispatch(self, request: Request, call_next):
start = time.time()
# 跳过 /metrics 自身
if request.url.path == "/metrics":
return await call_next(request)
status_code = 500
try:
response = await call_next(request)
status_code = response.status_code
return response
finally:
duration = time.time() - start
endpoint = request.url.path.rstrip("/")
http_requests_total.labels(
method=request.method,
endpoint=endpoint,
status=str(status_code)
).inc()
http_request_duration_seconds.labels(
method=request.method,
endpoint=endpoint,
status=str(status_code)
).observe(duration)
def setup_metrics(app: FastAPI) -> None:
"""配置 Prometheus 指标收集:挂载 /metrics 端点"""
# /metrics 端点 — Prometheus 抓取
@app.get("/metrics", include_in_schema=False)
async def metrics_endpoint():
_update_system_metrics()
return Response(
content=generate_latest(),
media_type=CONTENT_TYPE_LATEST,
)
# /metrics/system 端点 — 内部调试用
@app.get("/metrics/system", include_in_schema=False)
async def system_metrics_detail():
process = psutil.Process(os.getpid())
mem = process.memory_info()
return {
"cpu_percent": process.cpu_percent(interval=0.1),
"memory_rss_mb": round(mem.rss / 1024 / 1024, 2),
"memory_vms_mb": round(mem.vms / 1024 / 1024, 2),
"open_fds": _safe_get_fds(process),
}
def _update_system_metrics() -> None:
"""更新系统指标(供 /metrics 端点每次抓取时调用)"""
try:
process = psutil.Process(os.getpid())
mem = process.memory_info()
process_memory_bytes.labels(type="rss").set(mem.rss)
process_memory_bytes.labels(type="vms").set(mem.vms)
process_cpu_percent.set(process.cpu_percent(interval=0.05))
fds = _safe_get_fds(process)
if fds is not None:
process_open_fds.set(fds)
except Exception:
pass
def _safe_get_fds(process) -> int | None:
try:
return process.num_fds()
except (AttributeError, psutil.AccessDenied, OSError):
return None
# ─── 便捷记录函数 ───
def record_llm_call(model: str, provider: str, duration: float,
input_tokens: int = 0, output_tokens: int = 0,
status: str = "success") -> None:
llm_calls_total.labels(model=model, provider=provider, status=status).inc()
llm_call_duration_seconds.labels(model=model, provider=provider).observe(duration)
if input_tokens > 0:
llm_token_usage_total.labels(model=model, type="input").inc(input_tokens)
if output_tokens > 0:
llm_token_usage_total.labels(model=model, type="output").inc(output_tokens)
def record_agent_execution(agent_id: str, agent_name: str, duration: float,
status: str = "success") -> None:
agent_executions_total.labels(
agent_id=agent_id, agent_name=agent_name, status=status
).inc()
agent_execution_duration_seconds.labels(
agent_id=agent_id, agent_name=agent_name
).observe(duration)
def record_tool_call(tool_name: str, status: str = "success") -> None:
tool_calls_total.labels(tool_name=tool_name, status=status).inc()
def record_workflow_execution(workflow_id: str, status: str = "success") -> None:
workflow_executions_total.labels(workflow_id=workflow_id, status=status).inc()