- Fix delete agent 500: clean up FK records (agent_llm_logs, permissions, schedules, executions, team_members) and unbind goals/tasks before delete - Remove hardcoded personality templates in Android, replace with dynamic system prompt generation from name + description - Set promptSectionsEnabled=false to bypass PromptComposer for personality - Add Tencent Cloud Linux deployment guide (Docker Compose) - Accumulated backend service updates, frontend UI fixes, Android app changes Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
253 lines
7.3 KiB
Python
253 lines
7.3 KiB
Python
"""
|
|
Prometheus 指标收集 — HTTP 请求、业务指标、系统指标
|
|
使用 prometheus_client 原生实现,无额外框架依赖。
|
|
"""
|
|
import os
|
|
import time
|
|
import psutil
|
|
from prometheus_client import Counter, Histogram, Gauge, Info, generate_latest, CONTENT_TYPE_LATEST
|
|
from fastapi import FastAPI, Request, Response
|
|
from starlette.middleware.base import BaseHTTPMiddleware
|
|
|
|
|
|
# ─── HTTP 指标 ───
|
|
|
|
http_requests_total = Counter(
|
|
"http_requests_total", "HTTP 请求总数",
|
|
["method", "endpoint", "status"]
|
|
)
|
|
|
|
http_request_size_bytes = Histogram(
|
|
"http_request_size_bytes", "HTTP 请求体大小",
|
|
["method", "endpoint"],
|
|
buckets=[100, 1024, 10240, 102400, 1048576]
|
|
)
|
|
|
|
http_response_size_bytes = Histogram(
|
|
"http_response_size_bytes", "HTTP 响应体大小",
|
|
["method", "endpoint", "status"],
|
|
buckets=[100, 1024, 10240, 102400, 1048576]
|
|
)
|
|
|
|
http_request_duration_seconds = Histogram(
|
|
"http_request_duration_seconds", "HTTP 请求延迟",
|
|
["method", "endpoint", "status"],
|
|
buckets=[0.01, 0.05, 0.1, 0.25, 0.5, 1, 2, 5, 10, 30, 60]
|
|
)
|
|
|
|
# ─── 业务指标 ───
|
|
|
|
agent_executions_total = Counter(
|
|
"agent_executions_total", "Agent 执行总次数",
|
|
["agent_id", "agent_name", "status"]
|
|
)
|
|
|
|
agent_execution_duration_seconds = Histogram(
|
|
"agent_execution_duration_seconds", "Agent 单次执行耗时",
|
|
["agent_id", "agent_name"],
|
|
buckets=[1, 5, 10, 30, 60, 120, 300, 600, 1800]
|
|
)
|
|
|
|
workflow_executions_total = Counter(
|
|
"workflow_executions_total", "工作流执行总次数",
|
|
["workflow_id", "status"]
|
|
)
|
|
|
|
workflow_node_duration_seconds = Histogram(
|
|
"workflow_node_duration_seconds", "工作流节点执行耗时",
|
|
["node_type"],
|
|
buckets=[0.1, 0.5, 1, 2, 5, 10, 30, 60]
|
|
)
|
|
|
|
llm_calls_total = Counter(
|
|
"llm_calls_total", "LLM API 调用总次数",
|
|
["model", "provider", "status"]
|
|
)
|
|
|
|
llm_call_duration_seconds = Histogram(
|
|
"llm_call_duration_seconds", "LLM 单次调用耗时",
|
|
["model", "provider"],
|
|
buckets=[0.5, 1, 2, 5, 10, 20, 30, 60]
|
|
)
|
|
|
|
llm_token_usage_total = Counter(
|
|
"llm_token_usage_total", "LLM Token 用量",
|
|
["model", "type"] # type: input/output
|
|
)
|
|
|
|
tool_calls_total = Counter(
|
|
"tool_calls_total", "工具调用总次数",
|
|
["tool_name", "status"]
|
|
)
|
|
|
|
knowledge_entries_total = Gauge(
|
|
"knowledge_entries_total", "知识库条目总数"
|
|
)
|
|
|
|
knowledge_queries_total = Counter(
|
|
"knowledge_queries_total", "知识库查询总次数",
|
|
["status"]
|
|
)
|
|
|
|
active_sessions = Gauge(
|
|
"active_sessions", "当前活跃会话数"
|
|
)
|
|
|
|
login_total = Counter(
|
|
"login_total", "登录总次数",
|
|
["client_type", "status"]
|
|
)
|
|
|
|
push_notifications_total = Counter(
|
|
"push_notifications_total", "推送通知总次数",
|
|
["channel", "status"]
|
|
)
|
|
|
|
scheduled_tasks_total = Counter(
|
|
"scheduled_tasks_total", "定时任务执行总次数",
|
|
["task_type", "status"]
|
|
)
|
|
|
|
# ─── 系统指标 ───
|
|
|
|
process_cpu_percent = Gauge(
|
|
"process_cpu_percent", "进程 CPU 使用率 (%)"
|
|
)
|
|
|
|
process_memory_bytes = Gauge(
|
|
"process_memory_bytes", "进程内存使用 (bytes)",
|
|
["type"] # rss/vms
|
|
)
|
|
|
|
process_open_fds = Gauge(
|
|
"process_open_fds", "进程打开的文件描述符数"
|
|
)
|
|
|
|
db_connections_active = Gauge(
|
|
"db_connections_active", "活跃数据库连接数"
|
|
)
|
|
|
|
redis_connected = Gauge(
|
|
"redis_connected", "Redis 连接状态 (1=已连接 / 0=断开)"
|
|
)
|
|
|
|
# ─── 应用信息 ───
|
|
|
|
app_info = Info("tiangong_app", "天工智能体平台信息")
|
|
app_info.info({
|
|
"version": "1.0.0",
|
|
"framework": "FastAPI",
|
|
"python": os.sys.version.split()[0],
|
|
})
|
|
|
|
|
|
class PrometheusMiddleware(BaseHTTPMiddleware):
|
|
"""HTTP 请求指标收集中间件"""
|
|
|
|
async def dispatch(self, request: Request, call_next):
|
|
start = time.time()
|
|
|
|
# 跳过 /metrics 自身
|
|
if request.url.path == "/metrics":
|
|
return await call_next(request)
|
|
|
|
status_code = 500
|
|
try:
|
|
response = await call_next(request)
|
|
status_code = response.status_code
|
|
return response
|
|
finally:
|
|
duration = time.time() - start
|
|
endpoint = request.url.path.rstrip("/")
|
|
|
|
http_requests_total.labels(
|
|
method=request.method,
|
|
endpoint=endpoint,
|
|
status=str(status_code)
|
|
).inc()
|
|
|
|
http_request_duration_seconds.labels(
|
|
method=request.method,
|
|
endpoint=endpoint,
|
|
status=str(status_code)
|
|
).observe(duration)
|
|
|
|
|
|
def setup_metrics(app: FastAPI) -> None:
|
|
"""配置 Prometheus 指标收集:挂载 /metrics 端点"""
|
|
|
|
# /metrics 端点 — Prometheus 抓取
|
|
@app.get("/metrics", include_in_schema=False)
|
|
async def metrics_endpoint():
|
|
_update_system_metrics()
|
|
return Response(
|
|
content=generate_latest(),
|
|
media_type=CONTENT_TYPE_LATEST,
|
|
)
|
|
|
|
# /metrics/system 端点 — 内部调试用
|
|
@app.get("/metrics/system", include_in_schema=False)
|
|
async def system_metrics_detail():
|
|
process = psutil.Process(os.getpid())
|
|
mem = process.memory_info()
|
|
return {
|
|
"cpu_percent": process.cpu_percent(interval=0.1),
|
|
"memory_rss_mb": round(mem.rss / 1024 / 1024, 2),
|
|
"memory_vms_mb": round(mem.vms / 1024 / 1024, 2),
|
|
"open_fds": _safe_get_fds(process),
|
|
}
|
|
|
|
|
|
def _update_system_metrics() -> None:
|
|
"""更新系统指标(供 /metrics 端点每次抓取时调用)"""
|
|
try:
|
|
process = psutil.Process(os.getpid())
|
|
mem = process.memory_info()
|
|
process_memory_bytes.labels(type="rss").set(mem.rss)
|
|
process_memory_bytes.labels(type="vms").set(mem.vms)
|
|
process_cpu_percent.set(process.cpu_percent(interval=0.05))
|
|
|
|
fds = _safe_get_fds(process)
|
|
if fds is not None:
|
|
process_open_fds.set(fds)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def _safe_get_fds(process) -> int | None:
|
|
try:
|
|
return process.num_fds()
|
|
except (AttributeError, psutil.AccessDenied, OSError):
|
|
return None
|
|
|
|
|
|
# ─── 便捷记录函数 ───
|
|
|
|
def record_llm_call(model: str, provider: str, duration: float,
|
|
input_tokens: int = 0, output_tokens: int = 0,
|
|
status: str = "success") -> None:
|
|
llm_calls_total.labels(model=model, provider=provider, status=status).inc()
|
|
llm_call_duration_seconds.labels(model=model, provider=provider).observe(duration)
|
|
if input_tokens > 0:
|
|
llm_token_usage_total.labels(model=model, type="input").inc(input_tokens)
|
|
if output_tokens > 0:
|
|
llm_token_usage_total.labels(model=model, type="output").inc(output_tokens)
|
|
|
|
|
|
def record_agent_execution(agent_id: str, agent_name: str, duration: float,
|
|
status: str = "success") -> None:
|
|
agent_executions_total.labels(
|
|
agent_id=agent_id, agent_name=agent_name, status=status
|
|
).inc()
|
|
agent_execution_duration_seconds.labels(
|
|
agent_id=agent_id, agent_name=agent_name
|
|
).observe(duration)
|
|
|
|
|
|
def record_tool_call(tool_name: str, status: str = "success") -> None:
|
|
tool_calls_total.labels(tool_name=tool_name, status=status).inc()
|
|
|
|
|
|
def record_workflow_execution(workflow_id: str, status: str = "success") -> None:
|
|
workflow_executions_total.labels(workflow_id=workflow_id, status=status).inc()
|