# 天工智能体平台 — Prometheus 告警规则 groups: - name: tiangong_alerts rules: # ─── 服务可用性 ─── - alert: BackendDown expr: up{job="backend"} == 0 for: 1m labels: severity: critical annotations: summary: "后端服务不可用" description: "FastAPI 后端 ($labels.instance) 已宕机超过 1 分钟" # ─── HTTP 错误率 ─── - alert: HighErrorRate expr: rate(http_requests_total{job="backend", status=~"5.."}[5m]) > 0.05 for: 5m labels: severity: critical annotations: summary: "HTTP 5xx 错误率过高" description: "后端 5xx 错误率 {{ $value | humanizePercentage }} 超过 5% 阈值" - alert: High4xxRate expr: rate(http_requests_total{job="backend", status=~"4.."}[5m]) > 0.2 for: 10m labels: severity: warning annotations: summary: "HTTP 4xx 错误率偏高" description: "后端 4xx 错误率 {{ $value | humanizePercentage }} 超过 20%" # ─── 延迟 ─── - alert: HighLatency expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job="backend"}[5m])) > 3 for: 5m labels: severity: warning annotations: summary: "API 延迟过高" description: "P95 延迟 {{ $value }}s 超过 3 秒" # ─── Agent 执行失败 ─── - alert: AgentExecutionFailureRate expr: rate(agent_executions_total{status="failure"}[15m]) / rate(agent_executions_total[15m]) > 0.3 for: 10m labels: severity: warning annotations: summary: "Agent 执行失败率过高" description: "Agent {{ $labels.agent_name }} 的失败率超过 30%" # ─── LLM 调用失败 ─── - alert: LLMCallFailureRate expr: rate(llm_calls_total{status="failure"}[10m]) / rate(llm_calls_total[10m]) > 0.2 for: 5m labels: severity: warning annotations: summary: "LLM 调用失败率过高" description: "模型 {{ $labels.model }} 调用失败率超过 20%" # ─── 系统资源 ─── - alert: HighMemoryUsage expr: process_memory_bytes{type="rss"} / 1024 / 1024 / 1024 > 4 for: 10m labels: severity: warning annotations: summary: "内存使用过高" description: "进程内存使用 {{ $value }} GB 超过 4GB" - alert: HighCPUUsage expr: process_cpu_percent > 80 for: 10m labels: severity: warning annotations: summary: "CPU 使用率过高" description: "进程 CPU 使用率 {{ $value }}% 超过 80%"