Files
aiagent/prometheus-alert-rules.yml
renjianbo beff3fac8d fix: delete agent 500 error + dynamic personality + deployment guide
- Fix delete agent 500: clean up FK records (agent_llm_logs, permissions,
  schedules, executions, team_members) and unbind goals/tasks before delete
- Remove hardcoded personality templates in Android, replace with dynamic
  system prompt generation from name + description
- Set promptSectionsEnabled=false to bypass PromptComposer for personality
- Add Tencent Cloud Linux deployment guide (Docker Compose)
- Accumulated backend service updates, frontend UI fixes, Android app changes

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-06-29 01:17:21 +08:00

82 lines
2.8 KiB
YAML

# 天工智能体平台 — Prometheus 告警规则
groups:
- name: tiangong_alerts
rules:
# ─── 服务可用性 ───
- alert: BackendDown
expr: up{job="backend"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "后端服务不可用"
description: "FastAPI 后端 ($labels.instance) 已宕机超过 1 分钟"
# ─── HTTP 错误率 ───
- alert: HighErrorRate
expr: rate(http_requests_total{job="backend", status=~"5.."}[5m]) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "HTTP 5xx 错误率过高"
description: "后端 5xx 错误率 {{ $value | humanizePercentage }} 超过 5% 阈值"
- alert: High4xxRate
expr: rate(http_requests_total{job="backend", status=~"4.."}[5m]) > 0.2
for: 10m
labels:
severity: warning
annotations:
summary: "HTTP 4xx 错误率偏高"
description: "后端 4xx 错误率 {{ $value | humanizePercentage }} 超过 20%"
# ─── 延迟 ───
- alert: HighLatency
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job="backend"}[5m])) > 3
for: 5m
labels:
severity: warning
annotations:
summary: "API 延迟过高"
description: "P95 延迟 {{ $value }}s 超过 3 秒"
# ─── Agent 执行失败 ───
- alert: AgentExecutionFailureRate
expr: rate(agent_executions_total{status="failure"}[15m]) / rate(agent_executions_total[15m]) > 0.3
for: 10m
labels:
severity: warning
annotations:
summary: "Agent 执行失败率过高"
description: "Agent {{ $labels.agent_name }} 的失败率超过 30%"
# ─── LLM 调用失败 ───
- alert: LLMCallFailureRate
expr: rate(llm_calls_total{status="failure"}[10m]) / rate(llm_calls_total[10m]) > 0.2
for: 5m
labels:
severity: warning
annotations:
summary: "LLM 调用失败率过高"
description: "模型 {{ $labels.model }} 调用失败率超过 20%"
# ─── 系统资源 ───
- alert: HighMemoryUsage
expr: process_memory_bytes{type="rss"} / 1024 / 1024 / 1024 > 4
for: 10m
labels:
severity: warning
annotations:
summary: "内存使用过高"
description: "进程内存使用 {{ $value }} GB 超过 4GB"
- alert: HighCPUUsage
expr: process_cpu_percent > 80
for: 10m
labels:
severity: warning
annotations:
summary: "CPU 使用率过高"
description: "进程 CPU 使用率 {{ $value }}% 超过 80%"