- Fix delete agent 500: clean up FK records (agent_llm_logs, permissions, schedules, executions, team_members) and unbind goals/tasks before delete - Remove hardcoded personality templates in Android, replace with dynamic system prompt generation from name + description - Set promptSectionsEnabled=false to bypass PromptComposer for personality - Add Tencent Cloud Linux deployment guide (Docker Compose) - Accumulated backend service updates, frontend UI fixes, Android app changes Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
82 lines
2.8 KiB
YAML
82 lines
2.8 KiB
YAML
# 天工智能体平台 — Prometheus 告警规则
|
|
groups:
|
|
- name: tiangong_alerts
|
|
rules:
|
|
# ─── 服务可用性 ───
|
|
- alert: BackendDown
|
|
expr: up{job="backend"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "后端服务不可用"
|
|
description: "FastAPI 后端 ($labels.instance) 已宕机超过 1 分钟"
|
|
|
|
# ─── HTTP 错误率 ───
|
|
- alert: HighErrorRate
|
|
expr: rate(http_requests_total{job="backend", status=~"5.."}[5m]) > 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "HTTP 5xx 错误率过高"
|
|
description: "后端 5xx 错误率 {{ $value | humanizePercentage }} 超过 5% 阈值"
|
|
|
|
- alert: High4xxRate
|
|
expr: rate(http_requests_total{job="backend", status=~"4.."}[5m]) > 0.2
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "HTTP 4xx 错误率偏高"
|
|
description: "后端 4xx 错误率 {{ $value | humanizePercentage }} 超过 20%"
|
|
|
|
# ─── 延迟 ───
|
|
- alert: HighLatency
|
|
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job="backend"}[5m])) > 3
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "API 延迟过高"
|
|
description: "P95 延迟 {{ $value }}s 超过 3 秒"
|
|
|
|
# ─── Agent 执行失败 ───
|
|
- alert: AgentExecutionFailureRate
|
|
expr: rate(agent_executions_total{status="failure"}[15m]) / rate(agent_executions_total[15m]) > 0.3
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Agent 执行失败率过高"
|
|
description: "Agent {{ $labels.agent_name }} 的失败率超过 30%"
|
|
|
|
# ─── LLM 调用失败 ───
|
|
- alert: LLMCallFailureRate
|
|
expr: rate(llm_calls_total{status="failure"}[10m]) / rate(llm_calls_total[10m]) > 0.2
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "LLM 调用失败率过高"
|
|
description: "模型 {{ $labels.model }} 调用失败率超过 20%"
|
|
|
|
# ─── 系统资源 ───
|
|
- alert: HighMemoryUsage
|
|
expr: process_memory_bytes{type="rss"} / 1024 / 1024 / 1024 > 4
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "内存使用过高"
|
|
description: "进程内存使用 {{ $value }} GB 超过 4GB"
|
|
|
|
- alert: HighCPUUsage
|
|
expr: process_cpu_percent > 80
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "CPU 使用率过高"
|
|
description: "进程 CPU 使用率 {{ $value }}% 超过 80%"
|