Files
aiagent/backend/app/api/alert_rules.py
2026-01-19 00:09:36 +08:00

415 lines
13 KiB
Python

"""
告警规则管理API
"""
from fastapi import APIRouter, Depends, Query, HTTPException, status
from sqlalchemy.orm import Session
from pydantic import BaseModel
from typing import List, Optional, Dict, Any
from datetime import datetime
import logging
from app.core.database import get_db
from app.models.alert_rule import AlertRule, AlertLog
from app.api.auth import get_current_user
from app.models.user import User
from app.core.exceptions import NotFoundError, ValidationError, ConflictError
from app.services.alert_service import AlertService
logger = logging.getLogger(__name__)
router = APIRouter(
prefix="/api/v1/alert-rules",
tags=["alert-rules"],
responses={
401: {"description": "未授权"},
404: {"description": "资源不存在"},
400: {"description": "请求参数错误"},
500: {"description": "服务器内部错误"}
}
)
class AlertRuleCreate(BaseModel):
"""告警规则创建模型"""
name: str
description: Optional[str] = None
alert_type: str # execution_failed, execution_timeout, error_rate, resource_usage
target_type: str # workflow, agent, system
target_id: Optional[str] = None
conditions: Dict[str, Any]
notification_type: str # email, webhook, internal
notification_config: Optional[Dict[str, Any]] = None
enabled: bool = True
class AlertRuleUpdate(BaseModel):
"""告警规则更新模型"""
name: Optional[str] = None
description: Optional[str] = None
alert_type: Optional[str] = None
target_type: Optional[str] = None
target_id: Optional[str] = None
conditions: Optional[Dict[str, Any]] = None
notification_type: Optional[str] = None
notification_config: Optional[Dict[str, Any]] = None
enabled: Optional[bool] = None
class AlertRuleResponse(BaseModel):
"""告警规则响应模型"""
id: str
name: str
description: Optional[str]
alert_type: str
target_type: str
target_id: Optional[str]
conditions: Dict[str, Any]
notification_type: str
notification_config: Optional[Dict[str, Any]]
enabled: bool
trigger_count: int
last_triggered_at: Optional[datetime]
user_id: str
created_at: datetime
updated_at: datetime
class Config:
from_attributes = True
class AlertLogResponse(BaseModel):
"""告警日志响应模型"""
id: str
rule_id: str
alert_type: str
severity: str
message: str
details: Optional[Dict[str, Any]]
status: str
notification_type: Optional[str]
notification_result: Optional[str]
triggered_at: datetime
acknowledged_at: Optional[datetime]
acknowledged_by: Optional[str]
class Config:
from_attributes = True
@router.get("", response_model=List[AlertRuleResponse])
async def get_alert_rules(
skip: int = Query(0, ge=0, description="跳过记录数"),
limit: int = Query(100, ge=1, le=100, description="每页记录数"),
alert_type: Optional[str] = Query(None, description="告警类型筛选"),
enabled: Optional[bool] = Query(None, description="是否启用筛选"),
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user)
):
"""
获取告警规则列表
支持分页和筛选
"""
query = db.query(AlertRule).filter(AlertRule.user_id == current_user.id)
# 筛选
if alert_type:
query = query.filter(AlertRule.alert_type == alert_type)
if enabled is not None:
query = query.filter(AlertRule.enabled == enabled)
# 排序和分页
rules = query.order_by(AlertRule.created_at.desc()).offset(skip).limit(limit).all()
return rules
@router.post("", response_model=AlertRuleResponse, status_code=status.HTTP_201_CREATED)
async def create_alert_rule(
rule_data: AlertRuleCreate,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user)
):
"""
创建告警规则
"""
# 验证告警类型
valid_alert_types = ['execution_failed', 'execution_timeout', 'error_rate', 'resource_usage']
if rule_data.alert_type not in valid_alert_types:
raise ValidationError(f"不支持的告警类型: {rule_data.alert_type}")
# 验证目标类型
valid_target_types = ['workflow', 'agent', 'system']
if rule_data.target_type not in valid_target_types:
raise ValidationError(f"不支持的目标类型: {rule_data.target_type}")
# 验证通知方式
valid_notification_types = ['email', 'webhook', 'internal']
if rule_data.notification_type not in valid_notification_types:
raise ValidationError(f"不支持的通知方式: {rule_data.notification_type}")
# 检查名称是否重复
existing_rule = db.query(AlertRule).filter(
AlertRule.name == rule_data.name,
AlertRule.user_id == current_user.id
).first()
if existing_rule:
raise ConflictError(f"告警规则名称 '{rule_data.name}' 已存在")
# 创建告警规则
alert_rule = AlertRule(
name=rule_data.name,
description=rule_data.description,
alert_type=rule_data.alert_type,
target_type=rule_data.target_type,
target_id=rule_data.target_id,
conditions=rule_data.conditions,
notification_type=rule_data.notification_type,
notification_config=rule_data.notification_config,
enabled=rule_data.enabled,
user_id=current_user.id
)
db.add(alert_rule)
db.commit()
db.refresh(alert_rule)
logger.info(f"用户 {current_user.username} 创建了告警规则: {alert_rule.name} ({alert_rule.id})")
return alert_rule
@router.get("/{rule_id}", response_model=AlertRuleResponse)
async def get_alert_rule(
rule_id: str,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user)
):
"""
获取告警规则详情
"""
rule = db.query(AlertRule).filter(
AlertRule.id == rule_id,
AlertRule.user_id == current_user.id
).first()
if not rule:
raise NotFoundError(f"告警规则不存在: {rule_id}")
return rule
@router.put("/{rule_id}", response_model=AlertRuleResponse)
async def update_alert_rule(
rule_id: str,
rule_data: AlertRuleUpdate,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user)
):
"""
更新告警规则
"""
rule = db.query(AlertRule).filter(
AlertRule.id == rule_id,
AlertRule.user_id == current_user.id
).first()
if not rule:
raise NotFoundError(f"告警规则不存在: {rule_id}")
# 更新字段
if rule_data.name is not None:
# 检查名称是否重复
existing_rule = db.query(AlertRule).filter(
AlertRule.name == rule_data.name,
AlertRule.user_id == current_user.id,
AlertRule.id != rule_id
).first()
if existing_rule:
raise ConflictError(f"告警规则名称 '{rule_data.name}' 已存在")
rule.name = rule_data.name
if rule_data.description is not None:
rule.description = rule_data.description
if rule_data.alert_type is not None:
valid_alert_types = ['execution_failed', 'execution_timeout', 'error_rate', 'resource_usage']
if rule_data.alert_type not in valid_alert_types:
raise ValidationError(f"不支持的告警类型: {rule_data.alert_type}")
rule.alert_type = rule_data.alert_type
if rule_data.target_type is not None:
valid_target_types = ['workflow', 'agent', 'system']
if rule_data.target_type not in valid_target_types:
raise ValidationError(f"不支持的目标类型: {rule_data.target_type}")
rule.target_type = rule_data.target_type
if rule_data.target_id is not None:
rule.target_id = rule_data.target_id
if rule_data.conditions is not None:
rule.conditions = rule_data.conditions
if rule_data.notification_type is not None:
valid_notification_types = ['email', 'webhook', 'internal']
if rule_data.notification_type not in valid_notification_types:
raise ValidationError(f"不支持的通知方式: {rule_data.notification_type}")
rule.notification_type = rule_data.notification_type
if rule_data.notification_config is not None:
rule.notification_config = rule_data.notification_config
if rule_data.enabled is not None:
rule.enabled = rule_data.enabled
db.commit()
db.refresh(rule)
logger.info(f"用户 {current_user.username} 更新了告警规则: {rule.name} ({rule.id})")
return rule
@router.delete("/{rule_id}", status_code=status.HTTP_200_OK)
async def delete_alert_rule(
rule_id: str,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user)
):
"""
删除告警规则
"""
rule = db.query(AlertRule).filter(
AlertRule.id == rule_id,
AlertRule.user_id == current_user.id
).first()
if not rule:
raise NotFoundError(f"告警规则不存在: {rule_id}")
rule_name = rule.name
db.delete(rule)
db.commit()
logger.info(f"用户 {current_user.username} 删除了告警规则: {rule_name} ({rule_id})")
return {"message": "告警规则已删除"}
@router.get("/{rule_id}/logs", response_model=List[AlertLogResponse])
async def get_alert_logs(
rule_id: str,
skip: int = Query(0, ge=0, description="跳过记录数"),
limit: int = Query(50, ge=1, le=100, description="每页记录数"),
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user)
):
"""
获取告警规则的日志
"""
# 验证规则存在且属于当前用户
rule = db.query(AlertRule).filter(
AlertRule.id == rule_id,
AlertRule.user_id == current_user.id
).first()
if not rule:
raise NotFoundError(f"告警规则不存在: {rule_id}")
# 获取日志
logs = db.query(AlertLog).filter(
AlertLog.rule_id == rule_id
).order_by(AlertLog.triggered_at.desc()).offset(skip).limit(limit).all()
return logs
@router.post("/{rule_id}/acknowledge/{log_id}", status_code=status.HTTP_200_OK)
async def acknowledge_alert(
rule_id: str,
log_id: str,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user)
):
"""
确认告警
"""
# 验证规则存在且属于当前用户
rule = db.query(AlertRule).filter(
AlertRule.id == rule_id,
AlertRule.user_id == current_user.id
).first()
if not rule:
raise NotFoundError(f"告警规则不存在: {rule_id}")
# 获取告警日志
alert_log = db.query(AlertLog).filter(
AlertLog.id == log_id,
AlertLog.rule_id == rule_id
).first()
if not alert_log:
raise NotFoundError(f"告警日志不存在: {log_id}")
# 更新状态
alert_log.status = "acknowledged"
alert_log.acknowledged_at = datetime.utcnow()
alert_log.acknowledged_by = current_user.id
db.commit()
db.refresh(alert_log)
return {"message": "告警已确认", "log": alert_log}
@router.post("/check", status_code=status.HTTP_200_OK)
async def check_alerts(
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user)
):
"""
手动触发告警检查
检查所有启用的告警规则
"""
# 获取所有启用的告警规则
rules = db.query(AlertRule).filter(
AlertRule.enabled == True,
AlertRule.user_id == current_user.id
).all()
triggered_count = 0
for rule in rules:
try:
if rule.alert_type == 'error_rate':
# 错误率告警需要检查所有相关执行
should_trigger = await AlertService.check_error_rate(db, rule)
if should_trigger:
# 创建告警日志
alert_log = AlertLog(
rule_id=rule.id,
alert_type=rule.alert_type,
severity=rule.conditions.get('severity', 'warning'),
message=f"错误率告警: {rule.target_type} 错误率超过阈值",
details={
"target_type": rule.target_type,
"target_id": rule.target_id
},
status='pending',
notification_type=rule.notification_type,
triggered_at=datetime.utcnow()
)
db.add(alert_log)
rule.trigger_count += 1
rule.last_triggered_at = datetime.utcnow()
db.commit()
db.refresh(alert_log)
# 发送通知
await AlertService.send_notification(db, alert_log, rule)
triggered_count += 1
except Exception as e:
logger.error(f"检查告警规则失败 {rule.id}: {str(e)}")
continue
return {
"message": f"告警检查完成,触发 {triggered_count} 个告警",
"triggered_count": triggered_count
}