""" 告警规则管理API """ from fastapi import APIRouter, Depends, Query, HTTPException, status from sqlalchemy.orm import Session from pydantic import BaseModel from typing import List, Optional, Dict, Any from datetime import datetime import logging from app.core.database import get_db from app.models.alert_rule import AlertRule, AlertLog from app.api.auth import get_current_user from app.models.user import User from app.core.exceptions import NotFoundError, ValidationError, ConflictError from app.services.alert_service import AlertService logger = logging.getLogger(__name__) router = APIRouter( prefix="/api/v1/alert-rules", tags=["alert-rules"], responses={ 401: {"description": "未授权"}, 404: {"description": "资源不存在"}, 400: {"description": "请求参数错误"}, 500: {"description": "服务器内部错误"} } ) class AlertRuleCreate(BaseModel): """告警规则创建模型""" name: str description: Optional[str] = None alert_type: str # execution_failed, execution_timeout, error_rate, resource_usage target_type: str # workflow, agent, system target_id: Optional[str] = None conditions: Dict[str, Any] notification_type: str # email, webhook, internal notification_config: Optional[Dict[str, Any]] = None enabled: bool = True class AlertRuleUpdate(BaseModel): """告警规则更新模型""" name: Optional[str] = None description: Optional[str] = None alert_type: Optional[str] = None target_type: Optional[str] = None target_id: Optional[str] = None conditions: Optional[Dict[str, Any]] = None notification_type: Optional[str] = None notification_config: Optional[Dict[str, Any]] = None enabled: Optional[bool] = None class AlertRuleResponse(BaseModel): """告警规则响应模型""" id: str name: str description: Optional[str] alert_type: str target_type: str target_id: Optional[str] conditions: Dict[str, Any] notification_type: str notification_config: Optional[Dict[str, Any]] enabled: bool trigger_count: int last_triggered_at: Optional[datetime] user_id: str created_at: datetime updated_at: datetime class Config: from_attributes = True class AlertLogResponse(BaseModel): """告警日志响应模型""" id: str rule_id: str alert_type: str severity: str message: str details: Optional[Dict[str, Any]] status: str notification_type: Optional[str] notification_result: Optional[str] triggered_at: datetime acknowledged_at: Optional[datetime] acknowledged_by: Optional[str] class Config: from_attributes = True @router.get("", response_model=List[AlertRuleResponse]) async def get_alert_rules( skip: int = Query(0, ge=0, description="跳过记录数"), limit: int = Query(100, ge=1, le=100, description="每页记录数"), alert_type: Optional[str] = Query(None, description="告警类型筛选"), enabled: Optional[bool] = Query(None, description="是否启用筛选"), db: Session = Depends(get_db), current_user: User = Depends(get_current_user) ): """ 获取告警规则列表 支持分页和筛选 """ query = db.query(AlertRule).filter(AlertRule.user_id == current_user.id) # 筛选 if alert_type: query = query.filter(AlertRule.alert_type == alert_type) if enabled is not None: query = query.filter(AlertRule.enabled == enabled) # 排序和分页 rules = query.order_by(AlertRule.created_at.desc()).offset(skip).limit(limit).all() return rules @router.post("", response_model=AlertRuleResponse, status_code=status.HTTP_201_CREATED) async def create_alert_rule( rule_data: AlertRuleCreate, db: Session = Depends(get_db), current_user: User = Depends(get_current_user) ): """ 创建告警规则 """ # 验证告警类型 valid_alert_types = ['execution_failed', 'execution_timeout', 'error_rate', 'resource_usage'] if rule_data.alert_type not in valid_alert_types: raise ValidationError(f"不支持的告警类型: {rule_data.alert_type}") # 验证目标类型 valid_target_types = ['workflow', 'agent', 'system'] if rule_data.target_type not in valid_target_types: raise ValidationError(f"不支持的目标类型: {rule_data.target_type}") # 验证通知方式 valid_notification_types = ['email', 'webhook', 'internal'] if rule_data.notification_type not in valid_notification_types: raise ValidationError(f"不支持的通知方式: {rule_data.notification_type}") # 检查名称是否重复 existing_rule = db.query(AlertRule).filter( AlertRule.name == rule_data.name, AlertRule.user_id == current_user.id ).first() if existing_rule: raise ConflictError(f"告警规则名称 '{rule_data.name}' 已存在") # 创建告警规则 alert_rule = AlertRule( name=rule_data.name, description=rule_data.description, alert_type=rule_data.alert_type, target_type=rule_data.target_type, target_id=rule_data.target_id, conditions=rule_data.conditions, notification_type=rule_data.notification_type, notification_config=rule_data.notification_config, enabled=rule_data.enabled, user_id=current_user.id ) db.add(alert_rule) db.commit() db.refresh(alert_rule) logger.info(f"用户 {current_user.username} 创建了告警规则: {alert_rule.name} ({alert_rule.id})") return alert_rule @router.get("/{rule_id}", response_model=AlertRuleResponse) async def get_alert_rule( rule_id: str, db: Session = Depends(get_db), current_user: User = Depends(get_current_user) ): """ 获取告警规则详情 """ rule = db.query(AlertRule).filter( AlertRule.id == rule_id, AlertRule.user_id == current_user.id ).first() if not rule: raise NotFoundError(f"告警规则不存在: {rule_id}") return rule @router.put("/{rule_id}", response_model=AlertRuleResponse) async def update_alert_rule( rule_id: str, rule_data: AlertRuleUpdate, db: Session = Depends(get_db), current_user: User = Depends(get_current_user) ): """ 更新告警规则 """ rule = db.query(AlertRule).filter( AlertRule.id == rule_id, AlertRule.user_id == current_user.id ).first() if not rule: raise NotFoundError(f"告警规则不存在: {rule_id}") # 更新字段 if rule_data.name is not None: # 检查名称是否重复 existing_rule = db.query(AlertRule).filter( AlertRule.name == rule_data.name, AlertRule.user_id == current_user.id, AlertRule.id != rule_id ).first() if existing_rule: raise ConflictError(f"告警规则名称 '{rule_data.name}' 已存在") rule.name = rule_data.name if rule_data.description is not None: rule.description = rule_data.description if rule_data.alert_type is not None: valid_alert_types = ['execution_failed', 'execution_timeout', 'error_rate', 'resource_usage'] if rule_data.alert_type not in valid_alert_types: raise ValidationError(f"不支持的告警类型: {rule_data.alert_type}") rule.alert_type = rule_data.alert_type if rule_data.target_type is not None: valid_target_types = ['workflow', 'agent', 'system'] if rule_data.target_type not in valid_target_types: raise ValidationError(f"不支持的目标类型: {rule_data.target_type}") rule.target_type = rule_data.target_type if rule_data.target_id is not None: rule.target_id = rule_data.target_id if rule_data.conditions is not None: rule.conditions = rule_data.conditions if rule_data.notification_type is not None: valid_notification_types = ['email', 'webhook', 'internal'] if rule_data.notification_type not in valid_notification_types: raise ValidationError(f"不支持的通知方式: {rule_data.notification_type}") rule.notification_type = rule_data.notification_type if rule_data.notification_config is not None: rule.notification_config = rule_data.notification_config if rule_data.enabled is not None: rule.enabled = rule_data.enabled db.commit() db.refresh(rule) logger.info(f"用户 {current_user.username} 更新了告警规则: {rule.name} ({rule.id})") return rule @router.delete("/{rule_id}", status_code=status.HTTP_200_OK) async def delete_alert_rule( rule_id: str, db: Session = Depends(get_db), current_user: User = Depends(get_current_user) ): """ 删除告警规则 """ rule = db.query(AlertRule).filter( AlertRule.id == rule_id, AlertRule.user_id == current_user.id ).first() if not rule: raise NotFoundError(f"告警规则不存在: {rule_id}") rule_name = rule.name db.delete(rule) db.commit() logger.info(f"用户 {current_user.username} 删除了告警规则: {rule_name} ({rule_id})") return {"message": "告警规则已删除"} @router.get("/{rule_id}/logs", response_model=List[AlertLogResponse]) async def get_alert_logs( rule_id: str, skip: int = Query(0, ge=0, description="跳过记录数"), limit: int = Query(50, ge=1, le=100, description="每页记录数"), db: Session = Depends(get_db), current_user: User = Depends(get_current_user) ): """ 获取告警规则的日志 """ # 验证规则存在且属于当前用户 rule = db.query(AlertRule).filter( AlertRule.id == rule_id, AlertRule.user_id == current_user.id ).first() if not rule: raise NotFoundError(f"告警规则不存在: {rule_id}") # 获取日志 logs = db.query(AlertLog).filter( AlertLog.rule_id == rule_id ).order_by(AlertLog.triggered_at.desc()).offset(skip).limit(limit).all() return logs @router.post("/{rule_id}/acknowledge/{log_id}", status_code=status.HTTP_200_OK) async def acknowledge_alert( rule_id: str, log_id: str, db: Session = Depends(get_db), current_user: User = Depends(get_current_user) ): """ 确认告警 """ # 验证规则存在且属于当前用户 rule = db.query(AlertRule).filter( AlertRule.id == rule_id, AlertRule.user_id == current_user.id ).first() if not rule: raise NotFoundError(f"告警规则不存在: {rule_id}") # 获取告警日志 alert_log = db.query(AlertLog).filter( AlertLog.id == log_id, AlertLog.rule_id == rule_id ).first() if not alert_log: raise NotFoundError(f"告警日志不存在: {log_id}") # 更新状态 alert_log.status = "acknowledged" alert_log.acknowledged_at = datetime.utcnow() alert_log.acknowledged_by = current_user.id db.commit() db.refresh(alert_log) return {"message": "告警已确认", "log": alert_log} @router.post("/check", status_code=status.HTTP_200_OK) async def check_alerts( db: Session = Depends(get_db), current_user: User = Depends(get_current_user) ): """ 手动触发告警检查 检查所有启用的告警规则 """ # 获取所有启用的告警规则 rules = db.query(AlertRule).filter( AlertRule.enabled == True, AlertRule.user_id == current_user.id ).all() triggered_count = 0 for rule in rules: try: if rule.alert_type == 'error_rate': # 错误率告警需要检查所有相关执行 should_trigger = await AlertService.check_error_rate(db, rule) if should_trigger: # 创建告警日志 alert_log = AlertLog( rule_id=rule.id, alert_type=rule.alert_type, severity=rule.conditions.get('severity', 'warning'), message=f"错误率告警: {rule.target_type} 错误率超过阈值", details={ "target_type": rule.target_type, "target_id": rule.target_id }, status='pending', notification_type=rule.notification_type, triggered_at=datetime.utcnow() ) db.add(alert_log) rule.trigger_count += 1 rule.last_triggered_at = datetime.utcnow() db.commit() db.refresh(alert_log) # 发送通知 await AlertService.send_notification(db, alert_log, rule) triggered_count += 1 except Exception as e: logger.error(f"检查告警规则失败 {rule.id}: {str(e)}") continue return { "message": f"告警检查完成,触发 {triggered_count} 个告警", "triggered_count": triggered_count }