415 lines
13 KiB
Python
415 lines
13 KiB
Python
"""
|
|
告警规则管理API
|
|
"""
|
|
from fastapi import APIRouter, Depends, Query, HTTPException, status
|
|
from sqlalchemy.orm import Session
|
|
from pydantic import BaseModel
|
|
from typing import List, Optional, Dict, Any
|
|
from datetime import datetime
|
|
import logging
|
|
from app.core.database import get_db
|
|
from app.models.alert_rule import AlertRule, AlertLog
|
|
from app.api.auth import get_current_user
|
|
from app.models.user import User
|
|
from app.core.exceptions import NotFoundError, ValidationError, ConflictError
|
|
from app.services.alert_service import AlertService
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
router = APIRouter(
|
|
prefix="/api/v1/alert-rules",
|
|
tags=["alert-rules"],
|
|
responses={
|
|
401: {"description": "未授权"},
|
|
404: {"description": "资源不存在"},
|
|
400: {"description": "请求参数错误"},
|
|
500: {"description": "服务器内部错误"}
|
|
}
|
|
)
|
|
|
|
|
|
class AlertRuleCreate(BaseModel):
|
|
"""告警规则创建模型"""
|
|
name: str
|
|
description: Optional[str] = None
|
|
alert_type: str # execution_failed, execution_timeout, error_rate, resource_usage
|
|
target_type: str # workflow, agent, system
|
|
target_id: Optional[str] = None
|
|
conditions: Dict[str, Any]
|
|
notification_type: str # email, webhook, internal
|
|
notification_config: Optional[Dict[str, Any]] = None
|
|
enabled: bool = True
|
|
|
|
|
|
class AlertRuleUpdate(BaseModel):
|
|
"""告警规则更新模型"""
|
|
name: Optional[str] = None
|
|
description: Optional[str] = None
|
|
alert_type: Optional[str] = None
|
|
target_type: Optional[str] = None
|
|
target_id: Optional[str] = None
|
|
conditions: Optional[Dict[str, Any]] = None
|
|
notification_type: Optional[str] = None
|
|
notification_config: Optional[Dict[str, Any]] = None
|
|
enabled: Optional[bool] = None
|
|
|
|
|
|
class AlertRuleResponse(BaseModel):
|
|
"""告警规则响应模型"""
|
|
id: str
|
|
name: str
|
|
description: Optional[str]
|
|
alert_type: str
|
|
target_type: str
|
|
target_id: Optional[str]
|
|
conditions: Dict[str, Any]
|
|
notification_type: str
|
|
notification_config: Optional[Dict[str, Any]]
|
|
enabled: bool
|
|
trigger_count: int
|
|
last_triggered_at: Optional[datetime]
|
|
user_id: str
|
|
created_at: datetime
|
|
updated_at: datetime
|
|
|
|
class Config:
|
|
from_attributes = True
|
|
|
|
|
|
class AlertLogResponse(BaseModel):
|
|
"""告警日志响应模型"""
|
|
id: str
|
|
rule_id: str
|
|
alert_type: str
|
|
severity: str
|
|
message: str
|
|
details: Optional[Dict[str, Any]]
|
|
status: str
|
|
notification_type: Optional[str]
|
|
notification_result: Optional[str]
|
|
triggered_at: datetime
|
|
acknowledged_at: Optional[datetime]
|
|
acknowledged_by: Optional[str]
|
|
|
|
class Config:
|
|
from_attributes = True
|
|
|
|
|
|
@router.get("", response_model=List[AlertRuleResponse])
|
|
async def get_alert_rules(
|
|
skip: int = Query(0, ge=0, description="跳过记录数"),
|
|
limit: int = Query(100, ge=1, le=100, description="每页记录数"),
|
|
alert_type: Optional[str] = Query(None, description="告警类型筛选"),
|
|
enabled: Optional[bool] = Query(None, description="是否启用筛选"),
|
|
db: Session = Depends(get_db),
|
|
current_user: User = Depends(get_current_user)
|
|
):
|
|
"""
|
|
获取告警规则列表
|
|
|
|
支持分页和筛选
|
|
"""
|
|
query = db.query(AlertRule).filter(AlertRule.user_id == current_user.id)
|
|
|
|
# 筛选
|
|
if alert_type:
|
|
query = query.filter(AlertRule.alert_type == alert_type)
|
|
if enabled is not None:
|
|
query = query.filter(AlertRule.enabled == enabled)
|
|
|
|
# 排序和分页
|
|
rules = query.order_by(AlertRule.created_at.desc()).offset(skip).limit(limit).all()
|
|
return rules
|
|
|
|
|
|
@router.post("", response_model=AlertRuleResponse, status_code=status.HTTP_201_CREATED)
|
|
async def create_alert_rule(
|
|
rule_data: AlertRuleCreate,
|
|
db: Session = Depends(get_db),
|
|
current_user: User = Depends(get_current_user)
|
|
):
|
|
"""
|
|
创建告警规则
|
|
"""
|
|
# 验证告警类型
|
|
valid_alert_types = ['execution_failed', 'execution_timeout', 'error_rate', 'resource_usage']
|
|
if rule_data.alert_type not in valid_alert_types:
|
|
raise ValidationError(f"不支持的告警类型: {rule_data.alert_type}")
|
|
|
|
# 验证目标类型
|
|
valid_target_types = ['workflow', 'agent', 'system']
|
|
if rule_data.target_type not in valid_target_types:
|
|
raise ValidationError(f"不支持的目标类型: {rule_data.target_type}")
|
|
|
|
# 验证通知方式
|
|
valid_notification_types = ['email', 'webhook', 'internal']
|
|
if rule_data.notification_type not in valid_notification_types:
|
|
raise ValidationError(f"不支持的通知方式: {rule_data.notification_type}")
|
|
|
|
# 检查名称是否重复
|
|
existing_rule = db.query(AlertRule).filter(
|
|
AlertRule.name == rule_data.name,
|
|
AlertRule.user_id == current_user.id
|
|
).first()
|
|
if existing_rule:
|
|
raise ConflictError(f"告警规则名称 '{rule_data.name}' 已存在")
|
|
|
|
# 创建告警规则
|
|
alert_rule = AlertRule(
|
|
name=rule_data.name,
|
|
description=rule_data.description,
|
|
alert_type=rule_data.alert_type,
|
|
target_type=rule_data.target_type,
|
|
target_id=rule_data.target_id,
|
|
conditions=rule_data.conditions,
|
|
notification_type=rule_data.notification_type,
|
|
notification_config=rule_data.notification_config,
|
|
enabled=rule_data.enabled,
|
|
user_id=current_user.id
|
|
)
|
|
db.add(alert_rule)
|
|
db.commit()
|
|
db.refresh(alert_rule)
|
|
|
|
logger.info(f"用户 {current_user.username} 创建了告警规则: {alert_rule.name} ({alert_rule.id})")
|
|
return alert_rule
|
|
|
|
|
|
@router.get("/{rule_id}", response_model=AlertRuleResponse)
|
|
async def get_alert_rule(
|
|
rule_id: str,
|
|
db: Session = Depends(get_db),
|
|
current_user: User = Depends(get_current_user)
|
|
):
|
|
"""
|
|
获取告警规则详情
|
|
"""
|
|
rule = db.query(AlertRule).filter(
|
|
AlertRule.id == rule_id,
|
|
AlertRule.user_id == current_user.id
|
|
).first()
|
|
|
|
if not rule:
|
|
raise NotFoundError(f"告警规则不存在: {rule_id}")
|
|
|
|
return rule
|
|
|
|
|
|
@router.put("/{rule_id}", response_model=AlertRuleResponse)
|
|
async def update_alert_rule(
|
|
rule_id: str,
|
|
rule_data: AlertRuleUpdate,
|
|
db: Session = Depends(get_db),
|
|
current_user: User = Depends(get_current_user)
|
|
):
|
|
"""
|
|
更新告警规则
|
|
"""
|
|
rule = db.query(AlertRule).filter(
|
|
AlertRule.id == rule_id,
|
|
AlertRule.user_id == current_user.id
|
|
).first()
|
|
|
|
if not rule:
|
|
raise NotFoundError(f"告警规则不存在: {rule_id}")
|
|
|
|
# 更新字段
|
|
if rule_data.name is not None:
|
|
# 检查名称是否重复
|
|
existing_rule = db.query(AlertRule).filter(
|
|
AlertRule.name == rule_data.name,
|
|
AlertRule.user_id == current_user.id,
|
|
AlertRule.id != rule_id
|
|
).first()
|
|
if existing_rule:
|
|
raise ConflictError(f"告警规则名称 '{rule_data.name}' 已存在")
|
|
rule.name = rule_data.name
|
|
|
|
if rule_data.description is not None:
|
|
rule.description = rule_data.description
|
|
|
|
if rule_data.alert_type is not None:
|
|
valid_alert_types = ['execution_failed', 'execution_timeout', 'error_rate', 'resource_usage']
|
|
if rule_data.alert_type not in valid_alert_types:
|
|
raise ValidationError(f"不支持的告警类型: {rule_data.alert_type}")
|
|
rule.alert_type = rule_data.alert_type
|
|
|
|
if rule_data.target_type is not None:
|
|
valid_target_types = ['workflow', 'agent', 'system']
|
|
if rule_data.target_type not in valid_target_types:
|
|
raise ValidationError(f"不支持的目标类型: {rule_data.target_type}")
|
|
rule.target_type = rule_data.target_type
|
|
|
|
if rule_data.target_id is not None:
|
|
rule.target_id = rule_data.target_id
|
|
|
|
if rule_data.conditions is not None:
|
|
rule.conditions = rule_data.conditions
|
|
|
|
if rule_data.notification_type is not None:
|
|
valid_notification_types = ['email', 'webhook', 'internal']
|
|
if rule_data.notification_type not in valid_notification_types:
|
|
raise ValidationError(f"不支持的通知方式: {rule_data.notification_type}")
|
|
rule.notification_type = rule_data.notification_type
|
|
|
|
if rule_data.notification_config is not None:
|
|
rule.notification_config = rule_data.notification_config
|
|
|
|
if rule_data.enabled is not None:
|
|
rule.enabled = rule_data.enabled
|
|
|
|
db.commit()
|
|
db.refresh(rule)
|
|
|
|
logger.info(f"用户 {current_user.username} 更新了告警规则: {rule.name} ({rule.id})")
|
|
return rule
|
|
|
|
|
|
@router.delete("/{rule_id}", status_code=status.HTTP_200_OK)
|
|
async def delete_alert_rule(
|
|
rule_id: str,
|
|
db: Session = Depends(get_db),
|
|
current_user: User = Depends(get_current_user)
|
|
):
|
|
"""
|
|
删除告警规则
|
|
"""
|
|
rule = db.query(AlertRule).filter(
|
|
AlertRule.id == rule_id,
|
|
AlertRule.user_id == current_user.id
|
|
).first()
|
|
|
|
if not rule:
|
|
raise NotFoundError(f"告警规则不存在: {rule_id}")
|
|
|
|
rule_name = rule.name
|
|
db.delete(rule)
|
|
db.commit()
|
|
|
|
logger.info(f"用户 {current_user.username} 删除了告警规则: {rule_name} ({rule_id})")
|
|
return {"message": "告警规则已删除"}
|
|
|
|
|
|
@router.get("/{rule_id}/logs", response_model=List[AlertLogResponse])
|
|
async def get_alert_logs(
|
|
rule_id: str,
|
|
skip: int = Query(0, ge=0, description="跳过记录数"),
|
|
limit: int = Query(50, ge=1, le=100, description="每页记录数"),
|
|
db: Session = Depends(get_db),
|
|
current_user: User = Depends(get_current_user)
|
|
):
|
|
"""
|
|
获取告警规则的日志
|
|
"""
|
|
# 验证规则存在且属于当前用户
|
|
rule = db.query(AlertRule).filter(
|
|
AlertRule.id == rule_id,
|
|
AlertRule.user_id == current_user.id
|
|
).first()
|
|
|
|
if not rule:
|
|
raise NotFoundError(f"告警规则不存在: {rule_id}")
|
|
|
|
# 获取日志
|
|
logs = db.query(AlertLog).filter(
|
|
AlertLog.rule_id == rule_id
|
|
).order_by(AlertLog.triggered_at.desc()).offset(skip).limit(limit).all()
|
|
|
|
return logs
|
|
|
|
|
|
@router.post("/{rule_id}/acknowledge/{log_id}", status_code=status.HTTP_200_OK)
|
|
async def acknowledge_alert(
|
|
rule_id: str,
|
|
log_id: str,
|
|
db: Session = Depends(get_db),
|
|
current_user: User = Depends(get_current_user)
|
|
):
|
|
"""
|
|
确认告警
|
|
"""
|
|
# 验证规则存在且属于当前用户
|
|
rule = db.query(AlertRule).filter(
|
|
AlertRule.id == rule_id,
|
|
AlertRule.user_id == current_user.id
|
|
).first()
|
|
|
|
if not rule:
|
|
raise NotFoundError(f"告警规则不存在: {rule_id}")
|
|
|
|
# 获取告警日志
|
|
alert_log = db.query(AlertLog).filter(
|
|
AlertLog.id == log_id,
|
|
AlertLog.rule_id == rule_id
|
|
).first()
|
|
|
|
if not alert_log:
|
|
raise NotFoundError(f"告警日志不存在: {log_id}")
|
|
|
|
# 更新状态
|
|
alert_log.status = "acknowledged"
|
|
alert_log.acknowledged_at = datetime.utcnow()
|
|
alert_log.acknowledged_by = current_user.id
|
|
|
|
db.commit()
|
|
db.refresh(alert_log)
|
|
|
|
return {"message": "告警已确认", "log": alert_log}
|
|
|
|
|
|
@router.post("/check", status_code=status.HTTP_200_OK)
|
|
async def check_alerts(
|
|
db: Session = Depends(get_db),
|
|
current_user: User = Depends(get_current_user)
|
|
):
|
|
"""
|
|
手动触发告警检查
|
|
|
|
检查所有启用的告警规则
|
|
"""
|
|
# 获取所有启用的告警规则
|
|
rules = db.query(AlertRule).filter(
|
|
AlertRule.enabled == True,
|
|
AlertRule.user_id == current_user.id
|
|
).all()
|
|
|
|
triggered_count = 0
|
|
|
|
for rule in rules:
|
|
try:
|
|
if rule.alert_type == 'error_rate':
|
|
# 错误率告警需要检查所有相关执行
|
|
should_trigger = await AlertService.check_error_rate(db, rule)
|
|
if should_trigger:
|
|
# 创建告警日志
|
|
alert_log = AlertLog(
|
|
rule_id=rule.id,
|
|
alert_type=rule.alert_type,
|
|
severity=rule.conditions.get('severity', 'warning'),
|
|
message=f"错误率告警: {rule.target_type} 错误率超过阈值",
|
|
details={
|
|
"target_type": rule.target_type,
|
|
"target_id": rule.target_id
|
|
},
|
|
status='pending',
|
|
notification_type=rule.notification_type,
|
|
triggered_at=datetime.utcnow()
|
|
)
|
|
db.add(alert_log)
|
|
rule.trigger_count += 1
|
|
rule.last_triggered_at = datetime.utcnow()
|
|
db.commit()
|
|
db.refresh(alert_log)
|
|
|
|
# 发送通知
|
|
await AlertService.send_notification(db, alert_log, rule)
|
|
triggered_count += 1
|
|
except Exception as e:
|
|
logger.error(f"检查告警规则失败 {rule.id}: {str(e)}")
|
|
continue
|
|
|
|
return {
|
|
"message": f"告警检查完成,触发 {triggered_count} 个告警",
|
|
"triggered_count": triggered_count
|
|
}
|