""" 告警服务 提供告警检测和通知功能 """ from sqlalchemy.orm import Session from sqlalchemy import func, and_ from datetime import datetime, timedelta from typing import Dict, Any, List, Optional import logging from app.models.alert_rule import AlertRule, AlertLog from app.models.execution import Execution from app.models.workflow import Workflow from app.models.execution_log import ExecutionLog import httpx import aiosmtplib from email.mime.text import MIMEText from email.mime.multipart import MIMEMultipart logger = logging.getLogger(__name__) class AlertService: """告警服务类""" @staticmethod async def check_execution_failed( db: Session, rule: AlertRule, execution: Execution ) -> bool: """ 检查执行失败告警 Args: db: 数据库会话 rule: 告警规则 execution: 执行记录 Returns: 是否触发告警 """ if execution.status != 'failed': return False # 检查目标是否匹配 if rule.target_type == 'workflow' and rule.target_id: if execution.workflow_id != rule.target_id: return False elif rule.target_type == 'agent' and rule.target_id: if execution.agent_id != rule.target_id: return False # 检查时间窗口内的失败次数 conditions = rule.conditions threshold = conditions.get('threshold', 1) time_window = conditions.get('time_window', 3600) # 默认1小时 comparison = conditions.get('comparison', 'gt') # gt, gte, eq start_time = datetime.utcnow() - timedelta(seconds=time_window) # 构建查询条件 query = db.query(func.count(Execution.id)).filter( Execution.status == 'failed', Execution.created_at >= start_time ) if rule.target_type == 'workflow' and rule.target_id: query = query.filter(Execution.workflow_id == rule.target_id) elif rule.target_type == 'agent' and rule.target_id: query = query.filter(Execution.agent_id == rule.target_id) failed_count = query.scalar() or 0 # 根据比较操作符判断 if comparison == 'gt': return failed_count > threshold elif comparison == 'gte': return failed_count >= threshold elif comparison == 'eq': return failed_count == threshold else: return failed_count > threshold @staticmethod async def check_execution_timeout( db: Session, rule: AlertRule, execution: Execution ) -> bool: """ 检查执行超时告警 Args: db: 数据库会话 rule: 告警规则 execution: 执行记录 Returns: 是否触发告警 """ if execution.status != 'running': return False # 检查目标是否匹配 if rule.target_type == 'workflow' and rule.target_id: if execution.workflow_id != rule.target_id: return False elif rule.target_type == 'agent' and rule.target_id: if execution.agent_id != rule.target_id: return False # 检查执行时间 conditions = rule.conditions timeout_seconds = conditions.get('timeout_seconds', 3600) # 默认1小时 if execution.created_at: elapsed = (datetime.utcnow() - execution.created_at).total_seconds() return elapsed > timeout_seconds return False @staticmethod async def check_error_rate( db: Session, rule: AlertRule ) -> bool: """ 检查错误率告警 Args: db: 数据库会话 rule: 告警规则 Returns: 是否触发告警 """ conditions = rule.conditions threshold = conditions.get('threshold', 0.1) # 默认10% time_window = conditions.get('time_window', 3600) # 默认1小时 start_time = datetime.utcnow() - timedelta(seconds=time_window) # 构建查询条件 query = db.query(Execution).filter( Execution.created_at >= start_time ) if rule.target_type == 'workflow' and rule.target_id: query = query.filter(Execution.workflow_id == rule.target_id) elif rule.target_type == 'agent' and rule.target_id: query = query.filter(Execution.agent_id == rule.target_id) executions = query.all() if not executions: return False total_count = len(executions) failed_count = sum(1 for e in executions if e.status == 'failed') error_rate = failed_count / total_count if total_count > 0 else 0 return error_rate >= threshold @staticmethod async def check_alerts_for_execution( db: Session, execution: Execution ) -> List[AlertLog]: """ 检查执行记录相关的告警规则 Args: db: 数据库会话 execution: 执行记录 Returns: 触发的告警日志列表 """ triggered_logs = [] # 获取相关的告警规则 query = db.query(AlertRule).filter(AlertRule.enabled == True) # 根据执行记录筛选相关规则 if execution.workflow_id: query = query.filter( (AlertRule.target_type == 'workflow') & ((AlertRule.target_id == execution.workflow_id) | (AlertRule.target_id.is_(None))) ) elif execution.agent_id: query = query.filter( (AlertRule.target_type == 'agent') & ((AlertRule.target_id == execution.agent_id) | (AlertRule.target_id.is_(None))) ) # 也包含系统级告警 query = query.filter(AlertRule.target_type == 'system') rules = query.all() for rule in rules: try: should_trigger = False alert_message = "" alert_details = {} if rule.alert_type == 'execution_failed': should_trigger = await AlertService.check_execution_failed(db, rule, execution) if should_trigger: alert_message = f"执行失败告警: 工作流 {execution.workflow_id} 执行失败" alert_details = { "execution_id": execution.id, "workflow_id": execution.workflow_id, "status": execution.status, "error_message": execution.error_message } elif rule.alert_type == 'execution_timeout': should_trigger = await AlertService.check_execution_timeout(db, rule, execution) if should_trigger: elapsed = (datetime.utcnow() - execution.created_at).total_seconds() if execution.created_at else 0 alert_message = f"执行超时告警: 工作流 {execution.workflow_id} 执行超时 ({elapsed:.0f}秒)" alert_details = { "execution_id": execution.id, "workflow_id": execution.workflow_id, "elapsed_seconds": elapsed } elif rule.alert_type == 'error_rate': should_trigger = await AlertService.check_error_rate(db, rule) if should_trigger: alert_message = f"错误率告警: {rule.target_type} 错误率超过阈值" alert_details = { "target_type": rule.target_type, "target_id": rule.target_id } if should_trigger: # 创建告警日志 alert_log = AlertLog( rule_id=rule.id, alert_type=rule.alert_type, severity=rule.conditions.get('severity', 'warning'), message=alert_message, details=alert_details, status='pending', notification_type=rule.notification_type, triggered_at=datetime.utcnow() ) db.add(alert_log) # 更新规则统计 rule.trigger_count += 1 rule.last_triggered_at = datetime.utcnow() db.commit() db.refresh(alert_log) # 发送通知 await AlertService.send_notification(db, alert_log, rule) triggered_logs.append(alert_log) except Exception as e: logger.error(f"检查告警规则失败 {rule.id}: {str(e)}") continue return triggered_logs @staticmethod async def send_notification( db: Session, alert_log: AlertLog, rule: AlertRule ): """ 发送告警通知 Args: db: 数据库会话 alert_log: 告警日志 rule: 告警规则 """ try: if rule.notification_type == 'email': await AlertService.send_email_notification(alert_log, rule) elif rule.notification_type == 'webhook': await AlertService.send_webhook_notification(alert_log, rule) elif rule.notification_type == 'internal': # 站内通知,只需要记录日志即可 pass alert_log.status = 'sent' alert_log.notification_result = '通知发送成功' except Exception as e: logger.error(f"发送告警通知失败: {str(e)}") alert_log.status = 'failed' alert_log.notification_result = f"通知发送失败: {str(e)}" finally: db.commit() @staticmethod async def send_email_notification( alert_log: AlertLog, rule: AlertRule ): """ 发送邮件通知 Args: alert_log: 告警日志 rule: 告警规则 """ config = rule.notification_config or {} smtp_host = config.get('smtp_host', 'smtp.gmail.com') smtp_port = config.get('smtp_port', 587) smtp_user = config.get('smtp_user') smtp_password = config.get('smtp_password') to_email = config.get('to_email') if not to_email: raise ValueError("邮件通知配置缺少收件人地址") # 创建邮件 message = MIMEMultipart() message['From'] = smtp_user message['To'] = to_email message['Subject'] = f"告警通知: {rule.name}" body = f""" 告警规则: {rule.name} 告警类型: {alert_log.alert_type} 严重程度: {alert_log.severity} 告警消息: {alert_log.message} 触发时间: {alert_log.triggered_at} """ if alert_log.details: body += f"\n详细信息:\n{alert_log.details}" message.attach(MIMEText(body, 'plain', 'utf-8')) # 发送邮件 await aiosmtplib.send( message, hostname=smtp_host, port=smtp_port, username=smtp_user, password=smtp_password, use_tls=True ) @staticmethod async def send_webhook_notification( alert_log: AlertLog, rule: AlertRule ): """ 发送Webhook通知 Args: alert_log: 告警日志 rule: 告警规则 """ config = rule.notification_config or {} webhook_url = config.get('webhook_url') if not webhook_url: raise ValueError("Webhook通知配置缺少URL") # 构建请求数据 payload = { "rule_name": rule.name, "alert_type": alert_log.alert_type, "severity": alert_log.severity, "message": alert_log.message, "details": alert_log.details, "triggered_at": alert_log.triggered_at.isoformat() if alert_log.triggered_at else None } # 发送HTTP请求 async with httpx.AsyncClient() as client: response = await client.post( webhook_url, json=payload, headers=config.get('headers', {}), timeout=10 ) response.raise_for_status()