Files
aiagent/backend/app/services/alert_service.py
2026-01-19 00:09:36 +08:00

392 lines
13 KiB
Python

"""
告警服务
提供告警检测和通知功能
"""
from sqlalchemy.orm import Session
from sqlalchemy import func, and_
from datetime import datetime, timedelta
from typing import Dict, Any, List, Optional
import logging
from app.models.alert_rule import AlertRule, AlertLog
from app.models.execution import Execution
from app.models.workflow import Workflow
from app.models.execution_log import ExecutionLog
import httpx
import aiosmtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
logger = logging.getLogger(__name__)
class AlertService:
"""告警服务类"""
@staticmethod
async def check_execution_failed(
db: Session,
rule: AlertRule,
execution: Execution
) -> bool:
"""
检查执行失败告警
Args:
db: 数据库会话
rule: 告警规则
execution: 执行记录
Returns:
是否触发告警
"""
if execution.status != 'failed':
return False
# 检查目标是否匹配
if rule.target_type == 'workflow' and rule.target_id:
if execution.workflow_id != rule.target_id:
return False
elif rule.target_type == 'agent' and rule.target_id:
if execution.agent_id != rule.target_id:
return False
# 检查时间窗口内的失败次数
conditions = rule.conditions
threshold = conditions.get('threshold', 1)
time_window = conditions.get('time_window', 3600) # 默认1小时
comparison = conditions.get('comparison', 'gt') # gt, gte, eq
start_time = datetime.utcnow() - timedelta(seconds=time_window)
# 构建查询条件
query = db.query(func.count(Execution.id)).filter(
Execution.status == 'failed',
Execution.created_at >= start_time
)
if rule.target_type == 'workflow' and rule.target_id:
query = query.filter(Execution.workflow_id == rule.target_id)
elif rule.target_type == 'agent' and rule.target_id:
query = query.filter(Execution.agent_id == rule.target_id)
failed_count = query.scalar() or 0
# 根据比较操作符判断
if comparison == 'gt':
return failed_count > threshold
elif comparison == 'gte':
return failed_count >= threshold
elif comparison == 'eq':
return failed_count == threshold
else:
return failed_count > threshold
@staticmethod
async def check_execution_timeout(
db: Session,
rule: AlertRule,
execution: Execution
) -> bool:
"""
检查执行超时告警
Args:
db: 数据库会话
rule: 告警规则
execution: 执行记录
Returns:
是否触发告警
"""
if execution.status != 'running':
return False
# 检查目标是否匹配
if rule.target_type == 'workflow' and rule.target_id:
if execution.workflow_id != rule.target_id:
return False
elif rule.target_type == 'agent' and rule.target_id:
if execution.agent_id != rule.target_id:
return False
# 检查执行时间
conditions = rule.conditions
timeout_seconds = conditions.get('timeout_seconds', 3600) # 默认1小时
if execution.created_at:
elapsed = (datetime.utcnow() - execution.created_at).total_seconds()
return elapsed > timeout_seconds
return False
@staticmethod
async def check_error_rate(
db: Session,
rule: AlertRule
) -> bool:
"""
检查错误率告警
Args:
db: 数据库会话
rule: 告警规则
Returns:
是否触发告警
"""
conditions = rule.conditions
threshold = conditions.get('threshold', 0.1) # 默认10%
time_window = conditions.get('time_window', 3600) # 默认1小时
start_time = datetime.utcnow() - timedelta(seconds=time_window)
# 构建查询条件
query = db.query(Execution).filter(
Execution.created_at >= start_time
)
if rule.target_type == 'workflow' and rule.target_id:
query = query.filter(Execution.workflow_id == rule.target_id)
elif rule.target_type == 'agent' and rule.target_id:
query = query.filter(Execution.agent_id == rule.target_id)
executions = query.all()
if not executions:
return False
total_count = len(executions)
failed_count = sum(1 for e in executions if e.status == 'failed')
error_rate = failed_count / total_count if total_count > 0 else 0
return error_rate >= threshold
@staticmethod
async def check_alerts_for_execution(
db: Session,
execution: Execution
) -> List[AlertLog]:
"""
检查执行记录相关的告警规则
Args:
db: 数据库会话
execution: 执行记录
Returns:
触发的告警日志列表
"""
triggered_logs = []
# 获取相关的告警规则
query = db.query(AlertRule).filter(AlertRule.enabled == True)
# 根据执行记录筛选相关规则
if execution.workflow_id:
query = query.filter(
(AlertRule.target_type == 'workflow') &
((AlertRule.target_id == execution.workflow_id) | (AlertRule.target_id.is_(None)))
)
elif execution.agent_id:
query = query.filter(
(AlertRule.target_type == 'agent') &
((AlertRule.target_id == execution.agent_id) | (AlertRule.target_id.is_(None)))
)
# 也包含系统级告警
query = query.filter(AlertRule.target_type == 'system')
rules = query.all()
for rule in rules:
try:
should_trigger = False
alert_message = ""
alert_details = {}
if rule.alert_type == 'execution_failed':
should_trigger = await AlertService.check_execution_failed(db, rule, execution)
if should_trigger:
alert_message = f"执行失败告警: 工作流 {execution.workflow_id} 执行失败"
alert_details = {
"execution_id": execution.id,
"workflow_id": execution.workflow_id,
"status": execution.status,
"error_message": execution.error_message
}
elif rule.alert_type == 'execution_timeout':
should_trigger = await AlertService.check_execution_timeout(db, rule, execution)
if should_trigger:
elapsed = (datetime.utcnow() - execution.created_at).total_seconds() if execution.created_at else 0
alert_message = f"执行超时告警: 工作流 {execution.workflow_id} 执行超时 ({elapsed:.0f}秒)"
alert_details = {
"execution_id": execution.id,
"workflow_id": execution.workflow_id,
"elapsed_seconds": elapsed
}
elif rule.alert_type == 'error_rate':
should_trigger = await AlertService.check_error_rate(db, rule)
if should_trigger:
alert_message = f"错误率告警: {rule.target_type} 错误率超过阈值"
alert_details = {
"target_type": rule.target_type,
"target_id": rule.target_id
}
if should_trigger:
# 创建告警日志
alert_log = AlertLog(
rule_id=rule.id,
alert_type=rule.alert_type,
severity=rule.conditions.get('severity', 'warning'),
message=alert_message,
details=alert_details,
status='pending',
notification_type=rule.notification_type,
triggered_at=datetime.utcnow()
)
db.add(alert_log)
# 更新规则统计
rule.trigger_count += 1
rule.last_triggered_at = datetime.utcnow()
db.commit()
db.refresh(alert_log)
# 发送通知
await AlertService.send_notification(db, alert_log, rule)
triggered_logs.append(alert_log)
except Exception as e:
logger.error(f"检查告警规则失败 {rule.id}: {str(e)}")
continue
return triggered_logs
@staticmethod
async def send_notification(
db: Session,
alert_log: AlertLog,
rule: AlertRule
):
"""
发送告警通知
Args:
db: 数据库会话
alert_log: 告警日志
rule: 告警规则
"""
try:
if rule.notification_type == 'email':
await AlertService.send_email_notification(alert_log, rule)
elif rule.notification_type == 'webhook':
await AlertService.send_webhook_notification(alert_log, rule)
elif rule.notification_type == 'internal':
# 站内通知,只需要记录日志即可
pass
alert_log.status = 'sent'
alert_log.notification_result = '通知发送成功'
except Exception as e:
logger.error(f"发送告警通知失败: {str(e)}")
alert_log.status = 'failed'
alert_log.notification_result = f"通知发送失败: {str(e)}"
finally:
db.commit()
@staticmethod
async def send_email_notification(
alert_log: AlertLog,
rule: AlertRule
):
"""
发送邮件通知
Args:
alert_log: 告警日志
rule: 告警规则
"""
config = rule.notification_config or {}
smtp_host = config.get('smtp_host', 'smtp.gmail.com')
smtp_port = config.get('smtp_port', 587)
smtp_user = config.get('smtp_user')
smtp_password = config.get('smtp_password')
to_email = config.get('to_email')
if not to_email:
raise ValueError("邮件通知配置缺少收件人地址")
# 创建邮件
message = MIMEMultipart()
message['From'] = smtp_user
message['To'] = to_email
message['Subject'] = f"告警通知: {rule.name}"
body = f"""
告警规则: {rule.name}
告警类型: {alert_log.alert_type}
严重程度: {alert_log.severity}
告警消息: {alert_log.message}
触发时间: {alert_log.triggered_at}
"""
if alert_log.details:
body += f"\n详细信息:\n{alert_log.details}"
message.attach(MIMEText(body, 'plain', 'utf-8'))
# 发送邮件
await aiosmtplib.send(
message,
hostname=smtp_host,
port=smtp_port,
username=smtp_user,
password=smtp_password,
use_tls=True
)
@staticmethod
async def send_webhook_notification(
alert_log: AlertLog,
rule: AlertRule
):
"""
发送Webhook通知
Args:
alert_log: 告警日志
rule: 告警规则
"""
config = rule.notification_config or {}
webhook_url = config.get('webhook_url')
if not webhook_url:
raise ValueError("Webhook通知配置缺少URL")
# 构建请求数据
payload = {
"rule_name": rule.name,
"alert_type": alert_log.alert_type,
"severity": alert_log.severity,
"message": alert_log.message,
"details": alert_log.details,
"triggered_at": alert_log.triggered_at.isoformat() if alert_log.triggered_at else None
}
# 发送HTTP请求
async with httpx.AsyncClient() as client:
response = await client.post(
webhook_url,
json=payload,
headers=config.get('headers', {}),
timeout=10
)
response.raise_for_status()