392 lines
13 KiB
Python
392 lines
13 KiB
Python
"""
|
|
告警服务
|
|
提供告警检测和通知功能
|
|
"""
|
|
from sqlalchemy.orm import Session
|
|
from sqlalchemy import func, and_
|
|
from datetime import datetime, timedelta
|
|
from typing import Dict, Any, List, Optional
|
|
import logging
|
|
from app.models.alert_rule import AlertRule, AlertLog
|
|
from app.models.execution import Execution
|
|
from app.models.workflow import Workflow
|
|
from app.models.execution_log import ExecutionLog
|
|
import httpx
|
|
import aiosmtplib
|
|
from email.mime.text import MIMEText
|
|
from email.mime.multipart import MIMEMultipart
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class AlertService:
|
|
"""告警服务类"""
|
|
|
|
@staticmethod
|
|
async def check_execution_failed(
|
|
db: Session,
|
|
rule: AlertRule,
|
|
execution: Execution
|
|
) -> bool:
|
|
"""
|
|
检查执行失败告警
|
|
|
|
Args:
|
|
db: 数据库会话
|
|
rule: 告警规则
|
|
execution: 执行记录
|
|
|
|
Returns:
|
|
是否触发告警
|
|
"""
|
|
if execution.status != 'failed':
|
|
return False
|
|
|
|
# 检查目标是否匹配
|
|
if rule.target_type == 'workflow' and rule.target_id:
|
|
if execution.workflow_id != rule.target_id:
|
|
return False
|
|
elif rule.target_type == 'agent' and rule.target_id:
|
|
if execution.agent_id != rule.target_id:
|
|
return False
|
|
|
|
# 检查时间窗口内的失败次数
|
|
conditions = rule.conditions
|
|
threshold = conditions.get('threshold', 1)
|
|
time_window = conditions.get('time_window', 3600) # 默认1小时
|
|
comparison = conditions.get('comparison', 'gt') # gt, gte, eq
|
|
|
|
start_time = datetime.utcnow() - timedelta(seconds=time_window)
|
|
|
|
# 构建查询条件
|
|
query = db.query(func.count(Execution.id)).filter(
|
|
Execution.status == 'failed',
|
|
Execution.created_at >= start_time
|
|
)
|
|
|
|
if rule.target_type == 'workflow' and rule.target_id:
|
|
query = query.filter(Execution.workflow_id == rule.target_id)
|
|
elif rule.target_type == 'agent' and rule.target_id:
|
|
query = query.filter(Execution.agent_id == rule.target_id)
|
|
|
|
failed_count = query.scalar() or 0
|
|
|
|
# 根据比较操作符判断
|
|
if comparison == 'gt':
|
|
return failed_count > threshold
|
|
elif comparison == 'gte':
|
|
return failed_count >= threshold
|
|
elif comparison == 'eq':
|
|
return failed_count == threshold
|
|
else:
|
|
return failed_count > threshold
|
|
|
|
@staticmethod
|
|
async def check_execution_timeout(
|
|
db: Session,
|
|
rule: AlertRule,
|
|
execution: Execution
|
|
) -> bool:
|
|
"""
|
|
检查执行超时告警
|
|
|
|
Args:
|
|
db: 数据库会话
|
|
rule: 告警规则
|
|
execution: 执行记录
|
|
|
|
Returns:
|
|
是否触发告警
|
|
"""
|
|
if execution.status != 'running':
|
|
return False
|
|
|
|
# 检查目标是否匹配
|
|
if rule.target_type == 'workflow' and rule.target_id:
|
|
if execution.workflow_id != rule.target_id:
|
|
return False
|
|
elif rule.target_type == 'agent' and rule.target_id:
|
|
if execution.agent_id != rule.target_id:
|
|
return False
|
|
|
|
# 检查执行时间
|
|
conditions = rule.conditions
|
|
timeout_seconds = conditions.get('timeout_seconds', 3600) # 默认1小时
|
|
|
|
if execution.created_at:
|
|
elapsed = (datetime.utcnow() - execution.created_at).total_seconds()
|
|
return elapsed > timeout_seconds
|
|
|
|
return False
|
|
|
|
@staticmethod
|
|
async def check_error_rate(
|
|
db: Session,
|
|
rule: AlertRule
|
|
) -> bool:
|
|
"""
|
|
检查错误率告警
|
|
|
|
Args:
|
|
db: 数据库会话
|
|
rule: 告警规则
|
|
|
|
Returns:
|
|
是否触发告警
|
|
"""
|
|
conditions = rule.conditions
|
|
threshold = conditions.get('threshold', 0.1) # 默认10%
|
|
time_window = conditions.get('time_window', 3600) # 默认1小时
|
|
|
|
start_time = datetime.utcnow() - timedelta(seconds=time_window)
|
|
|
|
# 构建查询条件
|
|
query = db.query(Execution).filter(
|
|
Execution.created_at >= start_time
|
|
)
|
|
|
|
if rule.target_type == 'workflow' and rule.target_id:
|
|
query = query.filter(Execution.workflow_id == rule.target_id)
|
|
elif rule.target_type == 'agent' and rule.target_id:
|
|
query = query.filter(Execution.agent_id == rule.target_id)
|
|
|
|
executions = query.all()
|
|
|
|
if not executions:
|
|
return False
|
|
|
|
total_count = len(executions)
|
|
failed_count = sum(1 for e in executions if e.status == 'failed')
|
|
error_rate = failed_count / total_count if total_count > 0 else 0
|
|
|
|
return error_rate >= threshold
|
|
|
|
@staticmethod
|
|
async def check_alerts_for_execution(
|
|
db: Session,
|
|
execution: Execution
|
|
) -> List[AlertLog]:
|
|
"""
|
|
检查执行记录相关的告警规则
|
|
|
|
Args:
|
|
db: 数据库会话
|
|
execution: 执行记录
|
|
|
|
Returns:
|
|
触发的告警日志列表
|
|
"""
|
|
triggered_logs = []
|
|
|
|
# 获取相关的告警规则
|
|
query = db.query(AlertRule).filter(AlertRule.enabled == True)
|
|
|
|
# 根据执行记录筛选相关规则
|
|
if execution.workflow_id:
|
|
query = query.filter(
|
|
(AlertRule.target_type == 'workflow') &
|
|
((AlertRule.target_id == execution.workflow_id) | (AlertRule.target_id.is_(None)))
|
|
)
|
|
elif execution.agent_id:
|
|
query = query.filter(
|
|
(AlertRule.target_type == 'agent') &
|
|
((AlertRule.target_id == execution.agent_id) | (AlertRule.target_id.is_(None)))
|
|
)
|
|
|
|
# 也包含系统级告警
|
|
query = query.filter(AlertRule.target_type == 'system')
|
|
|
|
rules = query.all()
|
|
|
|
for rule in rules:
|
|
try:
|
|
should_trigger = False
|
|
alert_message = ""
|
|
alert_details = {}
|
|
|
|
if rule.alert_type == 'execution_failed':
|
|
should_trigger = await AlertService.check_execution_failed(db, rule, execution)
|
|
if should_trigger:
|
|
alert_message = f"执行失败告警: 工作流 {execution.workflow_id} 执行失败"
|
|
alert_details = {
|
|
"execution_id": execution.id,
|
|
"workflow_id": execution.workflow_id,
|
|
"status": execution.status,
|
|
"error_message": execution.error_message
|
|
}
|
|
|
|
elif rule.alert_type == 'execution_timeout':
|
|
should_trigger = await AlertService.check_execution_timeout(db, rule, execution)
|
|
if should_trigger:
|
|
elapsed = (datetime.utcnow() - execution.created_at).total_seconds() if execution.created_at else 0
|
|
alert_message = f"执行超时告警: 工作流 {execution.workflow_id} 执行超时 ({elapsed:.0f}秒)"
|
|
alert_details = {
|
|
"execution_id": execution.id,
|
|
"workflow_id": execution.workflow_id,
|
|
"elapsed_seconds": elapsed
|
|
}
|
|
|
|
elif rule.alert_type == 'error_rate':
|
|
should_trigger = await AlertService.check_error_rate(db, rule)
|
|
if should_trigger:
|
|
alert_message = f"错误率告警: {rule.target_type} 错误率超过阈值"
|
|
alert_details = {
|
|
"target_type": rule.target_type,
|
|
"target_id": rule.target_id
|
|
}
|
|
|
|
if should_trigger:
|
|
# 创建告警日志
|
|
alert_log = AlertLog(
|
|
rule_id=rule.id,
|
|
alert_type=rule.alert_type,
|
|
severity=rule.conditions.get('severity', 'warning'),
|
|
message=alert_message,
|
|
details=alert_details,
|
|
status='pending',
|
|
notification_type=rule.notification_type,
|
|
triggered_at=datetime.utcnow()
|
|
)
|
|
db.add(alert_log)
|
|
|
|
# 更新规则统计
|
|
rule.trigger_count += 1
|
|
rule.last_triggered_at = datetime.utcnow()
|
|
|
|
db.commit()
|
|
db.refresh(alert_log)
|
|
|
|
# 发送通知
|
|
await AlertService.send_notification(db, alert_log, rule)
|
|
|
|
triggered_logs.append(alert_log)
|
|
|
|
except Exception as e:
|
|
logger.error(f"检查告警规则失败 {rule.id}: {str(e)}")
|
|
continue
|
|
|
|
return triggered_logs
|
|
|
|
@staticmethod
|
|
async def send_notification(
|
|
db: Session,
|
|
alert_log: AlertLog,
|
|
rule: AlertRule
|
|
):
|
|
"""
|
|
发送告警通知
|
|
|
|
Args:
|
|
db: 数据库会话
|
|
alert_log: 告警日志
|
|
rule: 告警规则
|
|
"""
|
|
try:
|
|
if rule.notification_type == 'email':
|
|
await AlertService.send_email_notification(alert_log, rule)
|
|
elif rule.notification_type == 'webhook':
|
|
await AlertService.send_webhook_notification(alert_log, rule)
|
|
elif rule.notification_type == 'internal':
|
|
# 站内通知,只需要记录日志即可
|
|
pass
|
|
|
|
alert_log.status = 'sent'
|
|
alert_log.notification_result = '通知发送成功'
|
|
|
|
except Exception as e:
|
|
logger.error(f"发送告警通知失败: {str(e)}")
|
|
alert_log.status = 'failed'
|
|
alert_log.notification_result = f"通知发送失败: {str(e)}"
|
|
|
|
finally:
|
|
db.commit()
|
|
|
|
@staticmethod
|
|
async def send_email_notification(
|
|
alert_log: AlertLog,
|
|
rule: AlertRule
|
|
):
|
|
"""
|
|
发送邮件通知
|
|
|
|
Args:
|
|
alert_log: 告警日志
|
|
rule: 告警规则
|
|
"""
|
|
config = rule.notification_config or {}
|
|
smtp_host = config.get('smtp_host', 'smtp.gmail.com')
|
|
smtp_port = config.get('smtp_port', 587)
|
|
smtp_user = config.get('smtp_user')
|
|
smtp_password = config.get('smtp_password')
|
|
to_email = config.get('to_email')
|
|
|
|
if not to_email:
|
|
raise ValueError("邮件通知配置缺少收件人地址")
|
|
|
|
# 创建邮件
|
|
message = MIMEMultipart()
|
|
message['From'] = smtp_user
|
|
message['To'] = to_email
|
|
message['Subject'] = f"告警通知: {rule.name}"
|
|
|
|
body = f"""
|
|
告警规则: {rule.name}
|
|
告警类型: {alert_log.alert_type}
|
|
严重程度: {alert_log.severity}
|
|
告警消息: {alert_log.message}
|
|
触发时间: {alert_log.triggered_at}
|
|
"""
|
|
|
|
if alert_log.details:
|
|
body += f"\n详细信息:\n{alert_log.details}"
|
|
|
|
message.attach(MIMEText(body, 'plain', 'utf-8'))
|
|
|
|
# 发送邮件
|
|
await aiosmtplib.send(
|
|
message,
|
|
hostname=smtp_host,
|
|
port=smtp_port,
|
|
username=smtp_user,
|
|
password=smtp_password,
|
|
use_tls=True
|
|
)
|
|
|
|
@staticmethod
|
|
async def send_webhook_notification(
|
|
alert_log: AlertLog,
|
|
rule: AlertRule
|
|
):
|
|
"""
|
|
发送Webhook通知
|
|
|
|
Args:
|
|
alert_log: 告警日志
|
|
rule: 告警规则
|
|
"""
|
|
config = rule.notification_config or {}
|
|
webhook_url = config.get('webhook_url')
|
|
|
|
if not webhook_url:
|
|
raise ValueError("Webhook通知配置缺少URL")
|
|
|
|
# 构建请求数据
|
|
payload = {
|
|
"rule_name": rule.name,
|
|
"alert_type": alert_log.alert_type,
|
|
"severity": alert_log.severity,
|
|
"message": alert_log.message,
|
|
"details": alert_log.details,
|
|
"triggered_at": alert_log.triggered_at.isoformat() if alert_log.triggered_at else None
|
|
}
|
|
|
|
# 发送HTTP请求
|
|
async with httpx.AsyncClient() as client:
|
|
response = await client.post(
|
|
webhook_url,
|
|
json=payload,
|
|
headers=config.get('headers', {}),
|
|
timeout=10
|
|
)
|
|
response.raise_for_status()
|