""" 工作流瓶颈自动检测 — 分析执行数据,识别性能瓶颈 """ from __future__ import annotations import logging from typing import Any, Dict, List, Optional from collections import defaultdict from sqlalchemy import func from sqlalchemy.orm import Session from app.core.database import SessionLocal from app.models.execution import Execution from app.models.execution_log import ExecutionLog from app.models.agent_execution_log import AgentExecutionLog logger = logging.getLogger(__name__) class BottleneckDetector: """工作流瓶颈检测器""" def analyze_executions(self, hours: int = 24) -> List[Dict[str, Any]]: """分析最近N小时的执行数据,找出瓶颈。""" db: Optional[Session] = None try: db = SessionLocal() from datetime import datetime, timedelta since = datetime.now() - timedelta(hours=hours) # 1. 分析工作流执行节点耗时 logs = ( db.query(ExecutionLog) .filter( ExecutionLog.timestamp >= since, ExecutionLog.node_id.isnot(None), ) .all() ) # 按 node_type 聚合 node_stats = defaultdict(lambda: { "count": 0, "errors": 0, "durations": [], "total_duration": 0, "node_type": "", }) for log in logs: nt = log.node_type or "unknown" node_stats[nt]["count"] += 1 node_stats[nt]["node_type"] = nt if log.level == "ERROR": node_stats[nt]["errors"] += 1 if log.duration: node_stats[nt]["durations"].append(log.duration) node_stats[nt]["total_duration"] += log.duration # 计算统计值 results = [] for nt, stats in node_stats.items(): if stats["count"] < 3: continue durations = sorted(stats["durations"]) n = len(durations) avg = stats["total_duration"] / n if n > 0 else 0 results.append({ "node_type": nt, "count": stats["count"], "error_rate": round(stats["errors"] / stats["count"], 3), "avg_duration_ms": int(avg), "p50_ms": durations[n // 2] if n > 0 else 0, "p95_ms": durations[int(n * 0.95)] if n > 4 else (durations[-1] if n > 0 else 0), "p99_ms": durations[int(n * 0.99)] if n > 9 else (durations[-1] if n > 0 else 0), }) # 按耗时排序 results.sort(key=lambda x: x["p95_ms"], reverse=True) # 2. 分析 Agent 执行效率 agent_logs = ( db.query(AgentExecutionLog) .filter(AgentExecutionLog.created_at >= since) .all() ) agent_failure_rate = 0.0 agent_avg_tool_calls = 0.0 if agent_logs: failed = sum(1 for a in agent_logs if not a.success) agent_failure_rate = round(failed / len(agent_logs), 3) agent_avg_tool_calls = round( sum(a.tool_calls_made or 0 for a in agent_logs) / len(agent_logs), 1 ) # 3. 识别瓶颈 if results: overall_avg_p95 = sum(r["p95_ms"] for r in results) / len(results) for r in results: r["is_bottleneck"] = r["p95_ms"] > overall_avg_p95 * 3 r["is_problematic"] = r["error_rate"] > 0.2 r["is_inefficient"] = r["p50_ms"] > 30000 r["severity"] = "high" if (r["is_bottleneck"] or r["is_problematic"]) else ( "medium" if r["is_inefficient"] else "low" ) return results except Exception as e: logger.error("瓶颈检测失败: %s", e) return [] finally: if db: try: db.close() except Exception: pass def generate_recommendations(self, bottlenecks: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """根据瓶颈生成优化建议。""" recommendations = [] for b in bottlenecks: if not b.get("is_bottleneck") and not b.get("is_problematic"): continue rec = { "node_type": b["node_type"], "severity": b["severity"], "current_state": { "p95_ms": b["p95_ms"], "error_rate": b["error_rate"], "count": b["count"], }, "actions": [], } if b.get("is_bottleneck"): rec["actions"].append({ "type": "split_node", "description": f"节点 {b['node_type']} P95耗时 {b['p95_ms']}ms,建议拆分为并行子节点", "expected_improvement": "减少50-70%耗时", }) rec["actions"].append({ "type": "upgrade_model", "description": "考虑使用更快的模型或增加超时时间", "expected_improvement": "减少20-40%耗时", }) if b.get("is_problematic"): rec["actions"].append({ "type": "add_retry", "description": f"节点失败率 {b['error_rate']:.1%},建议添加重试逻辑和前置校验", "expected_improvement": "失败率降至5%以下", }) if b.get("is_inefficient"): rec["actions"].append({ "type": "add_cache", "description": f"节点平均耗时过长,建议添加结果缓存避免重复计算", "expected_improvement": "减少30-60%耗时", }) recommendations.append(rec) return recommendations def run_full_analysis(self, hours: int = 24) -> Dict[str, Any]: """运行完整分析:检测+建议。""" bottlenecks = self.analyze_executions(hours=hours) recommendations = self.generate_recommendations(bottlenecks) return { "period_hours": hours, "bottlenecks_found": len([b for b in bottlenecks if b.get("is_bottleneck")]), "problematic_nodes": len([b for b in bottlenecks if b.get("is_problematic")]), "nodes_analyzed": len(bottlenecks), "bottlenecks": bottlenecks, "recommendations": recommendations, } bottleneck_detector = BottleneckDetector()