aiagent/backend/app/services/bottleneck_detector.py

"""
工作流瓶颈自动检测 — 分析执行数据，识别性能瓶颈
"""
from __future__ import annotations

import logging
from typing import Any, Dict, List, Optional
from collections import defaultdict

from sqlalchemy import func
from sqlalchemy.orm import Session

from app.core.database import SessionLocal
from app.models.execution import Execution
from app.models.execution_log import ExecutionLog
from app.models.agent_execution_log import AgentExecutionLog

logger = logging.getLogger(__name__)


class BottleneckDetector:
    """工作流瓶颈检测器"""

    def analyze_executions(self, hours: int = 24) -> List[Dict[str, Any]]:
        """分析最近N小时的执行数据，找出瓶颈。"""
        db: Optional[Session] = None
        try:
            db = SessionLocal()

            from datetime import datetime, timedelta
            since = datetime.now() - timedelta(hours=hours)

            # 1. 分析工作流执行节点耗时
            logs = (
                db.query(ExecutionLog)
                .filter(
                    ExecutionLog.timestamp >= since,
                    ExecutionLog.node_id.isnot(None),
                )
                .all()
            )

            # 按 node_type 聚合
            node_stats = defaultdict(lambda: {
                "count": 0, "errors": 0, "durations": [],
                "total_duration": 0, "node_type": "",
            })

            for log in logs:
                nt = log.node_type or "unknown"
                node_stats[nt]["count"] += 1
                node_stats[nt]["node_type"] = nt
                if log.level == "ERROR":
                    node_stats[nt]["errors"] += 1
                if log.duration:
                    node_stats[nt]["durations"].append(log.duration)
                    node_stats[nt]["total_duration"] += log.duration

            # 计算统计值
            results = []
            for nt, stats in node_stats.items():
                if stats["count"] < 3:
                    continue
                durations = sorted(stats["durations"])
                n = len(durations)
                avg = stats["total_duration"] / n if n > 0 else 0
                results.append({
                    "node_type": nt,
                    "count": stats["count"],
                    "error_rate": round(stats["errors"] / stats["count"], 3),
                    "avg_duration_ms": int(avg),
                    "p50_ms": durations[n // 2] if n > 0 else 0,
                    "p95_ms": durations[int(n * 0.95)] if n > 4 else (durations[-1] if n > 0 else 0),
                    "p99_ms": durations[int(n * 0.99)] if n > 9 else (durations[-1] if n > 0 else 0),
                })

            # 按耗时排序
            results.sort(key=lambda x: x["p95_ms"], reverse=True)

            # 2. 分析 Agent 执行效率
            agent_logs = (
                db.query(AgentExecutionLog)
                .filter(AgentExecutionLog.created_at >= since)
                .all()
            )

            agent_failure_rate = 0.0
            agent_avg_tool_calls = 0.0
            if agent_logs:
                failed = sum(1 for a in agent_logs if not a.success)
                agent_failure_rate = round(failed / len(agent_logs), 3)
                agent_avg_tool_calls = round(
                    sum(a.tool_calls_made or 0 for a in agent_logs) / len(agent_logs), 1
                )

            # 3. 识别瓶颈
            if results:
                overall_avg_p95 = sum(r["p95_ms"] for r in results) / len(results)
                for r in results:
                    r["is_bottleneck"] = r["p95_ms"] > overall_avg_p95 * 3
                    r["is_problematic"] = r["error_rate"] > 0.2
                    r["is_inefficient"] = r["p50_ms"] > 30000
                    r["severity"] = "high" if (r["is_bottleneck"] or r["is_problematic"]) else (
                        "medium" if r["is_inefficient"] else "low"
                    )

            return results

        except Exception as e:
            logger.error("瓶颈检测失败: %s", e)
            return []
        finally:
            if db:
                try:
                    db.close()
                except Exception:
                    pass

    def generate_recommendations(self, bottlenecks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """根据瓶颈生成优化建议。"""
        recommendations = []

        for b in bottlenecks:
            if not b.get("is_bottleneck") and not b.get("is_problematic"):
                continue

            rec = {
                "node_type": b["node_type"],
                "severity": b["severity"],
                "current_state": {
                    "p95_ms": b["p95_ms"],
                    "error_rate": b["error_rate"],
                    "count": b["count"],
                },
                "actions": [],
            }

            if b.get("is_bottleneck"):
                rec["actions"].append({
                    "type": "split_node",
                    "description": f"节点 {b['node_type']} P95耗时 {b['p95_ms']}ms，建议拆分为并行子节点",
                    "expected_improvement": "减少50-70%耗时",
                })
                rec["actions"].append({
                    "type": "upgrade_model",
                    "description": "考虑使用更快的模型或增加超时时间",
                    "expected_improvement": "减少20-40%耗时",
                })

            if b.get("is_problematic"):
                rec["actions"].append({
                    "type": "add_retry",
                    "description": f"节点失败率 {b['error_rate']:.1%}，建议添加重试逻辑和前置校验",
                    "expected_improvement": "失败率降至5%以下",
                })

            if b.get("is_inefficient"):
                rec["actions"].append({
                    "type": "add_cache",
                    "description": f"节点平均耗时过长，建议添加结果缓存避免重复计算",
                    "expected_improvement": "减少30-60%耗时",
                })

            recommendations.append(rec)

        return recommendations

    def run_full_analysis(self, hours: int = 24) -> Dict[str, Any]:
        """运行完整分析：检测+建议。"""
        bottlenecks = self.analyze_executions(hours=hours)
        recommendations = self.generate_recommendations(bottlenecks)
        return {
            "period_hours": hours,
            "bottlenecks_found": len([b for b in bottlenecks if b.get("is_bottleneck")]),
            "problematic_nodes": len([b for b in bottlenecks if b.get("is_problematic")]),
            "nodes_analyzed": len(bottlenecks),
            "bottlenecks": bottlenecks,
            "recommendations": recommendations,
        }


bottleneck_detector = BottleneckDetector()