Files
aiagent/backend/app/services/bottleneck_detector.py
renjianbo ab1589921a fix: 修复35个安全与功能缺陷,补全知识进化/数字孪生/行为采集模块
## 安全修复 (12项)
- Webhook接口添加全局Token认证,过滤敏感请求头
- 修复JWT Base64 padding公式,防止签名验证绕过
- 数据库密码/飞书Token从源码移除,改为环境变量
- 工作流引擎添加路径遍历防护 (_resolve_safe_path)
- eval()添加模板长度上限检查
- 审批API添加认证依赖
- 前端v-html增强XSS转义,console.log仅开发模式输出
- 500错误不再暴露内部异常详情

## Agent运行时修复 (7项)
- 删除_inject_knowledge_context中未定义db变量的finally块
- 工具执行添加try/except保护,异常不崩溃Agent
- LLM重试计入budget计数器
- self_review异常时passed=False
- max_iterations截断标记success=False
- 工具参数JSON解析失败时记录警告日志
- run()开始时重置_llm_invocations计数器

## 配置与基础设施
- DEBUG默认False,SQL_ECHO独立配置项
- init_db()补全13个缺失模型导入
- 新增WEBHOOK_AUTH_TOKEN/SQL_ECHO配置项
- 新增.env.example模板文件

## 前端修复 (12项)
- 登录改用URLSearchParams替代FormData
- 401拦截器通过Pinia store统一清理状态
- SSE流超时从60s延长至300s
- final/error事件时清除streamTimeout
- localStorage聊天记录添加24h TTL
- safeParseArgCount替代模板中裸JSON.parse
- fetchUser 401时同时清除user对象

## 新增模块
- 知识进化: knowledge_extractor/retriever/tasks
- 数字孪生: shadow_executor/comparison模型
- 行为采集: behavior_middleware/collector/fingerprint_engine
- 代码审查: code_review_agent/document_review_agent
- 反馈学习: feedback_learner
- 瓶颈检测/优化引擎/成本估算/需求估算
- 速率限制器 (rate_limiter)
- Alembic迁移 015-020

## 文档
- 商业化落地计划
- 8篇docs文档 (架构/API/部署/开发/贡献等)
- Docker Compose生产配置

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-10 19:50:20 +08:00

183 lines
6.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
工作流瓶颈自动检测 — 分析执行数据,识别性能瓶颈
"""
from __future__ import annotations
import logging
from typing import Any, Dict, List, Optional
from collections import defaultdict
from sqlalchemy import func
from sqlalchemy.orm import Session
from app.core.database import SessionLocal
from app.models.execution import Execution
from app.models.execution_log import ExecutionLog
from app.models.agent_execution_log import AgentExecutionLog
logger = logging.getLogger(__name__)
class BottleneckDetector:
"""工作流瓶颈检测器"""
def analyze_executions(self, hours: int = 24) -> List[Dict[str, Any]]:
"""分析最近N小时的执行数据找出瓶颈。"""
db: Optional[Session] = None
try:
db = SessionLocal()
from datetime import datetime, timedelta
since = datetime.now() - timedelta(hours=hours)
# 1. 分析工作流执行节点耗时
logs = (
db.query(ExecutionLog)
.filter(
ExecutionLog.timestamp >= since,
ExecutionLog.node_id.isnot(None),
)
.all()
)
# 按 node_type 聚合
node_stats = defaultdict(lambda: {
"count": 0, "errors": 0, "durations": [],
"total_duration": 0, "node_type": "",
})
for log in logs:
nt = log.node_type or "unknown"
node_stats[nt]["count"] += 1
node_stats[nt]["node_type"] = nt
if log.level == "ERROR":
node_stats[nt]["errors"] += 1
if log.duration:
node_stats[nt]["durations"].append(log.duration)
node_stats[nt]["total_duration"] += log.duration
# 计算统计值
results = []
for nt, stats in node_stats.items():
if stats["count"] < 3:
continue
durations = sorted(stats["durations"])
n = len(durations)
avg = stats["total_duration"] / n if n > 0 else 0
results.append({
"node_type": nt,
"count": stats["count"],
"error_rate": round(stats["errors"] / stats["count"], 3),
"avg_duration_ms": int(avg),
"p50_ms": durations[n // 2] if n > 0 else 0,
"p95_ms": durations[int(n * 0.95)] if n > 4 else (durations[-1] if n > 0 else 0),
"p99_ms": durations[int(n * 0.99)] if n > 9 else (durations[-1] if n > 0 else 0),
})
# 按耗时排序
results.sort(key=lambda x: x["p95_ms"], reverse=True)
# 2. 分析 Agent 执行效率
agent_logs = (
db.query(AgentExecutionLog)
.filter(AgentExecutionLog.created_at >= since)
.all()
)
agent_failure_rate = 0.0
agent_avg_tool_calls = 0.0
if agent_logs:
failed = sum(1 for a in agent_logs if not a.success)
agent_failure_rate = round(failed / len(agent_logs), 3)
agent_avg_tool_calls = round(
sum(a.tool_calls_made or 0 for a in agent_logs) / len(agent_logs), 1
)
# 3. 识别瓶颈
if results:
overall_avg_p95 = sum(r["p95_ms"] for r in results) / len(results)
for r in results:
r["is_bottleneck"] = r["p95_ms"] > overall_avg_p95 * 3
r["is_problematic"] = r["error_rate"] > 0.2
r["is_inefficient"] = r["p50_ms"] > 30000
r["severity"] = "high" if (r["is_bottleneck"] or r["is_problematic"]) else (
"medium" if r["is_inefficient"] else "low"
)
return results
except Exception as e:
logger.error("瓶颈检测失败: %s", e)
return []
finally:
if db:
try:
db.close()
except Exception:
pass
def generate_recommendations(self, bottlenecks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""根据瓶颈生成优化建议。"""
recommendations = []
for b in bottlenecks:
if not b.get("is_bottleneck") and not b.get("is_problematic"):
continue
rec = {
"node_type": b["node_type"],
"severity": b["severity"],
"current_state": {
"p95_ms": b["p95_ms"],
"error_rate": b["error_rate"],
"count": b["count"],
},
"actions": [],
}
if b.get("is_bottleneck"):
rec["actions"].append({
"type": "split_node",
"description": f"节点 {b['node_type']} P95耗时 {b['p95_ms']}ms建议拆分为并行子节点",
"expected_improvement": "减少50-70%耗时",
})
rec["actions"].append({
"type": "upgrade_model",
"description": "考虑使用更快的模型或增加超时时间",
"expected_improvement": "减少20-40%耗时",
})
if b.get("is_problematic"):
rec["actions"].append({
"type": "add_retry",
"description": f"节点失败率 {b['error_rate']:.1%},建议添加重试逻辑和前置校验",
"expected_improvement": "失败率降至5%以下",
})
if b.get("is_inefficient"):
rec["actions"].append({
"type": "add_cache",
"description": f"节点平均耗时过长,建议添加结果缓存避免重复计算",
"expected_improvement": "减少30-60%耗时",
})
recommendations.append(rec)
return recommendations
def run_full_analysis(self, hours: int = 24) -> Dict[str, Any]:
"""运行完整分析:检测+建议。"""
bottlenecks = self.analyze_executions(hours=hours)
recommendations = self.generate_recommendations(bottlenecks)
return {
"period_hours": hours,
"bottlenecks_found": len([b for b in bottlenecks if b.get("is_bottleneck")]),
"problematic_nodes": len([b for b in bottlenecks if b.get("is_problematic")]),
"nodes_analyzed": len(bottlenecks),
"bottlenecks": bottlenecks,
"recommendations": recommendations,
}
bottleneck_detector = BottleneckDetector()