## 安全修复 (12项) - Webhook接口添加全局Token认证,过滤敏感请求头 - 修复JWT Base64 padding公式,防止签名验证绕过 - 数据库密码/飞书Token从源码移除,改为环境变量 - 工作流引擎添加路径遍历防护 (_resolve_safe_path) - eval()添加模板长度上限检查 - 审批API添加认证依赖 - 前端v-html增强XSS转义,console.log仅开发模式输出 - 500错误不再暴露内部异常详情 ## Agent运行时修复 (7项) - 删除_inject_knowledge_context中未定义db变量的finally块 - 工具执行添加try/except保护,异常不崩溃Agent - LLM重试计入budget计数器 - self_review异常时passed=False - max_iterations截断标记success=False - 工具参数JSON解析失败时记录警告日志 - run()开始时重置_llm_invocations计数器 ## 配置与基础设施 - DEBUG默认False,SQL_ECHO独立配置项 - init_db()补全13个缺失模型导入 - 新增WEBHOOK_AUTH_TOKEN/SQL_ECHO配置项 - 新增.env.example模板文件 ## 前端修复 (12项) - 登录改用URLSearchParams替代FormData - 401拦截器通过Pinia store统一清理状态 - SSE流超时从60s延长至300s - final/error事件时清除streamTimeout - localStorage聊天记录添加24h TTL - safeParseArgCount替代模板中裸JSON.parse - fetchUser 401时同时清除user对象 ## 新增模块 - 知识进化: knowledge_extractor/retriever/tasks - 数字孪生: shadow_executor/comparison模型 - 行为采集: behavior_middleware/collector/fingerprint_engine - 代码审查: code_review_agent/document_review_agent - 反馈学习: feedback_learner - 瓶颈检测/优化引擎/成本估算/需求估算 - 速率限制器 (rate_limiter) - Alembic迁移 015-020 ## 文档 - 商业化落地计划 - 8篇docs文档 (架构/API/部署/开发/贡献等) - Docker Compose生产配置 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
183 lines
6.7 KiB
Python
183 lines
6.7 KiB
Python
"""
|
||
工作流瓶颈自动检测 — 分析执行数据,识别性能瓶颈
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import logging
|
||
from typing import Any, Dict, List, Optional
|
||
from collections import defaultdict
|
||
|
||
from sqlalchemy import func
|
||
from sqlalchemy.orm import Session
|
||
|
||
from app.core.database import SessionLocal
|
||
from app.models.execution import Execution
|
||
from app.models.execution_log import ExecutionLog
|
||
from app.models.agent_execution_log import AgentExecutionLog
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
class BottleneckDetector:
|
||
"""工作流瓶颈检测器"""
|
||
|
||
def analyze_executions(self, hours: int = 24) -> List[Dict[str, Any]]:
|
||
"""分析最近N小时的执行数据,找出瓶颈。"""
|
||
db: Optional[Session] = None
|
||
try:
|
||
db = SessionLocal()
|
||
|
||
from datetime import datetime, timedelta
|
||
since = datetime.now() - timedelta(hours=hours)
|
||
|
||
# 1. 分析工作流执行节点耗时
|
||
logs = (
|
||
db.query(ExecutionLog)
|
||
.filter(
|
||
ExecutionLog.timestamp >= since,
|
||
ExecutionLog.node_id.isnot(None),
|
||
)
|
||
.all()
|
||
)
|
||
|
||
# 按 node_type 聚合
|
||
node_stats = defaultdict(lambda: {
|
||
"count": 0, "errors": 0, "durations": [],
|
||
"total_duration": 0, "node_type": "",
|
||
})
|
||
|
||
for log in logs:
|
||
nt = log.node_type or "unknown"
|
||
node_stats[nt]["count"] += 1
|
||
node_stats[nt]["node_type"] = nt
|
||
if log.level == "ERROR":
|
||
node_stats[nt]["errors"] += 1
|
||
if log.duration:
|
||
node_stats[nt]["durations"].append(log.duration)
|
||
node_stats[nt]["total_duration"] += log.duration
|
||
|
||
# 计算统计值
|
||
results = []
|
||
for nt, stats in node_stats.items():
|
||
if stats["count"] < 3:
|
||
continue
|
||
durations = sorted(stats["durations"])
|
||
n = len(durations)
|
||
avg = stats["total_duration"] / n if n > 0 else 0
|
||
results.append({
|
||
"node_type": nt,
|
||
"count": stats["count"],
|
||
"error_rate": round(stats["errors"] / stats["count"], 3),
|
||
"avg_duration_ms": int(avg),
|
||
"p50_ms": durations[n // 2] if n > 0 else 0,
|
||
"p95_ms": durations[int(n * 0.95)] if n > 4 else (durations[-1] if n > 0 else 0),
|
||
"p99_ms": durations[int(n * 0.99)] if n > 9 else (durations[-1] if n > 0 else 0),
|
||
})
|
||
|
||
# 按耗时排序
|
||
results.sort(key=lambda x: x["p95_ms"], reverse=True)
|
||
|
||
# 2. 分析 Agent 执行效率
|
||
agent_logs = (
|
||
db.query(AgentExecutionLog)
|
||
.filter(AgentExecutionLog.created_at >= since)
|
||
.all()
|
||
)
|
||
|
||
agent_failure_rate = 0.0
|
||
agent_avg_tool_calls = 0.0
|
||
if agent_logs:
|
||
failed = sum(1 for a in agent_logs if not a.success)
|
||
agent_failure_rate = round(failed / len(agent_logs), 3)
|
||
agent_avg_tool_calls = round(
|
||
sum(a.tool_calls_made or 0 for a in agent_logs) / len(agent_logs), 1
|
||
)
|
||
|
||
# 3. 识别瓶颈
|
||
if results:
|
||
overall_avg_p95 = sum(r["p95_ms"] for r in results) / len(results)
|
||
for r in results:
|
||
r["is_bottleneck"] = r["p95_ms"] > overall_avg_p95 * 3
|
||
r["is_problematic"] = r["error_rate"] > 0.2
|
||
r["is_inefficient"] = r["p50_ms"] > 30000
|
||
r["severity"] = "high" if (r["is_bottleneck"] or r["is_problematic"]) else (
|
||
"medium" if r["is_inefficient"] else "low"
|
||
)
|
||
|
||
return results
|
||
|
||
except Exception as e:
|
||
logger.error("瓶颈检测失败: %s", e)
|
||
return []
|
||
finally:
|
||
if db:
|
||
try:
|
||
db.close()
|
||
except Exception:
|
||
pass
|
||
|
||
def generate_recommendations(self, bottlenecks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||
"""根据瓶颈生成优化建议。"""
|
||
recommendations = []
|
||
|
||
for b in bottlenecks:
|
||
if not b.get("is_bottleneck") and not b.get("is_problematic"):
|
||
continue
|
||
|
||
rec = {
|
||
"node_type": b["node_type"],
|
||
"severity": b["severity"],
|
||
"current_state": {
|
||
"p95_ms": b["p95_ms"],
|
||
"error_rate": b["error_rate"],
|
||
"count": b["count"],
|
||
},
|
||
"actions": [],
|
||
}
|
||
|
||
if b.get("is_bottleneck"):
|
||
rec["actions"].append({
|
||
"type": "split_node",
|
||
"description": f"节点 {b['node_type']} P95耗时 {b['p95_ms']}ms,建议拆分为并行子节点",
|
||
"expected_improvement": "减少50-70%耗时",
|
||
})
|
||
rec["actions"].append({
|
||
"type": "upgrade_model",
|
||
"description": "考虑使用更快的模型或增加超时时间",
|
||
"expected_improvement": "减少20-40%耗时",
|
||
})
|
||
|
||
if b.get("is_problematic"):
|
||
rec["actions"].append({
|
||
"type": "add_retry",
|
||
"description": f"节点失败率 {b['error_rate']:.1%},建议添加重试逻辑和前置校验",
|
||
"expected_improvement": "失败率降至5%以下",
|
||
})
|
||
|
||
if b.get("is_inefficient"):
|
||
rec["actions"].append({
|
||
"type": "add_cache",
|
||
"description": f"节点平均耗时过长,建议添加结果缓存避免重复计算",
|
||
"expected_improvement": "减少30-60%耗时",
|
||
})
|
||
|
||
recommendations.append(rec)
|
||
|
||
return recommendations
|
||
|
||
def run_full_analysis(self, hours: int = 24) -> Dict[str, Any]:
|
||
"""运行完整分析:检测+建议。"""
|
||
bottlenecks = self.analyze_executions(hours=hours)
|
||
recommendations = self.generate_recommendations(bottlenecks)
|
||
return {
|
||
"period_hours": hours,
|
||
"bottlenecks_found": len([b for b in bottlenecks if b.get("is_bottleneck")]),
|
||
"problematic_nodes": len([b for b in bottlenecks if b.get("is_problematic")]),
|
||
"nodes_analyzed": len(bottlenecks),
|
||
"bottlenecks": bottlenecks,
|
||
"recommendations": recommendations,
|
||
}
|
||
|
||
|
||
bottleneck_detector = BottleneckDetector()
|