Files
aiagent/backend/app/services/knowledge_extractor.py
renjianbo ab1589921a fix: 修复35个安全与功能缺陷,补全知识进化/数字孪生/行为采集模块
## 安全修复 (12项)
- Webhook接口添加全局Token认证,过滤敏感请求头
- 修复JWT Base64 padding公式,防止签名验证绕过
- 数据库密码/飞书Token从源码移除,改为环境变量
- 工作流引擎添加路径遍历防护 (_resolve_safe_path)
- eval()添加模板长度上限检查
- 审批API添加认证依赖
- 前端v-html增强XSS转义,console.log仅开发模式输出
- 500错误不再暴露内部异常详情

## Agent运行时修复 (7项)
- 删除_inject_knowledge_context中未定义db变量的finally块
- 工具执行添加try/except保护,异常不崩溃Agent
- LLM重试计入budget计数器
- self_review异常时passed=False
- max_iterations截断标记success=False
- 工具参数JSON解析失败时记录警告日志
- run()开始时重置_llm_invocations计数器

## 配置与基础设施
- DEBUG默认False,SQL_ECHO独立配置项
- init_db()补全13个缺失模型导入
- 新增WEBHOOK_AUTH_TOKEN/SQL_ECHO配置项
- 新增.env.example模板文件

## 前端修复 (12项)
- 登录改用URLSearchParams替代FormData
- 401拦截器通过Pinia store统一清理状态
- SSE流超时从60s延长至300s
- final/error事件时清除streamTimeout
- localStorage聊天记录添加24h TTL
- safeParseArgCount替代模板中裸JSON.parse
- fetchUser 401时同时清除user对象

## 新增模块
- 知识进化: knowledge_extractor/retriever/tasks
- 数字孪生: shadow_executor/comparison模型
- 行为采集: behavior_middleware/collector/fingerprint_engine
- 代码审查: code_review_agent/document_review_agent
- 反馈学习: feedback_learner
- 瓶颈检测/优化引擎/成本估算/需求估算
- 速率限制器 (rate_limiter)
- Alembic迁移 015-020

## 文档
- 商业化落地计划
- 8篇docs文档 (架构/API/部署/开发/贡献等)
- Docker Compose生产配置

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-10 19:50:20 +08:00

230 lines
7.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
知识提取器 - 从 Agent 执行日志中用 LLM 提取可复用知识
"""
from __future__ import annotations
import json
import logging
from typing import Any, Dict, List, Optional
from sqlalchemy import desc
from app.core.database import SessionLocal
logger = logging.getLogger(__name__)
KNOWLEDGE_EXTRACTION_PROMPT = """你是一个知识工程专家。请从以下 Agent 执行记录中提取可复用的经验知识。
执行输入: {input_text}
执行输出: {output_text}
是否成功: {success}
工具调用链: {tool_chain}
迭代次数: {iterations_used}
工具调用总次数: {tool_calls_made}
请提取为以下 JSON 格式(只输出 JSON不要其他文本
{{
"title": "一句话标题处理MySQL死锁的重试策略",
"category": "bug_fix / best_practice / workaround / optimization / insight",
"tags": ["tag1", "tag2"],
"situation": "什么场景下适用",
"solution": "具体解决方案或操作步骤",
"caveats": "注意事项或已知限制",
"confidence": 0.0-1.0
}}
如果这条执行记录没有值得沉淀的知识,返回:
{{"skip": true, "reason": "原因"}}
"""
class KnowledgeExtractor:
"""从执行日志中提取知识(使用 LLM"""
def __init__(self, llm_model: str = "deepseek-v4-flash"):
self.llm_model = llm_model
def _sync_llm_call(self, prompt: str) -> str:
"""同步调用 LLM。"""
from app.agent_runtime.core import _LLMClient
from app.agent_runtime.schemas import AgentLLMConfig
import asyncio as aio
client = _LLMClient(AgentLLMConfig(
provider="deepseek",
model=self.llm_model,
temperature=0.3,
max_iterations=1,
))
result = aio.run(client.chat(
messages=[{"role": "user", "content": prompt}],
tools=None,
iteration=1,
))
content = result.get("content", "") if isinstance(result, dict) else str(result)
return content
def extract_from_execution(self, log: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""从单条执行日志提取知识(同步)。"""
input_text = log.get("input_text", "") or ""
output_text = log.get("output_text", "") or ""
success = log.get("success", False)
tool_chain = log.get("tool_chain") or []
tool_calls_made = log.get("tool_calls_made", 0)
iterations_used = log.get("iterations_used", 0)
if len(input_text) < 20 or len(output_text) < 50:
return None
if not success and not tool_chain:
return None
tool_chain_str = json.dumps(tool_chain[:5], ensure_ascii=False) if tool_chain else ""
prompt = KNOWLEDGE_EXTRACTION_PROMPT.format(
input_text=input_text[:2000],
output_text=output_text[:2000],
success="" if success else "",
tool_chain=tool_chain_str,
iterations_used=iterations_used,
tool_calls_made=tool_calls_made,
)
try:
llm_output = self._sync_llm_call(prompt)
except Exception as e:
logger.warning("知识提取 LLM 调用失败: %s", e)
return None
try:
json_start = llm_output.find("{")
json_end = llm_output.rfind("}") + 1
if json_start >= 0 and json_end > json_start:
data = json.loads(llm_output[json_start:json_end])
else:
return None
except json.JSONDecodeError:
logger.warning("知识提取 JSON 解析失败: %s", llm_output[:200])
return None
if data.get("skip"):
return None
confidence = float(data.get("confidence", 0.5))
if confidence < 0.3:
return None
return {
"title": data.get("title", "未命名知识"),
"category": data.get("category", "insight"),
"tags": data.get("tags", []),
"situation": data.get("situation", ""),
"solution": data.get("solution", ""),
"caveats": data.get("caveats", ""),
"confidence": confidence,
}
def run_extraction_pipeline(self, limit: int = 10) -> int:
"""运行提取管道:从未处理的执行日志中提取知识。"""
from app.models.agent_execution_log import AgentExecutionLog
from app.models.knowledge_entry import KnowledgeEntry
db = None
extracted = 0
try:
db = SessionLocal()
logs = (
db.query(AgentExecutionLog)
.filter(
AgentExecutionLog.success == True,
AgentExecutionLog.knowledge_extracted == False,
AgentExecutionLog.output_text.isnot(None),
)
.order_by(desc(AgentExecutionLog.created_at))
.limit(limit)
.all()
)
if not logs:
logger.info("知识提取管道: 没有待处理的新日志")
return 0
logger.info("知识提取管道: 找到 %d 条待处理日志", len(logs))
for log_entry in logs:
log_data = {
"input_text": log_entry.input_text,
"output_text": log_entry.output_text,
"success": log_entry.success,
"tool_chain": log_entry.tool_chain,
"iterations_used": log_entry.iterations_used,
"tool_calls_made": log_entry.tool_calls_made,
}
knowledge = self.extract_from_execution(log_data)
if not knowledge:
log_entry.knowledge_extracted = True
db.commit()
continue
existing = (
db.query(KnowledgeEntry)
.filter(KnowledgeEntry.title == knowledge["title"])
.first()
)
if existing:
log_entry.knowledge_extracted = True
db.commit()
continue
embedding_text = (
knowledge["title"] + "\n" +
knowledge["situation"] + "\n" +
knowledge["solution"] + "\n" +
knowledge["caveats"]
)
entry = KnowledgeEntry(
title=knowledge["title"],
category=knowledge["category"],
tags=knowledge["tags"],
situation=knowledge["situation"],
solution=knowledge["solution"],
caveats=knowledge["caveats"],
source_execution_ids=[str(log_entry.id)],
source_agent_name=log_entry.agent_name,
source_model=log_entry.model,
embedding_text=embedding_text,
extracted_by="llm_auto",
confidence=knowledge["confidence"],
)
db.add(entry)
log_entry.knowledge_extracted = True
db.commit()
extracted += 1
logger.info(
"知识提取: %s -> %s (conf=%.2f)",
log_entry.agent_name, knowledge["title"], knowledge["confidence"],
)
return extracted
except Exception as e:
logger.error("知识提取管道异常: %s", e)
if db:
try:
db.rollback()
except Exception:
pass
return 0
finally:
if db:
try:
db.close()
except Exception:
pass
knowledge_extractor = KnowledgeExtractor()