230 lines
7.8 KiB
Python
230 lines
7.8 KiB
Python
|
|
"""
|
|||
|
|
知识提取器 - 从 Agent 执行日志中用 LLM 提取可复用知识
|
|||
|
|
"""
|
|||
|
|
from __future__ import annotations
|
|||
|
|
|
|||
|
|
import json
|
|||
|
|
import logging
|
|||
|
|
from typing import Any, Dict, List, Optional
|
|||
|
|
|
|||
|
|
from sqlalchemy import desc
|
|||
|
|
|
|||
|
|
from app.core.database import SessionLocal
|
|||
|
|
|
|||
|
|
logger = logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
KNOWLEDGE_EXTRACTION_PROMPT = """你是一个知识工程专家。请从以下 Agent 执行记录中提取可复用的经验知识。
|
|||
|
|
|
|||
|
|
执行输入: {input_text}
|
|||
|
|
执行输出: {output_text}
|
|||
|
|
是否成功: {success}
|
|||
|
|
工具调用链: {tool_chain}
|
|||
|
|
迭代次数: {iterations_used}
|
|||
|
|
工具调用总次数: {tool_calls_made}
|
|||
|
|
|
|||
|
|
请提取为以下 JSON 格式(只输出 JSON,不要其他文本):
|
|||
|
|
{{
|
|||
|
|
"title": "一句话标题(如:处理MySQL死锁的重试策略)",
|
|||
|
|
"category": "bug_fix / best_practice / workaround / optimization / insight",
|
|||
|
|
"tags": ["tag1", "tag2"],
|
|||
|
|
"situation": "什么场景下适用",
|
|||
|
|
"solution": "具体解决方案或操作步骤",
|
|||
|
|
"caveats": "注意事项或已知限制",
|
|||
|
|
"confidence": 0.0-1.0
|
|||
|
|
}}
|
|||
|
|
|
|||
|
|
如果这条执行记录没有值得沉淀的知识,返回:
|
|||
|
|
{{"skip": true, "reason": "原因"}}
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
|
|||
|
|
class KnowledgeExtractor:
|
|||
|
|
"""从执行日志中提取知识(使用 LLM)"""
|
|||
|
|
|
|||
|
|
def __init__(self, llm_model: str = "deepseek-v4-flash"):
|
|||
|
|
self.llm_model = llm_model
|
|||
|
|
|
|||
|
|
def _sync_llm_call(self, prompt: str) -> str:
|
|||
|
|
"""同步调用 LLM。"""
|
|||
|
|
from app.agent_runtime.core import _LLMClient
|
|||
|
|
from app.agent_runtime.schemas import AgentLLMConfig
|
|||
|
|
import asyncio as aio
|
|||
|
|
|
|||
|
|
client = _LLMClient(AgentLLMConfig(
|
|||
|
|
provider="deepseek",
|
|||
|
|
model=self.llm_model,
|
|||
|
|
temperature=0.3,
|
|||
|
|
max_iterations=1,
|
|||
|
|
))
|
|||
|
|
|
|||
|
|
result = aio.run(client.chat(
|
|||
|
|
messages=[{"role": "user", "content": prompt}],
|
|||
|
|
tools=None,
|
|||
|
|
iteration=1,
|
|||
|
|
))
|
|||
|
|
content = result.get("content", "") if isinstance(result, dict) else str(result)
|
|||
|
|
return content
|
|||
|
|
|
|||
|
|
def extract_from_execution(self, log: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
|||
|
|
"""从单条执行日志提取知识(同步)。"""
|
|||
|
|
input_text = log.get("input_text", "") or ""
|
|||
|
|
output_text = log.get("output_text", "") or ""
|
|||
|
|
success = log.get("success", False)
|
|||
|
|
tool_chain = log.get("tool_chain") or []
|
|||
|
|
tool_calls_made = log.get("tool_calls_made", 0)
|
|||
|
|
iterations_used = log.get("iterations_used", 0)
|
|||
|
|
|
|||
|
|
if len(input_text) < 20 or len(output_text) < 50:
|
|||
|
|
return None
|
|||
|
|
if not success and not tool_chain:
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
tool_chain_str = json.dumps(tool_chain[:5], ensure_ascii=False) if tool_chain else "无"
|
|||
|
|
|
|||
|
|
prompt = KNOWLEDGE_EXTRACTION_PROMPT.format(
|
|||
|
|
input_text=input_text[:2000],
|
|||
|
|
output_text=output_text[:2000],
|
|||
|
|
success="是" if success else "否",
|
|||
|
|
tool_chain=tool_chain_str,
|
|||
|
|
iterations_used=iterations_used,
|
|||
|
|
tool_calls_made=tool_calls_made,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
llm_output = self._sync_llm_call(prompt)
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning("知识提取 LLM 调用失败: %s", e)
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
json_start = llm_output.find("{")
|
|||
|
|
json_end = llm_output.rfind("}") + 1
|
|||
|
|
if json_start >= 0 and json_end > json_start:
|
|||
|
|
data = json.loads(llm_output[json_start:json_end])
|
|||
|
|
else:
|
|||
|
|
return None
|
|||
|
|
except json.JSONDecodeError:
|
|||
|
|
logger.warning("知识提取 JSON 解析失败: %s", llm_output[:200])
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
if data.get("skip"):
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
confidence = float(data.get("confidence", 0.5))
|
|||
|
|
if confidence < 0.3:
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
"title": data.get("title", "未命名知识"),
|
|||
|
|
"category": data.get("category", "insight"),
|
|||
|
|
"tags": data.get("tags", []),
|
|||
|
|
"situation": data.get("situation", ""),
|
|||
|
|
"solution": data.get("solution", ""),
|
|||
|
|
"caveats": data.get("caveats", ""),
|
|||
|
|
"confidence": confidence,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
def run_extraction_pipeline(self, limit: int = 10) -> int:
|
|||
|
|
"""运行提取管道:从未处理的执行日志中提取知识。"""
|
|||
|
|
from app.models.agent_execution_log import AgentExecutionLog
|
|||
|
|
from app.models.knowledge_entry import KnowledgeEntry
|
|||
|
|
|
|||
|
|
db = None
|
|||
|
|
extracted = 0
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
db = SessionLocal()
|
|||
|
|
logs = (
|
|||
|
|
db.query(AgentExecutionLog)
|
|||
|
|
.filter(
|
|||
|
|
AgentExecutionLog.success == True,
|
|||
|
|
AgentExecutionLog.knowledge_extracted == False,
|
|||
|
|
AgentExecutionLog.output_text.isnot(None),
|
|||
|
|
)
|
|||
|
|
.order_by(desc(AgentExecutionLog.created_at))
|
|||
|
|
.limit(limit)
|
|||
|
|
.all()
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
if not logs:
|
|||
|
|
logger.info("知识提取管道: 没有待处理的新日志")
|
|||
|
|
return 0
|
|||
|
|
|
|||
|
|
logger.info("知识提取管道: 找到 %d 条待处理日志", len(logs))
|
|||
|
|
|
|||
|
|
for log_entry in logs:
|
|||
|
|
log_data = {
|
|||
|
|
"input_text": log_entry.input_text,
|
|||
|
|
"output_text": log_entry.output_text,
|
|||
|
|
"success": log_entry.success,
|
|||
|
|
"tool_chain": log_entry.tool_chain,
|
|||
|
|
"iterations_used": log_entry.iterations_used,
|
|||
|
|
"tool_calls_made": log_entry.tool_calls_made,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
knowledge = self.extract_from_execution(log_data)
|
|||
|
|
if not knowledge:
|
|||
|
|
log_entry.knowledge_extracted = True
|
|||
|
|
db.commit()
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
existing = (
|
|||
|
|
db.query(KnowledgeEntry)
|
|||
|
|
.filter(KnowledgeEntry.title == knowledge["title"])
|
|||
|
|
.first()
|
|||
|
|
)
|
|||
|
|
if existing:
|
|||
|
|
log_entry.knowledge_extracted = True
|
|||
|
|
db.commit()
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
embedding_text = (
|
|||
|
|
knowledge["title"] + "\n" +
|
|||
|
|
knowledge["situation"] + "\n" +
|
|||
|
|
knowledge["solution"] + "\n" +
|
|||
|
|
knowledge["caveats"]
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
entry = KnowledgeEntry(
|
|||
|
|
title=knowledge["title"],
|
|||
|
|
category=knowledge["category"],
|
|||
|
|
tags=knowledge["tags"],
|
|||
|
|
situation=knowledge["situation"],
|
|||
|
|
solution=knowledge["solution"],
|
|||
|
|
caveats=knowledge["caveats"],
|
|||
|
|
source_execution_ids=[str(log_entry.id)],
|
|||
|
|
source_agent_name=log_entry.agent_name,
|
|||
|
|
source_model=log_entry.model,
|
|||
|
|
embedding_text=embedding_text,
|
|||
|
|
extracted_by="llm_auto",
|
|||
|
|
confidence=knowledge["confidence"],
|
|||
|
|
)
|
|||
|
|
db.add(entry)
|
|||
|
|
log_entry.knowledge_extracted = True
|
|||
|
|
db.commit()
|
|||
|
|
extracted += 1
|
|||
|
|
logger.info(
|
|||
|
|
"知识提取: %s -> %s (conf=%.2f)",
|
|||
|
|
log_entry.agent_name, knowledge["title"], knowledge["confidence"],
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
return extracted
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error("知识提取管道异常: %s", e)
|
|||
|
|
if db:
|
|||
|
|
try:
|
|||
|
|
db.rollback()
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
return 0
|
|||
|
|
finally:
|
|||
|
|
if db:
|
|||
|
|
try:
|
|||
|
|
db.close()
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
|
|||
|
|
knowledge_extractor = KnowledgeExtractor()
|