""" 知识提取器 - 从 Agent 执行日志中用 LLM 提取可复用知识 """ from __future__ import annotations import json import logging from typing import Any, Dict, List, Optional from sqlalchemy import desc from app.core.database import SessionLocal logger = logging.getLogger(__name__) KNOWLEDGE_EXTRACTION_PROMPT = """你是一个知识工程专家。请从以下 Agent 执行记录中提取可复用的经验知识。 执行输入: {input_text} 执行输出: {output_text} 是否成功: {success} 工具调用链: {tool_chain} 迭代次数: {iterations_used} 工具调用总次数: {tool_calls_made} 请提取为以下 JSON 格式(只输出 JSON,不要其他文本): {{ "title": "一句话标题(如:处理MySQL死锁的重试策略)", "category": "bug_fix / best_practice / workaround / optimization / insight", "tags": ["tag1", "tag2"], "situation": "什么场景下适用", "solution": "具体解决方案或操作步骤", "caveats": "注意事项或已知限制", "confidence": 0.0-1.0 }} 如果这条执行记录没有值得沉淀的知识,返回: {{"skip": true, "reason": "原因"}} """ class KnowledgeExtractor: """从执行日志中提取知识(使用 LLM)""" def __init__(self, llm_model: str = "deepseek-v4-flash"): self.llm_model = llm_model def _sync_llm_call(self, prompt: str) -> str: """同步调用 LLM。""" from app.agent_runtime.core import _LLMClient from app.agent_runtime.schemas import AgentLLMConfig import asyncio as aio client = _LLMClient(AgentLLMConfig( provider="deepseek", model=self.llm_model, temperature=0.3, max_iterations=1, )) result = aio.run(client.chat( messages=[{"role": "user", "content": prompt}], tools=None, iteration=1, )) content = result.get("content", "") if isinstance(result, dict) else str(result) return content def extract_from_execution(self, log: Dict[str, Any]) -> Optional[Dict[str, Any]]: """从单条执行日志提取知识(同步)。""" input_text = log.get("input_text", "") or "" output_text = log.get("output_text", "") or "" success = log.get("success", False) tool_chain = log.get("tool_chain") or [] tool_calls_made = log.get("tool_calls_made", 0) iterations_used = log.get("iterations_used", 0) if len(input_text) < 20 or len(output_text) < 50: return None if not success and not tool_chain: return None tool_chain_str = json.dumps(tool_chain[:5], ensure_ascii=False) if tool_chain else "无" prompt = KNOWLEDGE_EXTRACTION_PROMPT.format( input_text=input_text[:2000], output_text=output_text[:2000], success="是" if success else "否", tool_chain=tool_chain_str, iterations_used=iterations_used, tool_calls_made=tool_calls_made, ) try: llm_output = self._sync_llm_call(prompt) except Exception as e: logger.warning("知识提取 LLM 调用失败: %s", e) return None try: json_start = llm_output.find("{") json_end = llm_output.rfind("}") + 1 if json_start >= 0 and json_end > json_start: data = json.loads(llm_output[json_start:json_end]) else: return None except json.JSONDecodeError: logger.warning("知识提取 JSON 解析失败: %s", llm_output[:200]) return None if data.get("skip"): return None confidence = float(data.get("confidence", 0.5)) if confidence < 0.3: return None return { "title": data.get("title", "未命名知识"), "category": data.get("category", "insight"), "tags": data.get("tags", []), "situation": data.get("situation", ""), "solution": data.get("solution", ""), "caveats": data.get("caveats", ""), "confidence": confidence, } def run_extraction_pipeline(self, limit: int = 10) -> int: """运行提取管道:从未处理的执行日志中提取知识。""" from app.models.agent_execution_log import AgentExecutionLog from app.models.knowledge_entry import KnowledgeEntry db = None extracted = 0 try: db = SessionLocal() logs = ( db.query(AgentExecutionLog) .filter( AgentExecutionLog.success == True, AgentExecutionLog.knowledge_extracted == False, AgentExecutionLog.output_text.isnot(None), ) .order_by(desc(AgentExecutionLog.created_at)) .limit(limit) .all() ) if not logs: logger.info("知识提取管道: 没有待处理的新日志") return 0 logger.info("知识提取管道: 找到 %d 条待处理日志", len(logs)) for log_entry in logs: log_data = { "input_text": log_entry.input_text, "output_text": log_entry.output_text, "success": log_entry.success, "tool_chain": log_entry.tool_chain, "iterations_used": log_entry.iterations_used, "tool_calls_made": log_entry.tool_calls_made, } knowledge = self.extract_from_execution(log_data) if not knowledge: log_entry.knowledge_extracted = True db.commit() continue existing = ( db.query(KnowledgeEntry) .filter(KnowledgeEntry.title == knowledge["title"]) .first() ) if existing: log_entry.knowledge_extracted = True db.commit() continue embedding_text = ( knowledge["title"] + "\n" + knowledge["situation"] + "\n" + knowledge["solution"] + "\n" + knowledge["caveats"] ) entry = KnowledgeEntry( title=knowledge["title"], category=knowledge["category"], tags=knowledge["tags"], situation=knowledge["situation"], solution=knowledge["solution"], caveats=knowledge["caveats"], source_execution_ids=[str(log_entry.id)], source_agent_name=log_entry.agent_name, source_model=log_entry.model, embedding_text=embedding_text, extracted_by="llm_auto", confidence=knowledge["confidence"], ) db.add(entry) log_entry.knowledge_extracted = True db.commit() extracted += 1 logger.info( "知识提取: %s -> %s (conf=%.2f)", log_entry.agent_name, knowledge["title"], knowledge["confidence"], ) return extracted except Exception as e: logger.error("知识提取管道异常: %s", e) if db: try: db.rollback() except Exception: pass return 0 finally: if db: try: db.close() except Exception: pass knowledge_extractor = KnowledgeExtractor()