backend/app/services/knowledge_extractor.py

"""
知识提取器 - 从 Agent 执行日志中用 LLM 提取可复用知识
"""
from __future__ import annotations

import json
import logging
from typing import Any, Dict, List, Optional

from sqlalchemy import desc

from app.core.database import SessionLocal

logger = logging.getLogger(__name__)

KNOWLEDGE_EXTRACTION_PROMPT = """你是一个知识工程专家。请从以下 Agent 执行记录中提取可复用的经验知识。

执行输入: {input_text}
执行输出: {output_text}
是否成功: {success}
工具调用链: {tool_chain}
迭代次数: {iterations_used}
工具调用总次数: {tool_calls_made}

请提取为以下 JSON 格式（只输出 JSON，不要其他文本）：
{{
    "title": "一句话标题（如：处理MySQL死锁的重试策略）",
    "category": "bug_fix / best_practice / workaround / optimization / insight",
    "tags": ["tag1", "tag2"],
    "situation": "什么场景下适用",
    "solution": "具体解决方案或操作步骤",
    "caveats": "注意事项或已知限制",
    "confidence": 0.0-1.0
}}

如果这条执行记录没有值得沉淀的知识，返回：
{{"skip": true, "reason": "原因"}}
"""


class KnowledgeExtractor:
    """从执行日志中提取知识（使用 LLM）"""

    def __init__(self, llm_model: str = "deepseek-v4-flash"):
        self.llm_model = llm_model

    def _sync_llm_call(self, prompt: str) -> str:
        """同步调用 LLM。"""
        from app.agent_runtime.core import _LLMClient
        from app.agent_runtime.schemas import AgentLLMConfig
        import asyncio as aio

        client = _LLMClient(AgentLLMConfig(
            provider="deepseek",
            model=self.llm_model,
            temperature=0.3,
            max_iterations=1,
        ))

        result = aio.run(client.chat(
            messages=[{"role": "user", "content": prompt}],
            tools=None,
            iteration=1,
        ))
        content = result.get("content", "") if isinstance(result, dict) else str(result)
        return content

    def extract_from_execution(self, log: Dict[str, Any]) -> Optional[Dict[str, Any]]:
        """从单条执行日志提取知识（同步）。"""
        input_text = log.get("input_text", "") or ""
        output_text = log.get("output_text", "") or ""
        success = log.get("success", False)
        tool_chain = log.get("tool_chain") or []
        tool_calls_made = log.get("tool_calls_made", 0)
        iterations_used = log.get("iterations_used", 0)

        if len(input_text) < 20 or len(output_text) < 50:
            return None
        if not success and not tool_chain:
            return None

        tool_chain_str = json.dumps(tool_chain[:5], ensure_ascii=False) if tool_chain else "无"

        prompt = KNOWLEDGE_EXTRACTION_PROMPT.format(
            input_text=input_text[:2000],
            output_text=output_text[:2000],
            success="是" if success else "否",
            tool_chain=tool_chain_str,
            iterations_used=iterations_used,
            tool_calls_made=tool_calls_made,
        )

        try:
            llm_output = self._sync_llm_call(prompt)
        except Exception as e:
            logger.warning("知识提取 LLM 调用失败: %s", e)
            return None

        try:
            json_start = llm_output.find("{")
            json_end = llm_output.rfind("}") + 1
            if json_start >= 0 and json_end > json_start:
                data = json.loads(llm_output[json_start:json_end])
            else:
                return None
        except json.JSONDecodeError:
            logger.warning("知识提取 JSON 解析失败: %s", llm_output[:200])
            return None

        if data.get("skip"):
            return None

        confidence = float(data.get("confidence", 0.5))
        if confidence < 0.3:
            return None

        return {
            "title": data.get("title", "未命名知识"),
            "category": data.get("category", "insight"),
            "tags": data.get("tags", []),
            "situation": data.get("situation", ""),
            "solution": data.get("solution", ""),
            "caveats": data.get("caveats", ""),
            "confidence": confidence,
        }

    def run_extraction_pipeline(self, limit: int = 10) -> int:
        """运行提取管道：从未处理的执行日志中提取知识。"""
        from app.models.agent_execution_log import AgentExecutionLog
        from app.models.knowledge_entry import KnowledgeEntry

        db = None
        extracted = 0

        try:
            db = SessionLocal()
            logs = (
                db.query(AgentExecutionLog)
                .filter(
                    AgentExecutionLog.success == True,
                    AgentExecutionLog.knowledge_extracted == False,
                    AgentExecutionLog.output_text.isnot(None),
                )
                .order_by(desc(AgentExecutionLog.created_at))
                .limit(limit)
                .all()
            )

            if not logs:
                logger.info("知识提取管道: 没有待处理的新日志")
                return 0

            logger.info("知识提取管道: 找到 %d 条待处理日志", len(logs))

            for log_entry in logs:
                log_data = {
                    "input_text": log_entry.input_text,
                    "output_text": log_entry.output_text,
                    "success": log_entry.success,
                    "tool_chain": log_entry.tool_chain,
                    "iterations_used": log_entry.iterations_used,
                    "tool_calls_made": log_entry.tool_calls_made,
                }

                knowledge = self.extract_from_execution(log_data)
                if not knowledge:
                    log_entry.knowledge_extracted = True
                    db.commit()
                    continue

                existing = (
                    db.query(KnowledgeEntry)
                    .filter(KnowledgeEntry.title == knowledge["title"])
                    .first()
                )
                if existing:
                    log_entry.knowledge_extracted = True
                    db.commit()
                    continue

                embedding_text = (
                    knowledge["title"] + "\n" +
                    knowledge["situation"] + "\n" +
                    knowledge["solution"] + "\n" +
                    knowledge["caveats"]
                )

                entry = KnowledgeEntry(
                    title=knowledge["title"],
                    category=knowledge["category"],
                    tags=knowledge["tags"],
                    situation=knowledge["situation"],
                    solution=knowledge["solution"],
                    caveats=knowledge["caveats"],
                    source_execution_ids=[str(log_entry.id)],
                    source_agent_name=log_entry.agent_name,
                    source_model=log_entry.model,
                    embedding_text=embedding_text,
                    extracted_by="llm_auto",
                    confidence=knowledge["confidence"],
                )
                db.add(entry)
                log_entry.knowledge_extracted = True
                db.commit()
                extracted += 1
                logger.info(
                    "知识提取: %s -> %s (conf=%.2f)",
                    log_entry.agent_name, knowledge["title"], knowledge["confidence"],
                )

            return extracted

        except Exception as e:
            logger.error("知识提取管道异常: %s", e)
            if db:
                try:
                    db.rollback()
                except Exception:
                    pass
            return 0
        finally:
            if db:
                try:
                    db.close()
                except Exception:
                    pass


knowledge_extractor = KnowledgeExtractor()
-												fix: 修复35个安全与功能缺陷，补全知识进化/数字孪生/行为采集模块

## 安全修复 (12项)
- Webhook接口添加全局Token认证，过滤敏感请求头
- 修复JWT Base64 padding公式，防止签名验证绕过
- 数据库密码/飞书Token从源码移除，改为环境变量
- 工作流引擎添加路径遍历防护 (_resolve_safe_path)
- eval()添加模板长度上限检查
- 审批API添加认证依赖
- 前端v-html增强XSS转义，console.log仅开发模式输出
- 500错误不再暴露内部异常详情

## Agent运行时修复 (7项)
- 删除_inject_knowledge_context中未定义db变量的finally块
- 工具执行添加try/except保护，异常不崩溃Agent
- LLM重试计入budget计数器
- self_review异常时passed=False
- max_iterations截断标记success=False
- 工具参数JSON解析失败时记录警告日志
- run()开始时重置_llm_invocations计数器

## 配置与基础设施
- DEBUG默认False，SQL_ECHO独立配置项
- init_db()补全13个缺失模型导入
- 新增WEBHOOK_AUTH_TOKEN/SQL_ECHO配置项
- 新增.env.example模板文件

## 前端修复 (12项)
- 登录改用URLSearchParams替代FormData
- 401拦截器通过Pinia store统一清理状态
- SSE流超时从60s延长至300s
- final/error事件时清除streamTimeout
- localStorage聊天记录添加24h TTL
- safeParseArgCount替代模板中裸JSON.parse
- fetchUser 401时同时清除user对象

## 新增模块
- 知识进化: knowledge_extractor/retriever/tasks
- 数字孪生: shadow_executor/comparison模型
- 行为采集: behavior_middleware/collector/fingerprint_engine
- 代码审查: code_review_agent/document_review_agent
- 反馈学习: feedback_learner
- 瓶颈检测/优化引擎/成本估算/需求估算
- 速率限制器 (rate_limiter)
- Alembic迁移 015-020

## 文档
- 商业化落地计划
- 8篇docs文档 (架构/API/部署/开发/贡献等)
- Docker Compose生产配置

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

											
										
										
											2026-05-10 19:50:20 +08:00
+								"""
 								知识提取器 - 从 Agent 执行日志中用 LLM 提取可复用知识
 								"""
 								from __future__ import annotations
 								import json
 								import logging
 								from typing import Any, Dict, List, Optional
 								from sqlalchemy import desc
 								from app.core.database import SessionLocal
 								logger = logging.getLogger(__name__)
 								KNOWLEDGE_EXTRACTION_PROMPT = """你是一个知识工程专家。请从以下 Agent 执行记录中提取可复用的经验知识。
 								执行输入: {input_text}
 								执行输出: {output_text}
 								是否成功: {success}
 								工具调用链: {tool_chain}
 								迭代次数: {iterations_used}
 								工具调用总次数: {tool_calls_made}
 								请提取为以下 JSON 格式（只输出 JSON，不要其他文本）：
 								{{
 								    "title": "一句话标题（如：处理MySQL死锁的重试策略）",
 								    "category": "bug_fix / best_practice / workaround / optimization / insight",
 								    "tags": ["tag1", "tag2"],
 								    "situation": "什么场景下适用",
 								    "solution": "具体解决方案或操作步骤",
 								    "caveats": "注意事项或已知限制",
 								    "confidence": 0.0-1.0
 								}}
 								如果这条执行记录没有值得沉淀的知识，返回：
 								{{"skip": true, "reason": "原因"}}
 								"""
 								class KnowledgeExtractor:
 								    """从执行日志中提取知识（使用 LLM）"""
 								    def __init__(self, llm_model: str = "deepseek-v4-flash"):
 								        self.llm_model = llm_model
 								    def _sync_llm_call(self, prompt: str) -> str:
 								        """同步调用 LLM。"""
 								        from app.agent_runtime.core import _LLMClient
 								        from app.agent_runtime.schemas import AgentLLMConfig
 								        import asyncio as aio
 								        client = _LLMClient(AgentLLMConfig(
 								            provider="deepseek",
 								            model=self.llm_model,
 								            temperature=0.3,
 								            max_iterations=1,
 								        ))
 								        result = aio.run(client.chat(
 								            messages=[{"role": "user", "content": prompt}],
 								            tools=None,
 								            iteration=1,
 								        ))
 								        content = result.get("content", "") if isinstance(result, dict) else str(result)
 								        return content
 								    def extract_from_execution(self, log: Dict[str, Any]) -> Optional[Dict[str, Any]]:
 								        """从单条执行日志提取知识（同步）。"""
 								        input_text = log.get("input_text", "") or ""
 								        output_text = log.get("output_text", "") or ""
 								        success = log.get("success", False)
 								        tool_chain = log.get("tool_chain") or []
 								        tool_calls_made = log.get("tool_calls_made", 0)
 								        iterations_used = log.get("iterations_used", 0)
 								        if len(input_text) < 20 or len(output_text) < 50:
 								            return None
 								        if not success and not tool_chain:
 								            return None
 								        tool_chain_str = json.dumps(tool_chain[:5], ensure_ascii=False) if tool_chain else "无"
 								        prompt = KNOWLEDGE_EXTRACTION_PROMPT.format(
 								            input_text=input_text[:2000],
 								            output_text=output_text[:2000],
 								            success="是" if success else "否",
 								            tool_chain=tool_chain_str,
 								            iterations_used=iterations_used,
 								            tool_calls_made=tool_calls_made,
 								        )
 								        try:
 								            llm_output = self._sync_llm_call(prompt)
 								        except Exception as e:
 								            logger.warning("知识提取 LLM 调用失败: %s", e)
 								            return None
 								        try:
 								            json_start = llm_output.find("{")
 								            json_end = llm_output.rfind("}") + 1
 								            if json_start >= 0 and json_end > json_start:
 								                data = json.loads(llm_output[json_start:json_end])
 								            else:
 								                return None
 								        except json.JSONDecodeError:
 								            logger.warning("知识提取 JSON 解析失败: %s", llm_output[:200])
 								            return None
 								        if data.get("skip"):
 								            return None
 								        confidence = float(data.get("confidence", 0.5))
 								        if confidence < 0.3:
 								            return None
 								        return {
 								            "title": data.get("title", "未命名知识"),
 								            "category": data.get("category", "insight"),
 								            "tags": data.get("tags", []),
 								            "situation": data.get("situation", ""),
 								            "solution": data.get("solution", ""),
 								            "caveats": data.get("caveats", ""),
 								            "confidence": confidence,
 								        }
 								    def run_extraction_pipeline(self, limit: int = 10) -> int:
 								        """运行提取管道：从未处理的执行日志中提取知识。"""
 								        from app.models.agent_execution_log import AgentExecutionLog
 								        from app.models.knowledge_entry import KnowledgeEntry
 								        db = None
 								        extracted = 0
 								        try:
 								            db = SessionLocal()
 								            logs = (
 								                db.query(AgentExecutionLog)
 								                .filter(
 								                    AgentExecutionLog.success == True,
 								                    AgentExecutionLog.knowledge_extracted == False,
 								                    AgentExecutionLog.output_text.isnot(None),
 								                )
 								                .order_by(desc(AgentExecutionLog.created_at))
 								                .limit(limit)
 								                .all()
 								            )
 								            if not logs:
 								                logger.info("知识提取管道: 没有待处理的新日志")
 								                return 0
 								            logger.info("知识提取管道: 找到 %d 条待处理日志", len(logs))
 								            for log_entry in logs:
 								                log_data = {
 								                    "input_text": log_entry.input_text,
 								                    "output_text": log_entry.output_text,
 								                    "success": log_entry.success,
 								                    "tool_chain": log_entry.tool_chain,
 								                    "iterations_used": log_entry.iterations_used,
 								                    "tool_calls_made": log_entry.tool_calls_made,
 								                }
 								                knowledge = self.extract_from_execution(log_data)
 								                if not knowledge:
 								                    log_entry.knowledge_extracted = True
 								                    db.commit()
 								                    continue
 								                existing = (
 								                    db.query(KnowledgeEntry)
 								                    .filter(KnowledgeEntry.title == knowledge["title"])
 								                    .first()
 								                )
 								                if existing:
 								                    log_entry.knowledge_extracted = True
 								                    db.commit()
 								                    continue
 								                embedding_text = (
 								                    knowledge["title"] + "\n" +
 								                    knowledge["situation"] + "\n" +
 								                    knowledge["solution"] + "\n" +
 								                    knowledge["caveats"]
 								                )
 								                entry = KnowledgeEntry(
 								                    title=knowledge["title"],
 								                    category=knowledge["category"],
 								                    tags=knowledge["tags"],
 								                    situation=knowledge["situation"],
 								                    solution=knowledge["solution"],
 								                    caveats=knowledge["caveats"],
 								                    source_execution_ids=[str(log_entry.id)],
 								                    source_agent_name=log_entry.agent_name,
 								                    source_model=log_entry.model,
 								                    embedding_text=embedding_text,
 								                    extracted_by="llm_auto",
 								                    confidence=knowledge["confidence"],
 								                )
 								                db.add(entry)
 								                log_entry.knowledge_extracted = True
 								                db.commit()
 								                extracted += 1
 								                logger.info(
 								                    "知识提取: %s -> %s (conf=%.2f)",
 								                    log_entry.agent_name, knowledge["title"], knowledge["confidence"],
 								                )
 								            return extracted
 								        except Exception as e:
 								            logger.error("知识提取管道异常: %s", e)
 								            if db:
 								                try:
 								                    db.rollback()
 								                except Exception:
 								                    pass
 								            return 0
 								        finally:
 								            if db:
 								                try:
 								                    db.close()
 								                except Exception:
 								                    pass
 								knowledge_extractor = KnowledgeExtractor()