feat: Phase 1 - output quality verification + node-level auto-retry

- Add enterprise_review tool (35th builtin) for LLM-based quality assessment - Add evaluator workflow node type for quality gating in DAG - Add AgentRuntime built-in self-review with auto-correction loop - Rewrite error_handler node from stub to real retry mechanism - Add engine-level per-node retry with configurable max_retries/delay/on_exhausted - Add AgentExtension model for extension tracking - Enhance validation in agent_create_tool and tool_register_tool - Update 全能助手 system prompt with self-evolution workflow - Docs: 缺失能力.md and 解决缺失能力计划.md Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-04 22:05:28 +08:00
parent 244ed31274
commit b8b01254ca
13 changed files with 1821 additions and 60 deletions
--- a/backend/app/agent_runtime/core.py
+++ b/backend/app/agent_runtime/core.py
@@ -134,6 +134,7 @@ class AgentRuntime:
        tool_schemas = self.tool_manager.get_tool_schemas()
        has_tools = self.tool_manager.has_tools()
        steps: List[AgentStep] = []
+        _self_review_attempted = False  # 防止无限修正循环

        # 构建 LLM 调用回调（包装 on_llm_call，补充上下文）
        llm_callback_ctx = {"step_type": "think", "tool_name": None}
@@ -221,6 +222,33 @@ class AgentRuntime:
                # LLM 直接返回文本 → 结束
                self.context.add_assistant_message(content)
                final_text = content or "（模型未返回有效内容）"
+
+                # 输出质量自检（默认关闭，Agent 节点可开启）
+                if self.config.self_review_enabled and not _self_review_attempted:
+                    review = await self._self_review(final_text, task_context=user_input)
+                    steps.append(AgentStep(
+                        iteration=self.context.iteration,
+                        type="tool_result",
+                        content=f"self_review: score={review['score']:.2f} passed={review['passed']}",
+                        tool_name="self_review",
+                        tool_input={"content": final_text[:200]},
+                        tool_result=json.dumps(review, ensure_ascii=False),
+                    ))
+                    if review["passed"]:
+                        logger.info("self_review 通过 (%.2f >= %.2f)", review["score"], review["threshold"])
+                    else:
+                        logger.info("self_review 未通过 (%.2f < %.2f)，追加修正", review["score"], review["threshold"])
+                        _self_review_attempted = True
+                        # 追加修正提示
+                        fix_prompt = (
+                            f"你的上一个回答未通过质量检查（评分 {review['score']:.1f}/{review['threshold']}）。\n"
+                            f"问题：{'；'.join(review['issues'][:3])}\n"
+                            f"改进建议：{'；'.join(review['suggestions'][:3])}\n"
+                            "请修正你的回答，确保满足上述建议。"
+                        )
+                        self.context.add_user_message(fix_prompt)
+                        continue  # 回到 ReAct 循环，让 LLM 修正
+
                steps.append(AgentStep(
                    iteration=self.context.iteration,
                    type="final",
@@ -383,6 +411,7 @@ class AgentRuntime:
        tool_schemas = self.tool_manager.get_tool_schemas()
        has_tools = self.tool_manager.has_tools()
        steps: List[AgentStep] = []
+        _self_review_attempted = False

        llm_callback_ctx = {"step_type": "think", "tool_name": None}

@@ -458,6 +487,37 @@ class AgentRuntime:
                # LLM 直接返回文本 → 结束
                self.context.add_assistant_message(content)
                final_text = content or "（模型未返回有效内容）"
+
+                # 输出质量自检（默认关闭）
+                if self.config.self_review_enabled and not _self_review_attempted:
+                    review = await self._self_review(final_text, task_context=user_input)
+                    yield {
+                        "type": "tool_result",
+                        "content": f"self_review: score={review['score']:.2f} passed={review['passed']}",
+                        "tool_name": "self_review",
+                        "iteration": self.context.iteration,
+                        "session_id": self.context.session_id,
+                    }
+                    if review["passed"]:
+                        logger.info("self_review 通过 (%.2f >= %.2f)", review["score"], review["threshold"])
+                    else:
+                        logger.info("self_review 未通过 (%.2f < %.2f)，追加修正", review["score"], review["threshold"])
+                        _self_review_attempted = True
+                        yield {
+                            "type": "think",
+                            "content": f"自检未通过（{review['score']:.1f}），正在修正：{'；'.join(review['suggestions'][:2])}",
+                            "iteration": self.context.iteration,
+                            "session_id": self.context.session_id,
+                        }
+                        fix_prompt = (
+                            f"你的上一个回答未通过质量检查（评分 {review['score']:.1f}/{review['threshold']}）。\n"
+                            f"问题：{'；'.join(review['issues'][:3])}\n"
+                            f"改进建议：{'；'.join(review['suggestions'][:3])}\n"
+                            "请修正你的回答，确保满足上述建议。"
+                        )
+                        self.context.add_user_message(fix_prompt)
+                        continue  # 回到 ReAct 循环，让 LLM 修正
+
                yield {
                    "type": "final",
                    "content": final_text,
@@ -663,6 +723,88 @@ class AgentRuntime:
            if db:
                db.close()

+    async def _self_review(self, content: str, task_context: str = "") -> dict:
+        """输出质量自检：用轻量 LLM 评判输出，返回 {score, passed, issues, suggestions}。"""
+        criteria = (
+            "回答必须准确、完整、切题。"
+            "包含具体可执行的步骤或代码示例。"
+            "无明显事实错误或遗漏。"
+            "格式清晰，便于阅读。"
+        )
+        try:
+            from app.agent_runtime.core import _LLMClient
+            from app.agent_runtime.schemas import AgentLLMConfig
+
+            review_config = AgentLLMConfig(
+                provider=getattr(self.config.llm, 'provider', 'deepseek'),
+                model="deepseek-v4-flash",
+                temperature=0.1,
+                max_tokens=800,
+                request_timeout=30.0,
+            )
+            if self.config.llm.api_key:
+                review_config.api_key = self.config.llm.api_key
+            if self.config.llm.base_url:
+                review_config.base_url = self.config.llm.base_url
+
+            client = _LLMClient(review_config)
+
+            judge_prompt = (
+                "你是严格的内容质量评审专家。请根据以下标准对内容进行评分。\n\n"
+                f"【评判标准】\n{criteria}\n\n"
+                f"【待评审内容】\n{content[:8000]}\n"
+            )
+            if task_context:
+                judge_prompt += f"\n【任务背景】\n{task_context[:2000]}\n"
+
+            judge_prompt += (
+                "\n请以 JSON 格式返回评审结果（严格只返回 JSON，不要任何其他文字）：\n"
+                '{"score": 0.75, "passed": true, "issues": ["问题1"], '
+                '"suggestions": ["建议1"], "summary": "一句话总结"}\n\n'
+                "评分规则：1.0完美 0.8良好 0.6基本满足 0.4大部分未满足 0.2完全不满足\n"
+                "score >= 0.6 时 passed=true，否则 passed=false\n"
+            )
+
+            messages = [{"role": "user", "content": judge_prompt}]
+            resp = await client.chat(messages=messages, tools=None, iteration=0)
+            judge_text = getattr(resp, 'content', '') or (
+                resp.get('content', '') if isinstance(resp, dict) else str(resp)
+            )
+
+            # 解析 JSON
+            try:
+                judge_clean = judge_text.strip()
+                if judge_clean.startswith("```"):
+                    lines = judge_clean.split("\n")
+                    judge_clean = "\n".join(lines[1:-1] if lines[-1].strip() == "```" else lines[1:])
+                result = json.loads(judge_clean)
+            except json.JSONDecodeError:
+                import re as _sr_re
+                m = _sr_re.search(r'\{[^{}]*"score"\s*:\s*[\d.]+[^{}]*\}', judge_text, _sr_re.DOTALL)
+                if m:
+                    try:
+                        result = json.loads(m.group())
+                    except json.JSONDecodeError:
+                        result = {"score": 0.5, "passed": False, "issues": ["无法解析评审结果"], "suggestions": [], "summary": ""}
+                else:
+                    result = {"score": 0.5, "passed": False, "issues": ["无法解析评审结果"], "suggestions": [], "summary": ""}
+
+            score = float(result.get("score", 0.5))
+            threshold = self.config.llm.self_review_threshold
+            passed = score >= threshold
+
+            return {
+                "score": score,
+                "passed": passed,
+                "threshold": threshold,
+                "issues": result.get("issues", []),
+                "suggestions": result.get("suggestions", []),
+                "summary": result.get("summary", ""),
+            }
+        except Exception as e:
+            logger.warning("self_review 执行失败: %s", e)
+            return {"score": 0.5, "passed": True, "issues": [], "suggestions": [], "error": str(e)}
+
    @staticmethod
    def _extract_tool_calls(response: Any) -> List[Dict[str, Any]]:
        """从 LLM 响应中提取工具调用列表。"""