feat: Phase 1 - output quality verification + node-level auto-retry

- Add enterprise_review tool (35th builtin) for LLM-based quality assessment
- Add evaluator workflow node type for quality gating in DAG
- Add AgentRuntime built-in self-review with auto-correction loop
- Rewrite error_handler node from stub to real retry mechanism
- Add engine-level per-node retry with configurable max_retries/delay/on_exhausted
- Add AgentExtension model for extension tracking
- Enhance validation in agent_create_tool and tool_register_tool
- Update 全能助手 system prompt with self-evolution workflow
- Docs: 缺失能力.md and 解决缺失能力计划.md

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
renjianbo
2026-05-04 22:05:28 +08:00
parent 244ed31274
commit b8b01254ca
13 changed files with 1821 additions and 60 deletions

View File

@@ -134,6 +134,7 @@ class AgentRuntime:
tool_schemas = self.tool_manager.get_tool_schemas()
has_tools = self.tool_manager.has_tools()
steps: List[AgentStep] = []
_self_review_attempted = False # 防止无限修正循环
# 构建 LLM 调用回调(包装 on_llm_call补充上下文
llm_callback_ctx = {"step_type": "think", "tool_name": None}
@@ -221,6 +222,33 @@ class AgentRuntime:
# LLM 直接返回文本 → 结束
self.context.add_assistant_message(content)
final_text = content or "(模型未返回有效内容)"
# 输出质量自检默认关闭Agent 节点可开启)
if self.config.self_review_enabled and not _self_review_attempted:
review = await self._self_review(final_text, task_context=user_input)
steps.append(AgentStep(
iteration=self.context.iteration,
type="tool_result",
content=f"self_review: score={review['score']:.2f} passed={review['passed']}",
tool_name="self_review",
tool_input={"content": final_text[:200]},
tool_result=json.dumps(review, ensure_ascii=False),
))
if review["passed"]:
logger.info("self_review 通过 (%.2f >= %.2f)", review["score"], review["threshold"])
else:
logger.info("self_review 未通过 (%.2f < %.2f),追加修正", review["score"], review["threshold"])
_self_review_attempted = True
# 追加修正提示
fix_prompt = (
f"你的上一个回答未通过质量检查(评分 {review['score']:.1f}/{review['threshold']})。\n"
f"问题:{''.join(review['issues'][:3])}\n"
f"改进建议:{''.join(review['suggestions'][:3])}\n"
"请修正你的回答,确保满足上述建议。"
)
self.context.add_user_message(fix_prompt)
continue # 回到 ReAct 循环,让 LLM 修正
steps.append(AgentStep(
iteration=self.context.iteration,
type="final",
@@ -383,6 +411,7 @@ class AgentRuntime:
tool_schemas = self.tool_manager.get_tool_schemas()
has_tools = self.tool_manager.has_tools()
steps: List[AgentStep] = []
_self_review_attempted = False
llm_callback_ctx = {"step_type": "think", "tool_name": None}
@@ -458,6 +487,37 @@ class AgentRuntime:
# LLM 直接返回文本 → 结束
self.context.add_assistant_message(content)
final_text = content or "(模型未返回有效内容)"
# 输出质量自检(默认关闭)
if self.config.self_review_enabled and not _self_review_attempted:
review = await self._self_review(final_text, task_context=user_input)
yield {
"type": "tool_result",
"content": f"self_review: score={review['score']:.2f} passed={review['passed']}",
"tool_name": "self_review",
"iteration": self.context.iteration,
"session_id": self.context.session_id,
}
if review["passed"]:
logger.info("self_review 通过 (%.2f >= %.2f)", review["score"], review["threshold"])
else:
logger.info("self_review 未通过 (%.2f < %.2f),追加修正", review["score"], review["threshold"])
_self_review_attempted = True
yield {
"type": "think",
"content": f"自检未通过({review['score']:.1f}),正在修正:{''.join(review['suggestions'][:2])}",
"iteration": self.context.iteration,
"session_id": self.context.session_id,
}
fix_prompt = (
f"你的上一个回答未通过质量检查(评分 {review['score']:.1f}/{review['threshold']})。\n"
f"问题:{''.join(review['issues'][:3])}\n"
f"改进建议:{''.join(review['suggestions'][:3])}\n"
"请修正你的回答,确保满足上述建议。"
)
self.context.add_user_message(fix_prompt)
continue # 回到 ReAct 循环,让 LLM 修正
yield {
"type": "final",
"content": final_text,
@@ -663,6 +723,88 @@ class AgentRuntime:
if db:
db.close()
async def _self_review(self, content: str, task_context: str = "") -> dict:
"""输出质量自检:用轻量 LLM 评判输出,返回 {score, passed, issues, suggestions}。"""
criteria = (
"回答必须准确、完整、切题。"
"包含具体可执行的步骤或代码示例。"
"无明显事实错误或遗漏。"
"格式清晰,便于阅读。"
)
try:
from app.agent_runtime.core import _LLMClient
from app.agent_runtime.schemas import AgentLLMConfig
review_config = AgentLLMConfig(
provider=getattr(self.config.llm, 'provider', 'deepseek'),
model="deepseek-v4-flash",
temperature=0.1,
max_tokens=800,
request_timeout=30.0,
)
if self.config.llm.api_key:
review_config.api_key = self.config.llm.api_key
if self.config.llm.base_url:
review_config.base_url = self.config.llm.base_url
client = _LLMClient(review_config)
judge_prompt = (
"你是严格的内容质量评审专家。请根据以下标准对内容进行评分。\n\n"
f"【评判标准】\n{criteria}\n\n"
f"【待评审内容】\n{content[:8000]}\n"
)
if task_context:
judge_prompt += f"\n【任务背景】\n{task_context[:2000]}\n"
judge_prompt += (
"\n请以 JSON 格式返回评审结果(严格只返回 JSON不要任何其他文字\n"
'{"score": 0.75, "passed": true, "issues": ["问题1"], '
'"suggestions": ["建议1"], "summary": "一句话总结"}\n\n'
"评分规则1.0完美 0.8良好 0.6基本满足 0.4大部分未满足 0.2完全不满足\n"
"score >= 0.6 时 passed=true否则 passed=false\n"
)
messages = [{"role": "user", "content": judge_prompt}]
resp = await client.chat(messages=messages, tools=None, iteration=0)
judge_text = getattr(resp, 'content', '') or (
resp.get('content', '') if isinstance(resp, dict) else str(resp)
)
# 解析 JSON
try:
judge_clean = judge_text.strip()
if judge_clean.startswith("```"):
lines = judge_clean.split("\n")
judge_clean = "\n".join(lines[1:-1] if lines[-1].strip() == "```" else lines[1:])
result = json.loads(judge_clean)
except json.JSONDecodeError:
import re as _sr_re
m = _sr_re.search(r'\{[^{}]*"score"\s*:\s*[\d.]+[^{}]*\}', judge_text, _sr_re.DOTALL)
if m:
try:
result = json.loads(m.group())
except json.JSONDecodeError:
result = {"score": 0.5, "passed": False, "issues": ["无法解析评审结果"], "suggestions": [], "summary": ""}
else:
result = {"score": 0.5, "passed": False, "issues": ["无法解析评审结果"], "suggestions": [], "summary": ""}
score = float(result.get("score", 0.5))
threshold = self.config.llm.self_review_threshold
passed = score >= threshold
return {
"score": score,
"passed": passed,
"threshold": threshold,
"issues": result.get("issues", []),
"suggestions": result.get("suggestions", []),
"summary": result.get("summary", ""),
}
except Exception as e:
logger.warning("self_review 执行失败: %s", e)
return {"score": 0.5, "passed": True, "issues": [], "suggestions": [], "error": str(e)}
@staticmethod
def _extract_tool_calls(response: Any) -> List[Dict[str, Any]]:
"""从 LLM 响应中提取工具调用列表。"""