feat: Phase 1 - output quality verification + node-level auto-retry
- Add enterprise_review tool (35th builtin) for LLM-based quality assessment - Add evaluator workflow node type for quality gating in DAG - Add AgentRuntime built-in self-review with auto-correction loop - Rewrite error_handler node from stub to real retry mechanism - Add engine-level per-node retry with configurable max_retries/delay/on_exhausted - Add AgentExtension model for extension tracking - Enhance validation in agent_create_tool and tool_register_tool - Update 全能助手 system prompt with self-evolution workflow - Docs: 缺失能力.md and 解决缺失能力计划.md Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -134,6 +134,7 @@ class AgentRuntime:
|
||||
tool_schemas = self.tool_manager.get_tool_schemas()
|
||||
has_tools = self.tool_manager.has_tools()
|
||||
steps: List[AgentStep] = []
|
||||
_self_review_attempted = False # 防止无限修正循环
|
||||
|
||||
# 构建 LLM 调用回调(包装 on_llm_call,补充上下文)
|
||||
llm_callback_ctx = {"step_type": "think", "tool_name": None}
|
||||
@@ -221,6 +222,33 @@ class AgentRuntime:
|
||||
# LLM 直接返回文本 → 结束
|
||||
self.context.add_assistant_message(content)
|
||||
final_text = content or "(模型未返回有效内容)"
|
||||
|
||||
# 输出质量自检(默认关闭,Agent 节点可开启)
|
||||
if self.config.self_review_enabled and not _self_review_attempted:
|
||||
review = await self._self_review(final_text, task_context=user_input)
|
||||
steps.append(AgentStep(
|
||||
iteration=self.context.iteration,
|
||||
type="tool_result",
|
||||
content=f"self_review: score={review['score']:.2f} passed={review['passed']}",
|
||||
tool_name="self_review",
|
||||
tool_input={"content": final_text[:200]},
|
||||
tool_result=json.dumps(review, ensure_ascii=False),
|
||||
))
|
||||
if review["passed"]:
|
||||
logger.info("self_review 通过 (%.2f >= %.2f)", review["score"], review["threshold"])
|
||||
else:
|
||||
logger.info("self_review 未通过 (%.2f < %.2f),追加修正", review["score"], review["threshold"])
|
||||
_self_review_attempted = True
|
||||
# 追加修正提示
|
||||
fix_prompt = (
|
||||
f"你的上一个回答未通过质量检查(评分 {review['score']:.1f}/{review['threshold']})。\n"
|
||||
f"问题:{';'.join(review['issues'][:3])}\n"
|
||||
f"改进建议:{';'.join(review['suggestions'][:3])}\n"
|
||||
"请修正你的回答,确保满足上述建议。"
|
||||
)
|
||||
self.context.add_user_message(fix_prompt)
|
||||
continue # 回到 ReAct 循环,让 LLM 修正
|
||||
|
||||
steps.append(AgentStep(
|
||||
iteration=self.context.iteration,
|
||||
type="final",
|
||||
@@ -383,6 +411,7 @@ class AgentRuntime:
|
||||
tool_schemas = self.tool_manager.get_tool_schemas()
|
||||
has_tools = self.tool_manager.has_tools()
|
||||
steps: List[AgentStep] = []
|
||||
_self_review_attempted = False
|
||||
|
||||
llm_callback_ctx = {"step_type": "think", "tool_name": None}
|
||||
|
||||
@@ -458,6 +487,37 @@ class AgentRuntime:
|
||||
# LLM 直接返回文本 → 结束
|
||||
self.context.add_assistant_message(content)
|
||||
final_text = content or "(模型未返回有效内容)"
|
||||
|
||||
# 输出质量自检(默认关闭)
|
||||
if self.config.self_review_enabled and not _self_review_attempted:
|
||||
review = await self._self_review(final_text, task_context=user_input)
|
||||
yield {
|
||||
"type": "tool_result",
|
||||
"content": f"self_review: score={review['score']:.2f} passed={review['passed']}",
|
||||
"tool_name": "self_review",
|
||||
"iteration": self.context.iteration,
|
||||
"session_id": self.context.session_id,
|
||||
}
|
||||
if review["passed"]:
|
||||
logger.info("self_review 通过 (%.2f >= %.2f)", review["score"], review["threshold"])
|
||||
else:
|
||||
logger.info("self_review 未通过 (%.2f < %.2f),追加修正", review["score"], review["threshold"])
|
||||
_self_review_attempted = True
|
||||
yield {
|
||||
"type": "think",
|
||||
"content": f"自检未通过({review['score']:.1f}),正在修正:{';'.join(review['suggestions'][:2])}",
|
||||
"iteration": self.context.iteration,
|
||||
"session_id": self.context.session_id,
|
||||
}
|
||||
fix_prompt = (
|
||||
f"你的上一个回答未通过质量检查(评分 {review['score']:.1f}/{review['threshold']})。\n"
|
||||
f"问题:{';'.join(review['issues'][:3])}\n"
|
||||
f"改进建议:{';'.join(review['suggestions'][:3])}\n"
|
||||
"请修正你的回答,确保满足上述建议。"
|
||||
)
|
||||
self.context.add_user_message(fix_prompt)
|
||||
continue # 回到 ReAct 循环,让 LLM 修正
|
||||
|
||||
yield {
|
||||
"type": "final",
|
||||
"content": final_text,
|
||||
@@ -663,6 +723,88 @@ class AgentRuntime:
|
||||
if db:
|
||||
db.close()
|
||||
|
||||
async def _self_review(self, content: str, task_context: str = "") -> dict:
|
||||
"""输出质量自检:用轻量 LLM 评判输出,返回 {score, passed, issues, suggestions}。"""
|
||||
criteria = (
|
||||
"回答必须准确、完整、切题。"
|
||||
"包含具体可执行的步骤或代码示例。"
|
||||
"无明显事实错误或遗漏。"
|
||||
"格式清晰,便于阅读。"
|
||||
)
|
||||
try:
|
||||
from app.agent_runtime.core import _LLMClient
|
||||
from app.agent_runtime.schemas import AgentLLMConfig
|
||||
|
||||
review_config = AgentLLMConfig(
|
||||
provider=getattr(self.config.llm, 'provider', 'deepseek'),
|
||||
model="deepseek-v4-flash",
|
||||
temperature=0.1,
|
||||
max_tokens=800,
|
||||
request_timeout=30.0,
|
||||
)
|
||||
if self.config.llm.api_key:
|
||||
review_config.api_key = self.config.llm.api_key
|
||||
if self.config.llm.base_url:
|
||||
review_config.base_url = self.config.llm.base_url
|
||||
|
||||
client = _LLMClient(review_config)
|
||||
|
||||
judge_prompt = (
|
||||
"你是严格的内容质量评审专家。请根据以下标准对内容进行评分。\n\n"
|
||||
f"【评判标准】\n{criteria}\n\n"
|
||||
f"【待评审内容】\n{content[:8000]}\n"
|
||||
)
|
||||
if task_context:
|
||||
judge_prompt += f"\n【任务背景】\n{task_context[:2000]}\n"
|
||||
|
||||
judge_prompt += (
|
||||
"\n请以 JSON 格式返回评审结果(严格只返回 JSON,不要任何其他文字):\n"
|
||||
'{"score": 0.75, "passed": true, "issues": ["问题1"], '
|
||||
'"suggestions": ["建议1"], "summary": "一句话总结"}\n\n'
|
||||
"评分规则:1.0完美 0.8良好 0.6基本满足 0.4大部分未满足 0.2完全不满足\n"
|
||||
"score >= 0.6 时 passed=true,否则 passed=false\n"
|
||||
)
|
||||
|
||||
messages = [{"role": "user", "content": judge_prompt}]
|
||||
resp = await client.chat(messages=messages, tools=None, iteration=0)
|
||||
judge_text = getattr(resp, 'content', '') or (
|
||||
resp.get('content', '') if isinstance(resp, dict) else str(resp)
|
||||
)
|
||||
|
||||
# 解析 JSON
|
||||
try:
|
||||
judge_clean = judge_text.strip()
|
||||
if judge_clean.startswith("```"):
|
||||
lines = judge_clean.split("\n")
|
||||
judge_clean = "\n".join(lines[1:-1] if lines[-1].strip() == "```" else lines[1:])
|
||||
result = json.loads(judge_clean)
|
||||
except json.JSONDecodeError:
|
||||
import re as _sr_re
|
||||
m = _sr_re.search(r'\{[^{}]*"score"\s*:\s*[\d.]+[^{}]*\}', judge_text, _sr_re.DOTALL)
|
||||
if m:
|
||||
try:
|
||||
result = json.loads(m.group())
|
||||
except json.JSONDecodeError:
|
||||
result = {"score": 0.5, "passed": False, "issues": ["无法解析评审结果"], "suggestions": [], "summary": ""}
|
||||
else:
|
||||
result = {"score": 0.5, "passed": False, "issues": ["无法解析评审结果"], "suggestions": [], "summary": ""}
|
||||
|
||||
score = float(result.get("score", 0.5))
|
||||
threshold = self.config.llm.self_review_threshold
|
||||
passed = score >= threshold
|
||||
|
||||
return {
|
||||
"score": score,
|
||||
"passed": passed,
|
||||
"threshold": threshold,
|
||||
"issues": result.get("issues", []),
|
||||
"suggestions": result.get("suggestions", []),
|
||||
"summary": result.get("summary", ""),
|
||||
}
|
||||
except Exception as e:
|
||||
logger.warning("self_review 执行失败: %s", e)
|
||||
return {"score": 0.5, "passed": True, "issues": [], "suggestions": [], "error": str(e)}
|
||||
|
||||
@staticmethod
|
||||
def _extract_tool_calls(response: Any) -> List[Dict[str, Any]]:
|
||||
"""从 LLM 响应中提取工具调用列表。"""
|
||||
|
||||
Reference in New Issue
Block a user