feat: Phase 4 - LLM/Agent fallback chain, cross-agent knowledge sharing, async agent execution

- 4.1 Fallback chain: LLM fallback_llm config in AgentLLMConfig, retry with alternate model on API failure; Agent fallback_agent in DAG nodes - 4.2 Knowledge sharing: GlobalKnowledge model with embedding-based semantic search, auto-extraction of tool names as tags after execution - 4.3 Async execution: execute_agent_task fully implemented with AgentRuntime, scheduler dual-path for workflow/non-workflow agents Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-05 00:27:54 +08:00
parent 7e00b027d4
commit 592bca4f39
7 changed files with 461 additions and 70 deletions
--- a/backend/app/services/workflow_engine.py
+++ b/backend/app/services/workflow_engine.py
@@ -1956,6 +1956,38 @@ class WorkflowEngine:
                        )
                    return result
                except Exception as e:
+                    # fallback_agent 降级：主 Agent 失败时尝试备用 Agent
+                    node_data = node.get("data", {}) or {}
+                    fallback_agent_id = node_data.get("fallback_agent", "")
+                    if fallback_agent_id and str(fallback_agent_id) != str(node_data.get("agent_id", "")):
+                        if self.logger:
+                            self.logger.warn(
+                                "Agent 节点 %s 失败，降级到 fallback_agent: %s",
+                                node_id, fallback_agent_id,
+                            )
+                        try:
+                            fb_node_data = {**node_data, "agent_id": fallback_agent_id}
+                            fb_node_data.pop("fallback_agent", None)
+                            result = await run_agent_node(
+                                node_data=fb_node_data,
+                                input_data=input_data,
+                                execution_logger=self.logger,
+                                user_id=self.trusted_model_config_user_id,
+                                on_tool_executed=_agent_on_tool,
+                                on_llm_invocation=_on_agent_llm,
+                                budget_limits={
+                                    "max_llm_invocations": self._cap_llm,
+                                    "max_tool_calls": self._cap_tool,
+                                },
+                            )
+                            if self.logger:
+                                self.logger.info("fallback_agent %s 执行成功", fallback_agent_id)
+                            return result
+                        except Exception as fb_e:
+                            if self.logger:
+                                self.logger.error("fallback_agent %s 也失败: %s", fallback_agent_id, fb_e)
+                            logger.error(f"fallback_agent 执行失败: {fb_e}", exc_info=True)
+
                    if self.logger:
                        duration = int((time.time() - start_time) * 1000)
                        self.logger.log_node_error(node_id, node_type, e, duration)