feat(graph_engine): Support pausing workflow graph executions (#26585)

Signed-off-by: -LAN- <laipz8200@outlook.com>
2025-10-19 21:33:41 +08:00
parent 9a5f214623
commit 578247ffbc
112 changed files with 3766 additions and 2415 deletions
--- a/api/core/app/apps/common/graph_runtime_state_support.py
+++ b/api/core/app/apps/common/graph_runtime_state_support.py
@@ -0,0 +1,55 @@
+"""Shared helpers for managing GraphRuntimeState across task pipelines."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from core.workflow.runtime import GraphRuntimeState
+
+if TYPE_CHECKING:
+    from core.app.task_pipeline.based_generate_task_pipeline import BasedGenerateTaskPipeline
+
+
+class GraphRuntimeStateSupport:
+    """
+    Mixin that centralises common GraphRuntimeState access patterns used by task pipelines.
+
+    Subclasses are expected to provide:
+      * `_base_task_pipeline` – exposing the queue manager with an optional cached runtime state.
+      * `_graph_runtime_state` attribute used as the local cache for the runtime state.
+    """
+
+    _base_task_pipeline: BasedGenerateTaskPipeline
+    _graph_runtime_state: GraphRuntimeState | None = None
+
+    def _ensure_graph_runtime_initialized(
+        self,
+        graph_runtime_state: GraphRuntimeState | None = None,
+    ) -> GraphRuntimeState:
+        """Validate and return the active graph runtime state."""
+        return self._resolve_graph_runtime_state(graph_runtime_state)
+
+    def _extract_workflow_run_id(self, graph_runtime_state: GraphRuntimeState) -> str:
+        system_variables = graph_runtime_state.variable_pool.system_variables
+        if not system_variables or not system_variables.workflow_execution_id:
+            raise ValueError("workflow_execution_id missing from runtime state")
+        return str(system_variables.workflow_execution_id)
+
+    def _resolve_graph_runtime_state(
+        self,
+        graph_runtime_state: GraphRuntimeState | None = None,
+    ) -> GraphRuntimeState:
+        """Return the cached runtime state or bootstrap it from the queue manager."""
+        if graph_runtime_state is not None:
+            self._graph_runtime_state = graph_runtime_state
+            return graph_runtime_state
+
+        if self._graph_runtime_state is None:
+            candidate = self._base_task_pipeline.queue_manager.graph_runtime_state
+            if candidate is not None:
+                self._graph_runtime_state = candidate
+
+        if self._graph_runtime_state is None:
+            raise ValueError("graph runtime state not initialized.")
+
+        return self._graph_runtime_state
--- a/api/core/app/apps/common/workflow_response_converter.py
+++ b/api/core/app/apps/common/workflow_response_converter.py
@@ -1,9 +1,8 @@
 import time
 from collections.abc import Mapping, Sequence
-from datetime import UTC, datetime
-from typing import Any, Union
-
-from sqlalchemy.orm import Session
+from dataclasses import dataclass
+from datetime import datetime
+from typing import Any, NewType, Union

 from core.app.entities.app_invoke_entities import AdvancedChatAppGenerateEntity, WorkflowAppGenerateEntity
 from core.app.entities.queue_entities import (
@@ -39,16 +38,36 @@ from core.plugin.impl.datasource import PluginDatasourceManager
 from core.tools.entities.tool_entities import ToolProviderType
 from core.tools.tool_manager import ToolManager
 from core.variables.segments import ArrayFileSegment, FileSegment, Segment
-from core.workflow.entities import WorkflowExecution, WorkflowNodeExecution
-from core.workflow.enums import NodeType, WorkflowNodeExecutionStatus
+from core.workflow.enums import (
+    NodeType,
+    SystemVariableKey,
+    WorkflowExecutionStatus,
+    WorkflowNodeExecutionMetadataKey,
+    WorkflowNodeExecutionStatus,
+)
+from core.workflow.runtime import GraphRuntimeState
+from core.workflow.system_variable import SystemVariable
+from core.workflow.workflow_entry import WorkflowEntry
 from core.workflow.workflow_type_encoder import WorkflowRuntimeTypeConverter
 from libs.datetime_utils import naive_utc_now
-from models import (
-    Account,
-    EndUser,
-)
+from models import Account, EndUser
 from services.variable_truncator import VariableTruncator

+NodeExecutionId = NewType("NodeExecutionId", str)
+
+
+@dataclass(slots=True)
+class _NodeSnapshot:
+    """In-memory cache for node metadata between start and completion events."""
+
+    title: str
+    index: int
+    start_at: datetime
+    iteration_id: str = ""
+    """Empty string means the node is not executing inside an iteration."""
+    loop_id: str = ""
+    """Empty string means the node is not executing inside a loop."""
+

 class WorkflowResponseConverter:
    def __init__(
@@ -56,37 +75,151 @@ class WorkflowResponseConverter:
        *,
        application_generate_entity: Union[AdvancedChatAppGenerateEntity, WorkflowAppGenerateEntity],
        user: Union[Account, EndUser],
+        system_variables: SystemVariable,
    ):
        self._application_generate_entity = application_generate_entity
        self._user = user
+        self._system_variables = system_variables
+        self._workflow_inputs = self._prepare_workflow_inputs()
        self._truncator = VariableTruncator.default()
+        self._node_snapshots: dict[NodeExecutionId, _NodeSnapshot] = {}
+        self._workflow_execution_id: str | None = None
+        self._workflow_started_at: datetime | None = None
+
+    # ------------------------------------------------------------------
+    # Workflow lifecycle helpers
+    # ------------------------------------------------------------------
+    def _prepare_workflow_inputs(self) -> Mapping[str, Any]:
+        inputs = dict(self._application_generate_entity.inputs)
+        for field_name, value in self._system_variables.to_dict().items():
+            # TODO(@future-refactor): store system variables separately from user inputs so we don't
+            # need to flatten `sys.*` entries into the input payload just for rerun/export tooling.
+            if field_name == SystemVariableKey.CONVERSATION_ID:
+                # Conversation IDs are session-scoped; omitting them keeps workflow inputs
+                # reusable without pinning new runs to a prior conversation.
+                continue
+            inputs[f"sys.{field_name}"] = value
+        handled = WorkflowEntry.handle_special_values(inputs)
+        return dict(handled or {})
+
+    def _ensure_workflow_run_id(self, workflow_run_id: str | None = None) -> str:
+        """Return the memoized workflow run id, optionally seeding it during start events."""
+        if workflow_run_id is not None:
+            self._workflow_execution_id = workflow_run_id
+        if not self._workflow_execution_id:
+            raise ValueError("workflow_run_id missing before streaming workflow events")
+        return self._workflow_execution_id
+
+    # ------------------------------------------------------------------
+    # Node snapshot helpers
+    # ------------------------------------------------------------------
+    def _store_snapshot(self, event: QueueNodeStartedEvent) -> _NodeSnapshot:
+        snapshot = _NodeSnapshot(
+            title=event.node_title,
+            index=event.node_run_index,
+            start_at=event.start_at,
+            iteration_id=event.in_iteration_id or "",
+            loop_id=event.in_loop_id or "",
+        )
+        node_execution_id = NodeExecutionId(event.node_execution_id)
+        self._node_snapshots[node_execution_id] = snapshot
+        return snapshot
+
+    def _get_snapshot(self, node_execution_id: str) -> _NodeSnapshot | None:
+        return self._node_snapshots.get(NodeExecutionId(node_execution_id))
+
+    def _pop_snapshot(self, node_execution_id: str) -> _NodeSnapshot | None:
+        return self._node_snapshots.pop(NodeExecutionId(node_execution_id), None)
+
+    @staticmethod
+    def _merge_metadata(
+        base_metadata: Mapping[WorkflowNodeExecutionMetadataKey, Any] | None,
+        snapshot: _NodeSnapshot | None,
+    ) -> Mapping[WorkflowNodeExecutionMetadataKey, Any] | None:
+        if not base_metadata and not snapshot:
+            return base_metadata
+
+        merged: dict[WorkflowNodeExecutionMetadataKey, Any] = {}
+        if base_metadata:
+            merged.update(base_metadata)
+
+        if snapshot:
+            if snapshot.iteration_id:
+                merged[WorkflowNodeExecutionMetadataKey.ITERATION_ID] = snapshot.iteration_id
+            if snapshot.loop_id:
+                merged[WorkflowNodeExecutionMetadataKey.LOOP_ID] = snapshot.loop_id
+
+        return merged or None
+
+    def _truncate_mapping(
+        self,
+        mapping: Mapping[str, Any] | None,
+    ) -> tuple[Mapping[str, Any] | None, bool]:
+        if mapping is None:
+            return None, False
+        if not mapping:
+            return {}, False
+
+        normalized = WorkflowEntry.handle_special_values(dict(mapping))
+        if normalized is None:
+            return None, False
+
+        truncated, is_truncated = self._truncator.truncate_variable_mapping(dict(normalized))
+        return truncated, is_truncated
+
+    @staticmethod
+    def _encode_outputs(outputs: Mapping[str, Any] | None) -> Mapping[str, Any] | None:
+        if outputs is None:
+            return None
+        converter = WorkflowRuntimeTypeConverter()
+        return converter.to_json_encodable(outputs)

    def workflow_start_to_stream_response(
        self,
        *,
        task_id: str,
-        workflow_execution: WorkflowExecution,
+        workflow_run_id: str,
+        workflow_id: str,
    ) -> WorkflowStartStreamResponse:
+        run_id = self._ensure_workflow_run_id(workflow_run_id)
+        started_at = naive_utc_now()
+        self._workflow_started_at = started_at
+
        return WorkflowStartStreamResponse(
            task_id=task_id,
-            workflow_run_id=workflow_execution.id_,
+            workflow_run_id=run_id,
            data=WorkflowStartStreamResponse.Data(
-                id=workflow_execution.id_,
-                workflow_id=workflow_execution.workflow_id,
-                inputs=workflow_execution.inputs,
-                created_at=int(workflow_execution.started_at.timestamp()),
+                id=run_id,
+                workflow_id=workflow_id,
+                inputs=self._workflow_inputs,
+                created_at=int(started_at.timestamp()),
            ),
        )

    def workflow_finish_to_stream_response(
        self,
        *,
-        session: Session,
        task_id: str,
-        workflow_execution: WorkflowExecution,
+        workflow_id: str,
+        status: WorkflowExecutionStatus,
+        graph_runtime_state: GraphRuntimeState,
+        error: str | None = None,
+        exceptions_count: int = 0,
    ) -> WorkflowFinishStreamResponse:
-        created_by = None
+        run_id = self._ensure_workflow_run_id()
+        started_at = self._workflow_started_at
+        if started_at is None:
+            raise ValueError(
+                "workflow_finish_to_stream_response called before workflow_start_to_stream_response",
+            )

+        finished_at = naive_utc_now()
+        elapsed_time = (finished_at - started_at).total_seconds()
+
+        outputs_mapping = graph_runtime_state.outputs or {}
+        encoded_outputs = WorkflowRuntimeTypeConverter().to_json_encodable(outputs_mapping)
+
+        created_by: Mapping[str, object] | None
        user = self._user
        if isinstance(user, Account):
            created_by = {
@@ -94,38 +227,29 @@ class WorkflowResponseConverter:
                "name": user.name,
                "email": user.email,
            }
-        elif isinstance(user, EndUser):
+        else:
            created_by = {
                "id": user.id,
                "user": user.session_id,
            }
-        else:
-            raise NotImplementedError(f"User type not supported: {type(user)}")
-
-        # Handle the case where finished_at is None by using current time as default
-        finished_at_timestamp = (
-            int(workflow_execution.finished_at.timestamp())
-            if workflow_execution.finished_at
-            else int(datetime.now(UTC).timestamp())
-        )

        return WorkflowFinishStreamResponse(
            task_id=task_id,
-            workflow_run_id=workflow_execution.id_,
+            workflow_run_id=run_id,
            data=WorkflowFinishStreamResponse.Data(
-                id=workflow_execution.id_,
-                workflow_id=workflow_execution.workflow_id,
-                status=workflow_execution.status,
-                outputs=WorkflowRuntimeTypeConverter().to_json_encodable(workflow_execution.outputs),
-                error=workflow_execution.error_message,
-                elapsed_time=workflow_execution.elapsed_time,
-                total_tokens=workflow_execution.total_tokens,
-                total_steps=workflow_execution.total_steps,
+                id=run_id,
+                workflow_id=workflow_id,
+                status=status.value,
+                outputs=encoded_outputs,
+                error=error,
+                elapsed_time=elapsed_time,
+                total_tokens=graph_runtime_state.total_tokens,
+                total_steps=graph_runtime_state.node_run_steps,
                created_by=created_by,
-                created_at=int(workflow_execution.started_at.timestamp()),
-                finished_at=finished_at_timestamp,
-                files=self.fetch_files_from_node_outputs(workflow_execution.outputs),
-                exceptions_count=workflow_execution.exceptions_count,
+                created_at=int(started_at.timestamp()),
+                finished_at=int(finished_at.timestamp()),
+                files=self.fetch_files_from_node_outputs(outputs_mapping),
+                exceptions_count=exceptions_count,
            ),
        )

@@ -134,38 +258,28 @@ class WorkflowResponseConverter:
        *,
        event: QueueNodeStartedEvent,
        task_id: str,
-        workflow_node_execution: WorkflowNodeExecution,
    ) -> NodeStartStreamResponse | None:
-        if workflow_node_execution.node_type in {NodeType.ITERATION, NodeType.LOOP}:
-            return None
-        if not workflow_node_execution.workflow_execution_id:
+        if event.node_type in {NodeType.ITERATION, NodeType.LOOP}:
            return None
+        run_id = self._ensure_workflow_run_id()
+        snapshot = self._store_snapshot(event)

        response = NodeStartStreamResponse(
            task_id=task_id,
-            workflow_run_id=workflow_node_execution.workflow_execution_id,
+            workflow_run_id=run_id,
            data=NodeStartStreamResponse.Data(
-                id=workflow_node_execution.id,
-                node_id=workflow_node_execution.node_id,
-                node_type=workflow_node_execution.node_type,
-                title=workflow_node_execution.title,
-                index=workflow_node_execution.index,
-                predecessor_node_id=workflow_node_execution.predecessor_node_id,
-                inputs=workflow_node_execution.get_response_inputs(),
-                inputs_truncated=workflow_node_execution.inputs_truncated,
-                created_at=int(workflow_node_execution.created_at.timestamp()),
-                parallel_id=event.parallel_id,
-                parallel_start_node_id=event.parallel_start_node_id,
-                parent_parallel_id=event.parent_parallel_id,
-                parent_parallel_start_node_id=event.parent_parallel_start_node_id,
+                id=event.node_execution_id,
+                node_id=event.node_id,
+                node_type=event.node_type,
+                title=snapshot.title,
+                index=snapshot.index,
+                created_at=int(snapshot.start_at.timestamp()),
                iteration_id=event.in_iteration_id,
                loop_id=event.in_loop_id,
-                parallel_run_id=event.parallel_mode_run_id,
                agent_strategy=event.agent_strategy,
            ),
        )

-        # extras logic
        if event.node_type == NodeType.TOOL:
            response.data.extras["icon"] = ToolManager.get_tool_icon(
                tenant_id=self._application_generate_entity.app_config.tenant_id,
@@ -189,41 +303,54 @@ class WorkflowResponseConverter:
        *,
        event: QueueNodeSucceededEvent | QueueNodeFailedEvent | QueueNodeExceptionEvent,
        task_id: str,
-        workflow_node_execution: WorkflowNodeExecution,
    ) -> NodeFinishStreamResponse | None:
-        if workflow_node_execution.node_type in {NodeType.ITERATION, NodeType.LOOP}:
-            return None
-        if not workflow_node_execution.workflow_execution_id:
-            return None
-        if not workflow_node_execution.finished_at:
+        if event.node_type in {NodeType.ITERATION, NodeType.LOOP}:
            return None
+        run_id = self._ensure_workflow_run_id()
+        snapshot = self._pop_snapshot(event.node_execution_id)

-        json_converter = WorkflowRuntimeTypeConverter()
+        start_at = snapshot.start_at if snapshot else event.start_at
+        finished_at = naive_utc_now()
+        elapsed_time = (finished_at - start_at).total_seconds()
+
+        inputs, inputs_truncated = self._truncate_mapping(event.inputs)
+        process_data, process_data_truncated = self._truncate_mapping(event.process_data)
+        encoded_outputs = self._encode_outputs(event.outputs)
+        outputs, outputs_truncated = self._truncate_mapping(encoded_outputs)
+        metadata = self._merge_metadata(event.execution_metadata, snapshot)
+
+        if isinstance(event, QueueNodeSucceededEvent):
+            status = WorkflowNodeExecutionStatus.SUCCEEDED.value
+            error_message = event.error
+        elif isinstance(event, QueueNodeFailedEvent):
+            status = WorkflowNodeExecutionStatus.FAILED.value
+            error_message = event.error
+        else:
+            status = WorkflowNodeExecutionStatus.EXCEPTION.value
+            error_message = event.error

        return NodeFinishStreamResponse(
            task_id=task_id,
-            workflow_run_id=workflow_node_execution.workflow_execution_id,
+            workflow_run_id=run_id,
            data=NodeFinishStreamResponse.Data(
-                id=workflow_node_execution.id,
-                node_id=workflow_node_execution.node_id,
-                node_type=workflow_node_execution.node_type,
-                index=workflow_node_execution.index,
-                title=workflow_node_execution.title,
-                predecessor_node_id=workflow_node_execution.predecessor_node_id,
-                inputs=workflow_node_execution.get_response_inputs(),
-                inputs_truncated=workflow_node_execution.inputs_truncated,
-                process_data=workflow_node_execution.get_response_process_data(),
-                process_data_truncated=workflow_node_execution.process_data_truncated,
-                outputs=json_converter.to_json_encodable(workflow_node_execution.get_response_outputs()),
-                outputs_truncated=workflow_node_execution.outputs_truncated,
-                status=workflow_node_execution.status,
-                error=workflow_node_execution.error,
-                elapsed_time=workflow_node_execution.elapsed_time,
-                execution_metadata=workflow_node_execution.metadata,
-                created_at=int(workflow_node_execution.created_at.timestamp()),
-                finished_at=int(workflow_node_execution.finished_at.timestamp()),
-                files=self.fetch_files_from_node_outputs(workflow_node_execution.outputs or {}),
-                parallel_id=event.parallel_id,
+                id=event.node_execution_id,
+                node_id=event.node_id,
+                node_type=event.node_type,
+                index=snapshot.index if snapshot else 0,
+                title=snapshot.title if snapshot else "",
+                inputs=inputs,
+                inputs_truncated=inputs_truncated,
+                process_data=process_data,
+                process_data_truncated=process_data_truncated,
+                outputs=outputs,
+                outputs_truncated=outputs_truncated,
+                status=status,
+                error=error_message,
+                elapsed_time=elapsed_time,
+                execution_metadata=metadata,
+                created_at=int(start_at.timestamp()),
+                finished_at=int(finished_at.timestamp()),
+                files=self.fetch_files_from_node_outputs(event.outputs or {}),
                iteration_id=event.in_iteration_id,
                loop_id=event.in_loop_id,
            ),
@@ -234,44 +361,45 @@ class WorkflowResponseConverter:
        *,
        event: QueueNodeRetryEvent,
        task_id: str,
-        workflow_node_execution: WorkflowNodeExecution,
-    ) -> Union[NodeRetryStreamResponse, NodeFinishStreamResponse] | None:
-        if workflow_node_execution.node_type in {NodeType.ITERATION, NodeType.LOOP}:
-            return None
-        if not workflow_node_execution.workflow_execution_id:
-            return None
-        if not workflow_node_execution.finished_at:
+    ) -> NodeRetryStreamResponse | None:
+        if event.node_type in {NodeType.ITERATION, NodeType.LOOP}:
            return None
+        run_id = self._ensure_workflow_run_id()

-        json_converter = WorkflowRuntimeTypeConverter()
+        snapshot = self._get_snapshot(event.node_execution_id)
+        if snapshot is None:
+            raise AssertionError("node retry event arrived without a stored snapshot")
+        finished_at = naive_utc_now()
+        elapsed_time = (finished_at - event.start_at).total_seconds()
+
+        inputs, inputs_truncated = self._truncate_mapping(event.inputs)
+        process_data, process_data_truncated = self._truncate_mapping(event.process_data)
+        encoded_outputs = self._encode_outputs(event.outputs)
+        outputs, outputs_truncated = self._truncate_mapping(encoded_outputs)
+        metadata = self._merge_metadata(event.execution_metadata, snapshot)

        return NodeRetryStreamResponse(
            task_id=task_id,
-            workflow_run_id=workflow_node_execution.workflow_execution_id,
+            workflow_run_id=run_id,
            data=NodeRetryStreamResponse.Data(
-                id=workflow_node_execution.id,
-                node_id=workflow_node_execution.node_id,
-                node_type=workflow_node_execution.node_type,
-                index=workflow_node_execution.index,
-                title=workflow_node_execution.title,
-                predecessor_node_id=workflow_node_execution.predecessor_node_id,
-                inputs=workflow_node_execution.get_response_inputs(),
-                inputs_truncated=workflow_node_execution.inputs_truncated,
-                process_data=workflow_node_execution.get_response_process_data(),
-                process_data_truncated=workflow_node_execution.process_data_truncated,
-                outputs=json_converter.to_json_encodable(workflow_node_execution.get_response_outputs()),
-                outputs_truncated=workflow_node_execution.outputs_truncated,
-                status=workflow_node_execution.status,
-                error=workflow_node_execution.error,
-                elapsed_time=workflow_node_execution.elapsed_time,
-                execution_metadata=workflow_node_execution.metadata,
-                created_at=int(workflow_node_execution.created_at.timestamp()),
-                finished_at=int(workflow_node_execution.finished_at.timestamp()),
-                files=self.fetch_files_from_node_outputs(workflow_node_execution.outputs or {}),
-                parallel_id=event.parallel_id,
-                parallel_start_node_id=event.parallel_start_node_id,
-                parent_parallel_id=event.parent_parallel_id,
-                parent_parallel_start_node_id=event.parent_parallel_start_node_id,
+                id=event.node_execution_id,
+                node_id=event.node_id,
+                node_type=event.node_type,
+                index=snapshot.index,
+                title=snapshot.title,
+                inputs=inputs,
+                inputs_truncated=inputs_truncated,
+                process_data=process_data,
+                process_data_truncated=process_data_truncated,
+                outputs=outputs,
+                outputs_truncated=outputs_truncated,
+                status=WorkflowNodeExecutionStatus.RETRY.value,
+                error=event.error,
+                elapsed_time=elapsed_time,
+                execution_metadata=metadata,
+                created_at=int(snapshot.start_at.timestamp()),
+                finished_at=int(finished_at.timestamp()),
+                files=self.fetch_files_from_node_outputs(event.outputs or {}),
                iteration_id=event.in_iteration_id,
                loop_id=event.in_loop_id,
                retry_index=event.retry_index,
@@ -379,8 +507,6 @@ class WorkflowResponseConverter:
                inputs=new_inputs,
                inputs_truncated=truncated,
                metadata=event.metadata or {},
-                parallel_id=event.parallel_id,
-                parallel_start_node_id=event.parallel_start_node_id,
            ),
        )

@@ -405,9 +531,6 @@ class WorkflowResponseConverter:
                pre_loop_output={},
                created_at=int(time.time()),
                extras={},
-                parallel_id=event.parallel_id,
-                parallel_start_node_id=event.parallel_start_node_id,
-                parallel_mode_run_id=event.parallel_mode_run_id,
            ),
        )

@@ -446,8 +569,6 @@ class WorkflowResponseConverter:
                execution_metadata=event.metadata,
                finished_at=int(time.time()),
                steps=event.steps,
-                parallel_id=event.parallel_id,
-                parallel_start_node_id=event.parallel_start_node_id,
            ),
        )