feat(workflow_cycle_manager): Removes redundant repository methods and adds caching (#22597)

Signed-off-by: -LAN- <laipz8200@outlook.com>
2025-07-18 09:26:05 +08:00
parent 3826b57424
commit b88dd17fc1
7 changed files with 294 additions and 535 deletions
--- a/api/core/workflow/repositories/workflow_execution_repository.py
+++ b/api/core/workflow/repositories/workflow_execution_repository.py
@@ -1,4 +1,4 @@
-from typing import Optional, Protocol
+from typing import Protocol

 from core.workflow.entities.workflow_execution import WorkflowExecution

@@ -28,15 +28,3 @@ class WorkflowExecutionRepository(Protocol):
            execution: The WorkflowExecution instance to save or update
        """
        ...
-
-    def get(self, execution_id: str) -> Optional[WorkflowExecution]:
-        """
-        Retrieve a WorkflowExecution by its ID.
-
-        Args:
-            execution_id: The workflow execution ID
-
-        Returns:
-            The WorkflowExecution instance if found, None otherwise
-        """
-        ...
--- a/api/core/workflow/repositories/workflow_node_execution_repository.py
+++ b/api/core/workflow/repositories/workflow_node_execution_repository.py
@@ -39,18 +39,6 @@ class WorkflowNodeExecutionRepository(Protocol):
        """
        ...

-    def get_by_node_execution_id(self, node_execution_id: str) -> Optional[WorkflowNodeExecution]:
-        """
-        Retrieve a NodeExecution by its node_execution_id.
-
-        Args:
-            node_execution_id: The node execution ID
-
-        Returns:
-            The NodeExecution instance if found, None otherwise
-        """
-        ...
-
    def get_by_workflow_run(
        self,
        workflow_run_id: str,
@@ -69,24 +57,3 @@ class WorkflowNodeExecutionRepository(Protocol):
            A list of NodeExecution instances
        """
        ...
-
-    def get_running_executions(self, workflow_run_id: str) -> Sequence[WorkflowNodeExecution]:
-        """
-        Retrieve all running NodeExecution instances for a specific workflow run.
-
-        Args:
-            workflow_run_id: The workflow run ID
-
-        Returns:
-            A list of running NodeExecution instances
-        """
-        ...
-
-    def clear(self) -> None:
-        """
-        Clear all NodeExecution records based on implementation-specific criteria.
-
-        This method is intended to be used for bulk deletion operations, such as removing
-        all records associated with a specific app_id and tenant_id in multi-tenant implementations.
-        """
-        ...
--- a/api/core/workflow/workflow_cycle_manager.py
+++ b/api/core/workflow/workflow_cycle_manager.py
@@ -55,24 +55,15 @@ class WorkflowCycleManager:
        self._workflow_execution_repository = workflow_execution_repository
        self._workflow_node_execution_repository = workflow_node_execution_repository

+        # Initialize caches for workflow execution cycle
+        # These caches avoid redundant repository calls during a single workflow execution
+        self._workflow_execution_cache: dict[str, WorkflowExecution] = {}
+        self._node_execution_cache: dict[str, WorkflowNodeExecution] = {}
+
    def handle_workflow_run_start(self) -> WorkflowExecution:
-        inputs = {**self._application_generate_entity.inputs}
+        inputs = self._prepare_workflow_inputs()
+        execution_id = self._get_or_generate_execution_id()

-        # Iterate over SystemVariable fields using Pydantic's model_fields
-        if self._workflow_system_variables:
-            for field_name, value in self._workflow_system_variables.to_dict().items():
-                if field_name == SystemVariableKey.CONVERSATION_ID:
-                    continue
-                inputs[f"sys.{field_name}"] = value
-
-        # handle special values
-        inputs = dict(WorkflowEntry.handle_special_values(inputs) or {})
-
-        # init workflow run
-        # TODO: This workflow_run_id should always not be None, maybe we can use a more elegant way to handle this
-        execution_id = str(
-            self._workflow_system_variables.workflow_execution_id if self._workflow_system_variables else None
-        ) or str(uuid4())
        execution = WorkflowExecution.new(
            id_=execution_id,
            workflow_id=self._workflow_info.workflow_id,
@@ -83,9 +74,7 @@ class WorkflowCycleManager:
            started_at=datetime.now(UTC).replace(tzinfo=None),
        )

-        self._workflow_execution_repository.save(execution)
-
-        return execution
+        return self._save_and_cache_workflow_execution(execution)

    def handle_workflow_run_success(
        self,
@@ -99,23 +88,15 @@ class WorkflowCycleManager:
    ) -> WorkflowExecution:
        workflow_execution = self._get_workflow_execution_or_raise_error(workflow_run_id)

-        # outputs = WorkflowEntry.handle_special_values(outputs)
+        self._update_workflow_execution_completion(
+            workflow_execution,
+            status=WorkflowExecutionStatus.SUCCEEDED,
+            outputs=outputs,
+            total_tokens=total_tokens,
+            total_steps=total_steps,
+        )

-        workflow_execution.status = WorkflowExecutionStatus.SUCCEEDED
-        workflow_execution.outputs = outputs or {}
-        workflow_execution.total_tokens = total_tokens
-        workflow_execution.total_steps = total_steps
-        workflow_execution.finished_at = datetime.now(UTC).replace(tzinfo=None)
-
-        if trace_manager:
-            trace_manager.add_trace_task(
-                TraceTask(
-                    TraceTaskName.WORKFLOW_TRACE,
-                    workflow_execution=workflow_execution,
-                    conversation_id=conversation_id,
-                    user_id=trace_manager.user_id,
-                )
-            )
+        self._add_trace_task_if_needed(trace_manager, workflow_execution, conversation_id)

        self._workflow_execution_repository.save(workflow_execution)
        return workflow_execution
@@ -132,24 +113,17 @@ class WorkflowCycleManager:
        trace_manager: Optional[TraceQueueManager] = None,
    ) -> WorkflowExecution:
        execution = self._get_workflow_execution_or_raise_error(workflow_run_id)
-        # outputs = WorkflowEntry.handle_special_values(dict(outputs) if outputs else None)

-        execution.status = WorkflowExecutionStatus.PARTIAL_SUCCEEDED
-        execution.outputs = outputs or {}
-        execution.total_tokens = total_tokens
-        execution.total_steps = total_steps
-        execution.finished_at = datetime.now(UTC).replace(tzinfo=None)
-        execution.exceptions_count = exceptions_count
+        self._update_workflow_execution_completion(
+            execution,
+            status=WorkflowExecutionStatus.PARTIAL_SUCCEEDED,
+            outputs=outputs,
+            total_tokens=total_tokens,
+            total_steps=total_steps,
+            exceptions_count=exceptions_count,
+        )

-        if trace_manager:
-            trace_manager.add_trace_task(
-                TraceTask(
-                    TraceTaskName.WORKFLOW_TRACE,
-                    workflow_execution=execution,
-                    conversation_id=conversation_id,
-                    user_id=trace_manager.user_id,
-                )
-            )
+        self._add_trace_task_if_needed(trace_manager, execution, conversation_id)

        self._workflow_execution_repository.save(execution)
        return execution
@@ -169,39 +143,18 @@ class WorkflowCycleManager:
        workflow_execution = self._get_workflow_execution_or_raise_error(workflow_run_id)
        now = naive_utc_now()

-        workflow_execution.status = WorkflowExecutionStatus(status.value)
-        workflow_execution.error_message = error_message
-        workflow_execution.total_tokens = total_tokens
-        workflow_execution.total_steps = total_steps
-        workflow_execution.finished_at = now
-        workflow_execution.exceptions_count = exceptions_count
-
-        # Use the instance repository to find running executions for a workflow run
-        running_node_executions = self._workflow_node_execution_repository.get_running_executions(
-            workflow_run_id=workflow_execution.id_
+        self._update_workflow_execution_completion(
+            workflow_execution,
+            status=status,
+            total_tokens=total_tokens,
+            total_steps=total_steps,
+            error_message=error_message,
+            exceptions_count=exceptions_count,
+            finished_at=now,
        )

-        # Update the domain models
-        for node_execution in running_node_executions:
-            if node_execution.node_execution_id:
-                # Update the domain model
-                node_execution.status = WorkflowNodeExecutionStatus.FAILED
-                node_execution.error = error_message
-                node_execution.finished_at = now
-                node_execution.elapsed_time = (now - node_execution.created_at).total_seconds()
-
-                # Update the repository with the domain model
-                self._workflow_node_execution_repository.save(node_execution)
-
-        if trace_manager:
-            trace_manager.add_trace_task(
-                TraceTask(
-                    TraceTaskName.WORKFLOW_TRACE,
-                    workflow_execution=workflow_execution,
-                    conversation_id=conversation_id,
-                    user_id=trace_manager.user_id,
-                )
-            )
+        self._fail_running_node_executions(workflow_execution.id_, error_message, now)
+        self._add_trace_task_if_needed(trace_manager, workflow_execution, conversation_id)

        self._workflow_execution_repository.save(workflow_execution)
        return workflow_execution
@@ -214,8 +167,198 @@ class WorkflowCycleManager:
    ) -> WorkflowNodeExecution:
        workflow_execution = self._get_workflow_execution_or_raise_error(workflow_execution_id)

-        # Create a domain model
-        created_at = datetime.now(UTC).replace(tzinfo=None)
+        domain_execution = self._create_node_execution_from_event(
+            workflow_execution=workflow_execution,
+            event=event,
+            status=WorkflowNodeExecutionStatus.RUNNING,
+        )
+
+        return self._save_and_cache_node_execution(domain_execution)
+
+    def handle_workflow_node_execution_success(self, *, event: QueueNodeSucceededEvent) -> WorkflowNodeExecution:
+        domain_execution = self._get_node_execution_from_cache(event.node_execution_id)
+
+        self._update_node_execution_completion(
+            domain_execution,
+            event=event,
+            status=WorkflowNodeExecutionStatus.SUCCEEDED,
+        )
+
+        self._workflow_node_execution_repository.save(domain_execution)
+        return domain_execution
+
+    def handle_workflow_node_execution_failed(
+        self,
+        *,
+        event: QueueNodeFailedEvent
+        | QueueNodeInIterationFailedEvent
+        | QueueNodeInLoopFailedEvent
+        | QueueNodeExceptionEvent,
+    ) -> WorkflowNodeExecution:
+        """
+        Workflow node execution failed
+        :param event: queue node failed event
+        :return:
+        """
+        domain_execution = self._get_node_execution_from_cache(event.node_execution_id)
+
+        status = (
+            WorkflowNodeExecutionStatus.EXCEPTION
+            if isinstance(event, QueueNodeExceptionEvent)
+            else WorkflowNodeExecutionStatus.FAILED
+        )
+
+        self._update_node_execution_completion(
+            domain_execution,
+            event=event,
+            status=status,
+            error=event.error,
+            handle_special_values=True,
+        )
+
+        self._workflow_node_execution_repository.save(domain_execution)
+        return domain_execution
+
+    def handle_workflow_node_execution_retried(
+        self, *, workflow_execution_id: str, event: QueueNodeRetryEvent
+    ) -> WorkflowNodeExecution:
+        workflow_execution = self._get_workflow_execution_or_raise_error(workflow_execution_id)
+
+        domain_execution = self._create_node_execution_from_event(
+            workflow_execution=workflow_execution,
+            event=event,
+            status=WorkflowNodeExecutionStatus.RETRY,
+            error=event.error,
+            created_at=event.start_at,
+        )
+
+        # Handle inputs and outputs
+        inputs = WorkflowEntry.handle_special_values(event.inputs)
+        outputs = event.outputs
+        metadata = self._merge_event_metadata(event)
+
+        domain_execution.update_from_mapping(inputs=inputs, outputs=outputs, metadata=metadata)
+
+        return self._save_and_cache_node_execution(domain_execution)
+
+    def _get_workflow_execution_or_raise_error(self, id: str, /) -> WorkflowExecution:
+        # Check cache first
+        if id in self._workflow_execution_cache:
+            return self._workflow_execution_cache[id]
+
+        raise WorkflowRunNotFoundError(id)
+
+    def _prepare_workflow_inputs(self) -> dict[str, Any]:
+        """Prepare workflow inputs by merging application inputs with system variables."""
+        inputs = {**self._application_generate_entity.inputs}
+
+        if self._workflow_system_variables:
+            for field_name, value in self._workflow_system_variables.to_dict().items():
+                if field_name != SystemVariableKey.CONVERSATION_ID:
+                    inputs[f"sys.{field_name}"] = value
+
+        return dict(WorkflowEntry.handle_special_values(inputs) or {})
+
+    def _get_or_generate_execution_id(self) -> str:
+        """Get execution ID from system variables or generate a new one."""
+        if self._workflow_system_variables and self._workflow_system_variables.workflow_execution_id:
+            return str(self._workflow_system_variables.workflow_execution_id)
+        return str(uuid4())
+
+    def _save_and_cache_workflow_execution(self, execution: WorkflowExecution) -> WorkflowExecution:
+        """Save workflow execution to repository and cache it."""
+        self._workflow_execution_repository.save(execution)
+        self._workflow_execution_cache[execution.id_] = execution
+        return execution
+
+    def _save_and_cache_node_execution(self, execution: WorkflowNodeExecution) -> WorkflowNodeExecution:
+        """Save node execution to repository and cache it if it has an ID."""
+        self._workflow_node_execution_repository.save(execution)
+        if execution.node_execution_id:
+            self._node_execution_cache[execution.node_execution_id] = execution
+        return execution
+
+    def _get_node_execution_from_cache(self, node_execution_id: str) -> WorkflowNodeExecution:
+        """Get node execution from cache or raise error if not found."""
+        domain_execution = self._node_execution_cache.get(node_execution_id)
+        if not domain_execution:
+            raise ValueError(f"Domain node execution not found: {node_execution_id}")
+        return domain_execution
+
+    def _update_workflow_execution_completion(
+        self,
+        execution: WorkflowExecution,
+        *,
+        status: WorkflowExecutionStatus,
+        total_tokens: int,
+        total_steps: int,
+        outputs: Mapping[str, Any] | None = None,
+        error_message: Optional[str] = None,
+        exceptions_count: int = 0,
+        finished_at: Optional[datetime] = None,
+    ) -> None:
+        """Update workflow execution with completion data."""
+        execution.status = status
+        execution.outputs = outputs or {}
+        execution.total_tokens = total_tokens
+        execution.total_steps = total_steps
+        execution.finished_at = finished_at or naive_utc_now()
+        execution.exceptions_count = exceptions_count
+        if error_message:
+            execution.error_message = error_message
+
+    def _add_trace_task_if_needed(
+        self,
+        trace_manager: Optional[TraceQueueManager],
+        workflow_execution: WorkflowExecution,
+        conversation_id: Optional[str],
+    ) -> None:
+        """Add trace task if trace manager is provided."""
+        if trace_manager:
+            trace_manager.add_trace_task(
+                TraceTask(
+                    TraceTaskName.WORKFLOW_TRACE,
+                    workflow_execution=workflow_execution,
+                    conversation_id=conversation_id,
+                    user_id=trace_manager.user_id,
+                )
+            )
+
+    def _fail_running_node_executions(
+        self,
+        workflow_execution_id: str,
+        error_message: str,
+        now: datetime,
+    ) -> None:
+        """Fail all running node executions for a workflow."""
+        running_node_executions = [
+            node_exec
+            for node_exec in self._node_execution_cache.values()
+            if node_exec.workflow_execution_id == workflow_execution_id
+            and node_exec.status == WorkflowNodeExecutionStatus.RUNNING
+        ]
+
+        for node_execution in running_node_executions:
+            if node_execution.node_execution_id:
+                node_execution.status = WorkflowNodeExecutionStatus.FAILED
+                node_execution.error = error_message
+                node_execution.finished_at = now
+                node_execution.elapsed_time = (now - node_execution.created_at).total_seconds()
+                self._workflow_node_execution_repository.save(node_execution)
+
+    def _create_node_execution_from_event(
+        self,
+        *,
+        workflow_execution: WorkflowExecution,
+        event: Union[QueueNodeStartedEvent, QueueNodeRetryEvent],
+        status: WorkflowNodeExecutionStatus,
+        error: Optional[str] = None,
+        created_at: Optional[datetime] = None,
+    ) -> WorkflowNodeExecution:
+        """Create a node execution from an event."""
+        now = datetime.now(UTC).replace(tzinfo=None)
+        created_at = created_at or now
+
        metadata = {
            WorkflowNodeExecutionMetadataKey.PARALLEL_MODE_RUN_ID: event.parallel_mode_run_id,
            WorkflowNodeExecutionMetadataKey.ITERATION_ID: event.in_iteration_id,
@@ -232,152 +375,76 @@ class WorkflowCycleManager:
            node_id=event.node_id,
            node_type=event.node_type,
            title=event.node_data.title,
-            status=WorkflowNodeExecutionStatus.RUNNING,
+            status=status,
            metadata=metadata,
            created_at=created_at,
+            error=error,
        )

-        # Use the instance repository to save the domain model
-        self._workflow_node_execution_repository.save(domain_execution)
+        if status == WorkflowNodeExecutionStatus.RETRY:
+            domain_execution.finished_at = now
+            domain_execution.elapsed_time = (now - created_at).total_seconds()

        return domain_execution

-    def handle_workflow_node_execution_success(self, *, event: QueueNodeSucceededEvent) -> WorkflowNodeExecution:
-        # Get the domain model from repository
-        domain_execution = self._workflow_node_execution_repository.get_by_node_execution_id(event.node_execution_id)
-        if not domain_execution:
-            raise ValueError(f"Domain node execution not found: {event.node_execution_id}")
-
-        # Process data
-        inputs = event.inputs
-        process_data = event.process_data
-        outputs = event.outputs
-
-        # Convert metadata keys to strings
-        execution_metadata_dict = {}
-        if event.execution_metadata:
-            for key, value in event.execution_metadata.items():
-                execution_metadata_dict[key] = value
-
-        finished_at = datetime.now(UTC).replace(tzinfo=None)
-        elapsed_time = (finished_at - event.start_at).total_seconds()
-
-        # Update domain model
-        domain_execution.status = WorkflowNodeExecutionStatus.SUCCEEDED
-        domain_execution.update_from_mapping(
-            inputs=inputs, process_data=process_data, outputs=outputs, metadata=execution_metadata_dict
-        )
-        domain_execution.finished_at = finished_at
-        domain_execution.elapsed_time = elapsed_time
-
-        # Update the repository with the domain model
-        self._workflow_node_execution_repository.save(domain_execution)
-
-        return domain_execution
-
-    def handle_workflow_node_execution_failed(
+    def _update_node_execution_completion(
        self,
+        domain_execution: WorkflowNodeExecution,
        *,
-        event: QueueNodeFailedEvent
-        | QueueNodeInIterationFailedEvent
-        | QueueNodeInLoopFailedEvent
-        | QueueNodeExceptionEvent,
-    ) -> WorkflowNodeExecution:
-        """
-        Workflow node execution failed
-        :param event: queue node failed event
-        :return:
-        """
-        # Get the domain model from repository
-        domain_execution = self._workflow_node_execution_repository.get_by_node_execution_id(event.node_execution_id)
-        if not domain_execution:
-            raise ValueError(f"Domain node execution not found: {event.node_execution_id}")
-
-        # Process data
-        inputs = WorkflowEntry.handle_special_values(event.inputs)
-        process_data = WorkflowEntry.handle_special_values(event.process_data)
-        outputs = event.outputs
-
-        # Convert metadata keys to strings
-        execution_metadata_dict = {}
-        if event.execution_metadata:
-            for key, value in event.execution_metadata.items():
-                execution_metadata_dict[key] = value
-
+        event: Union[
+            QueueNodeSucceededEvent,
+            QueueNodeFailedEvent,
+            QueueNodeInIterationFailedEvent,
+            QueueNodeInLoopFailedEvent,
+            QueueNodeExceptionEvent,
+        ],
+        status: WorkflowNodeExecutionStatus,
+        error: Optional[str] = None,
+        handle_special_values: bool = False,
+    ) -> None:
+        """Update node execution with completion data."""
        finished_at = datetime.now(UTC).replace(tzinfo=None)
        elapsed_time = (finished_at - event.start_at).total_seconds()

+        # Process data
+        if handle_special_values:
+            inputs = WorkflowEntry.handle_special_values(event.inputs)
+            process_data = WorkflowEntry.handle_special_values(event.process_data)
+        else:
+            inputs = event.inputs
+            process_data = event.process_data
+
+        outputs = event.outputs
+
+        # Convert metadata
+        execution_metadata_dict: dict[WorkflowNodeExecutionMetadataKey, Any] = {}
+        if event.execution_metadata:
+            execution_metadata_dict.update(event.execution_metadata)
+
        # Update domain model
-        domain_execution.status = (
-            WorkflowNodeExecutionStatus.FAILED
-            if not isinstance(event, QueueNodeExceptionEvent)
-            else WorkflowNodeExecutionStatus.EXCEPTION
-        )
-        domain_execution.error = event.error
+        domain_execution.status = status
        domain_execution.update_from_mapping(
-            inputs=inputs, process_data=process_data, outputs=outputs, metadata=execution_metadata_dict
+            inputs=inputs,
+            process_data=process_data,
+            outputs=outputs,
+            metadata=execution_metadata_dict,
        )
        domain_execution.finished_at = finished_at
        domain_execution.elapsed_time = elapsed_time

-        # Update the repository with the domain model
-        self._workflow_node_execution_repository.save(domain_execution)
+        if error:
+            domain_execution.error = error

-        return domain_execution
-
-    def handle_workflow_node_execution_retried(
-        self, *, workflow_execution_id: str, event: QueueNodeRetryEvent
-    ) -> WorkflowNodeExecution:
-        workflow_execution = self._get_workflow_execution_or_raise_error(workflow_execution_id)
-        created_at = event.start_at
-        finished_at = datetime.now(UTC).replace(tzinfo=None)
-        elapsed_time = (finished_at - created_at).total_seconds()
-        inputs = WorkflowEntry.handle_special_values(event.inputs)
-        outputs = event.outputs
-
-        # Convert metadata keys to strings
+    def _merge_event_metadata(self, event: QueueNodeRetryEvent) -> dict[WorkflowNodeExecutionMetadataKey, str | None]:
+        """Merge event metadata with origin metadata."""
        origin_metadata = {
            WorkflowNodeExecutionMetadataKey.ITERATION_ID: event.in_iteration_id,
            WorkflowNodeExecutionMetadataKey.PARALLEL_MODE_RUN_ID: event.parallel_mode_run_id,
            WorkflowNodeExecutionMetadataKey.LOOP_ID: event.in_loop_id,
        }

-        # Convert execution metadata keys to strings
        execution_metadata_dict: dict[WorkflowNodeExecutionMetadataKey, str | None] = {}
        if event.execution_metadata:
-            for key, value in event.execution_metadata.items():
-                execution_metadata_dict[key] = value
+            execution_metadata_dict.update(event.execution_metadata)

-        merged_metadata = {**execution_metadata_dict, **origin_metadata} if execution_metadata_dict else origin_metadata
-
-        # Create a domain model
-        domain_execution = WorkflowNodeExecution(
-            id=str(uuid4()),
-            workflow_id=workflow_execution.workflow_id,
-            workflow_execution_id=workflow_execution.id_,
-            predecessor_node_id=event.predecessor_node_id,
-            node_execution_id=event.node_execution_id,
-            node_id=event.node_id,
-            node_type=event.node_type,
-            title=event.node_data.title,
-            status=WorkflowNodeExecutionStatus.RETRY,
-            created_at=created_at,
-            finished_at=finished_at,
-            elapsed_time=elapsed_time,
-            error=event.error,
-            index=event.node_run_index,
-        )
-
-        # Update with mappings
-        domain_execution.update_from_mapping(inputs=inputs, outputs=outputs, metadata=merged_metadata)
-
-        # Use the instance repository to save the domain model
-        self._workflow_node_execution_repository.save(domain_execution)
-
-        return domain_execution
-
-    def _get_workflow_execution_or_raise_error(self, id: str, /) -> WorkflowExecution:
-        execution = self._workflow_execution_repository.get(id)
-        if not execution:
-            raise WorkflowRunNotFoundError(id)
-        return execution
+        return {**execution_metadata_dict, **origin_metadata} if execution_metadata_dict else origin_metadata