refactor: consume events after pause/abort and improve API clarity (#28328)

Co-authored-by: QuantumGhost <obelisk.reg+git@gmail.com>
2025-11-18 19:04:11 +08:00
parent 68526c09fc
commit 6efdc94661
7 changed files with 261 additions and 183 deletions
--- a/api/core/workflow/graph_engine/graph_engine.py
+++ b/api/core/workflow/graph_engine/graph_engine.py
@@ -192,7 +192,6 @@ class GraphEngine:
        self._dispatcher = Dispatcher(
            event_queue=self._event_queue,
            event_handler=self._event_handler_registry,
-            event_collector=self._event_manager,
            execution_coordinator=self._execution_coordinator,
            event_emitter=self._event_manager,
        )
--- a/api/core/workflow/graph_engine/orchestration/dispatcher.py
+++ b/api/core/workflow/graph_engine/orchestration/dispatcher.py
@@ -43,7 +43,6 @@ class Dispatcher:
        self,
        event_queue: queue.Queue[GraphNodeEventBase],
        event_handler: "EventHandler",
-        event_collector: EventManager,
        execution_coordinator: ExecutionCoordinator,
        event_emitter: EventManager | None = None,
    ) -> None:
@@ -53,13 +52,11 @@ class Dispatcher:
        Args:
            event_queue: Queue of events from workers
            event_handler: Event handler registry for processing events
-            event_collector: Event manager for collecting unhandled events
            execution_coordinator: Coordinator for execution flow
            event_emitter: Optional event manager to signal completion
        """
        self._event_queue = event_queue
        self._event_handler = event_handler
-        self._event_collector = event_collector
        self._execution_coordinator = execution_coordinator
        self._event_emitter = event_emitter

@@ -86,37 +83,31 @@ class Dispatcher:
    def _dispatcher_loop(self) -> None:
        """Main dispatcher loop."""
        try:
+            self._process_commands()
            while not self._stop_event.is_set():
-                commands_checked = False
-                should_check_commands = False
-                should_break = False
+                if (
+                    self._execution_coordinator.aborted
+                    or self._execution_coordinator.paused
+                    or self._execution_coordinator.execution_complete
+                ):
+                    break

-                if self._execution_coordinator.is_execution_complete():
-                    should_check_commands = True
-                    should_break = True
-                else:
-                    # Check for scaling
-                    self._execution_coordinator.check_scaling()
+                self._execution_coordinator.check_scaling()
+                try:
+                    event = self._event_queue.get(timeout=0.1)
+                    self._event_handler.dispatch(event)
+                    self._event_queue.task_done()
+                    self._process_commands(event)
+                except queue.Empty:
+                    time.sleep(0.1)

-                    # Process events
-                    try:
-                        event = self._event_queue.get(timeout=0.1)
-                        # Route to the event handler
-                        self._event_handler.dispatch(event)
-                        should_check_commands = self._should_check_commands(event)
-                        self._event_queue.task_done()
-                    except queue.Empty:
-                        # Process commands even when no new events arrive so abort requests are not missed
-                        should_check_commands = True
-                        time.sleep(0.1)
-
-                if should_check_commands and not commands_checked:
-                    self._execution_coordinator.check_commands()
-                    commands_checked = True
-
-                if should_break:
-                    if not commands_checked:
-                        self._execution_coordinator.check_commands()
+            self._process_commands()
+            while True:
+                try:
+                    event = self._event_queue.get(block=False)
+                    self._event_handler.dispatch(event)
+                    self._event_queue.task_done()
+                except queue.Empty:
                    break

        except Exception as e:
@@ -129,6 +120,6 @@ class Dispatcher:
            if self._event_emitter:
                self._event_emitter.mark_complete()

-    def _should_check_commands(self, event: GraphNodeEventBase) -> bool:
-        """Return True if the event represents a node completion."""
-        return isinstance(event, self._COMMAND_TRIGGER_EVENTS)
+    def _process_commands(self, event: GraphNodeEventBase | None = None):
+        if event is None or isinstance(event, self._COMMAND_TRIGGER_EVENTS):
+            self._execution_coordinator.process_commands()
--- a/api/core/workflow/graph_engine/orchestration/execution_coordinator.py
+++ b/api/core/workflow/graph_engine/orchestration/execution_coordinator.py
@@ -40,7 +40,7 @@ class ExecutionCoordinator:
        self._command_processor = command_processor
        self._worker_pool = worker_pool

-    def check_commands(self) -> None:
+    def process_commands(self) -> None:
        """Process any pending commands."""
        self._command_processor.process_commands()

@@ -48,24 +48,16 @@ class ExecutionCoordinator:
        """Check and perform worker scaling if needed."""
        self._worker_pool.check_and_scale()

-    def is_execution_complete(self) -> bool:
-        """
-        Check if execution is complete.
-
-        Returns:
-            True if execution is complete
-        """
-        # Treat paused, aborted, or failed executions as terminal states
-        if self._graph_execution.is_paused:
-            return True
-
-        if self._graph_execution.aborted or self._graph_execution.has_error:
-            return True
-
+    @property
+    def execution_complete(self):
        return self._state_manager.is_execution_complete()

    @property
-    def is_paused(self) -> bool:
+    def aborted(self):
+        return self._graph_execution.aborted or self._graph_execution.has_error
+
+    @property
+    def paused(self) -> bool:
        """Expose whether the underlying graph execution is paused."""
        return self._graph_execution.is_paused