Enhanced GraphEngine Pause Handling (#28196)

This commit: 

1. Convert `pause_reason` to `pause_reasons` in `GraphExecution` and relevant classes. Change the field from a scalar value to a list that can contain multiple `PauseReason` objects, ensuring all pause events are properly captured.
2. Introduce a new `WorkflowPauseReason` model to record reasons associated with a specific `WorkflowPause`.

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: -LAN- <laipz8200@outlook.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
QuantumGhost
2025-11-26 19:59:34 +08:00
committed by GitHub
parent b353a126d8
commit 1c1f124891
24 changed files with 275 additions and 185 deletions

View File

@@ -319,7 +319,7 @@ class TestPauseStatePersistenceLayerTestContainers:
# Create pause event
event = GraphRunPausedEvent(
reason=SchedulingPause(message="test pause"),
reasons=[SchedulingPause(message="test pause")],
outputs={"intermediate": "result"},
)
@@ -381,7 +381,7 @@ class TestPauseStatePersistenceLayerTestContainers:
command_channel = _TestCommandChannelImpl()
layer.initialize(graph_runtime_state, command_channel)
event = GraphRunPausedEvent(reason=SchedulingPause(message="test pause"))
event = GraphRunPausedEvent(reasons=[SchedulingPause(message="test pause")])
# Act - Save pause state
layer.on_event(event)
@@ -390,6 +390,7 @@ class TestPauseStatePersistenceLayerTestContainers:
pause_entity = self.workflow_run_service._workflow_run_repo.get_workflow_pause(self.test_workflow_run_id)
assert pause_entity is not None
assert pause_entity.workflow_execution_id == self.test_workflow_run_id
assert pause_entity.get_pause_reasons() == event.reasons
state_bytes = pause_entity.get_state()
resumption_context = WorkflowResumptionContext.loads(state_bytes.decode())
@@ -414,7 +415,7 @@ class TestPauseStatePersistenceLayerTestContainers:
command_channel = _TestCommandChannelImpl()
layer.initialize(graph_runtime_state, command_channel)
event = GraphRunPausedEvent(reason=SchedulingPause(message="test pause"))
event = GraphRunPausedEvent(reasons=[SchedulingPause(message="test pause")])
# Act
layer.on_event(event)
@@ -448,7 +449,7 @@ class TestPauseStatePersistenceLayerTestContainers:
command_channel = _TestCommandChannelImpl()
layer.initialize(graph_runtime_state, command_channel)
event = GraphRunPausedEvent(reason=SchedulingPause(message="test pause"))
event = GraphRunPausedEvent(reasons=[SchedulingPause(message="test pause")])
# Act
layer.on_event(event)
@@ -514,7 +515,7 @@ class TestPauseStatePersistenceLayerTestContainers:
command_channel = _TestCommandChannelImpl()
layer.initialize(graph_runtime_state, command_channel)
event = GraphRunPausedEvent(reason=SchedulingPause(message="test pause"))
event = GraphRunPausedEvent(reasons=[SchedulingPause(message="test pause")])
# Act
layer.on_event(event)
@@ -570,7 +571,7 @@ class TestPauseStatePersistenceLayerTestContainers:
layer = self._create_pause_state_persistence_layer()
# Don't initialize - graph_runtime_state should not be set
event = GraphRunPausedEvent(reason=SchedulingPause(message="test pause"))
event = GraphRunPausedEvent(reasons=[SchedulingPause(message="test pause")])
# Act & Assert - Should raise AttributeError
with pytest.raises(AttributeError):

View File

@@ -334,12 +334,14 @@ class TestWorkflowPauseIntegration:
workflow_run_id=workflow_run.id,
state_owner_user_id=self.test_user_id,
state=test_state,
pause_reasons=[],
)
# Assert - Pause state created
assert pause_entity is not None
assert pause_entity.id is not None
assert pause_entity.workflow_execution_id == workflow_run.id
assert list(pause_entity.get_pause_reasons()) == []
# Convert both to strings for comparison
retrieved_state = pause_entity.get_state()
if isinstance(retrieved_state, bytes):
@@ -366,6 +368,7 @@ class TestWorkflowPauseIntegration:
if isinstance(retrieved_state, bytes):
retrieved_state = retrieved_state.decode()
assert retrieved_state == test_state
assert list(retrieved_entity.get_pause_reasons()) == []
# Act - Resume workflow
resumed_entity = repository.resume_workflow_pause(
@@ -402,6 +405,7 @@ class TestWorkflowPauseIntegration:
workflow_run_id=workflow_run.id,
state_owner_user_id=self.test_user_id,
state=test_state,
pause_reasons=[],
)
assert pause_entity is not None
@@ -432,6 +436,7 @@ class TestWorkflowPauseIntegration:
workflow_run_id=workflow_run.id,
state_owner_user_id=self.test_user_id,
state=test_state,
pause_reasons=[],
)
@pytest.mark.parametrize("test_case", resume_workflow_success_cases(), ids=lambda tc: tc.name)
@@ -449,6 +454,7 @@ class TestWorkflowPauseIntegration:
workflow_run_id=workflow_run.id,
state_owner_user_id=self.test_user_id,
state=test_state,
pause_reasons=[],
)
self.session.refresh(workflow_run)
@@ -480,6 +486,7 @@ class TestWorkflowPauseIntegration:
workflow_run_id=workflow_run.id,
state_owner_user_id=self.test_user_id,
state=test_state,
pause_reasons=[],
)
self.session.refresh(workflow_run)
@@ -503,6 +510,7 @@ class TestWorkflowPauseIntegration:
workflow_run_id=workflow_run.id,
state_owner_user_id=self.test_user_id,
state=test_state,
pause_reasons=[],
)
pause_model = self.session.get(WorkflowPauseModel, pause_entity.id)
pause_model.resumed_at = naive_utc_now()
@@ -530,6 +538,7 @@ class TestWorkflowPauseIntegration:
workflow_run_id=nonexistent_id,
state_owner_user_id=self.test_user_id,
state=test_state,
pause_reasons=[],
)
def test_resume_nonexistent_workflow_run(self):
@@ -543,6 +552,7 @@ class TestWorkflowPauseIntegration:
workflow_run_id=workflow_run.id,
state_owner_user_id=self.test_user_id,
state=test_state,
pause_reasons=[],
)
nonexistent_id = str(uuid.uuid4())
@@ -570,6 +580,7 @@ class TestWorkflowPauseIntegration:
workflow_run_id=workflow_run.id,
state_owner_user_id=self.test_user_id,
state=test_state,
pause_reasons=[],
)
# Manually adjust timestamps for testing
@@ -648,6 +659,7 @@ class TestWorkflowPauseIntegration:
workflow_run_id=workflow_run.id,
state_owner_user_id=self.test_user_id,
state=test_state,
pause_reasons=[],
)
pause_entities.append(pause_entity)
@@ -750,6 +762,7 @@ class TestWorkflowPauseIntegration:
workflow_run_id=workflow_run1.id,
state_owner_user_id=self.test_user_id,
state=test_state,
pause_reasons=[],
)
# Try to access pause from tenant 2 using tenant 1's repository
@@ -762,6 +775,7 @@ class TestWorkflowPauseIntegration:
workflow_run_id=workflow_run2.id,
state_owner_user_id=account2.id,
state=test_state,
pause_reasons=[],
)
# Assert - Both pauses should exist and be separate
@@ -782,6 +796,7 @@ class TestWorkflowPauseIntegration:
workflow_run_id=workflow_run.id,
state_owner_user_id=self.test_user_id,
state=test_state,
pause_reasons=[],
)
# Verify pause is properly scoped
@@ -802,6 +817,7 @@ class TestWorkflowPauseIntegration:
workflow_run_id=workflow_run.id,
state_owner_user_id=self.test_user_id,
state=test_state,
pause_reasons=[],
)
# Assert - Verify file was uploaded to storage
@@ -828,9 +844,7 @@ class TestWorkflowPauseIntegration:
repository = self._get_workflow_run_repository()
pause_entity = repository.create_workflow_pause(
workflow_run_id=workflow_run.id,
state_owner_user_id=self.test_user_id,
state=test_state,
workflow_run_id=workflow_run.id, state_owner_user_id=self.test_user_id, state=test_state, pause_reasons=[]
)
# Get file info before deletion
@@ -868,6 +882,7 @@ class TestWorkflowPauseIntegration:
workflow_run_id=workflow_run.id,
state_owner_user_id=self.test_user_id,
state=large_state_json,
pause_reasons=[],
)
# Assert
@@ -902,9 +917,7 @@ class TestWorkflowPauseIntegration:
# Pause
pause_entity = repository.create_workflow_pause(
workflow_run_id=workflow_run.id,
state_owner_user_id=self.test_user_id,
state=state,
workflow_run_id=workflow_run.id, state_owner_user_id=self.test_user_id, state=state, pause_reasons=[]
)
assert pause_entity is not None