feat: knowledge pipeline (#25360)

Signed-off-by: -LAN- <laipz8200@outlook.com>
Co-authored-by: twwu <twwu@dify.ai>
Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
Co-authored-by: jyong <718720800@qq.com>
Co-authored-by: Wu Tianwei <30284043+WTW0313@users.noreply.github.com>
Co-authored-by: QuantumGhost <obelisk.reg+git@gmail.com>
Co-authored-by: lyzno1 <yuanyouhuilyz@gmail.com>
Co-authored-by: quicksand <quicksandzn@gmail.com>
Co-authored-by: Jyong <76649700+JohnJyong@users.noreply.github.com>
Co-authored-by: lyzno1 <92089059+lyzno1@users.noreply.github.com>
Co-authored-by: zxhlyh <jasonapring2015@outlook.com>
Co-authored-by: Yongtao Huang <yongtaoh2022@gmail.com>
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
Co-authored-by: Joel <iamjoel007@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: nite-knite <nkCoding@gmail.com>
Co-authored-by: Hanqing Zhao <sherry9277@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Harry <xh001x@hotmail.com>
This commit is contained in:
-LAN-
2025-09-18 12:49:10 +08:00
committed by GitHub
parent 7dadb33003
commit 85cda47c70
1772 changed files with 102407 additions and 31710 deletions

View File

@@ -9,11 +9,8 @@ from typing import Union
from sqlalchemy.engine import Engine
from sqlalchemy.orm import sessionmaker
from core.workflow.entities.workflow_execution import (
WorkflowExecution,
WorkflowExecutionStatus,
WorkflowType,
)
from core.workflow.entities import WorkflowExecution
from core.workflow.enums import WorkflowExecutionStatus, WorkflowType
from core.workflow.repositories.workflow_execution_repository import WorkflowExecutionRepository
from core.workflow.workflow_type_encoder import WorkflowRuntimeTypeConverter
from libs.helper import extract_tenant_id
@@ -203,5 +200,4 @@ class SQLAlchemyWorkflowExecutionRepository(WorkflowExecutionRepository):
session.commit()
# Update the in-memory cache for faster subsequent lookups
logger.debug("Updating cache for execution_id: %s", db_model.id)
self._execution_cache[db_model.id] = db_model

View File

@@ -2,10 +2,12 @@
SQLAlchemy implementation of the WorkflowNodeExecutionRepository.
"""
import dataclasses
import json
import logging
from collections.abc import Sequence
from typing import Union
from collections.abc import Callable, Mapping, Sequence
from concurrent.futures import ThreadPoolExecutor
from typing import Any, TypeVar, Union
import psycopg2.errors
from sqlalchemy import UnaryExpression, asc, desc, select
@@ -14,15 +16,13 @@ from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import sessionmaker
from tenacity import before_sleep_log, retry, retry_if_exception, stop_after_attempt
from configs import dify_config
from core.model_runtime.utils.encoders import jsonable_encoder
from core.workflow.entities.workflow_node_execution import (
WorkflowNodeExecution,
WorkflowNodeExecutionMetadataKey,
WorkflowNodeExecutionStatus,
)
from core.workflow.nodes.enums import NodeType
from core.workflow.entities import WorkflowNodeExecution
from core.workflow.enums import NodeType, WorkflowNodeExecutionMetadataKey, WorkflowNodeExecutionStatus
from core.workflow.repositories.workflow_node_execution_repository import OrderConfig, WorkflowNodeExecutionRepository
from core.workflow.workflow_type_encoder import WorkflowRuntimeTypeConverter
from extensions.ext_storage import storage
from libs.helper import extract_tenant_id
from libs.uuid_utils import uuidv7
from models import (
@@ -32,10 +32,22 @@ from models import (
WorkflowNodeExecutionModel,
WorkflowNodeExecutionTriggeredFrom,
)
from models.enums import ExecutionOffLoadType
from models.model import UploadFile
from models.workflow import WorkflowNodeExecutionOffload
from services.file_service import FileService
from services.variable_truncator import VariableTruncator
logger = logging.getLogger(__name__)
@dataclasses.dataclass(frozen=True)
class _InputsOutputsTruncationResult:
truncated_value: Mapping[str, Any]
file: UploadFile
offload: WorkflowNodeExecutionOffload
class SQLAlchemyWorkflowNodeExecutionRepository(WorkflowNodeExecutionRepository):
"""
SQLAlchemy implementation of the WorkflowNodeExecutionRepository interface.
@@ -86,6 +98,7 @@ class SQLAlchemyWorkflowNodeExecutionRepository(WorkflowNodeExecutionRepository)
# Extract user context
self._triggered_from = triggered_from
self._creator_user_id = user.id
self._user = user # Store the user object directly
# Determine user role based on user type
self._creator_user_role = CreatorUserRole.ACCOUNT if isinstance(user, Account) else CreatorUserRole.END_USER
@@ -94,17 +107,30 @@ class SQLAlchemyWorkflowNodeExecutionRepository(WorkflowNodeExecutionRepository)
# Key: node_execution_id, Value: WorkflowNodeExecution (DB model)
self._node_execution_cache: dict[str, WorkflowNodeExecutionModel] = {}
# Initialize FileService for handling offloaded data
self._file_service = FileService(session_factory)
def _create_truncator(self) -> VariableTruncator:
return VariableTruncator(
max_size_bytes=dify_config.WORKFLOW_VARIABLE_TRUNCATION_MAX_SIZE,
array_element_limit=dify_config.WORKFLOW_VARIABLE_TRUNCATION_ARRAY_LENGTH,
string_length_limit=dify_config.WORKFLOW_VARIABLE_TRUNCATION_STRING_LENGTH,
)
def _to_domain_model(self, db_model: WorkflowNodeExecutionModel) -> WorkflowNodeExecution:
"""
Convert a database model to a domain model.
This requires the offload_data, and correspond inputs_file and outputs_file are preloaded.
Args:
db_model: The database model to convert
db_model: The database model to convert. It must have `offload_data`
and the corresponding `inputs_file` and `outputs_file` preloaded.
Returns:
The domain model
"""
# Parse JSON fields
# Parse JSON fields - these might be truncated versions
inputs = db_model.inputs_dict
process_data = db_model.process_data_dict
outputs = db_model.outputs_dict
@@ -113,7 +139,7 @@ class SQLAlchemyWorkflowNodeExecutionRepository(WorkflowNodeExecutionRepository)
# Convert status to domain enum
status = WorkflowNodeExecutionStatus(db_model.status)
return WorkflowNodeExecution(
domain_model = WorkflowNodeExecution(
id=db_model.id,
node_execution_id=db_model.node_execution_id,
workflow_id=db_model.workflow_id,
@@ -134,15 +160,52 @@ class SQLAlchemyWorkflowNodeExecutionRepository(WorkflowNodeExecutionRepository)
finished_at=db_model.finished_at,
)
def to_db_model(self, domain_model: WorkflowNodeExecution) -> WorkflowNodeExecutionModel:
if not db_model.offload_data:
return domain_model
offload_data = db_model.offload_data
# Store truncated versions for API responses
# TODO: consider load content concurrently.
input_offload = _find_first(offload_data, _filter_by_offload_type(ExecutionOffLoadType.INPUTS))
if input_offload is not None:
assert input_offload.file is not None
domain_model.inputs = self._load_file(input_offload.file)
domain_model.set_truncated_inputs(inputs)
outputs_offload = _find_first(offload_data, _filter_by_offload_type(ExecutionOffLoadType.OUTPUTS))
if outputs_offload is not None:
assert outputs_offload.file is not None
domain_model.outputs = self._load_file(outputs_offload.file)
domain_model.set_truncated_outputs(outputs)
process_data_offload = _find_first(offload_data, _filter_by_offload_type(ExecutionOffLoadType.PROCESS_DATA))
if process_data_offload is not None:
assert process_data_offload.file is not None
domain_model.process_data = self._load_file(process_data_offload.file)
domain_model.set_truncated_process_data(process_data)
return domain_model
def _load_file(self, file: UploadFile) -> Mapping[str, Any]:
content = storage.load(file.key)
return json.loads(content)
@staticmethod
def _json_encode(values: Mapping[str, Any]) -> str:
json_converter = WorkflowRuntimeTypeConverter()
return json.dumps(json_converter.to_json_encodable(values))
def _to_db_model(self, domain_model: WorkflowNodeExecution) -> WorkflowNodeExecutionModel:
"""
Convert a domain model to a database model.
Convert a domain model to a database model. This copies the inputs /
process_data / outputs from domain model directly without applying truncation.
Args:
domain_model: The domain model to convert
Returns:
The database model
The database model, without setting inputs, process_data and outputs fields.
"""
# Use values from constructor if provided
if not self._triggered_from:
@@ -152,7 +215,9 @@ class SQLAlchemyWorkflowNodeExecutionRepository(WorkflowNodeExecutionRepository)
if not self._creator_user_role:
raise ValueError("created_by_role is required in repository constructor")
json_converter = WorkflowRuntimeTypeConverter()
converter = WorkflowRuntimeTypeConverter()
# json_converter = WorkflowRuntimeTypeConverter()
db_model = WorkflowNodeExecutionModel()
db_model.id = domain_model.id
db_model.tenant_id = self._tenant_id
@@ -168,16 +233,21 @@ class SQLAlchemyWorkflowNodeExecutionRepository(WorkflowNodeExecutionRepository)
db_model.node_type = domain_model.node_type
db_model.title = domain_model.title
db_model.inputs = (
json.dumps(json_converter.to_json_encodable(domain_model.inputs)) if domain_model.inputs else None
_deterministic_json_dump(converter.to_json_encodable(domain_model.inputs))
if domain_model.inputs is not None
else None
)
db_model.process_data = (
json.dumps(json_converter.to_json_encodable(domain_model.process_data))
if domain_model.process_data
_deterministic_json_dump(converter.to_json_encodable(domain_model.process_data))
if domain_model.process_data is not None
else None
)
db_model.outputs = (
json.dumps(json_converter.to_json_encodable(domain_model.outputs)) if domain_model.outputs else None
_deterministic_json_dump(converter.to_json_encodable(domain_model.outputs))
if domain_model.outputs is not None
else None
)
# inputs, process_data and outputs are handled below
db_model.status = domain_model.status
db_model.error = domain_model.error
db_model.elapsed_time = domain_model.elapsed_time
@@ -188,6 +258,7 @@ class SQLAlchemyWorkflowNodeExecutionRepository(WorkflowNodeExecutionRepository)
db_model.created_by_role = self._creator_user_role
db_model.created_by = self._creator_user_id
db_model.finished_at = domain_model.finished_at
return db_model
def _is_duplicate_key_error(self, exception: BaseException) -> bool:
@@ -203,22 +274,78 @@ class SQLAlchemyWorkflowNodeExecutionRepository(WorkflowNodeExecutionRepository)
db_model.id = new_id
execution.id = new_id
def save(self, execution: WorkflowNodeExecution):
def _truncate_and_upload(
self,
values: Mapping[str, Any] | None,
execution_id: str,
type_: ExecutionOffLoadType,
) -> _InputsOutputsTruncationResult | None:
if values is None:
return None
converter = WorkflowRuntimeTypeConverter()
json_encodable_value = converter.to_json_encodable(values)
truncator = self._create_truncator()
truncated_values, truncated = truncator.truncate_variable_mapping(json_encodable_value)
if not truncated:
return None
value_json = _deterministic_json_dump(json_encodable_value)
assert value_json is not None, "value_json should be not None here."
suffix = type_.value
upload_file = self._file_service.upload_file(
filename=f"node_execution_{execution_id}_{suffix}.json",
content=value_json.encode("utf-8"),
mimetype="application/json",
user=self._user,
)
offload = WorkflowNodeExecutionOffload(
id=uuidv7(),
tenant_id=self._tenant_id,
app_id=self._app_id,
node_execution_id=execution_id,
type_=type_,
file_id=upload_file.id,
)
return _InputsOutputsTruncationResult(
truncated_value=truncated_values,
file=upload_file,
offload=offload,
)
def save(self, execution: WorkflowNodeExecution) -> None:
"""
Save or update a NodeExecution domain entity to the database.
This method serves as a domain-to-database adapter that:
1. Converts the domain entity to its database representation
2. Checks for existing records and updates or inserts accordingly
3. Maintains proper multi-tenancy by including tenant context during conversion
4. Updates the in-memory cache for faster subsequent lookups
5. Handles duplicate key conflicts by retrying with a new UUID v7
3. Handles truncation and offloading of large inputs/outputs
4. Persists the database model using SQLAlchemy's merge operation
5. Maintains proper multi-tenancy by including tenant context during conversion
6. Updates the in-memory cache for faster subsequent lookups
The method handles both creating new records and updating existing ones through
SQLAlchemy's merge operation.
Args:
execution: The NodeExecution domain entity to persist
"""
# NOTE: As per the implementation of `WorkflowCycleManager`,
# the `save` method is invoked multiple times during the node's execution lifecycle, including:
#
# - When the node starts execution
# - When the node retries execution
# - When the node completes execution (either successfully or with failure)
#
# Only the final invocation will have `inputs` and `outputs` populated.
#
# This simplifies the logic for saving offloaded variables but introduces a tight coupling
# between this module and `WorkflowCycleManager`.
# Convert domain model to database model using tenant context and other attributes
db_model = self.to_db_model(execution)
db_model = self._to_db_model(execution)
# Use tenacity for retry logic with duplicate key handling
@retry(
@@ -245,7 +372,6 @@ class SQLAlchemyWorkflowNodeExecutionRepository(WorkflowNodeExecutionRepository)
# Update the in-memory cache after successful save
if db_model.node_execution_id:
logger.debug("Updating cache for node_execution_id: %s", db_model.node_execution_id)
self._node_execution_cache[db_model.node_execution_id] = db_model
except Exception:
@@ -276,14 +402,83 @@ class SQLAlchemyWorkflowNodeExecutionRepository(WorkflowNodeExecutionRepository)
session.commit()
# Update the in-memory cache for faster subsequent lookups
# Only cache if we have a node_execution_id to use as the cache key
if db_model.node_execution_id:
self._node_execution_cache[db_model.node_execution_id] = db_model
def save_execution_data(self, execution: WorkflowNodeExecution):
domain_model = execution
with self._session_factory(expire_on_commit=False) as session:
query = WorkflowNodeExecutionModel.preload_offload_data(select(WorkflowNodeExecutionModel)).where(
WorkflowNodeExecutionModel.id == domain_model.id
)
db_model: WorkflowNodeExecutionModel | None = session.execute(query).scalars().first()
if db_model is not None:
offload_data = db_model.offload_data
else:
db_model = self._to_db_model(domain_model)
offload_data = []
offload_data = db_model.offload_data
if domain_model.inputs is not None:
result = self._truncate_and_upload(
domain_model.inputs,
domain_model.id,
ExecutionOffLoadType.INPUTS,
)
if result is not None:
db_model.inputs = self._json_encode(result.truncated_value)
domain_model.set_truncated_inputs(result.truncated_value)
offload_data = _replace_or_append_offload(offload_data, result.offload)
else:
db_model.inputs = self._json_encode(domain_model.inputs)
if domain_model.outputs is not None:
result = self._truncate_and_upload(
domain_model.outputs,
domain_model.id,
ExecutionOffLoadType.OUTPUTS,
)
if result is not None:
db_model.outputs = self._json_encode(result.truncated_value)
domain_model.set_truncated_outputs(result.truncated_value)
offload_data = _replace_or_append_offload(offload_data, result.offload)
else:
db_model.outputs = self._json_encode(domain_model.outputs)
if domain_model.process_data is not None:
result = self._truncate_and_upload(
domain_model.process_data,
domain_model.id,
ExecutionOffLoadType.PROCESS_DATA,
)
if result is not None:
db_model.process_data = self._json_encode(result.truncated_value)
domain_model.set_truncated_process_data(result.truncated_value)
offload_data = _replace_or_append_offload(offload_data, result.offload)
else:
db_model.process_data = self._json_encode(domain_model.process_data)
db_model.offload_data = offload_data
with self._session_factory() as session, session.begin():
session.merge(db_model)
session.flush()
def get_db_models_by_workflow_run(
self,
workflow_run_id: str,
order_config: OrderConfig | None = None,
triggered_from: WorkflowNodeExecutionTriggeredFrom = WorkflowNodeExecutionTriggeredFrom.WORKFLOW_RUN,
) -> Sequence[WorkflowNodeExecutionModel]:
"""
Retrieve all WorkflowNodeExecution database models for a specific workflow run.
The returned models have `offload_data` preloaded, along with the associated
`inputs_file` and `outputs_file` data.
This method directly returns database models without converting to domain models,
which is useful when you need to access database-specific fields like triggered_from.
It also updates the in-memory cache with the retrieved models.
@@ -298,10 +493,11 @@ class SQLAlchemyWorkflowNodeExecutionRepository(WorkflowNodeExecutionRepository)
A list of WorkflowNodeExecution database models
"""
with self._session_factory() as session:
stmt = select(WorkflowNodeExecutionModel).where(
stmt = WorkflowNodeExecutionModel.preload_offload_data_and_files(select(WorkflowNodeExecutionModel))
stmt = stmt.where(
WorkflowNodeExecutionModel.workflow_run_id == workflow_run_id,
WorkflowNodeExecutionModel.tenant_id == self._tenant_id,
WorkflowNodeExecutionModel.triggered_from == WorkflowNodeExecutionTriggeredFrom.WORKFLOW_RUN,
WorkflowNodeExecutionModel.triggered_from == triggered_from,
)
if self._app_id:
@@ -335,6 +531,7 @@ class SQLAlchemyWorkflowNodeExecutionRepository(WorkflowNodeExecutionRepository)
self,
workflow_run_id: str,
order_config: OrderConfig | None = None,
triggered_from: WorkflowNodeExecutionTriggeredFrom = WorkflowNodeExecutionTriggeredFrom.WORKFLOW_RUN,
) -> Sequence[WorkflowNodeExecution]:
"""
Retrieve all NodeExecution instances for a specific workflow run.
@@ -352,12 +549,48 @@ class SQLAlchemyWorkflowNodeExecutionRepository(WorkflowNodeExecutionRepository)
A list of NodeExecution instances
"""
# Get the database models using the new method
db_models = self.get_db_models_by_workflow_run(workflow_run_id, order_config)
db_models = self.get_db_models_by_workflow_run(workflow_run_id, order_config, triggered_from)
# Convert database models to domain models
domain_models = []
for model in db_models:
domain_model = self._to_domain_model(model)
domain_models.append(domain_model)
with ThreadPoolExecutor(max_workers=10) as executor:
domain_models = executor.map(self._to_domain_model, db_models, timeout=30)
return domain_models
return list(domain_models)
def _deterministic_json_dump(value: Mapping[str, Any]) -> str:
return json.dumps(value, sort_keys=True)
_T = TypeVar("_T")
def _find_first(seq: Sequence[_T], pred: Callable[[_T], bool]) -> _T | None:
filtered = [i for i in seq if pred(i)]
if filtered:
return filtered[0]
return None
def _filter_by_offload_type(offload_type: ExecutionOffLoadType) -> Callable[[WorkflowNodeExecutionOffload], bool]:
def f(offload: WorkflowNodeExecutionOffload) -> bool:
return offload.type_ == offload_type
return f
def _replace_or_append_offload(
seq: list[WorkflowNodeExecutionOffload], elem: WorkflowNodeExecutionOffload
) -> list[WorkflowNodeExecutionOffload]:
"""Replace all elements in `seq` that satisfy the equality condition defined by `eq_func` with `elem`.
Args:
seq: The sequence of elements to process.
elem: The new element to insert.
eq_func: A function that determines equality between elements.
Returns:
A new sequence with the specified elements replaced or appended.
"""
ls = [i for i in seq if i.type_ != elem.type_]
ls.append(elem)
return ls