feat: knowledge pipeline (#25360)

Signed-off-by: -LAN- <laipz8200@outlook.com> Co-authored-by: twwu <twwu@dify.ai> Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com> Co-authored-by: jyong <718720800@qq.com> Co-authored-by: Wu Tianwei <30284043+WTW0313@users.noreply.github.com> Co-authored-by: QuantumGhost <obelisk.reg+git@gmail.com> Co-authored-by: lyzno1 <yuanyouhuilyz@gmail.com> Co-authored-by: quicksand <quicksandzn@gmail.com> Co-authored-by: Jyong <76649700+JohnJyong@users.noreply.github.com> Co-authored-by: lyzno1 <92089059+lyzno1@users.noreply.github.com> Co-authored-by: zxhlyh <jasonapring2015@outlook.com> Co-authored-by: Yongtao Huang <yongtaoh2022@gmail.com> Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: Joel <iamjoel007@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: nite-knite <nkCoding@gmail.com> Co-authored-by: Hanqing Zhao <sherry9277@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Harry <xh001x@hotmail.com>
2025-09-18 12:49:10 +08:00
parent 7dadb33003
commit 85cda47c70
1772 changed files with 102407 additions and 31710 deletions
--- a/api/core/repositories/sqlalchemy_workflow_execution_repository.py
+++ b/api/core/repositories/sqlalchemy_workflow_execution_repository.py
@@ -9,11 +9,8 @@ from typing import Union
 from sqlalchemy.engine import Engine
 from sqlalchemy.orm import sessionmaker

-from core.workflow.entities.workflow_execution import (
-    WorkflowExecution,
-    WorkflowExecutionStatus,
-    WorkflowType,
-)
+from core.workflow.entities import WorkflowExecution
+from core.workflow.enums import WorkflowExecutionStatus, WorkflowType
 from core.workflow.repositories.workflow_execution_repository import WorkflowExecutionRepository
 from core.workflow.workflow_type_encoder import WorkflowRuntimeTypeConverter
 from libs.helper import extract_tenant_id
@@ -203,5 +200,4 @@ class SQLAlchemyWorkflowExecutionRepository(WorkflowExecutionRepository):
            session.commit()

            # Update the in-memory cache for faster subsequent lookups
-            logger.debug("Updating cache for execution_id: %s", db_model.id)
            self._execution_cache[db_model.id] = db_model
--- a/api/core/repositories/sqlalchemy_workflow_node_execution_repository.py
+++ b/api/core/repositories/sqlalchemy_workflow_node_execution_repository.py
@@ -2,10 +2,12 @@
 SQLAlchemy implementation of the WorkflowNodeExecutionRepository.
 """

+import dataclasses
 import json
 import logging
-from collections.abc import Sequence
-from typing import Union
+from collections.abc import Callable, Mapping, Sequence
+from concurrent.futures import ThreadPoolExecutor
+from typing import Any, TypeVar, Union

 import psycopg2.errors
 from sqlalchemy import UnaryExpression, asc, desc, select
@@ -14,15 +16,13 @@ from sqlalchemy.exc import IntegrityError
 from sqlalchemy.orm import sessionmaker
 from tenacity import before_sleep_log, retry, retry_if_exception, stop_after_attempt

+from configs import dify_config
 from core.model_runtime.utils.encoders import jsonable_encoder
-from core.workflow.entities.workflow_node_execution import (
-    WorkflowNodeExecution,
-    WorkflowNodeExecutionMetadataKey,
-    WorkflowNodeExecutionStatus,
-)
-from core.workflow.nodes.enums import NodeType
+from core.workflow.entities import WorkflowNodeExecution
+from core.workflow.enums import NodeType, WorkflowNodeExecutionMetadataKey, WorkflowNodeExecutionStatus
 from core.workflow.repositories.workflow_node_execution_repository import OrderConfig, WorkflowNodeExecutionRepository
 from core.workflow.workflow_type_encoder import WorkflowRuntimeTypeConverter
+from extensions.ext_storage import storage
 from libs.helper import extract_tenant_id
 from libs.uuid_utils import uuidv7
 from models import (
@@ -32,10 +32,22 @@ from models import (
    WorkflowNodeExecutionModel,
    WorkflowNodeExecutionTriggeredFrom,
 )
+from models.enums import ExecutionOffLoadType
+from models.model import UploadFile
+from models.workflow import WorkflowNodeExecutionOffload
+from services.file_service import FileService
+from services.variable_truncator import VariableTruncator

 logger = logging.getLogger(__name__)


+@dataclasses.dataclass(frozen=True)
+class _InputsOutputsTruncationResult:
+    truncated_value: Mapping[str, Any]
+    file: UploadFile
+    offload: WorkflowNodeExecutionOffload
+
+
 class SQLAlchemyWorkflowNodeExecutionRepository(WorkflowNodeExecutionRepository):
    """
    SQLAlchemy implementation of the WorkflowNodeExecutionRepository interface.
@@ -86,6 +98,7 @@ class SQLAlchemyWorkflowNodeExecutionRepository(WorkflowNodeExecutionRepository)
        # Extract user context
        self._triggered_from = triggered_from
        self._creator_user_id = user.id
+        self._user = user  # Store the user object directly

        # Determine user role based on user type
        self._creator_user_role = CreatorUserRole.ACCOUNT if isinstance(user, Account) else CreatorUserRole.END_USER
@@ -94,17 +107,30 @@ class SQLAlchemyWorkflowNodeExecutionRepository(WorkflowNodeExecutionRepository)
        # Key: node_execution_id, Value: WorkflowNodeExecution (DB model)
        self._node_execution_cache: dict[str, WorkflowNodeExecutionModel] = {}

+        # Initialize FileService for handling offloaded data
+        self._file_service = FileService(session_factory)
+
+    def _create_truncator(self) -> VariableTruncator:
+        return VariableTruncator(
+            max_size_bytes=dify_config.WORKFLOW_VARIABLE_TRUNCATION_MAX_SIZE,
+            array_element_limit=dify_config.WORKFLOW_VARIABLE_TRUNCATION_ARRAY_LENGTH,
+            string_length_limit=dify_config.WORKFLOW_VARIABLE_TRUNCATION_STRING_LENGTH,
+        )
+
    def _to_domain_model(self, db_model: WorkflowNodeExecutionModel) -> WorkflowNodeExecution:
        """
        Convert a database model to a domain model.

+        This requires the offload_data, and correspond inputs_file and outputs_file are preloaded.
+
        Args:
-            db_model: The database model to convert
+            db_model: The database model to convert. It must have `offload_data`
+                  and the corresponding `inputs_file` and `outputs_file` preloaded.

        Returns:
            The domain model
        """
-        # Parse JSON fields
+        # Parse JSON fields - these might be truncated versions
        inputs = db_model.inputs_dict
        process_data = db_model.process_data_dict
        outputs = db_model.outputs_dict
@@ -113,7 +139,7 @@ class SQLAlchemyWorkflowNodeExecutionRepository(WorkflowNodeExecutionRepository)
        # Convert status to domain enum
        status = WorkflowNodeExecutionStatus(db_model.status)

-        return WorkflowNodeExecution(
+        domain_model = WorkflowNodeExecution(
            id=db_model.id,
            node_execution_id=db_model.node_execution_id,
            workflow_id=db_model.workflow_id,
@@ -134,15 +160,52 @@ class SQLAlchemyWorkflowNodeExecutionRepository(WorkflowNodeExecutionRepository)
            finished_at=db_model.finished_at,
        )

-    def to_db_model(self, domain_model: WorkflowNodeExecution) -> WorkflowNodeExecutionModel:
+        if not db_model.offload_data:
+            return domain_model
+
+        offload_data = db_model.offload_data
+        # Store truncated versions for API responses
+        # TODO: consider load content concurrently.
+
+        input_offload = _find_first(offload_data, _filter_by_offload_type(ExecutionOffLoadType.INPUTS))
+        if input_offload is not None:
+            assert input_offload.file is not None
+            domain_model.inputs = self._load_file(input_offload.file)
+            domain_model.set_truncated_inputs(inputs)
+
+        outputs_offload = _find_first(offload_data, _filter_by_offload_type(ExecutionOffLoadType.OUTPUTS))
+        if outputs_offload is not None:
+            assert outputs_offload.file is not None
+            domain_model.outputs = self._load_file(outputs_offload.file)
+            domain_model.set_truncated_outputs(outputs)
+
+        process_data_offload = _find_first(offload_data, _filter_by_offload_type(ExecutionOffLoadType.PROCESS_DATA))
+        if process_data_offload is not None:
+            assert process_data_offload.file is not None
+            domain_model.process_data = self._load_file(process_data_offload.file)
+            domain_model.set_truncated_process_data(process_data)
+
+        return domain_model
+
+    def _load_file(self, file: UploadFile) -> Mapping[str, Any]:
+        content = storage.load(file.key)
+        return json.loads(content)
+
+    @staticmethod
+    def _json_encode(values: Mapping[str, Any]) -> str:
+        json_converter = WorkflowRuntimeTypeConverter()
+        return json.dumps(json_converter.to_json_encodable(values))
+
+    def _to_db_model(self, domain_model: WorkflowNodeExecution) -> WorkflowNodeExecutionModel:
        """
-        Convert a domain model to a database model.
+        Convert a domain model to a database model. This copies the inputs /
+        process_data / outputs from domain model directly without applying truncation.

        Args:
            domain_model: The domain model to convert

        Returns:
-            The database model
+            The database model, without setting inputs, process_data and outputs fields.
        """
        # Use values from constructor if provided
        if not self._triggered_from:
@@ -152,7 +215,9 @@ class SQLAlchemyWorkflowNodeExecutionRepository(WorkflowNodeExecutionRepository)
        if not self._creator_user_role:
            raise ValueError("created_by_role is required in repository constructor")

-        json_converter = WorkflowRuntimeTypeConverter()
+        converter = WorkflowRuntimeTypeConverter()
+
+        # json_converter = WorkflowRuntimeTypeConverter()
        db_model = WorkflowNodeExecutionModel()
        db_model.id = domain_model.id
        db_model.tenant_id = self._tenant_id
@@ -168,16 +233,21 @@ class SQLAlchemyWorkflowNodeExecutionRepository(WorkflowNodeExecutionRepository)
        db_model.node_type = domain_model.node_type
        db_model.title = domain_model.title
        db_model.inputs = (
-            json.dumps(json_converter.to_json_encodable(domain_model.inputs)) if domain_model.inputs else None
+            _deterministic_json_dump(converter.to_json_encodable(domain_model.inputs))
+            if domain_model.inputs is not None
+            else None
        )
        db_model.process_data = (
-            json.dumps(json_converter.to_json_encodable(domain_model.process_data))
-            if domain_model.process_data
+            _deterministic_json_dump(converter.to_json_encodable(domain_model.process_data))
+            if domain_model.process_data is not None
            else None
        )
        db_model.outputs = (
-            json.dumps(json_converter.to_json_encodable(domain_model.outputs)) if domain_model.outputs else None
+            _deterministic_json_dump(converter.to_json_encodable(domain_model.outputs))
+            if domain_model.outputs is not None
+            else None
        )
+        # inputs, process_data and outputs are handled below
        db_model.status = domain_model.status
        db_model.error = domain_model.error
        db_model.elapsed_time = domain_model.elapsed_time
@@ -188,6 +258,7 @@ class SQLAlchemyWorkflowNodeExecutionRepository(WorkflowNodeExecutionRepository)
        db_model.created_by_role = self._creator_user_role
        db_model.created_by = self._creator_user_id
        db_model.finished_at = domain_model.finished_at
+
        return db_model

    def _is_duplicate_key_error(self, exception: BaseException) -> bool:
@@ -203,22 +274,78 @@ class SQLAlchemyWorkflowNodeExecutionRepository(WorkflowNodeExecutionRepository)
        db_model.id = new_id
        execution.id = new_id

-    def save(self, execution: WorkflowNodeExecution):
+    def _truncate_and_upload(
+        self,
+        values: Mapping[str, Any] | None,
+        execution_id: str,
+        type_: ExecutionOffLoadType,
+    ) -> _InputsOutputsTruncationResult | None:
+        if values is None:
+            return None
+
+        converter = WorkflowRuntimeTypeConverter()
+        json_encodable_value = converter.to_json_encodable(values)
+        truncator = self._create_truncator()
+        truncated_values, truncated = truncator.truncate_variable_mapping(json_encodable_value)
+        if not truncated:
+            return None
+
+        value_json = _deterministic_json_dump(json_encodable_value)
+        assert value_json is not None, "value_json should be not None here."
+
+        suffix = type_.value
+        upload_file = self._file_service.upload_file(
+            filename=f"node_execution_{execution_id}_{suffix}.json",
+            content=value_json.encode("utf-8"),
+            mimetype="application/json",
+            user=self._user,
+        )
+        offload = WorkflowNodeExecutionOffload(
+            id=uuidv7(),
+            tenant_id=self._tenant_id,
+            app_id=self._app_id,
+            node_execution_id=execution_id,
+            type_=type_,
+            file_id=upload_file.id,
+        )
+        return _InputsOutputsTruncationResult(
+            truncated_value=truncated_values,
+            file=upload_file,
+            offload=offload,
+        )
+
+    def save(self, execution: WorkflowNodeExecution) -> None:
        """
        Save or update a NodeExecution domain entity to the database.

        This method serves as a domain-to-database adapter that:
        1. Converts the domain entity to its database representation
        2. Checks for existing records and updates or inserts accordingly
-        3. Maintains proper multi-tenancy by including tenant context during conversion
-        4. Updates the in-memory cache for faster subsequent lookups
-        5. Handles duplicate key conflicts by retrying with a new UUID v7
+        3. Handles truncation and offloading of large inputs/outputs
+        4. Persists the database model using SQLAlchemy's merge operation
+        5. Maintains proper multi-tenancy by including tenant context during conversion
+        6. Updates the in-memory cache for faster subsequent lookups
+
+        The method handles both creating new records and updating existing ones through
+        SQLAlchemy's merge operation.

        Args:
            execution: The NodeExecution domain entity to persist
        """
+        # NOTE: As per the implementation of `WorkflowCycleManager`,
+        # the `save` method is invoked multiple times during the node's execution lifecycle, including:
+        #
+        # - When the node starts execution
+        # - When the node retries execution
+        # - When the node completes execution (either successfully or with failure)
+        #
+        # Only the final invocation will have `inputs` and `outputs` populated.
+        #
+        # This simplifies the logic for saving offloaded variables but introduces a tight coupling
+        # between this module and `WorkflowCycleManager`.
+
        # Convert domain model to database model using tenant context and other attributes
-        db_model = self.to_db_model(execution)
+        db_model = self._to_db_model(execution)

        # Use tenacity for retry logic with duplicate key handling
        @retry(
@@ -245,7 +372,6 @@ class SQLAlchemyWorkflowNodeExecutionRepository(WorkflowNodeExecutionRepository)

            # Update the in-memory cache after successful save
            if db_model.node_execution_id:
-                logger.debug("Updating cache for node_execution_id: %s", db_model.node_execution_id)
                self._node_execution_cache[db_model.node_execution_id] = db_model

        except Exception:
@@ -276,14 +402,83 @@ class SQLAlchemyWorkflowNodeExecutionRepository(WorkflowNodeExecutionRepository)

            session.commit()

+            # Update the in-memory cache for faster subsequent lookups
+            # Only cache if we have a node_execution_id to use as the cache key
+            if db_model.node_execution_id:
+                self._node_execution_cache[db_model.node_execution_id] = db_model
+
+    def save_execution_data(self, execution: WorkflowNodeExecution):
+        domain_model = execution
+        with self._session_factory(expire_on_commit=False) as session:
+            query = WorkflowNodeExecutionModel.preload_offload_data(select(WorkflowNodeExecutionModel)).where(
+                WorkflowNodeExecutionModel.id == domain_model.id
+            )
+            db_model: WorkflowNodeExecutionModel | None = session.execute(query).scalars().first()
+
+        if db_model is not None:
+            offload_data = db_model.offload_data
+
+        else:
+            db_model = self._to_db_model(domain_model)
+            offload_data = []
+
+        offload_data = db_model.offload_data
+        if domain_model.inputs is not None:
+            result = self._truncate_and_upload(
+                domain_model.inputs,
+                domain_model.id,
+                ExecutionOffLoadType.INPUTS,
+            )
+            if result is not None:
+                db_model.inputs = self._json_encode(result.truncated_value)
+                domain_model.set_truncated_inputs(result.truncated_value)
+                offload_data = _replace_or_append_offload(offload_data, result.offload)
+            else:
+                db_model.inputs = self._json_encode(domain_model.inputs)
+
+        if domain_model.outputs is not None:
+            result = self._truncate_and_upload(
+                domain_model.outputs,
+                domain_model.id,
+                ExecutionOffLoadType.OUTPUTS,
+            )
+            if result is not None:
+                db_model.outputs = self._json_encode(result.truncated_value)
+                domain_model.set_truncated_outputs(result.truncated_value)
+                offload_data = _replace_or_append_offload(offload_data, result.offload)
+            else:
+                db_model.outputs = self._json_encode(domain_model.outputs)
+
+        if domain_model.process_data is not None:
+            result = self._truncate_and_upload(
+                domain_model.process_data,
+                domain_model.id,
+                ExecutionOffLoadType.PROCESS_DATA,
+            )
+            if result is not None:
+                db_model.process_data = self._json_encode(result.truncated_value)
+                domain_model.set_truncated_process_data(result.truncated_value)
+                offload_data = _replace_or_append_offload(offload_data, result.offload)
+            else:
+                db_model.process_data = self._json_encode(domain_model.process_data)
+
+        db_model.offload_data = offload_data
+        with self._session_factory() as session, session.begin():
+            session.merge(db_model)
+            session.flush()
+
    def get_db_models_by_workflow_run(
        self,
        workflow_run_id: str,
        order_config: OrderConfig | None = None,
+        triggered_from: WorkflowNodeExecutionTriggeredFrom = WorkflowNodeExecutionTriggeredFrom.WORKFLOW_RUN,
    ) -> Sequence[WorkflowNodeExecutionModel]:
        """
        Retrieve all WorkflowNodeExecution database models for a specific workflow run.

+        The returned models have `offload_data` preloaded, along with the associated
+        `inputs_file` and `outputs_file` data.
+
        This method directly returns database models without converting to domain models,
        which is useful when you need to access database-specific fields like triggered_from.
        It also updates the in-memory cache with the retrieved models.
@@ -298,10 +493,11 @@ class SQLAlchemyWorkflowNodeExecutionRepository(WorkflowNodeExecutionRepository)
            A list of WorkflowNodeExecution database models
        """
        with self._session_factory() as session:
-            stmt = select(WorkflowNodeExecutionModel).where(
+            stmt = WorkflowNodeExecutionModel.preload_offload_data_and_files(select(WorkflowNodeExecutionModel))
+            stmt = stmt.where(
                WorkflowNodeExecutionModel.workflow_run_id == workflow_run_id,
                WorkflowNodeExecutionModel.tenant_id == self._tenant_id,
-                WorkflowNodeExecutionModel.triggered_from == WorkflowNodeExecutionTriggeredFrom.WORKFLOW_RUN,
+                WorkflowNodeExecutionModel.triggered_from == triggered_from,
            )

            if self._app_id:
@@ -335,6 +531,7 @@ class SQLAlchemyWorkflowNodeExecutionRepository(WorkflowNodeExecutionRepository)
        self,
        workflow_run_id: str,
        order_config: OrderConfig | None = None,
+        triggered_from: WorkflowNodeExecutionTriggeredFrom = WorkflowNodeExecutionTriggeredFrom.WORKFLOW_RUN,
    ) -> Sequence[WorkflowNodeExecution]:
        """
        Retrieve all NodeExecution instances for a specific workflow run.
@@ -352,12 +549,48 @@ class SQLAlchemyWorkflowNodeExecutionRepository(WorkflowNodeExecutionRepository)
            A list of NodeExecution instances
        """
        # Get the database models using the new method
-        db_models = self.get_db_models_by_workflow_run(workflow_run_id, order_config)
+        db_models = self.get_db_models_by_workflow_run(workflow_run_id, order_config, triggered_from)

-        # Convert database models to domain models
-        domain_models = []
-        for model in db_models:
-            domain_model = self._to_domain_model(model)
-            domain_models.append(domain_model)
+        with ThreadPoolExecutor(max_workers=10) as executor:
+            domain_models = executor.map(self._to_domain_model, db_models, timeout=30)

-        return domain_models
+        return list(domain_models)
+
+
+def _deterministic_json_dump(value: Mapping[str, Any]) -> str:
+    return json.dumps(value, sort_keys=True)
+
+
+_T = TypeVar("_T")
+
+
+def _find_first(seq: Sequence[_T], pred: Callable[[_T], bool]) -> _T | None:
+    filtered = [i for i in seq if pred(i)]
+    if filtered:
+        return filtered[0]
+    return None
+
+
+def _filter_by_offload_type(offload_type: ExecutionOffLoadType) -> Callable[[WorkflowNodeExecutionOffload], bool]:
+    def f(offload: WorkflowNodeExecutionOffload) -> bool:
+        return offload.type_ == offload_type
+
+    return f
+
+
+def _replace_or_append_offload(
+    seq: list[WorkflowNodeExecutionOffload], elem: WorkflowNodeExecutionOffload
+) -> list[WorkflowNodeExecutionOffload]:
+    """Replace all elements in `seq` that satisfy the equality condition defined by `eq_func` with `elem`.
+
+    Args:
+        seq: The sequence of elements to process.
+        elem: The new element to insert.
+        eq_func: A function that determines equality between elements.
+
+    Returns:
+        A new sequence with the specified elements replaced or appended.
+    """
+    ls = [i for i in seq if i.type_ != elem.type_]
+    ls.append(elem)
+    return ls