feat: knowledge pipeline (#25360)

Signed-off-by: -LAN- <laipz8200@outlook.com> Co-authored-by: twwu <twwu@dify.ai> Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com> Co-authored-by: jyong <718720800@qq.com> Co-authored-by: Wu Tianwei <30284043+WTW0313@users.noreply.github.com> Co-authored-by: QuantumGhost <obelisk.reg+git@gmail.com> Co-authored-by: lyzno1 <yuanyouhuilyz@gmail.com> Co-authored-by: quicksand <quicksandzn@gmail.com> Co-authored-by: Jyong <76649700+JohnJyong@users.noreply.github.com> Co-authored-by: lyzno1 <92089059+lyzno1@users.noreply.github.com> Co-authored-by: zxhlyh <jasonapring2015@outlook.com> Co-authored-by: Yongtao Huang <yongtaoh2022@gmail.com> Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: Joel <iamjoel007@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: nite-knite <nkCoding@gmail.com> Co-authored-by: Hanqing Zhao <sherry9277@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Harry <xh001x@hotmail.com>
2025-09-18 12:49:10 +08:00
parent 7dadb33003
commit 85cda47c70
1772 changed files with 102407 additions and 31710 deletions
--- a/api/core/workflow/entities/init.py
+++ b/api/core/workflow/entities/init.py
@@ -0,0 +1,18 @@
+from .agent import AgentNodeStrategyInit
+from .graph_init_params import GraphInitParams
+from .graph_runtime_state import GraphRuntimeState
+from .run_condition import RunCondition
+from .variable_pool import VariablePool, VariableValue
+from .workflow_execution import WorkflowExecution
+from .workflow_node_execution import WorkflowNodeExecution
+
+__all__ = [
+    "AgentNodeStrategyInit",
+    "GraphInitParams",
+    "GraphRuntimeState",
+    "RunCondition",
+    "VariablePool",
+    "VariableValue",
+    "WorkflowExecution",
+    "WorkflowNodeExecution",
+]
--- a/api/core/workflow/entities/agent.py
+++ b/api/core/workflow/entities/agent.py
@@ -0,0 +1,8 @@
+from pydantic import BaseModel
+
+
+class AgentNodeStrategyInit(BaseModel):
+    """Agent node strategy initialization data."""
+
+    name: str
+    icon: str | None = None
--- a/api/core/workflow/entities/graph_init_params.py
+++ b/api/core/workflow/entities/graph_init_params.py
@@ -0,0 +1,20 @@
+from collections.abc import Mapping
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+
+class GraphInitParams(BaseModel):
+    # init params
+    tenant_id: str = Field(..., description="tenant / workspace id")
+    app_id: str = Field(..., description="app id")
+    workflow_id: str = Field(..., description="workflow id")
+    graph_config: Mapping[str, Any] = Field(..., description="graph config")
+    user_id: str = Field(..., description="user id")
+    user_from: str = Field(
+        ..., description="user from, account or end-user"
+    )  # Should be UserFrom enum: 'account' | 'end-user'
+    invoke_from: str = Field(
+        ..., description="invoke from, service-api, web-app, explore or debugger"
+    )  # Should be InvokeFrom enum: 'service-api' | 'web-app' | 'explore' | 'debugger'
+    call_depth: int = Field(..., description="call depth")
--- a/api/core/workflow/entities/graph_runtime_state.py
+++ b/api/core/workflow/entities/graph_runtime_state.py
@@ -0,0 +1,160 @@
+from copy import deepcopy
+
+from pydantic import BaseModel, PrivateAttr
+
+from core.model_runtime.entities.llm_entities import LLMUsage
+
+from .variable_pool import VariablePool
+
+
+class GraphRuntimeState(BaseModel):
+    # Private attributes to prevent direct modification
+    _variable_pool: VariablePool = PrivateAttr()
+    _start_at: float = PrivateAttr()
+    _total_tokens: int = PrivateAttr(default=0)
+    _llm_usage: LLMUsage = PrivateAttr(default_factory=LLMUsage.empty_usage)
+    _outputs: dict[str, object] = PrivateAttr(default_factory=dict[str, object])
+    _node_run_steps: int = PrivateAttr(default=0)
+    _ready_queue_json: str = PrivateAttr()
+    _graph_execution_json: str = PrivateAttr()
+    _response_coordinator_json: str = PrivateAttr()
+
+    def __init__(
+        self,
+        *,
+        variable_pool: VariablePool,
+        start_at: float,
+        total_tokens: int = 0,
+        llm_usage: LLMUsage | None = None,
+        outputs: dict[str, object] | None = None,
+        node_run_steps: int = 0,
+        ready_queue_json: str = "",
+        graph_execution_json: str = "",
+        response_coordinator_json: str = "",
+        **kwargs: object,
+    ):
+        """Initialize the GraphRuntimeState with validation."""
+        super().__init__(**kwargs)
+
+        # Initialize private attributes with validation
+        self._variable_pool = variable_pool
+
+        self._start_at = start_at
+
+        if total_tokens < 0:
+            raise ValueError("total_tokens must be non-negative")
+        self._total_tokens = total_tokens
+
+        if llm_usage is None:
+            llm_usage = LLMUsage.empty_usage()
+        self._llm_usage = llm_usage
+
+        if outputs is None:
+            outputs = {}
+        self._outputs = deepcopy(outputs)
+
+        if node_run_steps < 0:
+            raise ValueError("node_run_steps must be non-negative")
+        self._node_run_steps = node_run_steps
+
+        self._ready_queue_json = ready_queue_json
+        self._graph_execution_json = graph_execution_json
+        self._response_coordinator_json = response_coordinator_json
+
+    @property
+    def variable_pool(self) -> VariablePool:
+        """Get the variable pool."""
+        return self._variable_pool
+
+    @property
+    def start_at(self) -> float:
+        """Get the start time."""
+        return self._start_at
+
+    @start_at.setter
+    def start_at(self, value: float) -> None:
+        """Set the start time."""
+        self._start_at = value
+
+    @property
+    def total_tokens(self) -> int:
+        """Get the total tokens count."""
+        return self._total_tokens
+
+    @total_tokens.setter
+    def total_tokens(self, value: int):
+        """Set the total tokens count."""
+        if value < 0:
+            raise ValueError("total_tokens must be non-negative")
+        self._total_tokens = value
+
+    @property
+    def llm_usage(self) -> LLMUsage:
+        """Get the LLM usage info."""
+        # Return a copy to prevent external modification
+        return self._llm_usage.model_copy()
+
+    @llm_usage.setter
+    def llm_usage(self, value: LLMUsage):
+        """Set the LLM usage info."""
+        self._llm_usage = value.model_copy()
+
+    @property
+    def outputs(self) -> dict[str, object]:
+        """Get a copy of the outputs dictionary."""
+        return deepcopy(self._outputs)
+
+    @outputs.setter
+    def outputs(self, value: dict[str, object]) -> None:
+        """Set the outputs dictionary."""
+        self._outputs = deepcopy(value)
+
+    def set_output(self, key: str, value: object) -> None:
+        """Set a single output value."""
+        self._outputs[key] = deepcopy(value)
+
+    def get_output(self, key: str, default: object = None) -> object:
+        """Get a single output value."""
+        return deepcopy(self._outputs.get(key, default))
+
+    def update_outputs(self, updates: dict[str, object]) -> None:
+        """Update multiple output values."""
+        for key, value in updates.items():
+            self._outputs[key] = deepcopy(value)
+
+    @property
+    def node_run_steps(self) -> int:
+        """Get the node run steps count."""
+        return self._node_run_steps
+
+    @node_run_steps.setter
+    def node_run_steps(self, value: int) -> None:
+        """Set the node run steps count."""
+        if value < 0:
+            raise ValueError("node_run_steps must be non-negative")
+        self._node_run_steps = value
+
+    def increment_node_run_steps(self) -> None:
+        """Increment the node run steps by 1."""
+        self._node_run_steps += 1
+
+    def add_tokens(self, tokens: int) -> None:
+        """Add tokens to the total count."""
+        if tokens < 0:
+            raise ValueError("tokens must be non-negative")
+        self._total_tokens += tokens
+
+    @property
+    def ready_queue_json(self) -> str:
+        """Get a copy of the ready queue state."""
+        return self._ready_queue_json
+
+    @property
+    def graph_execution_json(self) -> str:
+        """Get a copy of the serialized graph execution state."""
+        return self._graph_execution_json
+
+    @property
+    def response_coordinator_json(self) -> str:
+        """Get a copy of the serialized response coordinator state."""
+        return self._response_coordinator_json
--- a/api/core/workflow/entities/node_entities.py
+++ b/api/core/workflow/entities/node_entities.py
@@ -1,34 +0,0 @@
-from collections.abc import Mapping
-from typing import Any
-
-from pydantic import BaseModel
-
-from core.model_runtime.entities.llm_entities import LLMUsage
-from core.workflow.entities.workflow_node_execution import WorkflowNodeExecutionMetadataKey, WorkflowNodeExecutionStatus
-
-
-class NodeRunResult(BaseModel):
-    """
-    Node Run Result.
-    """
-
-    status: WorkflowNodeExecutionStatus = WorkflowNodeExecutionStatus.RUNNING
-
-    inputs: Mapping[str, Any] | None = None  # node inputs
-    process_data: Mapping[str, Any] | None = None  # process data
-    outputs: Mapping[str, Any] | None = None  # node outputs
-    metadata: Mapping[WorkflowNodeExecutionMetadataKey, Any] | None = None  # node metadata
-    llm_usage: LLMUsage | None = None  # llm usage
-
-    edge_source_handle: str | None = None  # source handle id of node with multiple branches
-
-    error: str | None = None  # error message if status is failed
-    error_type: str | None = None  # error type if status is failed
-
-    # single step node run retry
-    retry_index: int = 0
-
-
-class AgentNodeStrategyInit(BaseModel):
-    name: str
-    icon: str | None = None
--- a/api/core/workflow/entities/run_condition.py
+++ b/api/core/workflow/entities/run_condition.py
@@ -0,0 +1,21 @@
+import hashlib
+from typing import Literal
+
+from pydantic import BaseModel
+
+from core.workflow.utils.condition.entities import Condition
+
+
+class RunCondition(BaseModel):
+    type: Literal["branch_identify", "condition"]
+    """condition type"""
+
+    branch_identify: str | None = None
+    """branch identify like: sourceHandle, required when type is branch_identify"""
+
+    conditions: list[Condition] | None = None
+    """conditions to run the node, required when type is condition"""
+
+    @property
+    def hash(self) -> str:
+        return hashlib.sha256(self.model_dump_json().encode()).hexdigest()
--- a/api/core/workflow/entities/variable_entities.py
+++ b/api/core/workflow/entities/variable_entities.py
@@ -1,12 +0,0 @@
-from collections.abc import Sequence
-
-from pydantic import BaseModel
-
-
-class VariableSelector(BaseModel):
-    """
-    Variable Selector.
-    """
-
-    variable: str
-    value_selector: Sequence[str]
--- a/api/core/workflow/entities/variable_pool.py
+++ b/api/core/workflow/entities/variable_pool.py
@@ -9,12 +9,17 @@ from core.file import File, FileAttribute, file_manager
 from core.variables import Segment, SegmentGroup, Variable
 from core.variables.consts import SELECTORS_LENGTH
 from core.variables.segments import FileSegment, ObjectSegment
-from core.variables.variables import VariableUnion
-from core.workflow.constants import CONVERSATION_VARIABLE_NODE_ID, ENVIRONMENT_VARIABLE_NODE_ID, SYSTEM_VARIABLE_NODE_ID
+from core.variables.variables import RAGPipelineVariableInput, VariableUnion
+from core.workflow.constants import (
+    CONVERSATION_VARIABLE_NODE_ID,
+    ENVIRONMENT_VARIABLE_NODE_ID,
+    RAG_PIPELINE_VARIABLE_NODE_ID,
+    SYSTEM_VARIABLE_NODE_ID,
+)
 from core.workflow.system_variable import SystemVariable
 from factories import variable_factory

-VariableValue = Union[str, int, float, dict, list, File]
+VariableValue = Union[str, int, float, dict[str, object], list[object], File]

 VARIABLE_PATTERN = re.compile(r"\{\{#([a-zA-Z0-9_]{1,50}(?:\.[a-zA-Z_][a-zA-Z0-9_]{0,29}){1,10})#\}\}")

@@ -40,10 +45,14 @@ class VariablePool(BaseModel):
    )
    environment_variables: Sequence[VariableUnion] = Field(
        description="Environment variables.",
-        default_factory=list,
+        default_factory=list[VariableUnion],
    )
    conversation_variables: Sequence[VariableUnion] = Field(
        description="Conversation variables.",
+        default_factory=list[VariableUnion],
+    )
+    rag_pipeline_variables: list[RAGPipelineVariableInput] = Field(
+        description="RAG pipeline variables.",
        default_factory=list,
    )

@@ -56,6 +65,16 @@ class VariablePool(BaseModel):
        # Add conversation variables to the variable pool
        for var in self.conversation_variables:
            self.add((CONVERSATION_VARIABLE_NODE_ID, var.name), var)
+        # Add rag pipeline variables to the variable pool
+        if self.rag_pipeline_variables:
+            rag_pipeline_variables_map: defaultdict[Any, dict[Any, Any]] = defaultdict(dict)
+            for rag_var in self.rag_pipeline_variables:
+                node_id = rag_var.variable.belong_to_node_id
+                key = rag_var.variable.variable
+                value = rag_var.value
+                rag_pipeline_variables_map[node_id][key] = value
+            for key, value in rag_pipeline_variables_map.items():
+                self.add((RAG_PIPELINE_VARIABLE_NODE_ID, key), value)

    def add(self, selector: Sequence[str], value: Any, /):
        """
@@ -191,7 +210,7 @@ class VariablePool(BaseModel):

    def convert_template(self, template: str, /):
        parts = VARIABLE_PATTERN.split(template)
-        segments = []
+        segments: list[Segment] = []
        for part in filter(lambda x: x, parts):
            if "." in part and (variable := self.get(part.split("."))):
                segments.append(variable)
--- a/api/core/workflow/entities/workflow_execution.py
+++ b/api/core/workflow/entities/workflow_execution.py
@@ -7,31 +7,14 @@ implementation details like tenant_id, app_id, etc.

 from collections.abc import Mapping
 from datetime import datetime
-from enum import StrEnum
 from typing import Any

 from pydantic import BaseModel, Field

+from core.workflow.enums import WorkflowExecutionStatus, WorkflowType
 from libs.datetime_utils import naive_utc_now


-class WorkflowType(StrEnum):
-    """
-    Workflow Type Enum for domain layer
-    """
-
-    WORKFLOW = "workflow"
-    CHAT = "chat"
-
-
-class WorkflowExecutionStatus(StrEnum):
-    RUNNING = "running"
-    SUCCEEDED = "succeeded"
-    FAILED = "failed"
-    STOPPED = "stopped"
-    PARTIAL_SUCCEEDED = "partial-succeeded"
-
-
 class WorkflowExecution(BaseModel):
    """
    Domain model for workflow execution based on WorkflowRun but without
--- a/api/core/workflow/entities/workflow_node_execution.py
+++ b/api/core/workflow/entities/workflow_node_execution.py
@@ -8,49 +8,11 @@ and don't contain implementation details like tenant_id, app_id, etc.

 from collections.abc import Mapping
 from datetime import datetime
-from enum import StrEnum
 from typing import Any

-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, PrivateAttr

-from core.workflow.nodes.enums import NodeType
-
-
-class WorkflowNodeExecutionMetadataKey(StrEnum):
-    """
-    Node Run Metadata Key.
-    """
-
-    TOTAL_TOKENS = "total_tokens"
-    TOTAL_PRICE = "total_price"
-    CURRENCY = "currency"
-    TOOL_INFO = "tool_info"
-    AGENT_LOG = "agent_log"
-    ITERATION_ID = "iteration_id"
-    ITERATION_INDEX = "iteration_index"
-    LOOP_ID = "loop_id"
-    LOOP_INDEX = "loop_index"
-    PARALLEL_ID = "parallel_id"
-    PARALLEL_START_NODE_ID = "parallel_start_node_id"
-    PARENT_PARALLEL_ID = "parent_parallel_id"
-    PARENT_PARALLEL_START_NODE_ID = "parent_parallel_start_node_id"
-    PARALLEL_MODE_RUN_ID = "parallel_mode_run_id"
-    ITERATION_DURATION_MAP = "iteration_duration_map"  # single iteration duration if iteration node runs
-    LOOP_DURATION_MAP = "loop_duration_map"  # single loop duration if loop node runs
-    ERROR_STRATEGY = "error_strategy"  # node in continue on error mode return the field
-    LOOP_VARIABLE_MAP = "loop_variable_map"  # single loop variable output
-
-
-class WorkflowNodeExecutionStatus(StrEnum):
-    """
-    Node Execution Status Enum.
-    """
-
-    RUNNING = "running"
-    SUCCEEDED = "succeeded"
-    FAILED = "failed"
-    EXCEPTION = "exception"
-    RETRY = "retry"
+from core.workflow.enums import NodeType, WorkflowNodeExecutionMetadataKey, WorkflowNodeExecutionStatus


 class WorkflowNodeExecution(BaseModel):
@@ -90,6 +52,7 @@ class WorkflowNodeExecution(BaseModel):
    title: str  # Display title of the node

    # Execution data
+    # The `inputs` and `outputs` fields hold the full content
    inputs: Mapping[str, Any] | None = None  # Input variables used by this node
    process_data: Mapping[str, Any] | None = None  # Intermediate processing data
    outputs: Mapping[str, Any] | None = None  # Output variables produced by this node
@@ -106,6 +69,58 @@ class WorkflowNodeExecution(BaseModel):
    created_at: datetime  # When execution started
    finished_at: datetime | None = None  # When execution completed

+    _truncated_inputs: Mapping[str, Any] | None = PrivateAttr(None)
+    _truncated_outputs: Mapping[str, Any] | None = PrivateAttr(None)
+    _truncated_process_data: Mapping[str, Any] | None = PrivateAttr(None)
+
+    def get_truncated_inputs(self) -> Mapping[str, Any] | None:
+        return self._truncated_inputs
+
+    def get_truncated_outputs(self) -> Mapping[str, Any] | None:
+        return self._truncated_outputs
+
+    def get_truncated_process_data(self) -> Mapping[str, Any] | None:
+        return self._truncated_process_data
+
+    def set_truncated_inputs(self, truncated_inputs: Mapping[str, Any] | None):
+        self._truncated_inputs = truncated_inputs
+
+    def set_truncated_outputs(self, truncated_outputs: Mapping[str, Any] | None):
+        self._truncated_outputs = truncated_outputs
+
+    def set_truncated_process_data(self, truncated_process_data: Mapping[str, Any] | None):
+        self._truncated_process_data = truncated_process_data
+
+    def get_response_inputs(self) -> Mapping[str, Any] | None:
+        inputs = self.get_truncated_inputs()
+        if inputs:
+            return inputs
+        return self.inputs
+
+    @property
+    def inputs_truncated(self):
+        return self._truncated_inputs is not None
+
+    @property
+    def outputs_truncated(self):
+        return self._truncated_outputs is not None
+
+    @property
+    def process_data_truncated(self):
+        return self._truncated_process_data is not None
+
+    def get_response_outputs(self) -> Mapping[str, Any] | None:
+        outputs = self.get_truncated_outputs()
+        if outputs is not None:
+            return outputs
+        return self.outputs
+
+    def get_response_process_data(self) -> Mapping[str, Any] | None:
+        process_data = self.get_truncated_process_data()
+        if process_data is not None:
+            return process_data
+        return self.process_data
+
    def update_from_mapping(
        self,
        inputs: Mapping[str, Any] | None = None,