feat: knowledge pipeline (#25360)

Signed-off-by: -LAN- <laipz8200@outlook.com> Co-authored-by: twwu <twwu@dify.ai> Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com> Co-authored-by: jyong <718720800@qq.com> Co-authored-by: Wu Tianwei <30284043+WTW0313@users.noreply.github.com> Co-authored-by: QuantumGhost <obelisk.reg+git@gmail.com> Co-authored-by: lyzno1 <yuanyouhuilyz@gmail.com> Co-authored-by: quicksand <quicksandzn@gmail.com> Co-authored-by: Jyong <76649700+JohnJyong@users.noreply.github.com> Co-authored-by: lyzno1 <92089059+lyzno1@users.noreply.github.com> Co-authored-by: zxhlyh <jasonapring2015@outlook.com> Co-authored-by: Yongtao Huang <yongtaoh2022@gmail.com> Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: Joel <iamjoel007@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: nite-knite <nkCoding@gmail.com> Co-authored-by: Hanqing Zhao <sherry9277@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Harry <xh001x@hotmail.com>
2025-09-18 12:49:10 +08:00
parent 7dadb33003
commit 85cda47c70
1772 changed files with 102407 additions and 31710 deletions
--- a/api/core/rag/extractor/entity/extract_setting.py
+++ b/api/core/rag/extractor/entity/extract_setting.py
@@ -9,6 +9,7 @@ class NotionInfo(BaseModel):
    Notion import info.
    """

+    credential_id: str | None = None
    notion_workspace_id: str
    notion_obj_id: str
    notion_page_type: str
--- a/api/core/rag/extractor/extract_processor.py
+++ b/api/core/rag/extractor/extract_processor.py
@@ -171,6 +171,7 @@ class ExtractProcessor:
                notion_page_type=extract_setting.notion_info.notion_page_type,
                document_model=extract_setting.notion_info.document,
                tenant_id=extract_setting.notion_info.tenant_id,
+                credential_id=extract_setting.notion_info.credential_id,
            )
            return extractor.extract()
        elif extract_setting.datasource_type == DatasourceType.WEBSITE.value:
--- a/api/core/rag/extractor/firecrawl/firecrawl_web_extractor.py
+++ b/api/core/rag/extractor/firecrawl/firecrawl_web_extractor.py
@@ -15,7 +15,14 @@ class FirecrawlWebExtractor(BaseExtractor):
        only_main_content: Only return the main content of the page excluding headers, navs, footers, etc.
    """

-    def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = True):
+    def __init__(
+        self,
+        url: str,
+        job_id: str,
+        tenant_id: str,
+        mode: str = "crawl",
+        only_main_content: bool = True,
+    ):
        """Initialize with url, api_key, base_url and mode."""
        self._url = url
        self.job_id = job_id
--- a/api/core/rag/extractor/jina_reader_extractor.py
+++ b/api/core/rag/extractor/jina_reader_extractor.py
@@ -8,7 +8,14 @@ class JinaReaderWebExtractor(BaseExtractor):
    Crawl and scrape websites and return content in clean llm-ready markdown.
    """

-    def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = False):
+    def __init__(
+        self,
+        url: str,
+        job_id: str,
+        tenant_id: str,
+        mode: str = "crawl",
+        only_main_content: bool = False,
+    ):
        """Initialize with url, api_key, base_url and mode."""
        self._url = url
        self.job_id = job_id
--- a/api/core/rag/extractor/notion_extractor.py
+++ b/api/core/rag/extractor/notion_extractor.py
@@ -4,14 +4,13 @@ import operator
 from typing import Any, cast

 import requests
-from sqlalchemy import select

 from configs import dify_config
 from core.rag.extractor.extractor_base import BaseExtractor
 from core.rag.models.document import Document
 from extensions.ext_database import db
 from models.dataset import Document as DocumentModel
-from models.source import DataSourceOauthBinding
+from services.datasource_provider_service import DatasourceProviderService

 logger = logging.getLogger(__name__)

@@ -38,16 +37,18 @@ class NotionExtractor(BaseExtractor):
        tenant_id: str,
        document_model: DocumentModel | None = None,
        notion_access_token: str | None = None,
+        credential_id: str | None = None,
    ):
        self._notion_access_token = None
        self._document_model = document_model
        self._notion_workspace_id = notion_workspace_id
        self._notion_obj_id = notion_obj_id
        self._notion_page_type = notion_page_type
+        self._credential_id = credential_id
        if notion_access_token:
            self._notion_access_token = notion_access_token
        else:
-            self._notion_access_token = self._get_access_token(tenant_id, self._notion_workspace_id)
+            self._notion_access_token = self._get_access_token(tenant_id, self._credential_id)
            if not self._notion_access_token:
                integration_token = dify_config.NOTION_INTEGRATION_TOKEN
                if integration_token is None:
@@ -368,18 +369,18 @@ class NotionExtractor(BaseExtractor):
        return cast(str, data["last_edited_time"])

    @classmethod
-    def _get_access_token(cls, tenant_id: str, notion_workspace_id: str) -> str:
-        stmt = select(DataSourceOauthBinding).where(
-            DataSourceOauthBinding.tenant_id == tenant_id,
-            DataSourceOauthBinding.provider == "notion",
-            DataSourceOauthBinding.disabled == False,
-            DataSourceOauthBinding.source_info["workspace_id"] == f'"{notion_workspace_id}"',
+    def _get_access_token(cls, tenant_id: str, credential_id: str | None) -> str:
+        # get credential from tenant_id and credential_id
+        if not credential_id:
+            raise Exception(f"No credential id found for tenant {tenant_id}")
+        datasource_provider_service = DatasourceProviderService()
+        credential = datasource_provider_service.get_datasource_credentials(
+            tenant_id=tenant_id,
+            credential_id=credential_id,
+            provider="notion_datasource",
+            plugin_id="langgenius/notion_datasource",
        )
-        data_source_binding = db.session.scalar(stmt)
+        if not credential:
+            raise Exception(f"No notion credential found for tenant {tenant_id} and credential {credential_id}")

-        if not data_source_binding:
-            raise Exception(
-                f"No notion data source binding found for tenant {tenant_id} and notion workspace {notion_workspace_id}"
-            )
-
-        return data_source_binding.access_token
+        return cast(str, credential["integration_secret"])
--- a/api/core/rag/extractor/watercrawl/extractor.py
+++ b/api/core/rag/extractor/watercrawl/extractor.py
@@ -16,7 +16,14 @@ class WaterCrawlWebExtractor(BaseExtractor):
        only_main_content: Only return the main content of the page excluding headers, navs, footers, etc.
    """

-    def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = True):
+    def __init__(
+        self,
+        url: str,
+        job_id: str,
+        tenant_id: str,
+        mode: str = "crawl",
+        only_main_content: bool = True,
+    ):
        """Initialize with url, api_key, base_url and mode."""
        self._url = url
        self.job_id = job_id