feat: knowledge pipeline (#25360)

Signed-off-by: -LAN- <laipz8200@outlook.com>
Co-authored-by: twwu <twwu@dify.ai>
Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
Co-authored-by: jyong <718720800@qq.com>
Co-authored-by: Wu Tianwei <30284043+WTW0313@users.noreply.github.com>
Co-authored-by: QuantumGhost <obelisk.reg+git@gmail.com>
Co-authored-by: lyzno1 <yuanyouhuilyz@gmail.com>
Co-authored-by: quicksand <quicksandzn@gmail.com>
Co-authored-by: Jyong <76649700+JohnJyong@users.noreply.github.com>
Co-authored-by: lyzno1 <92089059+lyzno1@users.noreply.github.com>
Co-authored-by: zxhlyh <jasonapring2015@outlook.com>
Co-authored-by: Yongtao Huang <yongtaoh2022@gmail.com>
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
Co-authored-by: Joel <iamjoel007@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: nite-knite <nkCoding@gmail.com>
Co-authored-by: Hanqing Zhao <sherry9277@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Harry <xh001x@hotmail.com>
This commit is contained in:
-LAN-
2025-09-18 12:49:10 +08:00
committed by GitHub
parent 7dadb33003
commit 85cda47c70
1772 changed files with 102407 additions and 31710 deletions

View File

@@ -9,6 +9,7 @@ class NotionInfo(BaseModel):
Notion import info.
"""
credential_id: str | None = None
notion_workspace_id: str
notion_obj_id: str
notion_page_type: str

View File

@@ -171,6 +171,7 @@ class ExtractProcessor:
notion_page_type=extract_setting.notion_info.notion_page_type,
document_model=extract_setting.notion_info.document,
tenant_id=extract_setting.notion_info.tenant_id,
credential_id=extract_setting.notion_info.credential_id,
)
return extractor.extract()
elif extract_setting.datasource_type == DatasourceType.WEBSITE.value:

View File

@@ -15,7 +15,14 @@ class FirecrawlWebExtractor(BaseExtractor):
only_main_content: Only return the main content of the page excluding headers, navs, footers, etc.
"""
def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = True):
def __init__(
self,
url: str,
job_id: str,
tenant_id: str,
mode: str = "crawl",
only_main_content: bool = True,
):
"""Initialize with url, api_key, base_url and mode."""
self._url = url
self.job_id = job_id

View File

@@ -8,7 +8,14 @@ class JinaReaderWebExtractor(BaseExtractor):
Crawl and scrape websites and return content in clean llm-ready markdown.
"""
def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = False):
def __init__(
self,
url: str,
job_id: str,
tenant_id: str,
mode: str = "crawl",
only_main_content: bool = False,
):
"""Initialize with url, api_key, base_url and mode."""
self._url = url
self.job_id = job_id

View File

@@ -4,14 +4,13 @@ import operator
from typing import Any, cast
import requests
from sqlalchemy import select
from configs import dify_config
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
from extensions.ext_database import db
from models.dataset import Document as DocumentModel
from models.source import DataSourceOauthBinding
from services.datasource_provider_service import DatasourceProviderService
logger = logging.getLogger(__name__)
@@ -38,16 +37,18 @@ class NotionExtractor(BaseExtractor):
tenant_id: str,
document_model: DocumentModel | None = None,
notion_access_token: str | None = None,
credential_id: str | None = None,
):
self._notion_access_token = None
self._document_model = document_model
self._notion_workspace_id = notion_workspace_id
self._notion_obj_id = notion_obj_id
self._notion_page_type = notion_page_type
self._credential_id = credential_id
if notion_access_token:
self._notion_access_token = notion_access_token
else:
self._notion_access_token = self._get_access_token(tenant_id, self._notion_workspace_id)
self._notion_access_token = self._get_access_token(tenant_id, self._credential_id)
if not self._notion_access_token:
integration_token = dify_config.NOTION_INTEGRATION_TOKEN
if integration_token is None:
@@ -368,18 +369,18 @@ class NotionExtractor(BaseExtractor):
return cast(str, data["last_edited_time"])
@classmethod
def _get_access_token(cls, tenant_id: str, notion_workspace_id: str) -> str:
stmt = select(DataSourceOauthBinding).where(
DataSourceOauthBinding.tenant_id == tenant_id,
DataSourceOauthBinding.provider == "notion",
DataSourceOauthBinding.disabled == False,
DataSourceOauthBinding.source_info["workspace_id"] == f'"{notion_workspace_id}"',
def _get_access_token(cls, tenant_id: str, credential_id: str | None) -> str:
# get credential from tenant_id and credential_id
if not credential_id:
raise Exception(f"No credential id found for tenant {tenant_id}")
datasource_provider_service = DatasourceProviderService()
credential = datasource_provider_service.get_datasource_credentials(
tenant_id=tenant_id,
credential_id=credential_id,
provider="notion_datasource",
plugin_id="langgenius/notion_datasource",
)
data_source_binding = db.session.scalar(stmt)
if not credential:
raise Exception(f"No notion credential found for tenant {tenant_id} and credential {credential_id}")
if not data_source_binding:
raise Exception(
f"No notion data source binding found for tenant {tenant_id} and notion workspace {notion_workspace_id}"
)
return data_source_binding.access_token
return cast(str, credential["integration_secret"])

View File

@@ -16,7 +16,14 @@ class WaterCrawlWebExtractor(BaseExtractor):
only_main_content: Only return the main content of the page excluding headers, navs, footers, etc.
"""
def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = True):
def __init__(
self,
url: str,
job_id: str,
tenant_id: str,
mode: str = "crawl",
only_main_content: bool = True,
):
"""Initialize with url, api_key, base_url and mode."""
self._url = url
self.job_id = job_id