feat: knowledge pipeline (#25360)

Signed-off-by: -LAN- <laipz8200@outlook.com> Co-authored-by: twwu <twwu@dify.ai> Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com> Co-authored-by: jyong <718720800@qq.com> Co-authored-by: Wu Tianwei <30284043+WTW0313@users.noreply.github.com> Co-authored-by: QuantumGhost <obelisk.reg+git@gmail.com> Co-authored-by: lyzno1 <yuanyouhuilyz@gmail.com> Co-authored-by: quicksand <quicksandzn@gmail.com> Co-authored-by: Jyong <76649700+JohnJyong@users.noreply.github.com> Co-authored-by: lyzno1 <92089059+lyzno1@users.noreply.github.com> Co-authored-by: zxhlyh <jasonapring2015@outlook.com> Co-authored-by: Yongtao Huang <yongtaoh2022@gmail.com> Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: Joel <iamjoel007@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: nite-knite <nkCoding@gmail.com> Co-authored-by: Hanqing Zhao <sherry9277@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Harry <xh001x@hotmail.com>
2025-09-18 12:49:10 +08:00
parent 7dadb33003
commit 85cda47c70
1772 changed files with 102407 additions and 31710 deletions
--- a/api/core/rag/datasource/keyword/jieba/jieba.py
+++ b/api/core/rag/datasource/keyword/jieba/jieba.py
@@ -29,10 +29,10 @@ class Jieba(BaseKeyword):
        with redis_client.lock(lock_name, timeout=600):
            keyword_table_handler = JiebaKeywordTableHandler()
            keyword_table = self._get_dataset_keyword_table()
+            keyword_number = self.dataset.keyword_number or self._config.max_keywords_per_chunk
+
            for text in texts:
-                keywords = keyword_table_handler.extract_keywords(
-                    text.page_content, self._config.max_keywords_per_chunk
-                )
+                keywords = keyword_table_handler.extract_keywords(text.page_content, keyword_number)
                if text.metadata is not None:
                    self._update_segment_keywords(self.dataset.id, text.metadata["doc_id"], list(keywords))
                    keyword_table = self._add_text_to_keyword_table(
@@ -50,18 +50,15 @@ class Jieba(BaseKeyword):

            keyword_table = self._get_dataset_keyword_table()
            keywords_list = kwargs.get("keywords_list")
+            keyword_number = self.dataset.keyword_number or self._config.max_keywords_per_chunk
            for i in range(len(texts)):
                text = texts[i]
                if keywords_list:
                    keywords = keywords_list[i]
                    if not keywords:
-                        keywords = keyword_table_handler.extract_keywords(
-                            text.page_content, self._config.max_keywords_per_chunk
-                        )
+                        keywords = keyword_table_handler.extract_keywords(text.page_content, keyword_number)
                else:
-                    keywords = keyword_table_handler.extract_keywords(
-                        text.page_content, self._config.max_keywords_per_chunk
-                    )
+                    keywords = keyword_table_handler.extract_keywords(text.page_content, keyword_number)
                if text.metadata is not None:
                    self._update_segment_keywords(self.dataset.id, text.metadata["doc_id"], list(keywords))
                    keyword_table = self._add_text_to_keyword_table(
@@ -238,7 +235,9 @@ class Jieba(BaseKeyword):
                    keyword_table or {}, segment.index_node_id, pre_segment_data["keywords"]
                )
            else:
-                keywords = keyword_table_handler.extract_keywords(segment.content, self._config.max_keywords_per_chunk)
+                keyword_number = self.dataset.keyword_number or self._config.max_keywords_per_chunk
+
+                keywords = keyword_table_handler.extract_keywords(segment.content, keyword_number)
                segment.keywords = list(keywords)
                keyword_table = self._add_text_to_keyword_table(
                    keyword_table or {}, segment.index_node_id, list(keywords)
--- a/api/core/rag/entities/event.py
+++ b/api/core/rag/entities/event.py
@@ -0,0 +1,38 @@
+from collections.abc import Mapping
+from enum import Enum
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+
+class DatasourceStreamEvent(Enum):
+    """
+    Datasource Stream event
+    """
+
+    PROCESSING = "datasource_processing"
+    COMPLETED = "datasource_completed"
+    ERROR = "datasource_error"
+
+
+class BaseDatasourceEvent(BaseModel):
+    pass
+
+
+class DatasourceErrorEvent(BaseDatasourceEvent):
+    event: str = DatasourceStreamEvent.ERROR.value
+    error: str = Field(..., description="error message")
+
+
+class DatasourceCompletedEvent(BaseDatasourceEvent):
+    event: str = DatasourceStreamEvent.COMPLETED.value
+    data: Mapping[str, Any] | list = Field(..., description="result")
+    total: int | None = Field(default=0, description="total")
+    completed: int | None = Field(default=0, description="completed")
+    time_consuming: float | None = Field(default=0.0, description="time consuming")
+
+
+class DatasourceProcessingEvent(BaseDatasourceEvent):
+    event: str = DatasourceStreamEvent.PROCESSING.value
+    total: int | None = Field(..., description="total")
+    completed: int | None = Field(..., description="completed")
--- a/api/core/rag/extractor/entity/extract_setting.py
+++ b/api/core/rag/extractor/entity/extract_setting.py
@@ -9,6 +9,7 @@ class NotionInfo(BaseModel):
    Notion import info.
    """

+    credential_id: str | None = None
    notion_workspace_id: str
    notion_obj_id: str
    notion_page_type: str
--- a/api/core/rag/extractor/extract_processor.py
+++ b/api/core/rag/extractor/extract_processor.py
@@ -171,6 +171,7 @@ class ExtractProcessor:
                notion_page_type=extract_setting.notion_info.notion_page_type,
                document_model=extract_setting.notion_info.document,
                tenant_id=extract_setting.notion_info.tenant_id,
+                credential_id=extract_setting.notion_info.credential_id,
            )
            return extractor.extract()
        elif extract_setting.datasource_type == DatasourceType.WEBSITE.value:
--- a/api/core/rag/extractor/firecrawl/firecrawl_web_extractor.py
+++ b/api/core/rag/extractor/firecrawl/firecrawl_web_extractor.py
@@ -15,7 +15,14 @@ class FirecrawlWebExtractor(BaseExtractor):
        only_main_content: Only return the main content of the page excluding headers, navs, footers, etc.
    """

-    def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = True):
+    def __init__(
+        self,
+        url: str,
+        job_id: str,
+        tenant_id: str,
+        mode: str = "crawl",
+        only_main_content: bool = True,
+    ):
        """Initialize with url, api_key, base_url and mode."""
        self._url = url
        self.job_id = job_id
--- a/api/core/rag/extractor/jina_reader_extractor.py
+++ b/api/core/rag/extractor/jina_reader_extractor.py
@@ -8,7 +8,14 @@ class JinaReaderWebExtractor(BaseExtractor):
    Crawl and scrape websites and return content in clean llm-ready markdown.
    """

-    def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = False):
+    def __init__(
+        self,
+        url: str,
+        job_id: str,
+        tenant_id: str,
+        mode: str = "crawl",
+        only_main_content: bool = False,
+    ):
        """Initialize with url, api_key, base_url and mode."""
        self._url = url
        self.job_id = job_id
--- a/api/core/rag/extractor/notion_extractor.py
+++ b/api/core/rag/extractor/notion_extractor.py
@@ -4,14 +4,13 @@ import operator
 from typing import Any, cast

 import requests
-from sqlalchemy import select

 from configs import dify_config
 from core.rag.extractor.extractor_base import BaseExtractor
 from core.rag.models.document import Document
 from extensions.ext_database import db
 from models.dataset import Document as DocumentModel
-from models.source import DataSourceOauthBinding
+from services.datasource_provider_service import DatasourceProviderService

 logger = logging.getLogger(__name__)

@@ -38,16 +37,18 @@ class NotionExtractor(BaseExtractor):
        tenant_id: str,
        document_model: DocumentModel | None = None,
        notion_access_token: str | None = None,
+        credential_id: str | None = None,
    ):
        self._notion_access_token = None
        self._document_model = document_model
        self._notion_workspace_id = notion_workspace_id
        self._notion_obj_id = notion_obj_id
        self._notion_page_type = notion_page_type
+        self._credential_id = credential_id
        if notion_access_token:
            self._notion_access_token = notion_access_token
        else:
-            self._notion_access_token = self._get_access_token(tenant_id, self._notion_workspace_id)
+            self._notion_access_token = self._get_access_token(tenant_id, self._credential_id)
            if not self._notion_access_token:
                integration_token = dify_config.NOTION_INTEGRATION_TOKEN
                if integration_token is None:
@@ -368,18 +369,18 @@ class NotionExtractor(BaseExtractor):
        return cast(str, data["last_edited_time"])

    @classmethod
-    def _get_access_token(cls, tenant_id: str, notion_workspace_id: str) -> str:
-        stmt = select(DataSourceOauthBinding).where(
-            DataSourceOauthBinding.tenant_id == tenant_id,
-            DataSourceOauthBinding.provider == "notion",
-            DataSourceOauthBinding.disabled == False,
-            DataSourceOauthBinding.source_info["workspace_id"] == f'"{notion_workspace_id}"',
+    def _get_access_token(cls, tenant_id: str, credential_id: str | None) -> str:
+        # get credential from tenant_id and credential_id
+        if not credential_id:
+            raise Exception(f"No credential id found for tenant {tenant_id}")
+        datasource_provider_service = DatasourceProviderService()
+        credential = datasource_provider_service.get_datasource_credentials(
+            tenant_id=tenant_id,
+            credential_id=credential_id,
+            provider="notion_datasource",
+            plugin_id="langgenius/notion_datasource",
        )
-        data_source_binding = db.session.scalar(stmt)
+        if not credential:
+            raise Exception(f"No notion credential found for tenant {tenant_id} and credential {credential_id}")

-        if not data_source_binding:
-            raise Exception(
-                f"No notion data source binding found for tenant {tenant_id} and notion workspace {notion_workspace_id}"
-            )
-
-        return data_source_binding.access_token
+        return cast(str, credential["integration_secret"])
--- a/api/core/rag/extractor/watercrawl/extractor.py
+++ b/api/core/rag/extractor/watercrawl/extractor.py
@@ -16,7 +16,14 @@ class WaterCrawlWebExtractor(BaseExtractor):
        only_main_content: Only return the main content of the page excluding headers, navs, footers, etc.
    """

-    def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = True):
+    def __init__(
+        self,
+        url: str,
+        job_id: str,
+        tenant_id: str,
+        mode: str = "crawl",
+        only_main_content: bool = True,
+    ):
        """Initialize with url, api_key, base_url and mode."""
        self._url = url
        self.job_id = job_id
--- a/api/core/rag/index_processor/constant/built_in_field.py
+++ b/api/core/rag/index_processor/constant/built_in_field.py
@@ -13,3 +13,5 @@ class MetadataDataSource(StrEnum):
    upload_file = "file_upload"
    website_crawl = "website"
    notion_import = "notion"
+    local_file = "file_upload"
+    online_document = "online_document"
--- a/api/core/rag/index_processor/index_processor_base.py
+++ b/api/core/rag/index_processor/index_processor_base.py
@@ -1,9 +1,10 @@
 """Abstract interface for document loader implementations."""

 from abc import ABC, abstractmethod
+from collections.abc import Mapping
+from typing import TYPE_CHECKING, Any, Optional

 from configs import dify_config
-from core.model_manager import ModelInstance
 from core.rag.extractor.entity.extract_setting import ExtractSetting
 from core.rag.models.document import Document
 from core.rag.splitter.fixed_text_splitter import (
@@ -12,6 +13,10 @@ from core.rag.splitter.fixed_text_splitter import (
 )
 from core.rag.splitter.text_splitter import TextSplitter
 from models.dataset import Dataset, DatasetProcessRule
+from models.dataset import Document as DatasetDocument
+
+if TYPE_CHECKING:
+    from core.model_manager import ModelInstance


 class BaseIndexProcessor(ABC):
@@ -33,6 +38,14 @@ class BaseIndexProcessor(ABC):
    def clean(self, dataset: Dataset, node_ids: list[str] | None, with_keywords: bool = True, **kwargs):
        raise NotImplementedError

+    @abstractmethod
+    def index(self, dataset: Dataset, document: DatasetDocument, chunks: Any):
+        raise NotImplementedError
+
+    @abstractmethod
+    def format_preview(self, chunks: Any) -> Mapping[str, Any]:
+        raise NotImplementedError
+
    @abstractmethod
    def retrieve(
        self,
@@ -51,7 +64,7 @@ class BaseIndexProcessor(ABC):
        max_tokens: int,
        chunk_overlap: int,
        separator: str,
-        embedding_model_instance: ModelInstance | None,
+        embedding_model_instance: Optional["ModelInstance"],
    ) -> TextSplitter:
        """
        Get the NodeParser object according to the processing rule.
--- a/api/core/rag/index_processor/processor/paragraph_index_processor.py
+++ b/api/core/rag/index_processor/processor/paragraph_index_processor.py
@@ -1,18 +1,23 @@
 """Paragraph index processor."""

 import uuid
+from collections.abc import Mapping
+from typing import Any

 from core.rag.cleaner.clean_processor import CleanProcessor
 from core.rag.datasource.keyword.keyword_factory import Keyword
 from core.rag.datasource.retrieval_service import RetrievalService
 from core.rag.datasource.vdb.vector_factory import Vector
+from core.rag.docstore.dataset_docstore import DatasetDocumentStore
 from core.rag.extractor.entity.extract_setting import ExtractSetting
 from core.rag.extractor.extract_processor import ExtractProcessor
+from core.rag.index_processor.constant.index_type import IndexType
 from core.rag.index_processor.index_processor_base import BaseIndexProcessor
 from core.rag.models.document import Document
 from core.tools.utils.text_processing_utils import remove_leading_symbols
 from libs import helper
 from models.dataset import Dataset, DatasetProcessRule
+from models.dataset import Document as DatasetDocument
 from services.entities.knowledge_entities.knowledge_entities import Rule


@@ -126,3 +131,38 @@ class ParagraphIndexProcessor(BaseIndexProcessor):
                doc = Document(page_content=result.page_content, metadata=metadata)
                docs.append(doc)
        return docs
+
+    def index(self, dataset: Dataset, document: DatasetDocument, chunks: Any):
+        if isinstance(chunks, list):
+            documents = []
+            for content in chunks:
+                metadata = {
+                    "dataset_id": dataset.id,
+                    "document_id": document.id,
+                    "doc_id": str(uuid.uuid4()),
+                    "doc_hash": helper.generate_text_hash(content),
+                }
+                doc = Document(page_content=content, metadata=metadata)
+                documents.append(doc)
+            if documents:
+                # save node to document segment
+                doc_store = DatasetDocumentStore(dataset=dataset, user_id=document.created_by, document_id=document.id)
+                # add document segments
+                doc_store.add_documents(docs=documents, save_child=False)
+                if dataset.indexing_technique == "high_quality":
+                    vector = Vector(dataset)
+                    vector.create(documents)
+                elif dataset.indexing_technique == "economy":
+                    keyword = Keyword(dataset)
+                    keyword.add_texts(documents)
+        else:
+            raise ValueError("Chunks is not a list")
+
+    def format_preview(self, chunks: Any) -> Mapping[str, Any]:
+        if isinstance(chunks, list):
+            preview = []
+            for content in chunks:
+                preview.append({"content": content})
+            return {"chunk_structure": IndexType.PARAGRAPH_INDEX, "preview": preview, "total_segments": len(chunks)}
+        else:
+            raise ValueError("Chunks is not a list")
--- a/api/core/rag/index_processor/processor/parent_child_index_processor.py
+++ b/api/core/rag/index_processor/processor/parent_child_index_processor.py
@@ -1,19 +1,25 @@
 """Paragraph index processor."""

+import json
 import uuid
+from collections.abc import Mapping
+from typing import Any

 from configs import dify_config
 from core.model_manager import ModelInstance
 from core.rag.cleaner.clean_processor import CleanProcessor
 from core.rag.datasource.retrieval_service import RetrievalService
 from core.rag.datasource.vdb.vector_factory import Vector
+from core.rag.docstore.dataset_docstore import DatasetDocumentStore
 from core.rag.extractor.entity.extract_setting import ExtractSetting
 from core.rag.extractor.extract_processor import ExtractProcessor
+from core.rag.index_processor.constant.index_type import IndexType
 from core.rag.index_processor.index_processor_base import BaseIndexProcessor
-from core.rag.models.document import ChildDocument, Document
+from core.rag.models.document import ChildDocument, Document, ParentChildStructureChunk
 from extensions.ext_database import db
 from libs import helper
-from models.dataset import ChildChunk, Dataset, DocumentSegment
+from models.dataset import ChildChunk, Dataset, DatasetProcessRule, DocumentSegment
+from models.dataset import Document as DatasetDocument
 from services.entities.knowledge_entities.knowledge_entities import ParentMode, Rule


@@ -216,3 +222,65 @@ class ParentChildIndexProcessor(BaseIndexProcessor):
                    child_document.page_content = child_page_content
                    child_nodes.append(child_document)
        return child_nodes
+
+    def index(self, dataset: Dataset, document: DatasetDocument, chunks: Any):
+        parent_childs = ParentChildStructureChunk(**chunks)
+        documents = []
+        for parent_child in parent_childs.parent_child_chunks:
+            metadata = {
+                "dataset_id": dataset.id,
+                "document_id": document.id,
+                "doc_id": str(uuid.uuid4()),
+                "doc_hash": helper.generate_text_hash(parent_child.parent_content),
+            }
+            child_documents = []
+            for child in parent_child.child_contents:
+                child_metadata = {
+                    "dataset_id": dataset.id,
+                    "document_id": document.id,
+                    "doc_id": str(uuid.uuid4()),
+                    "doc_hash": helper.generate_text_hash(child),
+                }
+                child_documents.append(ChildDocument(page_content=child, metadata=child_metadata))
+            doc = Document(page_content=parent_child.parent_content, metadata=metadata, children=child_documents)
+            documents.append(doc)
+        if documents:
+            # update document parent mode
+            dataset_process_rule = DatasetProcessRule(
+                dataset_id=dataset.id,
+                mode="hierarchical",
+                rules=json.dumps(
+                    {
+                        "parent_mode": parent_childs.parent_mode,
+                    }
+                ),
+                created_by=document.created_by,
+            )
+            db.session.add(dataset_process_rule)
+            db.session.flush()
+            document.dataset_process_rule_id = dataset_process_rule.id
+            db.session.commit()
+            # save node to document segment
+            doc_store = DatasetDocumentStore(dataset=dataset, user_id=document.created_by, document_id=document.id)
+            # add document segments
+            doc_store.add_documents(docs=documents, save_child=True)
+            if dataset.indexing_technique == "high_quality":
+                all_child_documents = []
+                for doc in documents:
+                    if doc.children:
+                        all_child_documents.extend(doc.children)
+                if all_child_documents:
+                    vector = Vector(dataset)
+                    vector.create(all_child_documents)
+
+    def format_preview(self, chunks: Any) -> Mapping[str, Any]:
+        parent_childs = ParentChildStructureChunk(**chunks)
+        preview = []
+        for parent_child in parent_childs.parent_child_chunks:
+            preview.append({"content": parent_child.parent_content, "child_chunks": parent_child.child_contents})
+        return {
+            "chunk_structure": IndexType.PARENT_CHILD_INDEX,
+            "parent_mode": parent_childs.parent_mode,
+            "preview": preview,
+            "total_segments": len(parent_childs.parent_child_chunks),
+        }
--- a/api/core/rag/index_processor/processor/qa_index_processor.py
+++ b/api/core/rag/index_processor/processor/qa_index_processor.py
@@ -4,6 +4,8 @@ import logging
 import re
 import threading
 import uuid
+from collections.abc import Mapping
+from typing import Any

 import pandas as pd
 from flask import Flask, current_app
@@ -13,13 +15,16 @@ from core.llm_generator.llm_generator import LLMGenerator
 from core.rag.cleaner.clean_processor import CleanProcessor
 from core.rag.datasource.retrieval_service import RetrievalService
 from core.rag.datasource.vdb.vector_factory import Vector
+from core.rag.docstore.dataset_docstore import DatasetDocumentStore
 from core.rag.extractor.entity.extract_setting import ExtractSetting
 from core.rag.extractor.extract_processor import ExtractProcessor
+from core.rag.index_processor.constant.index_type import IndexType
 from core.rag.index_processor.index_processor_base import BaseIndexProcessor
-from core.rag.models.document import Document
+from core.rag.models.document import Document, QAStructureChunk
 from core.tools.utils.text_processing_utils import remove_leading_symbols
 from libs import helper
 from models.dataset import Dataset
+from models.dataset import Document as DatasetDocument
 from services.entities.knowledge_entities.knowledge_entities import Rule

 logger = logging.getLogger(__name__)
@@ -162,6 +167,40 @@ class QAIndexProcessor(BaseIndexProcessor):
                docs.append(doc)
        return docs

+    def index(self, dataset: Dataset, document: DatasetDocument, chunks: Any):
+        qa_chunks = QAStructureChunk(**chunks)
+        documents = []
+        for qa_chunk in qa_chunks.qa_chunks:
+            metadata = {
+                "dataset_id": dataset.id,
+                "document_id": document.id,
+                "doc_id": str(uuid.uuid4()),
+                "doc_hash": helper.generate_text_hash(qa_chunk.question),
+                "answer": qa_chunk.answer,
+            }
+            doc = Document(page_content=qa_chunk.question, metadata=metadata)
+            documents.append(doc)
+        if documents:
+            # save node to document segment
+            doc_store = DatasetDocumentStore(dataset=dataset, user_id=document.created_by, document_id=document.id)
+            doc_store.add_documents(docs=documents, save_child=False)
+            if dataset.indexing_technique == "high_quality":
+                vector = Vector(dataset)
+                vector.create(documents)
+            else:
+                raise ValueError("Indexing technique must be high quality.")
+
+    def format_preview(self, chunks: Any) -> Mapping[str, Any]:
+        qa_chunks = QAStructureChunk(**chunks)
+        preview = []
+        for qa_chunk in qa_chunks.qa_chunks:
+            preview.append({"question": qa_chunk.question, "answer": qa_chunk.answer})
+        return {
+            "chunk_structure": IndexType.QA_INDEX,
+            "qa_preview": preview,
+            "total_segments": len(qa_chunks.qa_chunks),
+        }
+
    def _format_qa_document(self, flask_app: Flask, tenant_id: str, document_node, all_qa_documents, document_language):
        format_documents = []
        if document_node.page_content is None or not document_node.page_content.strip():
--- a/api/core/rag/models/document.py
+++ b/api/core/rag/models/document.py
@@ -35,6 +35,49 @@ class Document(BaseModel):
    children: list[ChildDocument] | None = None


+class GeneralStructureChunk(BaseModel):
+    """
+    General Structure Chunk.
+    """
+
+    general_chunks: list[str]
+
+
+class ParentChildChunk(BaseModel):
+    """
+    Parent Child Chunk.
+    """
+
+    parent_content: str
+    child_contents: list[str]
+
+
+class ParentChildStructureChunk(BaseModel):
+    """
+    Parent Child Structure Chunk.
+    """
+
+    parent_child_chunks: list[ParentChildChunk]
+    parent_mode: str = "paragraph"
+
+
+class QAChunk(BaseModel):
+    """
+    QA Chunk.
+    """
+
+    question: str
+    answer: str
+
+
+class QAStructureChunk(BaseModel):
+    """
+    QAStructureChunk.
+    """
+
+    qa_chunks: list[QAChunk]
+
+
 class BaseDocumentTransformer(ABC):
    """Abstract base class for document transformation systems.

--- a/api/core/rag/retrieval/retrieval_methods.py
+++ b/api/core/rag/retrieval/retrieval_methods.py
@@ -5,6 +5,7 @@ class RetrievalMethod(Enum):
    SEMANTIC_SEARCH = "semantic_search"
    FULL_TEXT_SEARCH = "full_text_search"
    HYBRID_SEARCH = "hybrid_search"
+    KEYWORD_SEARCH = "keyword_search"

    @staticmethod
    def is_support_semantic_search(retrieval_method: str) -> bool: