Feat/support parent child chunk (#12092)

2024-12-25 19:49:07 +08:00
parent 017d7538ae
commit 9231fdbf4c
54 changed files with 2578 additions and 808 deletions
--- a/api/core/rag/index_processor/constant/index_type.py
+++ b/api/core/rag/index_processor/constant/index_type.py
@@ -1,8 +1,7 @@
 from enum import Enum


-class IndexType(Enum):
+class IndexType(str, Enum):
    PARAGRAPH_INDEX = "text_model"
    QA_INDEX = "qa_model"
-    PARENT_CHILD_INDEX = "parent_child_index"
-    SUMMARY_INDEX = "summary_index"
+    PARENT_CHILD_INDEX = "hierarchical_model"
--- a/api/core/rag/index_processor/index_processor_base.py
+++ b/api/core/rag/index_processor/index_processor_base.py
@@ -27,10 +27,10 @@ class BaseIndexProcessor(ABC):
        raise NotImplementedError

    @abstractmethod
-    def load(self, dataset: Dataset, documents: list[Document], with_keywords: bool = True):
+    def load(self, dataset: Dataset, documents: list[Document], with_keywords: bool = True, **kwargs):
        raise NotImplementedError

-    def clean(self, dataset: Dataset, node_ids: Optional[list[str]], with_keywords: bool = True):
+    def clean(self, dataset: Dataset, node_ids: Optional[list[str]], with_keywords: bool = True, **kwargs):
        raise NotImplementedError

    @abstractmethod
@@ -45,26 +45,29 @@ class BaseIndexProcessor(ABC):
    ) -> list[Document]:
        raise NotImplementedError

-    def _get_splitter(self, processing_rule: dict, embedding_model_instance: Optional[ModelInstance]) -> TextSplitter:
+    def _get_splitter(
+        self,
+        processing_rule_mode: str,
+        max_tokens: int,
+        chunk_overlap: int,
+        separator: str,
+        embedding_model_instance: Optional[ModelInstance],
+    ) -> TextSplitter:
        """
        Get the NodeParser object according to the processing rule.
        """
-        character_splitter: TextSplitter
-        if processing_rule["mode"] == "custom":
+        if processing_rule_mode in ["custom", "hierarchical"]:
            # The user-defined segmentation rule
-            rules = processing_rule["rules"]
-            segmentation = rules["segmentation"]
            max_segmentation_tokens_length = dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH
-            if segmentation["max_tokens"] < 50 or segmentation["max_tokens"] > max_segmentation_tokens_length:
+            if max_tokens < 50 or max_tokens > max_segmentation_tokens_length:
                raise ValueError(f"Custom segment length should be between 50 and {max_segmentation_tokens_length}.")

-            separator = segmentation["separator"]
            if separator:
                separator = separator.replace("\\n", "\n")

            character_splitter = FixedRecursiveCharacterTextSplitter.from_encoder(
-                chunk_size=segmentation["max_tokens"],
-                chunk_overlap=segmentation.get("chunk_overlap", 0) or 0,
+                chunk_size=max_tokens,
+                chunk_overlap=chunk_overlap,
                fixed_separator=separator,
                separators=["\n\n", "。", ". ", " ", ""],
                embedding_model_instance=embedding_model_instance,
--- a/api/core/rag/index_processor/index_processor_factory.py
+++ b/api/core/rag/index_processor/index_processor_factory.py
@@ -3,6 +3,7 @@
 from core.rag.index_processor.constant.index_type import IndexType
 from core.rag.index_processor.index_processor_base import BaseIndexProcessor
 from core.rag.index_processor.processor.paragraph_index_processor import ParagraphIndexProcessor
+from core.rag.index_processor.processor.parent_child_index_processor import ParentChildIndexProcessor
 from core.rag.index_processor.processor.qa_index_processor import QAIndexProcessor


@@ -18,9 +19,11 @@ class IndexProcessorFactory:
        if not self._index_type:
            raise ValueError("Index type must be specified.")

-        if self._index_type == IndexType.PARAGRAPH_INDEX.value:
+        if self._index_type == IndexType.PARAGRAPH_INDEX:
            return ParagraphIndexProcessor()
-        elif self._index_type == IndexType.QA_INDEX.value:
+        elif self._index_type == IndexType.QA_INDEX:
            return QAIndexProcessor()
+        elif self._index_type == IndexType.PARENT_CHILD_INDEX:
+            return ParentChildIndexProcessor()
        else:
            raise ValueError(f"Index type {self._index_type} is not supported.")
--- a/api/core/rag/index_processor/processor/paragraph_index_processor.py
+++ b/api/core/rag/index_processor/processor/paragraph_index_processor.py
@@ -13,21 +13,34 @@ from core.rag.index_processor.index_processor_base import BaseIndexProcessor
 from core.rag.models.document import Document
 from core.tools.utils.text_processing_utils import remove_leading_symbols
 from libs import helper
-from models.dataset import Dataset
+from models.dataset import Dataset, DatasetProcessRule
+from services.entities.knowledge_entities.knowledge_entities import Rule


 class ParagraphIndexProcessor(BaseIndexProcessor):
    def extract(self, extract_setting: ExtractSetting, **kwargs) -> list[Document]:
        text_docs = ExtractProcessor.extract(
-            extract_setting=extract_setting, is_automatic=kwargs.get("process_rule_mode") == "automatic"
+            extract_setting=extract_setting,
+            is_automatic=(
+                kwargs.get("process_rule_mode") == "automatic" or kwargs.get("process_rule_mode") == "hierarchical"
+            ),
        )

        return text_docs

    def transform(self, documents: list[Document], **kwargs) -> list[Document]:
+        process_rule = kwargs.get("process_rule")
+        if process_rule.get("mode") == "automatic":
+            automatic_rule = DatasetProcessRule.AUTOMATIC_RULES
+            rules = Rule(**automatic_rule)
+        else:
+            rules = Rule(**process_rule.get("rules"))
        # Split the text documents into nodes.
        splitter = self._get_splitter(
-            processing_rule=kwargs.get("process_rule", {}),
+            processing_rule_mode=process_rule.get("mode"),
+            max_tokens=rules.segmentation.max_tokens,
+            chunk_overlap=rules.segmentation.chunk_overlap,
+            separator=rules.segmentation.separator,
            embedding_model_instance=kwargs.get("embedding_model_instance"),
        )
        all_documents = []
@@ -53,15 +66,19 @@ class ParagraphIndexProcessor(BaseIndexProcessor):
            all_documents.extend(split_documents)
        return all_documents

-    def load(self, dataset: Dataset, documents: list[Document], with_keywords: bool = True):
+    def load(self, dataset: Dataset, documents: list[Document], with_keywords: bool = True, **kwargs):
        if dataset.indexing_technique == "high_quality":
            vector = Vector(dataset)
            vector.create(documents)
        if with_keywords:
+            keywords_list = kwargs.get("keywords_list")
            keyword = Keyword(dataset)
-            keyword.create(documents)
+            if keywords_list and len(keywords_list) > 0:
+                keyword.add_texts(documents, keywords_list=keywords_list)
+            else:
+                keyword.add_texts(documents)

-    def clean(self, dataset: Dataset, node_ids: Optional[list[str]], with_keywords: bool = True):
+    def clean(self, dataset: Dataset, node_ids: Optional[list[str]], with_keywords: bool = True, **kwargs):
        if dataset.indexing_technique == "high_quality":
            vector = Vector(dataset)
            if node_ids:
--- a/api/core/rag/index_processor/processor/parent_child_index_processor.py
+++ b/api/core/rag/index_processor/processor/parent_child_index_processor.py
@@ -0,0 +1,189 @@
+"""Paragraph index processor."""
+
+import uuid
+from typing import Optional
+
+from core.model_manager import ModelInstance
+from core.rag.cleaner.clean_processor import CleanProcessor
+from core.rag.datasource.retrieval_service import RetrievalService
+from core.rag.datasource.vdb.vector_factory import Vector
+from core.rag.extractor.entity.extract_setting import ExtractSetting
+from core.rag.extractor.extract_processor import ExtractProcessor
+from core.rag.index_processor.index_processor_base import BaseIndexProcessor
+from core.rag.models.document import ChildDocument, Document
+from extensions.ext_database import db
+from libs import helper
+from models.dataset import ChildChunk, Dataset, DocumentSegment
+from services.entities.knowledge_entities.knowledge_entities import ParentMode, Rule
+
+
+class ParentChildIndexProcessor(BaseIndexProcessor):
+    def extract(self, extract_setting: ExtractSetting, **kwargs) -> list[Document]:
+        text_docs = ExtractProcessor.extract(
+            extract_setting=extract_setting,
+            is_automatic=(
+                kwargs.get("process_rule_mode") == "automatic" or kwargs.get("process_rule_mode") == "hierarchical"
+            ),
+        )
+
+        return text_docs
+
+    def transform(self, documents: list[Document], **kwargs) -> list[Document]:
+        process_rule = kwargs.get("process_rule")
+        rules = Rule(**process_rule.get("rules"))
+        all_documents = []
+        if rules.parent_mode == ParentMode.PARAGRAPH:
+            # Split the text documents into nodes.
+            splitter = self._get_splitter(
+                processing_rule_mode=process_rule.get("mode"),
+                max_tokens=rules.segmentation.max_tokens,
+                chunk_overlap=rules.segmentation.chunk_overlap,
+                separator=rules.segmentation.separator,
+                embedding_model_instance=kwargs.get("embedding_model_instance"),
+            )
+            for document in documents:
+                # document clean
+                document_text = CleanProcessor.clean(document.page_content, process_rule)
+                document.page_content = document_text
+                # parse document to nodes
+                document_nodes = splitter.split_documents([document])
+                split_documents = []
+                for document_node in document_nodes:
+                    if document_node.page_content.strip():
+                        doc_id = str(uuid.uuid4())
+                        hash = helper.generate_text_hash(document_node.page_content)
+                        document_node.metadata["doc_id"] = doc_id
+                        document_node.metadata["doc_hash"] = hash
+                        # delete Splitter character
+                        page_content = document_node.page_content
+                        if page_content.startswith(".") or page_content.startswith("。"):
+                            page_content = page_content[1:].strip()
+                        else:
+                            page_content = page_content
+                        if len(page_content) > 0:
+                            document_node.page_content = page_content
+                            # parse document to child nodes
+                            child_nodes = self._split_child_nodes(
+                                document_node, rules, process_rule.get("mode"), kwargs.get("embedding_model_instance")
+                            )
+                            document_node.children = child_nodes
+                            split_documents.append(document_node)
+                all_documents.extend(split_documents)
+        elif rules.parent_mode == ParentMode.FULL_DOC:
+            page_content = "\n".join([document.page_content for document in documents])
+            document = Document(page_content=page_content, metadata=documents[0].metadata)
+            # parse document to child nodes
+            child_nodes = self._split_child_nodes(
+                document, rules, process_rule.get("mode"), kwargs.get("embedding_model_instance")
+            )
+            document.children = child_nodes
+            doc_id = str(uuid.uuid4())
+            hash = helper.generate_text_hash(document.page_content)
+            document.metadata["doc_id"] = doc_id
+            document.metadata["doc_hash"] = hash
+            all_documents.append(document)
+
+        return all_documents
+
+    def load(self, dataset: Dataset, documents: list[Document], with_keywords: bool = True, **kwargs):
+        if dataset.indexing_technique == "high_quality":
+            vector = Vector(dataset)
+            for document in documents:
+                child_documents = document.children
+                if child_documents:
+                    formatted_child_documents = [
+                        Document(**child_document.model_dump()) for child_document in child_documents
+                    ]
+                    vector.create(formatted_child_documents)
+
+    def clean(self, dataset: Dataset, node_ids: Optional[list[str]], with_keywords: bool = True, **kwargs):
+        # node_ids is segment's node_ids
+        if dataset.indexing_technique == "high_quality":
+            delete_child_chunks = kwargs.get("delete_child_chunks") or False
+            vector = Vector(dataset)
+            if node_ids:
+                child_node_ids = (
+                    db.session.query(ChildChunk.index_node_id)
+                    .join(DocumentSegment, ChildChunk.segment_id == DocumentSegment.id)
+                    .filter(
+                        DocumentSegment.dataset_id == dataset.id,
+                        DocumentSegment.index_node_id.in_(node_ids),
+                        ChildChunk.dataset_id == dataset.id,
+                    )
+                    .all()
+                )
+                child_node_ids = [child_node_id[0] for child_node_id in child_node_ids]
+                vector.delete_by_ids(child_node_ids)
+                if delete_child_chunks:
+                    db.session.query(ChildChunk).filter(
+                        ChildChunk.dataset_id == dataset.id, ChildChunk.index_node_id.in_(child_node_ids)
+                    ).delete()
+                    db.session.commit()
+            else:
+                vector.delete()
+
+                if delete_child_chunks:
+                    db.session.query(ChildChunk).filter(ChildChunk.dataset_id == dataset.id).delete()
+                    db.session.commit()
+
+    def retrieve(
+        self,
+        retrieval_method: str,
+        query: str,
+        dataset: Dataset,
+        top_k: int,
+        score_threshold: float,
+        reranking_model: dict,
+    ) -> list[Document]:
+        # Set search parameters.
+        results = RetrievalService.retrieve(
+            retrieval_method=retrieval_method,
+            dataset_id=dataset.id,
+            query=query,
+            top_k=top_k,
+            score_threshold=score_threshold,
+            reranking_model=reranking_model,
+        )
+        # Organize results.
+        docs = []
+        for result in results:
+            metadata = result.metadata
+            metadata["score"] = result.score
+            if result.score > score_threshold:
+                doc = Document(page_content=result.page_content, metadata=metadata)
+                docs.append(doc)
+        return docs
+
+    def _split_child_nodes(
+        self,
+        document_node: Document,
+        rules: Rule,
+        process_rule_mode: str,
+        embedding_model_instance: Optional[ModelInstance],
+    ) -> list[ChildDocument]:
+        child_splitter = self._get_splitter(
+            processing_rule_mode=process_rule_mode,
+            max_tokens=rules.subchunk_segmentation.max_tokens,
+            chunk_overlap=rules.subchunk_segmentation.chunk_overlap,
+            separator=rules.subchunk_segmentation.separator,
+            embedding_model_instance=embedding_model_instance,
+        )
+        # parse document to child nodes
+        child_nodes = []
+        child_documents = child_splitter.split_documents([document_node])
+        for child_document_node in child_documents:
+            if child_document_node.page_content.strip():
+                doc_id = str(uuid.uuid4())
+                hash = helper.generate_text_hash(child_document_node.page_content)
+                child_document = ChildDocument(
+                    page_content=child_document_node.page_content, metadata=document_node.metadata
+                )
+                child_document.metadata["doc_id"] = doc_id
+                child_document.metadata["doc_hash"] = hash
+                child_page_content = child_document.page_content
+                if child_page_content.startswith(".") or child_page_content.startswith("。"):
+                    child_page_content = child_page_content[1:].strip()
+                if len(child_page_content) > 0:
+                    child_document.page_content = child_page_content
+                    child_nodes.append(child_document)
+        return child_nodes
--- a/api/core/rag/index_processor/processor/qa_index_processor.py
+++ b/api/core/rag/index_processor/processor/qa_index_processor.py
@@ -21,18 +21,28 @@ from core.rag.models.document import Document
 from core.tools.utils.text_processing_utils import remove_leading_symbols
 from libs import helper
 from models.dataset import Dataset
+from services.entities.knowledge_entities.knowledge_entities import Rule


 class QAIndexProcessor(BaseIndexProcessor):
    def extract(self, extract_setting: ExtractSetting, **kwargs) -> list[Document]:
        text_docs = ExtractProcessor.extract(
-            extract_setting=extract_setting, is_automatic=kwargs.get("process_rule_mode") == "automatic"
+            extract_setting=extract_setting,
+            is_automatic=(
+                kwargs.get("process_rule_mode") == "automatic" or kwargs.get("process_rule_mode") == "hierarchical"
+            ),
        )
        return text_docs

    def transform(self, documents: list[Document], **kwargs) -> list[Document]:
+        preview = kwargs.get("preview")
+        process_rule = kwargs.get("process_rule")
+        rules = Rule(**process_rule.get("rules"))
        splitter = self._get_splitter(
-            processing_rule=kwargs.get("process_rule") or {},
+            processing_rule_mode=process_rule.get("mode"),
+            max_tokens=rules.segmentation.max_tokens,
+            chunk_overlap=rules.segmentation.chunk_overlap,
+            separator=rules.segmentation.separator,
            embedding_model_instance=kwargs.get("embedding_model_instance"),
        )

@@ -59,24 +69,33 @@ class QAIndexProcessor(BaseIndexProcessor):
                    document_node.page_content = remove_leading_symbols(page_content)
                    split_documents.append(document_node)
            all_documents.extend(split_documents)
-        for i in range(0, len(all_documents), 10):
-            threads = []
-            sub_documents = all_documents[i : i + 10]
-            for doc in sub_documents:
-                document_format_thread = threading.Thread(
-                    target=self._format_qa_document,
-                    kwargs={
-                        "flask_app": current_app._get_current_object(),  # type: ignore
-                        "tenant_id": kwargs.get("tenant_id"),
-                        "document_node": doc,
-                        "all_qa_documents": all_qa_documents,
-                        "document_language": kwargs.get("doc_language", "English"),
-                    },
-                )
-                threads.append(document_format_thread)
-                document_format_thread.start()
-            for thread in threads:
-                thread.join()
+        if preview:
+            self._format_qa_document(
+                current_app._get_current_object(),
+                kwargs.get("tenant_id"),
+                all_documents[0],
+                all_qa_documents,
+                kwargs.get("doc_language", "English"),
+            )
+        else:
+            for i in range(0, len(all_documents), 10):
+                threads = []
+                sub_documents = all_documents[i : i + 10]
+                for doc in sub_documents:
+                    document_format_thread = threading.Thread(
+                        target=self._format_qa_document,
+                        kwargs={
+                            "flask_app": current_app._get_current_object(),
+                            "tenant_id": kwargs.get("tenant_id"),
+                            "document_node": doc,
+                            "all_qa_documents": all_qa_documents,
+                            "document_language": kwargs.get("doc_language", "English"),
+                        },
+                    )
+                    threads.append(document_format_thread)
+                    document_format_thread.start()
+                for thread in threads:
+                    thread.join()
        return all_qa_documents

    def format_by_template(self, file: FileStorage, **kwargs) -> list[Document]:
@@ -98,12 +117,12 @@ class QAIndexProcessor(BaseIndexProcessor):
            raise ValueError(str(e))
        return text_docs

-    def load(self, dataset: Dataset, documents: list[Document], with_keywords: bool = True):
+    def load(self, dataset: Dataset, documents: list[Document], with_keywords: bool = True, **kwargs):
        if dataset.indexing_technique == "high_quality":
            vector = Vector(dataset)
            vector.create(documents)

-    def clean(self, dataset: Dataset, node_ids: Optional[list[str]], with_keywords: bool = True):
+    def clean(self, dataset: Dataset, node_ids: Optional[list[str]], with_keywords: bool = True, **kwargs):
        vector = Vector(dataset)
        if node_ids:
            vector.delete_by_ids(node_ids)