fix: Fix parent child retrieval issues (#12206)

Co-authored-by: NFish <douxc512@gmail.com> Co-authored-by: nite-knite <nkCoding@gmail.com>
2025-01-02 16:07:21 +08:00
parent 68757950ce
commit 09d759d196
34 changed files with 446 additions and 387 deletions
--- a/web/app/(commonLayout)/datasets/template/template.en.mdx
+++ b/web/app/(commonLayout)/datasets/template/template.en.mdx
@@ -52,6 +52,15 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from
          - <code>high_quality</code> High quality: embedding using embedding model, built as vector database index
          - <code>economy</code> Economy: Build using inverted index of keyword table index
      </Property>
+      <Property name='doc_form' type='string' key='doc_form'>
+        Format of indexed content
+          - <code>text_model</code> Text documents are directly embedded; `economy` mode defaults to using this form
+          - <code>hierarchical_model</code> Parent-child mode
+          - <code>qa_model</code> Q&A Mode: Generates Q&A pairs for segmented documents and then embeds the questions
+      </Property>
+      <Property name='doc_language' type='string' key='doc_language'>
+        In Q&A mode, specify the language of the document, for example: <code>English</code>, <code>Chinese</code>
+      </Property>
      <Property name='process_rule' type='object' key='process_rule'>
        Processing rules
          - <code>mode</code> (string) Cleaning, segmentation mode, automatic / custom
@@ -65,6 +74,10 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from
            - <code>segmentation</code> (object) Segmentation rules
              - <code>separator</code> Custom segment identifier, currently only allows one delimiter to be set. Default is \n
              - <code>max_tokens</code> Maximum length (token) defaults to 1000
+            - <code>parent_mode</code> Retrieval mode of parent chunks: <code>full-doc</code> full text retrieval / <code>paragraph</code> paragraph retrieval
+            - <code>subchunk_segmentation</code> (object) Child chunk rules
+              - <code>separator</code> Segmentation identifier. Currently, only one delimiter is allowed. The default is <code>***</code>
+              - <code>max_tokens</code> The maximum length (tokens) must be validated to be shorter than the length of the parent chunk
      </Property>
    </Properties>
  </Col>
@@ -155,6 +168,13 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from
          - <code>high_quality</code> High quality: embedding using embedding model, built as vector database index
          - <code>economy</code> Economy: Build using inverted index of keyword table index

+        - <code>doc_form</code> Format of indexed content
+          - <code>text_model</code> Text documents are directly embedded; `economy` mode defaults to using this form
+          - <code>hierarchical_model</code> Parent-child mode
+          - <code>qa_model</code> Q&A Mode: Generates Q&A pairs for segmented documents and then embeds the questions
+
+        - <code>doc_language</code> In Q&A mode, specify the language of the document, for example: <code>English</code>, <code>Chinese</code>
+
        - <code>process_rule</code> Processing rules
          - <code>mode</code> (string) Cleaning, segmentation mode, automatic / custom
          - <code>rules</code> (object) Custom rules (in automatic mode, this field is empty)
@@ -167,6 +187,10 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from
            - <code>segmentation</code> (object) Segmentation rules
              - <code>separator</code> Custom segment identifier, currently only allows one delimiter to be set. Default is \n
              - <code>max_tokens</code> Maximum length (token) defaults to 1000
+            - <code>parent_mode</code> Retrieval mode of parent chunks: <code>full-doc</code> full text retrieval / <code>paragraph</code> paragraph retrieval
+            - <code>subchunk_segmentation</code> (object) Child chunk rules
+              - <code>separator</code> Segmentation identifier. Currently, only one delimiter is allowed. The default is <code>***</code>
+              - <code>max_tokens</code> The maximum length (tokens) must be validated to be shorter than the length of the parent chunk
      </Property>
      <Property name='file' type='multipart/form-data' key='file'>
        Files that need to be uploaded.
@@ -449,6 +473,10 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from
            - <code>segmentation</code> (object) Segmentation rules
              - <code>separator</code> Custom segment identifier, currently only allows one delimiter to be set. Default is \n
              - <code>max_tokens</code> Maximum length (token) defaults to 1000
+            - <code>parent_mode</code> Retrieval mode of parent chunks: <code>full-doc</code> full text retrieval / <code>paragraph</code> paragraph retrieval
+            - <code>subchunk_segmentation</code> (object) Child chunk rules
+              - <code>separator</code> Segmentation identifier. Currently, only one delimiter is allowed. The default is <code>***</code>
+              - <code>max_tokens</code> The maximum length (tokens) must be validated to be shorter than the length of the parent chunk
      </Property>
    </Properties>
  </Col>
@@ -546,6 +574,10 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from
            - <code>segmentation</code> (object) Segmentation rules
              - <code>separator</code> Custom segment identifier, currently only allows one delimiter to be set. Default is \n
              - <code>max_tokens</code> Maximum length (token) defaults to 1000
+            - <code>parent_mode</code> Retrieval mode of parent chunks: <code>full-doc</code> full text retrieval / <code>paragraph</code> paragraph retrieval
+            - <code>subchunk_segmentation</code> (object) Child chunk rules
+              - <code>separator</code> Segmentation identifier. Currently, only one delimiter is allowed. The default is <code>***</code>
+              - <code>max_tokens</code> The maximum length (tokens) must be validated to be shorter than the length of the parent chunk
      </Property>
    </Properties>
  </Col>
@@ -984,7 +1016,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from
 <Heading
  url='/datasets/{dataset_id}/documents/{document_id}/segments/{segment_id}'
  method='POST'
-  title='Update a Chunk in a Document '
+  title='Update a Chunk in a Document'
  name='#update_segment'
 />
 <Row>
@@ -1009,6 +1041,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from
        - <code>answer</code> (text) Answer content, passed if the knowledge is in Q&A mode (optional)
        - <code>keywords</code> (list) Keyword (optional)
        - <code>enabled</code> (bool) False / true (optional)
+        - <code>regenerate_child_chunks</code> (bool) Whether to regenerate child chunks (optional)
      </Property>
    </Properties>
  </Col>