Feat/support parent child chunk (#12092)
This commit is contained in:
@@ -7,7 +7,7 @@ from core.model_manager import ModelManager
|
||||
from core.model_runtime.entities.model_entities import ModelType
|
||||
from core.rag.models.document import Document
|
||||
from extensions.ext_database import db
|
||||
from models.dataset import Dataset, DocumentSegment
|
||||
from models.dataset import ChildChunk, Dataset, DocumentSegment
|
||||
|
||||
|
||||
class DatasetDocumentStore:
|
||||
@@ -60,7 +60,7 @@ class DatasetDocumentStore:
|
||||
|
||||
return output
|
||||
|
||||
def add_documents(self, docs: Sequence[Document], allow_update: bool = True) -> None:
|
||||
def add_documents(self, docs: Sequence[Document], allow_update: bool = True, save_child: bool = False) -> None:
|
||||
max_position = (
|
||||
db.session.query(func.max(DocumentSegment.position))
|
||||
.filter(DocumentSegment.document_id == self._document_id)
|
||||
@@ -120,6 +120,23 @@ class DatasetDocumentStore:
|
||||
segment_document.answer = doc.metadata.pop("answer", "")
|
||||
|
||||
db.session.add(segment_document)
|
||||
db.session.flush()
|
||||
if save_child:
|
||||
for postion, child in enumerate(doc.children, start=1):
|
||||
child_segment = ChildChunk(
|
||||
tenant_id=self._dataset.tenant_id,
|
||||
dataset_id=self._dataset.id,
|
||||
document_id=self._document_id,
|
||||
segment_id=segment_document.id,
|
||||
position=postion,
|
||||
index_node_id=child.metadata["doc_id"],
|
||||
index_node_hash=child.metadata["doc_hash"],
|
||||
content=child.page_content,
|
||||
word_count=len(child.page_content),
|
||||
type="automatic",
|
||||
created_by=self._user_id,
|
||||
)
|
||||
db.session.add(child_segment)
|
||||
else:
|
||||
segment_document.content = doc.page_content
|
||||
if doc.metadata.get("answer"):
|
||||
@@ -127,6 +144,30 @@ class DatasetDocumentStore:
|
||||
segment_document.index_node_hash = doc.metadata["doc_hash"]
|
||||
segment_document.word_count = len(doc.page_content)
|
||||
segment_document.tokens = tokens
|
||||
if save_child and doc.children:
|
||||
# delete the existing child chunks
|
||||
db.session.query(ChildChunk).filter(
|
||||
ChildChunk.tenant_id == self._dataset.tenant_id,
|
||||
ChildChunk.dataset_id == self._dataset.id,
|
||||
ChildChunk.document_id == self._document_id,
|
||||
ChildChunk.segment_id == segment_document.id,
|
||||
).delete()
|
||||
# add new child chunks
|
||||
for position, child in enumerate(doc.children, start=1):
|
||||
child_segment = ChildChunk(
|
||||
tenant_id=self._dataset.tenant_id,
|
||||
dataset_id=self._dataset.id,
|
||||
document_id=self._document_id,
|
||||
segment_id=segment_document.id,
|
||||
position=position,
|
||||
index_node_id=child.metadata["doc_id"],
|
||||
index_node_hash=child.metadata["doc_hash"],
|
||||
content=child.page_content,
|
||||
word_count=len(child.page_content),
|
||||
type="automatic",
|
||||
created_by=self._user_id,
|
||||
)
|
||||
db.session.add(child_segment)
|
||||
|
||||
db.session.commit()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user