Feat/dify rag (#2528)

Co-authored-by: jyong <jyong@dify.ai>
This commit is contained in:
Jyong
2024-02-22 23:31:57 +08:00
committed by GitHub
parent 97fe817186
commit 6c4e6bf1d6
119 changed files with 3181 additions and 5892 deletions

View File

@@ -4,10 +4,10 @@ import time
import click
from celery import shared_task
from langchain.schema import Document
from werkzeug.exceptions import NotFound
from core.index.index import IndexBuilder
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
from core.rag.models.document import Document
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from models.dataset import Document as DatasetDocument
@@ -60,15 +60,9 @@ def add_document_to_index_task(dataset_document_id: str):
if not dataset:
raise Exception('Document has no dataset')
# save vector index
index = IndexBuilder.get_index(dataset, 'high_quality')
if index:
index.add_texts(documents)
# save keyword index
index = IndexBuilder.get_index(dataset, 'economy')
if index:
index.add_texts(documents)
index_type = dataset.doc_form
index_processor = IndexProcessorFactory(index_type).init_index_processor()
index_processor.load(dataset, documents)
end_at = time.perf_counter()
logging.info(

View File

@@ -5,7 +5,7 @@ import click
from celery import shared_task
from langchain.schema import Document
from core.index.index import IndexBuilder
from core.rag.datasource.vdb.vector_factory import Vector
from models.dataset import Dataset
from services.dataset_service import DatasetCollectionBindingService
@@ -48,9 +48,9 @@ def add_annotation_to_index_task(annotation_id: str, question: str, tenant_id: s
"doc_id": annotation_id
}
)
index = IndexBuilder.get_index(dataset, 'high_quality')
if index:
index.add_texts([document])
vector = Vector(dataset, attributes=['doc_id', 'annotation_id', 'app_id'])
vector.create([document], duplicate_check=True)
end_at = time.perf_counter()
logging.info(
click.style(

View File

@@ -6,7 +6,7 @@ from celery import shared_task
from langchain.schema import Document
from werkzeug.exceptions import NotFound
from core.index.index import IndexBuilder
from core.rag.datasource.vdb.vector_factory import Vector
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from models.dataset import Dataset
@@ -79,9 +79,8 @@ def batch_import_annotations_task(job_id: str, content_list: list[dict], app_id:
collection_binding_id=dataset_collection_binding.id
)
index = IndexBuilder.get_index(dataset, 'high_quality')
if index:
index.add_texts(documents)
vector = Vector(dataset, attributes=['doc_id', 'annotation_id', 'app_id'])
vector.create(documents, duplicate_check=True)
db.session.commit()
redis_client.setex(indexing_cache_key, 600, 'completed')

View File

@@ -4,7 +4,7 @@ import time
import click
from celery import shared_task
from core.index.index import IndexBuilder
from core.rag.datasource.vdb.vector_factory import Vector
from models.dataset import Dataset
from services.dataset_service import DatasetCollectionBindingService
@@ -30,12 +30,11 @@ def delete_annotation_index_task(annotation_id: str, app_id: str, tenant_id: str
collection_binding_id=dataset_collection_binding.id
)
vector_index = IndexBuilder.get_default_high_quality_index(dataset)
if vector_index:
try:
vector_index.delete_by_metadata_field('annotation_id', annotation_id)
except Exception:
logging.exception("Delete annotation index failed when annotation deleted.")
try:
vector = Vector(dataset, attributes=['doc_id', 'annotation_id', 'app_id'])
vector.delete_by_metadata_field('annotation_id', annotation_id)
except Exception:
logging.exception("Delete annotation index failed when annotation deleted.")
end_at = time.perf_counter()
logging.info(
click.style('App annotations index deleted : {} latency: {}'.format(app_id, end_at - start_at),

View File

@@ -5,7 +5,7 @@ import click
from celery import shared_task
from werkzeug.exceptions import NotFound
from core.index.index import IndexBuilder
from core.rag.datasource.vdb.vector_factory import Vector
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from models.dataset import Dataset
@@ -48,12 +48,11 @@ def disable_annotation_reply_task(job_id: str, app_id: str, tenant_id: str):
collection_binding_id=app_annotation_setting.collection_binding_id
)
vector_index = IndexBuilder.get_default_high_quality_index(dataset)
if vector_index:
try:
vector_index.delete_by_metadata_field('app_id', app_id)
except Exception:
logging.exception("Delete doc index failed when dataset deleted.")
try:
vector = Vector(dataset, attributes=['doc_id', 'annotation_id', 'app_id'])
vector.delete_by_metadata_field('app_id', app_id)
except Exception:
logging.exception("Delete annotation index failed when annotation deleted.")
redis_client.setex(disable_app_annotation_job_key, 600, 'completed')
# delete annotation setting

View File

@@ -7,7 +7,7 @@ from celery import shared_task
from langchain.schema import Document
from werkzeug.exceptions import NotFound
from core.index.index import IndexBuilder
from core.rag.datasource.vdb.vector_factory import Vector
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from models.dataset import Dataset
@@ -81,15 +81,15 @@ def enable_annotation_reply_task(job_id: str, app_id: str, user_id: str, tenant_
}
)
documents.append(document)
index = IndexBuilder.get_index(dataset, 'high_quality')
if index:
try:
index.delete_by_metadata_field('app_id', app_id)
except Exception as e:
logging.info(
click.style('Delete annotation index error: {}'.format(str(e)),
fg='red'))
index.add_texts(documents)
vector = Vector(dataset, attributes=['doc_id', 'annotation_id', 'app_id'])
try:
vector.delete_by_metadata_field('app_id', app_id)
except Exception as e:
logging.info(
click.style('Delete annotation index error: {}'.format(str(e)),
fg='red'))
vector.add_texts(documents)
db.session.commit()
redis_client.setex(enable_app_annotation_job_key, 600, 'completed')
end_at = time.perf_counter()

View File

@@ -5,7 +5,7 @@ import click
from celery import shared_task
from langchain.schema import Document
from core.index.index import IndexBuilder
from core.rag.datasource.vdb.vector_factory import Vector
from models.dataset import Dataset
from services.dataset_service import DatasetCollectionBindingService
@@ -49,10 +49,9 @@ def update_annotation_to_index_task(annotation_id: str, question: str, tenant_id
"doc_id": annotation_id
}
)
index = IndexBuilder.get_index(dataset, 'high_quality')
if index:
index.delete_by_metadata_field('annotation_id', annotation_id)
index.add_texts([document])
vector = Vector(dataset, attributes=['doc_id', 'annotation_id', 'app_id'])
vector.delete_by_metadata_field('annotation_id', annotation_id)
vector.add_texts([document])
end_at = time.perf_counter()
logging.info(
click.style(

View File

@@ -4,7 +4,7 @@ import time
import click
from celery import shared_task
from core.index.index import IndexBuilder
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
from extensions.ext_database import db
from models.dataset import (
AppDatasetJoin,
@@ -18,7 +18,7 @@ from models.dataset import (
@shared_task(queue='dataset')
def clean_dataset_task(dataset_id: str, tenant_id: str, indexing_technique: str,
index_struct: str, collection_binding_id: str):
index_struct: str, collection_binding_id: str, doc_form: str):
"""
Clean dataset when dataset deleted.
:param dataset_id: dataset id
@@ -26,6 +26,7 @@ def clean_dataset_task(dataset_id: str, tenant_id: str, indexing_technique: str,
:param indexing_technique: indexing technique
:param index_struct: index struct dict
:param collection_binding_id: collection binding id
:param doc_form: dataset form
Usage: clean_dataset_task.delay(dataset_id, tenant_id, indexing_technique, index_struct)
"""
@@ -38,26 +39,14 @@ def clean_dataset_task(dataset_id: str, tenant_id: str, indexing_technique: str,
tenant_id=tenant_id,
indexing_technique=indexing_technique,
index_struct=index_struct,
collection_binding_id=collection_binding_id
collection_binding_id=collection_binding_id,
doc_form=doc_form
)
documents = db.session.query(Document).filter(Document.dataset_id == dataset_id).all()
segments = db.session.query(DocumentSegment).filter(DocumentSegment.dataset_id == dataset_id).all()
kw_index = IndexBuilder.get_index(dataset, 'economy')
# delete from vector index
if dataset.indexing_technique == 'high_quality':
vector_index = IndexBuilder.get_default_high_quality_index(dataset)
try:
vector_index.delete_by_group_id(dataset.id)
except Exception:
logging.exception("Delete doc index failed when dataset deleted.")
# delete from keyword index
try:
kw_index.delete()
except Exception:
logging.exception("Delete nodes index failed when dataset deleted.")
index_processor = IndexProcessorFactory(doc_form).init_index_processor()
index_processor.clean(dataset, None)
for document in documents:
db.session.delete(document)

View File

@@ -4,17 +4,18 @@ import time
import click
from celery import shared_task
from core.index.index import IndexBuilder
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
from extensions.ext_database import db
from models.dataset import Dataset, DocumentSegment
@shared_task(queue='dataset')
def clean_document_task(document_id: str, dataset_id: str):
def clean_document_task(document_id: str, dataset_id: str, doc_form: str):
"""
Clean document when document deleted.
:param document_id: document id
:param dataset_id: dataset id
:param doc_form: doc_form
Usage: clean_document_task.delay(document_id, dataset_id)
"""
@@ -27,21 +28,12 @@ def clean_document_task(document_id: str, dataset_id: str):
if not dataset:
raise Exception('Document has no dataset')
vector_index = IndexBuilder.get_index(dataset, 'high_quality')
kw_index = IndexBuilder.get_index(dataset, 'economy')
segments = db.session.query(DocumentSegment).filter(DocumentSegment.document_id == document_id).all()
# check segment is exist
if segments:
index_node_ids = [segment.index_node_id for segment in segments]
# delete from vector index
if vector_index:
vector_index.delete_by_document_id(document_id)
# delete from keyword index
if index_node_ids:
kw_index.delete_by_ids(index_node_ids)
index_processor = IndexProcessorFactory(doc_form).init_index_processor()
index_processor.clean(dataset, index_node_ids)
for segment in segments:
db.session.delete(segment)

View File

@@ -4,7 +4,7 @@ import time
import click
from celery import shared_task
from core.index.index import IndexBuilder
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
from extensions.ext_database import db
from models.dataset import Dataset, Document, DocumentSegment
@@ -26,9 +26,8 @@ def clean_notion_document_task(document_ids: list[str], dataset_id: str):
if not dataset:
raise Exception('Document has no dataset')
vector_index = IndexBuilder.get_index(dataset, 'high_quality')
kw_index = IndexBuilder.get_index(dataset, 'economy')
index_type = dataset.doc_form
index_processor = IndexProcessorFactory(index_type).init_index_processor()
for document_id in document_ids:
document = db.session.query(Document).filter(
Document.id == document_id
@@ -38,13 +37,7 @@ def clean_notion_document_task(document_ids: list[str], dataset_id: str):
segments = db.session.query(DocumentSegment).filter(DocumentSegment.document_id == document_id).all()
index_node_ids = [segment.index_node_id for segment in segments]
# delete from vector index
if vector_index:
vector_index.delete_by_document_id(document_id)
# delete from keyword index
if index_node_ids:
kw_index.delete_by_ids(index_node_ids)
index_processor.clean(dataset, index_node_ids)
for segment in segments:
db.session.delete(segment)

View File

@@ -5,10 +5,10 @@ from typing import Optional
import click
from celery import shared_task
from langchain.schema import Document
from werkzeug.exceptions import NotFound
from core.index.index import IndexBuilder
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
from core.rag.models.document import Document
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from models.dataset import DocumentSegment
@@ -68,18 +68,9 @@ def create_segment_to_index_task(segment_id: str, keywords: Optional[list[str]]
logging.info(click.style('Segment {} document status is invalid, pass.'.format(segment.id), fg='cyan'))
return
# save vector index
index = IndexBuilder.get_index(dataset, 'high_quality')
if index:
index.add_texts([document], duplicate_check=True)
# save keyword index
index = IndexBuilder.get_index(dataset, 'economy')
if index:
if keywords and len(keywords) > 0:
index.create_segment_keywords(segment.index_node_id, keywords)
else:
index.add_texts([document])
index_type = dataset.doc_form
index_processor = IndexProcessorFactory(index_type).init_index_processor()
index_processor.load(dataset, [document])
# update segment to completed
update_params = {

View File

@@ -3,9 +3,9 @@ import time
import click
from celery import shared_task
from langchain.schema import Document
from core.index.index import IndexBuilder
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
from core.rag.models.document import Document
from extensions.ext_database import db
from models.dataset import Dataset, DocumentSegment
from models.dataset import Document as DatasetDocument
@@ -29,10 +29,10 @@ def deal_dataset_vector_index_task(dataset_id: str, action: str):
if not dataset:
raise Exception('Dataset not found')
index_type = dataset.doc_form
index_processor = IndexProcessorFactory(index_type).init_index_processor()
if action == "remove":
index = IndexBuilder.get_index(dataset, 'high_quality', ignore_high_quality_check=True)
index.delete_by_group_id(dataset.id)
index_processor.clean(dataset, None, with_keywords=False)
elif action == "add":
dataset_documents = db.session.query(DatasetDocument).filter(
DatasetDocument.dataset_id == dataset_id,
@@ -42,8 +42,6 @@ def deal_dataset_vector_index_task(dataset_id: str, action: str):
).all()
if dataset_documents:
# save vector index
index = IndexBuilder.get_index(dataset, 'high_quality', ignore_high_quality_check=False)
documents = []
for dataset_document in dataset_documents:
# delete from vector index
@@ -65,7 +63,7 @@ def deal_dataset_vector_index_task(dataset_id: str, action: str):
documents.append(document)
# save vector index
index.create(documents)
index_processor.load(dataset, documents, with_keywords=False)
end_at = time.perf_counter()
logging.info(

View File

@@ -4,7 +4,7 @@ import time
import click
from celery import shared_task
from core.index.index import IndexBuilder
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from models.dataset import Dataset, Document
@@ -39,15 +39,9 @@ def delete_segment_from_index_task(segment_id: str, index_node_id: str, dataset_
logging.info(click.style('Segment {} document status is invalid, pass.'.format(segment_id), fg='cyan'))
return
vector_index = IndexBuilder.get_index(dataset, 'high_quality')
kw_index = IndexBuilder.get_index(dataset, 'economy')
# delete from vector index
if vector_index:
vector_index.delete_by_ids([index_node_id])
# delete from keyword index
kw_index.delete_by_ids([index_node_id])
index_type = dataset_document.doc_form
index_processor = IndexProcessorFactory(index_type).init_index_processor()
index_processor.clean(dataset, [index_node_id])
end_at = time.perf_counter()
logging.info(click.style('Segment deleted from index: {} latency: {}'.format(segment_id, end_at - start_at), fg='green'))

View File

@@ -5,7 +5,7 @@ import click
from celery import shared_task
from werkzeug.exceptions import NotFound
from core.index.index import IndexBuilder
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from models.dataset import DocumentSegment
@@ -48,15 +48,9 @@ def disable_segment_from_index_task(segment_id: str):
logging.info(click.style('Segment {} document status is invalid, pass.'.format(segment.id), fg='cyan'))
return
vector_index = IndexBuilder.get_index(dataset, 'high_quality')
kw_index = IndexBuilder.get_index(dataset, 'economy')
# delete from vector index
if vector_index:
vector_index.delete_by_ids([segment.index_node_id])
# delete from keyword index
kw_index.delete_by_ids([segment.index_node_id])
index_type = dataset_document.doc_form
index_processor = IndexProcessorFactory(index_type).init_index_processor()
index_processor.clean(dataset, [segment.index_node_id])
end_at = time.perf_counter()
logging.info(click.style('Segment removed from index: {} latency: {}'.format(segment.id, end_at - start_at), fg='green'))

View File

@@ -6,9 +6,9 @@ import click
from celery import shared_task
from werkzeug.exceptions import NotFound
from core.data_loader.loader.notion import NotionLoader
from core.index.index import IndexBuilder
from core.indexing_runner import DocumentIsPausedException, IndexingRunner
from core.rag.extractor.notion_extractor import NotionExtractor
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
from extensions.ext_database import db
from models.dataset import Dataset, Document, DocumentSegment
from models.source import DataSourceBinding
@@ -54,11 +54,11 @@ def document_indexing_sync_task(dataset_id: str, document_id: str):
if not data_source_binding:
raise ValueError('Data source binding not found.')
loader = NotionLoader(
notion_access_token=data_source_binding.access_token,
loader = NotionExtractor(
notion_workspace_id=workspace_id,
notion_obj_id=page_id,
notion_page_type=page_type
notion_page_type=page_type,
notion_access_token=data_source_binding.access_token
)
last_edited_time = loader.get_notion_last_edited_time()
@@ -74,20 +74,14 @@ def document_indexing_sync_task(dataset_id: str, document_id: str):
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
if not dataset:
raise Exception('Dataset not found')
vector_index = IndexBuilder.get_index(dataset, 'high_quality')
kw_index = IndexBuilder.get_index(dataset, 'economy')
index_type = document.doc_form
index_processor = IndexProcessorFactory(index_type).init_index_processor()
segments = db.session.query(DocumentSegment).filter(DocumentSegment.document_id == document_id).all()
index_node_ids = [segment.index_node_id for segment in segments]
# delete from vector index
if vector_index:
vector_index.delete_by_document_id(document_id)
# delete from keyword index
if index_node_ids:
kw_index.delete_by_ids(index_node_ids)
index_processor.clean(dataset, index_node_ids)
for segment in segments:
db.session.delete(segment)

View File

@@ -6,8 +6,8 @@ import click
from celery import shared_task
from werkzeug.exceptions import NotFound
from core.index.index import IndexBuilder
from core.indexing_runner import DocumentIsPausedException, IndexingRunner
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
from extensions.ext_database import db
from models.dataset import Dataset, Document, DocumentSegment
@@ -42,19 +42,14 @@ def document_indexing_update_task(dataset_id: str, document_id: str):
if not dataset:
raise Exception('Dataset not found')
vector_index = IndexBuilder.get_index(dataset, 'high_quality')
kw_index = IndexBuilder.get_index(dataset, 'economy')
index_type = document.doc_form
index_processor = IndexProcessorFactory(index_type).init_index_processor()
segments = db.session.query(DocumentSegment).filter(DocumentSegment.document_id == document_id).all()
index_node_ids = [segment.index_node_id for segment in segments]
# delete from vector index
if vector_index:
vector_index.delete_by_ids(index_node_ids)
# delete from keyword index
if index_node_ids:
kw_index.delete_by_ids(index_node_ids)
index_processor.clean(dataset, index_node_ids)
for segment in segments:
db.session.delete(segment)

View File

@@ -4,10 +4,10 @@ import time
import click
from celery import shared_task
from langchain.schema import Document
from werkzeug.exceptions import NotFound
from core.index.index import IndexBuilder
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
from core.rag.models.document import Document
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from models.dataset import DocumentSegment
@@ -60,15 +60,9 @@ def enable_segment_to_index_task(segment_id: str):
logging.info(click.style('Segment {} document status is invalid, pass.'.format(segment.id), fg='cyan'))
return
index_processor = IndexProcessorFactory(dataset_document.doc_form).init_index_processor()
# save vector index
index = IndexBuilder.get_index(dataset, 'high_quality')
if index:
index.add_texts([document], duplicate_check=True)
# save keyword index
index = IndexBuilder.get_index(dataset, 'economy')
if index:
index.add_texts([document])
index_processor.load(dataset, [document])
end_at = time.perf_counter()
logging.info(click.style('Segment enabled to index: {} latency: {}'.format(segment.id, end_at - start_at), fg='green'))

View File

@@ -5,7 +5,7 @@ import click
from celery import shared_task
from werkzeug.exceptions import NotFound
from core.index.index import IndexBuilder
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from models.dataset import Document, DocumentSegment
@@ -37,18 +37,12 @@ def remove_document_from_index_task(document_id: str):
if not dataset:
raise Exception('Document has no dataset')
vector_index = IndexBuilder.get_index(dataset, 'high_quality')
kw_index = IndexBuilder.get_index(dataset, 'economy')
index_processor = IndexProcessorFactory(document.doc_form).init_index_processor()
# delete from vector index
if vector_index:
vector_index.delete_by_document_id(document.id)
# delete from keyword index
segments = db.session.query(DocumentSegment).filter(DocumentSegment.document_id == document.id).all()
index_node_ids = [segment.index_node_id for segment in segments]
if index_node_ids:
kw_index.delete_by_ids(index_node_ids)
index_processor.clean(dataset, index_node_ids)
end_at = time.perf_counter()
logging.info(

View File

@@ -1,114 +0,0 @@
import datetime
import logging
import time
from typing import Optional
import click
from celery import shared_task
from langchain.schema import Document
from werkzeug.exceptions import NotFound
from core.index.index import IndexBuilder
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from models.dataset import DocumentSegment
@shared_task(queue='dataset')
def update_segment_index_task(segment_id: str, keywords: Optional[list[str]] = None):
"""
Async update segment index
:param segment_id:
:param keywords:
Usage: update_segment_index_task.delay(segment_id)
"""
logging.info(click.style('Start update segment index: {}'.format(segment_id), fg='green'))
start_at = time.perf_counter()
segment = db.session.query(DocumentSegment).filter(DocumentSegment.id == segment_id).first()
if not segment:
raise NotFound('Segment not found')
if segment.status != 'updating':
return
indexing_cache_key = 'segment_{}_indexing'.format(segment.id)
try:
dataset = segment.dataset
if not dataset:
logging.info(click.style('Segment {} has no dataset, pass.'.format(segment.id), fg='cyan'))
return
dataset_document = segment.document
if not dataset_document:
logging.info(click.style('Segment {} has no document, pass.'.format(segment.id), fg='cyan'))
return
if not dataset_document.enabled or dataset_document.archived or dataset_document.indexing_status != 'completed':
logging.info(click.style('Segment {} document status is invalid, pass.'.format(segment.id), fg='cyan'))
return
# update segment status to indexing
update_params = {
DocumentSegment.status: "indexing",
DocumentSegment.indexing_at: datetime.datetime.utcnow()
}
DocumentSegment.query.filter_by(id=segment.id).update(update_params)
db.session.commit()
vector_index = IndexBuilder.get_index(dataset, 'high_quality')
kw_index = IndexBuilder.get_index(dataset, 'economy')
# delete from vector index
if vector_index:
vector_index.delete_by_ids([segment.index_node_id])
# delete from keyword index
kw_index.delete_by_ids([segment.index_node_id])
# add new index
document = Document(
page_content=segment.content,
metadata={
"doc_id": segment.index_node_id,
"doc_hash": segment.index_node_hash,
"document_id": segment.document_id,
"dataset_id": segment.dataset_id,
}
)
# save vector index
index = IndexBuilder.get_index(dataset, 'high_quality')
if index:
index.add_texts([document], duplicate_check=True)
# save keyword index
index = IndexBuilder.get_index(dataset, 'economy')
if index:
if keywords and len(keywords) > 0:
index.create_segment_keywords(segment.index_node_id, keywords)
else:
index.add_texts([document])
# update segment to completed
update_params = {
DocumentSegment.status: "completed",
DocumentSegment.completed_at: datetime.datetime.utcnow()
}
DocumentSegment.query.filter_by(id=segment.id).update(update_params)
db.session.commit()
end_at = time.perf_counter()
logging.info(click.style('Segment update index: {} latency: {}'.format(segment.id, end_at - start_at), fg='green'))
except Exception as e:
logging.exception("update segment index failed")
segment.enabled = False
segment.disabled_at = datetime.datetime.utcnow()
segment.status = 'error'
segment.error = str(e)
db.session.commit()
finally:
redis_client.delete(indexing_cache_key)

View File

@@ -1,68 +0,0 @@
import datetime
import logging
import time
import click
from celery import shared_task
from werkzeug.exceptions import NotFound
from core.index.index import IndexBuilder
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from models.dataset import DocumentSegment
@shared_task(queue='dataset')
def update_segment_keyword_index_task(segment_id: str):
"""
Async update segment index
:param segment_id:
Usage: update_segment_keyword_index_task.delay(segment_id)
"""
logging.info(click.style('Start update segment keyword index: {}'.format(segment_id), fg='green'))
start_at = time.perf_counter()
segment = db.session.query(DocumentSegment).filter(DocumentSegment.id == segment_id).first()
if not segment:
raise NotFound('Segment not found')
indexing_cache_key = 'segment_{}_indexing'.format(segment.id)
try:
dataset = segment.dataset
if not dataset:
logging.info(click.style('Segment {} has no dataset, pass.'.format(segment.id), fg='cyan'))
return
dataset_document = segment.document
if not dataset_document:
logging.info(click.style('Segment {} has no document, pass.'.format(segment.id), fg='cyan'))
return
if not dataset_document.enabled or dataset_document.archived or dataset_document.indexing_status != 'completed':
logging.info(click.style('Segment {} document status is invalid, pass.'.format(segment.id), fg='cyan'))
return
kw_index = IndexBuilder.get_index(dataset, 'economy')
# delete from keyword index
kw_index.delete_by_ids([segment.index_node_id])
# save keyword index
index = IndexBuilder.get_index(dataset, 'economy')
if index:
index.update_segment_keywords_index(segment.index_node_id, segment.keywords)
end_at = time.perf_counter()
logging.info(click.style('Segment update index: {} latency: {}'.format(segment.id, end_at - start_at), fg='green'))
except Exception as e:
logging.exception("update segment index failed")
segment.enabled = False
segment.disabled_at = datetime.datetime.utcnow()
segment.status = 'error'
segment.error = str(e)
db.session.commit()
finally:
redis_client.delete(indexing_cache_key)