Fix variable typo (#8084)
This commit is contained in:
@@ -8,5 +8,5 @@ class UnstructuredNonAsciiCharsCleaner(BaseCleaner):
|
||||
"""clean document content."""
|
||||
from unstructured.cleaners.core import clean_non_ascii_chars
|
||||
|
||||
# Returns "This text containsnon-ascii characters!"
|
||||
# Returns "This text contains non-ascii characters!"
|
||||
return clean_non_ascii_chars(content)
|
||||
|
||||
@@ -7,7 +7,7 @@ from core.rag.data_post_processor.data_post_processor import DataPostProcessor
|
||||
from core.rag.datasource.keyword.keyword_factory import Keyword
|
||||
from core.rag.datasource.vdb.vector_factory import Vector
|
||||
from core.rag.rerank.constants.rerank_mode import RerankMode
|
||||
from core.rag.retrieval.retrival_methods import RetrievalMethod
|
||||
from core.rag.retrieval.retrieval_methods import RetrievalMethod
|
||||
from extensions.ext_database import db
|
||||
from models.dataset import Dataset
|
||||
|
||||
@@ -26,7 +26,7 @@ default_retrieval_model = {
|
||||
class RetrievalService:
|
||||
|
||||
@classmethod
|
||||
def retrieve(cls, retrival_method: str, dataset_id: str, query: str,
|
||||
def retrieve(cls, retrieval_method: str, dataset_id: str, query: str,
|
||||
top_k: int, score_threshold: Optional[float] = .0,
|
||||
reranking_model: Optional[dict] = None, reranking_mode: Optional[str] = 'reranking_model',
|
||||
weights: Optional[dict] = None):
|
||||
@@ -39,7 +39,7 @@ class RetrievalService:
|
||||
threads = []
|
||||
exceptions = []
|
||||
# retrieval_model source with keyword
|
||||
if retrival_method == 'keyword_search':
|
||||
if retrieval_method == 'keyword_search':
|
||||
keyword_thread = threading.Thread(target=RetrievalService.keyword_search, kwargs={
|
||||
'flask_app': current_app._get_current_object(),
|
||||
'dataset_id': dataset_id,
|
||||
@@ -51,7 +51,7 @@ class RetrievalService:
|
||||
threads.append(keyword_thread)
|
||||
keyword_thread.start()
|
||||
# retrieval_model source with semantic
|
||||
if RetrievalMethod.is_support_semantic_search(retrival_method):
|
||||
if RetrievalMethod.is_support_semantic_search(retrieval_method):
|
||||
embedding_thread = threading.Thread(target=RetrievalService.embedding_search, kwargs={
|
||||
'flask_app': current_app._get_current_object(),
|
||||
'dataset_id': dataset_id,
|
||||
@@ -60,19 +60,19 @@ class RetrievalService:
|
||||
'score_threshold': score_threshold,
|
||||
'reranking_model': reranking_model,
|
||||
'all_documents': all_documents,
|
||||
'retrival_method': retrival_method,
|
||||
'retrieval_method': retrieval_method,
|
||||
'exceptions': exceptions,
|
||||
})
|
||||
threads.append(embedding_thread)
|
||||
embedding_thread.start()
|
||||
|
||||
# retrieval source with full text
|
||||
if RetrievalMethod.is_support_fulltext_search(retrival_method):
|
||||
if RetrievalMethod.is_support_fulltext_search(retrieval_method):
|
||||
full_text_index_thread = threading.Thread(target=RetrievalService.full_text_index_search, kwargs={
|
||||
'flask_app': current_app._get_current_object(),
|
||||
'dataset_id': dataset_id,
|
||||
'query': query,
|
||||
'retrival_method': retrival_method,
|
||||
'retrieval_method': retrieval_method,
|
||||
'score_threshold': score_threshold,
|
||||
'top_k': top_k,
|
||||
'reranking_model': reranking_model,
|
||||
@@ -89,7 +89,7 @@ class RetrievalService:
|
||||
exception_message = ';\n'.join(exceptions)
|
||||
raise Exception(exception_message)
|
||||
|
||||
if retrival_method == RetrievalMethod.HYBRID_SEARCH.value:
|
||||
if retrieval_method == RetrievalMethod.HYBRID_SEARCH.value:
|
||||
data_post_processor = DataPostProcessor(str(dataset.tenant_id), reranking_mode,
|
||||
reranking_model, weights, False)
|
||||
all_documents = data_post_processor.invoke(
|
||||
@@ -124,7 +124,7 @@ class RetrievalService:
|
||||
@classmethod
|
||||
def embedding_search(cls, flask_app: Flask, dataset_id: str, query: str,
|
||||
top_k: int, score_threshold: Optional[float], reranking_model: Optional[dict],
|
||||
all_documents: list, retrival_method: str, exceptions: list):
|
||||
all_documents: list, retrieval_method: str, exceptions: list):
|
||||
with flask_app.app_context():
|
||||
try:
|
||||
dataset = db.session.query(Dataset).filter(
|
||||
@@ -146,7 +146,7 @@ class RetrievalService:
|
||||
)
|
||||
|
||||
if documents:
|
||||
if reranking_model and reranking_model.get('reranking_model_name') and reranking_model.get('reranking_provider_name') and retrival_method == RetrievalMethod.SEMANTIC_SEARCH.value:
|
||||
if reranking_model and reranking_model.get('reranking_model_name') and reranking_model.get('reranking_provider_name') and retrieval_method == RetrievalMethod.SEMANTIC_SEARCH.value:
|
||||
data_post_processor = DataPostProcessor(str(dataset.tenant_id),
|
||||
RerankMode.RERANKING_MODEL.value,
|
||||
reranking_model, None, False)
|
||||
@@ -164,7 +164,7 @@ class RetrievalService:
|
||||
@classmethod
|
||||
def full_text_index_search(cls, flask_app: Flask, dataset_id: str, query: str,
|
||||
top_k: int, score_threshold: Optional[float], reranking_model: Optional[dict],
|
||||
all_documents: list, retrival_method: str, exceptions: list):
|
||||
all_documents: list, retrieval_method: str, exceptions: list):
|
||||
with flask_app.app_context():
|
||||
try:
|
||||
dataset = db.session.query(Dataset).filter(
|
||||
@@ -180,7 +180,7 @@ class RetrievalService:
|
||||
top_k=top_k
|
||||
)
|
||||
if documents:
|
||||
if reranking_model and reranking_model.get('reranking_model_name') and reranking_model.get('reranking_provider_name') and retrival_method == RetrievalMethod.FULL_TEXT_SEARCH.value:
|
||||
if reranking_model and reranking_model.get('reranking_model_name') and reranking_model.get('reranking_provider_name') and retrieval_method == RetrievalMethod.FULL_TEXT_SEARCH.value:
|
||||
data_post_processor = DataPostProcessor(str(dataset.tenant_id),
|
||||
RerankMode.RERANKING_MODEL.value,
|
||||
reranking_model, None, False)
|
||||
|
||||
@@ -275,10 +275,10 @@ class NotionExtractor(BaseExtractor):
|
||||
data = res.json()
|
||||
# get table headers text
|
||||
table_header_cell_texts = []
|
||||
tabel_header_cells = data["results"][0]['table_row']['cells']
|
||||
for tabel_header_cell in tabel_header_cells:
|
||||
if tabel_header_cell:
|
||||
for table_header_cell_text in tabel_header_cell:
|
||||
table_header_cells = data["results"][0]['table_row']['cells']
|
||||
for table_header_cell in table_header_cells:
|
||||
if table_header_cell:
|
||||
for table_header_cell_text in table_header_cell:
|
||||
text = table_header_cell_text["text"]["content"]
|
||||
table_header_cell_texts.append(text)
|
||||
else:
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
from collections.abc import Iterator
|
||||
from typing import Optional
|
||||
|
||||
from core.rag.extractor.blod.blod import Blob
|
||||
from core.rag.extractor.blob.blob import Blob
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.models.document import Document
|
||||
from extensions.ext_storage import storage
|
||||
|
||||
@@ -34,7 +34,7 @@ class BaseIndexProcessor(ABC):
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def retrieve(self, retrival_method: str, query: str, dataset: Dataset, top_k: int,
|
||||
def retrieve(self, retrieval_method: str, query: str, dataset: Dataset, top_k: int,
|
||||
score_threshold: float, reranking_model: dict) -> list[Document]:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@@ -42,7 +42,7 @@ class ParagraphIndexProcessor(BaseIndexProcessor):
|
||||
hash = helper.generate_text_hash(document_node.page_content)
|
||||
document_node.metadata['doc_id'] = doc_id
|
||||
document_node.metadata['doc_hash'] = hash
|
||||
# delete Spliter character
|
||||
# delete Splitter character
|
||||
page_content = document_node.page_content
|
||||
if page_content.startswith(".") or page_content.startswith("。"):
|
||||
page_content = page_content[1:].strip()
|
||||
@@ -76,10 +76,10 @@ class ParagraphIndexProcessor(BaseIndexProcessor):
|
||||
else:
|
||||
keyword.delete()
|
||||
|
||||
def retrieve(self, retrival_method: str, query: str, dataset: Dataset, top_k: int,
|
||||
def retrieve(self, retrieval_method: str, query: str, dataset: Dataset, top_k: int,
|
||||
score_threshold: float, reranking_model: dict) -> list[Document]:
|
||||
# Set search parameters.
|
||||
results = RetrievalService.retrieve(retrival_method=retrival_method, dataset_id=dataset.id, query=query,
|
||||
results = RetrievalService.retrieve(retrieval_method=retrieval_method, dataset_id=dataset.id, query=query,
|
||||
top_k=top_k, score_threshold=score_threshold,
|
||||
reranking_model=reranking_model)
|
||||
# Organize results.
|
||||
|
||||
@@ -50,7 +50,7 @@ class QAIndexProcessor(BaseIndexProcessor):
|
||||
hash = helper.generate_text_hash(document_node.page_content)
|
||||
document_node.metadata['doc_id'] = doc_id
|
||||
document_node.metadata['doc_hash'] = hash
|
||||
# delete Spliter character
|
||||
# delete Splitter character
|
||||
page_content = document_node.page_content
|
||||
if page_content.startswith(".") or page_content.startswith("。"):
|
||||
page_content = page_content[1:]
|
||||
@@ -107,10 +107,10 @@ class QAIndexProcessor(BaseIndexProcessor):
|
||||
else:
|
||||
vector.delete()
|
||||
|
||||
def retrieve(self, retrival_method: str, query: str, dataset: Dataset, top_k: int,
|
||||
def retrieve(self, retrieval_method: str, query: str, dataset: Dataset, top_k: int,
|
||||
score_threshold: float, reranking_model: dict):
|
||||
# Set search parameters.
|
||||
results = RetrievalService.retrieve(retrival_method=retrival_method, dataset_id=dataset.id, query=query,
|
||||
results = RetrievalService.retrieve(retrieval_method=retrieval_method, dataset_id=dataset.id, query=query,
|
||||
top_k=top_k, score_threshold=score_threshold,
|
||||
reranking_model=reranking_model)
|
||||
# Organize results.
|
||||
|
||||
@@ -21,7 +21,7 @@ from core.rag.data_post_processor.data_post_processor import DataPostProcessor
|
||||
from core.rag.datasource.keyword.jieba.jieba_keyword_table_handler import JiebaKeywordTableHandler
|
||||
from core.rag.datasource.retrieval_service import RetrievalService
|
||||
from core.rag.models.document import Document
|
||||
from core.rag.retrieval.retrival_methods import RetrievalMethod
|
||||
from core.rag.retrieval.retrieval_methods import RetrievalMethod
|
||||
from core.rag.retrieval.router.multi_dataset_function_call_router import FunctionCallMultiDatasetRouter
|
||||
from core.rag.retrieval.router.multi_dataset_react_route import ReactMultiDatasetRouter
|
||||
from core.tools.tool.dataset_retriever.dataset_multi_retriever_tool import DatasetMultiRetrieverTool
|
||||
@@ -261,9 +261,9 @@ class DatasetRetrieval:
|
||||
top_k = retrieval_model_config['top_k']
|
||||
# get retrieval method
|
||||
if dataset.indexing_technique == "economy":
|
||||
retrival_method = 'keyword_search'
|
||||
retrieval_method = 'keyword_search'
|
||||
else:
|
||||
retrival_method = retrieval_model_config['search_method']
|
||||
retrieval_method = retrieval_model_config['search_method']
|
||||
# get reranking model
|
||||
reranking_model = retrieval_model_config['reranking_model'] \
|
||||
if retrieval_model_config['reranking_enable'] else None
|
||||
@@ -275,7 +275,7 @@ class DatasetRetrieval:
|
||||
|
||||
with measure_time() as timer:
|
||||
results = RetrievalService.retrieve(
|
||||
retrival_method=retrival_method, dataset_id=dataset.id,
|
||||
retrieval_method=retrieval_method, dataset_id=dataset.id,
|
||||
query=query,
|
||||
top_k=top_k, score_threshold=score_threshold,
|
||||
reranking_model=reranking_model,
|
||||
@@ -285,7 +285,7 @@ class DatasetRetrieval:
|
||||
self._on_query(query, [dataset_id], app_id, user_from, user_id)
|
||||
|
||||
if results:
|
||||
self._on_retrival_end(results, message_id, timer)
|
||||
self._on_retrieval_end(results, message_id, timer)
|
||||
|
||||
return results
|
||||
return []
|
||||
@@ -347,14 +347,14 @@ class DatasetRetrieval:
|
||||
self._on_query(query, dataset_ids, app_id, user_from, user_id)
|
||||
|
||||
if all_documents:
|
||||
self._on_retrival_end(all_documents, message_id, timer)
|
||||
self._on_retrieval_end(all_documents, message_id, timer)
|
||||
|
||||
return all_documents
|
||||
|
||||
def _on_retrival_end(
|
||||
def _on_retrieval_end(
|
||||
self, documents: list[Document], message_id: Optional[str] = None, timer: Optional[dict] = None
|
||||
) -> None:
|
||||
"""Handle retrival end."""
|
||||
"""Handle retrieval end."""
|
||||
for document in documents:
|
||||
query = db.session.query(DocumentSegment).filter(
|
||||
DocumentSegment.index_node_id == document.metadata['doc_id']
|
||||
@@ -419,7 +419,7 @@ class DatasetRetrieval:
|
||||
|
||||
if dataset.indexing_technique == "economy":
|
||||
# use keyword table query
|
||||
documents = RetrievalService.retrieve(retrival_method='keyword_search',
|
||||
documents = RetrievalService.retrieve(retrieval_method='keyword_search',
|
||||
dataset_id=dataset.id,
|
||||
query=query,
|
||||
top_k=top_k
|
||||
@@ -429,7 +429,7 @@ class DatasetRetrieval:
|
||||
else:
|
||||
if top_k > 0:
|
||||
# retrieval source
|
||||
documents = RetrievalService.retrieve(retrival_method=retrieval_model['search_method'],
|
||||
documents = RetrievalService.retrieve(retrieval_method=retrieval_model['search_method'],
|
||||
dataset_id=dataset.id,
|
||||
query=query,
|
||||
top_k=top_k,
|
||||
|
||||
Reference in New Issue
Block a user