Fix variable typo (#8084)

This commit is contained in:
Nam Vu
2024-09-08 12:14:11 +07:00
committed by GitHub
parent b1918dae5e
commit 2d7954c7da
215 changed files with 599 additions and 597 deletions

View File

@@ -8,5 +8,5 @@ class UnstructuredNonAsciiCharsCleaner(BaseCleaner):
"""clean document content."""
from unstructured.cleaners.core import clean_non_ascii_chars
# Returns "This text containsnon-ascii characters!"
# Returns "This text contains non-ascii characters!"
return clean_non_ascii_chars(content)

View File

@@ -7,7 +7,7 @@ from core.rag.data_post_processor.data_post_processor import DataPostProcessor
from core.rag.datasource.keyword.keyword_factory import Keyword
from core.rag.datasource.vdb.vector_factory import Vector
from core.rag.rerank.constants.rerank_mode import RerankMode
from core.rag.retrieval.retrival_methods import RetrievalMethod
from core.rag.retrieval.retrieval_methods import RetrievalMethod
from extensions.ext_database import db
from models.dataset import Dataset
@@ -26,7 +26,7 @@ default_retrieval_model = {
class RetrievalService:
@classmethod
def retrieve(cls, retrival_method: str, dataset_id: str, query: str,
def retrieve(cls, retrieval_method: str, dataset_id: str, query: str,
top_k: int, score_threshold: Optional[float] = .0,
reranking_model: Optional[dict] = None, reranking_mode: Optional[str] = 'reranking_model',
weights: Optional[dict] = None):
@@ -39,7 +39,7 @@ class RetrievalService:
threads = []
exceptions = []
# retrieval_model source with keyword
if retrival_method == 'keyword_search':
if retrieval_method == 'keyword_search':
keyword_thread = threading.Thread(target=RetrievalService.keyword_search, kwargs={
'flask_app': current_app._get_current_object(),
'dataset_id': dataset_id,
@@ -51,7 +51,7 @@ class RetrievalService:
threads.append(keyword_thread)
keyword_thread.start()
# retrieval_model source with semantic
if RetrievalMethod.is_support_semantic_search(retrival_method):
if RetrievalMethod.is_support_semantic_search(retrieval_method):
embedding_thread = threading.Thread(target=RetrievalService.embedding_search, kwargs={
'flask_app': current_app._get_current_object(),
'dataset_id': dataset_id,
@@ -60,19 +60,19 @@ class RetrievalService:
'score_threshold': score_threshold,
'reranking_model': reranking_model,
'all_documents': all_documents,
'retrival_method': retrival_method,
'retrieval_method': retrieval_method,
'exceptions': exceptions,
})
threads.append(embedding_thread)
embedding_thread.start()
# retrieval source with full text
if RetrievalMethod.is_support_fulltext_search(retrival_method):
if RetrievalMethod.is_support_fulltext_search(retrieval_method):
full_text_index_thread = threading.Thread(target=RetrievalService.full_text_index_search, kwargs={
'flask_app': current_app._get_current_object(),
'dataset_id': dataset_id,
'query': query,
'retrival_method': retrival_method,
'retrieval_method': retrieval_method,
'score_threshold': score_threshold,
'top_k': top_k,
'reranking_model': reranking_model,
@@ -89,7 +89,7 @@ class RetrievalService:
exception_message = ';\n'.join(exceptions)
raise Exception(exception_message)
if retrival_method == RetrievalMethod.HYBRID_SEARCH.value:
if retrieval_method == RetrievalMethod.HYBRID_SEARCH.value:
data_post_processor = DataPostProcessor(str(dataset.tenant_id), reranking_mode,
reranking_model, weights, False)
all_documents = data_post_processor.invoke(
@@ -124,7 +124,7 @@ class RetrievalService:
@classmethod
def embedding_search(cls, flask_app: Flask, dataset_id: str, query: str,
top_k: int, score_threshold: Optional[float], reranking_model: Optional[dict],
all_documents: list, retrival_method: str, exceptions: list):
all_documents: list, retrieval_method: str, exceptions: list):
with flask_app.app_context():
try:
dataset = db.session.query(Dataset).filter(
@@ -146,7 +146,7 @@ class RetrievalService:
)
if documents:
if reranking_model and reranking_model.get('reranking_model_name') and reranking_model.get('reranking_provider_name') and retrival_method == RetrievalMethod.SEMANTIC_SEARCH.value:
if reranking_model and reranking_model.get('reranking_model_name') and reranking_model.get('reranking_provider_name') and retrieval_method == RetrievalMethod.SEMANTIC_SEARCH.value:
data_post_processor = DataPostProcessor(str(dataset.tenant_id),
RerankMode.RERANKING_MODEL.value,
reranking_model, None, False)
@@ -164,7 +164,7 @@ class RetrievalService:
@classmethod
def full_text_index_search(cls, flask_app: Flask, dataset_id: str, query: str,
top_k: int, score_threshold: Optional[float], reranking_model: Optional[dict],
all_documents: list, retrival_method: str, exceptions: list):
all_documents: list, retrieval_method: str, exceptions: list):
with flask_app.app_context():
try:
dataset = db.session.query(Dataset).filter(
@@ -180,7 +180,7 @@ class RetrievalService:
top_k=top_k
)
if documents:
if reranking_model and reranking_model.get('reranking_model_name') and reranking_model.get('reranking_provider_name') and retrival_method == RetrievalMethod.FULL_TEXT_SEARCH.value:
if reranking_model and reranking_model.get('reranking_model_name') and reranking_model.get('reranking_provider_name') and retrieval_method == RetrievalMethod.FULL_TEXT_SEARCH.value:
data_post_processor = DataPostProcessor(str(dataset.tenant_id),
RerankMode.RERANKING_MODEL.value,
reranking_model, None, False)

View File

@@ -275,10 +275,10 @@ class NotionExtractor(BaseExtractor):
data = res.json()
# get table headers text
table_header_cell_texts = []
tabel_header_cells = data["results"][0]['table_row']['cells']
for tabel_header_cell in tabel_header_cells:
if tabel_header_cell:
for table_header_cell_text in tabel_header_cell:
table_header_cells = data["results"][0]['table_row']['cells']
for table_header_cell in table_header_cells:
if table_header_cell:
for table_header_cell_text in table_header_cell:
text = table_header_cell_text["text"]["content"]
table_header_cell_texts.append(text)
else:

View File

@@ -2,7 +2,7 @@
from collections.abc import Iterator
from typing import Optional
from core.rag.extractor.blod.blod import Blob
from core.rag.extractor.blob.blob import Blob
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
from extensions.ext_storage import storage

View File

@@ -34,7 +34,7 @@ class BaseIndexProcessor(ABC):
raise NotImplementedError
@abstractmethod
def retrieve(self, retrival_method: str, query: str, dataset: Dataset, top_k: int,
def retrieve(self, retrieval_method: str, query: str, dataset: Dataset, top_k: int,
score_threshold: float, reranking_model: dict) -> list[Document]:
raise NotImplementedError

View File

@@ -42,7 +42,7 @@ class ParagraphIndexProcessor(BaseIndexProcessor):
hash = helper.generate_text_hash(document_node.page_content)
document_node.metadata['doc_id'] = doc_id
document_node.metadata['doc_hash'] = hash
# delete Spliter character
# delete Splitter character
page_content = document_node.page_content
if page_content.startswith(".") or page_content.startswith(""):
page_content = page_content[1:].strip()
@@ -76,10 +76,10 @@ class ParagraphIndexProcessor(BaseIndexProcessor):
else:
keyword.delete()
def retrieve(self, retrival_method: str, query: str, dataset: Dataset, top_k: int,
def retrieve(self, retrieval_method: str, query: str, dataset: Dataset, top_k: int,
score_threshold: float, reranking_model: dict) -> list[Document]:
# Set search parameters.
results = RetrievalService.retrieve(retrival_method=retrival_method, dataset_id=dataset.id, query=query,
results = RetrievalService.retrieve(retrieval_method=retrieval_method, dataset_id=dataset.id, query=query,
top_k=top_k, score_threshold=score_threshold,
reranking_model=reranking_model)
# Organize results.

View File

@@ -50,7 +50,7 @@ class QAIndexProcessor(BaseIndexProcessor):
hash = helper.generate_text_hash(document_node.page_content)
document_node.metadata['doc_id'] = doc_id
document_node.metadata['doc_hash'] = hash
# delete Spliter character
# delete Splitter character
page_content = document_node.page_content
if page_content.startswith(".") or page_content.startswith(""):
page_content = page_content[1:]
@@ -107,10 +107,10 @@ class QAIndexProcessor(BaseIndexProcessor):
else:
vector.delete()
def retrieve(self, retrival_method: str, query: str, dataset: Dataset, top_k: int,
def retrieve(self, retrieval_method: str, query: str, dataset: Dataset, top_k: int,
score_threshold: float, reranking_model: dict):
# Set search parameters.
results = RetrievalService.retrieve(retrival_method=retrival_method, dataset_id=dataset.id, query=query,
results = RetrievalService.retrieve(retrieval_method=retrieval_method, dataset_id=dataset.id, query=query,
top_k=top_k, score_threshold=score_threshold,
reranking_model=reranking_model)
# Organize results.

View File

@@ -21,7 +21,7 @@ from core.rag.data_post_processor.data_post_processor import DataPostProcessor
from core.rag.datasource.keyword.jieba.jieba_keyword_table_handler import JiebaKeywordTableHandler
from core.rag.datasource.retrieval_service import RetrievalService
from core.rag.models.document import Document
from core.rag.retrieval.retrival_methods import RetrievalMethod
from core.rag.retrieval.retrieval_methods import RetrievalMethod
from core.rag.retrieval.router.multi_dataset_function_call_router import FunctionCallMultiDatasetRouter
from core.rag.retrieval.router.multi_dataset_react_route import ReactMultiDatasetRouter
from core.tools.tool.dataset_retriever.dataset_multi_retriever_tool import DatasetMultiRetrieverTool
@@ -261,9 +261,9 @@ class DatasetRetrieval:
top_k = retrieval_model_config['top_k']
# get retrieval method
if dataset.indexing_technique == "economy":
retrival_method = 'keyword_search'
retrieval_method = 'keyword_search'
else:
retrival_method = retrieval_model_config['search_method']
retrieval_method = retrieval_model_config['search_method']
# get reranking model
reranking_model = retrieval_model_config['reranking_model'] \
if retrieval_model_config['reranking_enable'] else None
@@ -275,7 +275,7 @@ class DatasetRetrieval:
with measure_time() as timer:
results = RetrievalService.retrieve(
retrival_method=retrival_method, dataset_id=dataset.id,
retrieval_method=retrieval_method, dataset_id=dataset.id,
query=query,
top_k=top_k, score_threshold=score_threshold,
reranking_model=reranking_model,
@@ -285,7 +285,7 @@ class DatasetRetrieval:
self._on_query(query, [dataset_id], app_id, user_from, user_id)
if results:
self._on_retrival_end(results, message_id, timer)
self._on_retrieval_end(results, message_id, timer)
return results
return []
@@ -347,14 +347,14 @@ class DatasetRetrieval:
self._on_query(query, dataset_ids, app_id, user_from, user_id)
if all_documents:
self._on_retrival_end(all_documents, message_id, timer)
self._on_retrieval_end(all_documents, message_id, timer)
return all_documents
def _on_retrival_end(
def _on_retrieval_end(
self, documents: list[Document], message_id: Optional[str] = None, timer: Optional[dict] = None
) -> None:
"""Handle retrival end."""
"""Handle retrieval end."""
for document in documents:
query = db.session.query(DocumentSegment).filter(
DocumentSegment.index_node_id == document.metadata['doc_id']
@@ -419,7 +419,7 @@ class DatasetRetrieval:
if dataset.indexing_technique == "economy":
# use keyword table query
documents = RetrievalService.retrieve(retrival_method='keyword_search',
documents = RetrievalService.retrieve(retrieval_method='keyword_search',
dataset_id=dataset.id,
query=query,
top_k=top_k
@@ -429,7 +429,7 @@ class DatasetRetrieval:
else:
if top_k > 0:
# retrieval source
documents = RetrievalService.retrieve(retrival_method=retrieval_model['search_method'],
documents = RetrievalService.retrieve(retrieval_method=retrieval_model['search_method'],
dataset_id=dataset.id,
query=query,
top_k=top_k,