delete the deprecated method (#5612)

This commit is contained in:
Jyong
2024-06-26 12:51:50 +08:00
committed by GitHub
parent af6e3869ff
commit 43335b5c87
19 changed files with 24 additions and 87 deletions

View File

@@ -70,22 +70,6 @@ class Jieba(BaseKeyword):
self._save_dataset_keyword_table(keyword_table)
def delete_by_document_id(self, document_id: str):
lock_name = 'keyword_indexing_lock_{}'.format(self.dataset.id)
with redis_client.lock(lock_name, timeout=600):
# get segment ids by document_id
segments = db.session.query(DocumentSegment).filter(
DocumentSegment.dataset_id == self.dataset.id,
DocumentSegment.document_id == document_id
).all()
ids = [segment.index_node_id for segment in segments]
keyword_table = self._get_dataset_keyword_table()
keyword_table = self._delete_ids_from_keyword_table(keyword_table, ids)
self._save_dataset_keyword_table(keyword_table)
def search(
self, query: str,
**kwargs: Any
@@ -104,6 +88,7 @@ class Jieba(BaseKeyword):
).first()
if segment:
documents.append(Document(
page_content=segment.content,
metadata={

View File

@@ -28,10 +28,6 @@ class BaseKeyword(ABC):
def delete_by_ids(self, ids: list[str]) -> None:
raise NotImplementedError
@abstractmethod
def delete_by_document_id(self, document_id: str) -> None:
raise NotImplementedError
def delete(self) -> None:
raise NotImplementedError

View File

@@ -39,9 +39,6 @@ class Keyword:
def delete_by_ids(self, ids: list[str]) -> None:
self._keyword_processor.delete_by_ids(ids)
def delete_by_document_id(self, document_id: str) -> None:
self._keyword_processor.delete_by_document_id(document_id)
def delete(self) -> None:
self._keyword_processor.delete()

View File

@@ -100,12 +100,6 @@ class MilvusVector(BaseVector):
raise e
return pks
def delete_by_document_id(self, document_id: str):
ids = self.get_ids_by_metadata_field('document_id', document_id)
if ids:
self._client.delete(collection_name=self._collection_name, pks=ids)
def get_ids_by_metadata_field(self, key: str, value: str):
result = self._client.query(collection_name=self._collection_name,
filter=f'metadata["{key}"] == "{value}"',

View File

@@ -87,11 +87,6 @@ class OpenSearchVector(BaseVector):
helpers.bulk(self._client, actions)
def delete_by_document_id(self, document_id: str):
ids = self.get_ids_by_metadata_field('document_id', document_id)
if ids:
self.delete_by_ids(ids)
def get_ids_by_metadata_field(self, key: str, value: str):
query = {"query": {"term": {f"{Field.METADATA_KEY.value}.{key}": value}}}
response = self._client.search(index=self._collection_name.lower(), body=query)

View File

@@ -156,13 +156,6 @@ class OracleVector(BaseVector):
# idss.append(record[0])
# return idss
#def delete_by_document_id(self, document_id: str):
# ids = self.get_ids_by_metadata_field('doc_id', document_id)
# if len(ids)>0:
# with self._get_cursor() as cur:
# cur.execute(f"delete FROM {self.table_name} d WHERE d.meta.doc_id in '%s'" % ("','".join(ids),))
def delete_by_ids(self, ids: list[str]) -> None:
with self._get_cursor() as cur:
cur.execute(f"DELETE FROM {self.table_name} WHERE id IN %s" % (tuple(ids),))

View File

@@ -130,14 +130,6 @@ class PGVectoRS(BaseVector):
return pks
def delete_by_document_id(self, document_id: str):
ids = self.get_ids_by_metadata_field('document_id', document_id)
if ids:
with Session(self._client) as session:
select_statement = sql_text(f"DELETE FROM {self._collection_name} WHERE id = ANY(:ids)")
session.execute(select_statement, {'ids': ids})
session.commit()
def get_ids_by_metadata_field(self, key: str, value: str):
result = None
with Session(self._client) as session:

View File

@@ -151,11 +151,6 @@ class RelytVector(BaseVector):
return ids
def delete_by_document_id(self, document_id: str):
ids = self.get_ids_by_metadata_field('document_id', document_id)
if ids:
self.delete_by_uuids(ids)
def get_ids_by_metadata_field(self, key: str, value: str):
result = None
with Session(self.client) as session:

View File

@@ -161,11 +161,6 @@ class TiDBVector(BaseVector):
print("Delete operation failed:", str(e))
return False
def delete_by_document_id(self, document_id: str):
ids = self.get_ids_by_metadata_field('document_id', document_id)
if ids:
self._delete_by_ids(ids)
def get_ids_by_metadata_field(self, key: str, value: str):
with Session(self._engine) as session:
select_statement = sql_text(

View File

@@ -31,9 +31,6 @@ class BaseVector(ABC):
def delete_by_ids(self, ids: list[str]) -> None:
raise NotImplementedError
def delete_by_document_id(self, document_id: str):
raise NotImplementedError
def get_ids_by_metadata_field(self, key: str, value: str):
raise NotImplementedError

View File

@@ -1,4 +1,5 @@
"""Abstract interface for document loader implementations."""
import os
from typing import Optional
import pandas as pd
@@ -29,8 +30,15 @@ class ExcelExtractor(BaseExtractor):
def extract(self) -> list[Document]:
""" Load from Excel file in xls or xlsx format using Pandas."""
documents = []
# Determine the file extension
file_extension = os.path.splitext(self._file_path)[-1].lower()
# Read each worksheet of an Excel file using Pandas
excel_file = pd.ExcelFile(self._file_path)
if file_extension == '.xlsx':
excel_file = pd.ExcelFile(self._file_path, engine='openpyxl')
elif file_extension == '.xls':
excel_file = pd.ExcelFile(self._file_path, engine='xlrd')
else:
raise ValueError(f"Unsupported file extension: {file_extension}")
for sheet_name in excel_file.sheet_names:
df: pd.DataFrame = excel_file.parse(sheet_name=sheet_name)