remove .value (#26633)
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
@@ -488,9 +488,9 @@ class ClickzettaVector(BaseVector):
|
||||
create_table_sql = f"""
|
||||
CREATE TABLE IF NOT EXISTS {self._config.schema_name}.{self._table_name} (
|
||||
id STRING NOT NULL COMMENT 'Unique document identifier',
|
||||
{Field.CONTENT_KEY.value} STRING NOT NULL COMMENT 'Document text content for search and retrieval',
|
||||
{Field.METADATA_KEY.value} JSON COMMENT 'Document metadata including source, type, and other attributes',
|
||||
{Field.VECTOR.value} VECTOR(FLOAT, {dimension}) NOT NULL COMMENT
|
||||
{Field.CONTENT_KEY} STRING NOT NULL COMMENT 'Document text content for search and retrieval',
|
||||
{Field.METADATA_KEY} JSON COMMENT 'Document metadata including source, type, and other attributes',
|
||||
{Field.VECTOR} VECTOR(FLOAT, {dimension}) NOT NULL COMMENT
|
||||
'High-dimensional embedding vector for semantic similarity search',
|
||||
PRIMARY KEY (id)
|
||||
) COMMENT 'Dify RAG knowledge base vector storage table for document embeddings and content'
|
||||
@@ -519,15 +519,15 @@ class ClickzettaVector(BaseVector):
|
||||
existing_indexes = cursor.fetchall()
|
||||
for idx in existing_indexes:
|
||||
# Check if vector index already exists on the embedding column
|
||||
if Field.VECTOR.value in str(idx).lower():
|
||||
logger.info("Vector index already exists on column %s", Field.VECTOR.value)
|
||||
if Field.VECTOR in str(idx).lower():
|
||||
logger.info("Vector index already exists on column %s", Field.VECTOR)
|
||||
return
|
||||
except (RuntimeError, ValueError) as e:
|
||||
logger.warning("Failed to check existing indexes: %s", e)
|
||||
|
||||
index_sql = f"""
|
||||
CREATE VECTOR INDEX IF NOT EXISTS {index_name}
|
||||
ON TABLE {self._config.schema_name}.{self._table_name}({Field.VECTOR.value})
|
||||
ON TABLE {self._config.schema_name}.{self._table_name}({Field.VECTOR})
|
||||
PROPERTIES (
|
||||
"distance.function" = "{self._config.vector_distance_function}",
|
||||
"scalar.type" = "f32",
|
||||
@@ -560,17 +560,17 @@ class ClickzettaVector(BaseVector):
|
||||
# More precise check: look for inverted index specifically on the content column
|
||||
if (
|
||||
"inverted" in idx_str
|
||||
and Field.CONTENT_KEY.value.lower() in idx_str
|
||||
and Field.CONTENT_KEY.lower() in idx_str
|
||||
and (index_name.lower() in idx_str or f"idx_{self._table_name}_text" in idx_str)
|
||||
):
|
||||
logger.info("Inverted index already exists on column %s: %s", Field.CONTENT_KEY.value, idx)
|
||||
logger.info("Inverted index already exists on column %s: %s", Field.CONTENT_KEY, idx)
|
||||
return
|
||||
except (RuntimeError, ValueError) as e:
|
||||
logger.warning("Failed to check existing indexes: %s", e)
|
||||
|
||||
index_sql = f"""
|
||||
CREATE INVERTED INDEX IF NOT EXISTS {index_name}
|
||||
ON TABLE {self._config.schema_name}.{self._table_name} ({Field.CONTENT_KEY.value})
|
||||
ON TABLE {self._config.schema_name}.{self._table_name} ({Field.CONTENT_KEY})
|
||||
PROPERTIES (
|
||||
"analyzer" = "{self._config.analyzer_type}",
|
||||
"mode" = "{self._config.analyzer_mode}"
|
||||
@@ -588,13 +588,13 @@ class ClickzettaVector(BaseVector):
|
||||
or "with the same type" in error_msg
|
||||
or "cannot create inverted index" in error_msg
|
||||
) and "already has index" in error_msg:
|
||||
logger.info("Inverted index already exists on column %s", Field.CONTENT_KEY.value)
|
||||
logger.info("Inverted index already exists on column %s", Field.CONTENT_KEY)
|
||||
# Try to get the existing index name for logging
|
||||
try:
|
||||
cursor.execute(f"SHOW INDEX FROM {self._config.schema_name}.{self._table_name}")
|
||||
existing_indexes = cursor.fetchall()
|
||||
for idx in existing_indexes:
|
||||
if "inverted" in str(idx).lower() and Field.CONTENT_KEY.value.lower() in str(idx).lower():
|
||||
if "inverted" in str(idx).lower() and Field.CONTENT_KEY.lower() in str(idx).lower():
|
||||
logger.info("Found existing inverted index: %s", idx)
|
||||
break
|
||||
except (RuntimeError, ValueError):
|
||||
@@ -669,7 +669,7 @@ class ClickzettaVector(BaseVector):
|
||||
|
||||
# Use parameterized INSERT with executemany for better performance and security
|
||||
# Cast JSON and VECTOR in SQL, pass raw data as parameters
|
||||
columns = f"id, {Field.CONTENT_KEY.value}, {Field.METADATA_KEY.value}, {Field.VECTOR.value}"
|
||||
columns = f"id, {Field.CONTENT_KEY}, {Field.METADATA_KEY}, {Field.VECTOR}"
|
||||
insert_sql = (
|
||||
f"INSERT INTO {self._config.schema_name}.{self._table_name} ({columns}) "
|
||||
f"VALUES (?, ?, CAST(? AS JSON), CAST(? AS VECTOR({vector_dimension})))"
|
||||
@@ -767,7 +767,7 @@ class ClickzettaVector(BaseVector):
|
||||
# Use json_extract_string function for ClickZetta compatibility
|
||||
sql = (
|
||||
f"DELETE FROM {self._config.schema_name}.{self._table_name} "
|
||||
f"WHERE json_extract_string({Field.METADATA_KEY.value}, '$.{key}') = ?"
|
||||
f"WHERE json_extract_string({Field.METADATA_KEY}, '$.{key}') = ?"
|
||||
)
|
||||
cursor.execute(sql, binding_params=[value])
|
||||
|
||||
@@ -795,9 +795,7 @@ class ClickzettaVector(BaseVector):
|
||||
safe_doc_ids = [str(id).replace("'", "''") for id in document_ids_filter]
|
||||
doc_ids_str = ",".join(f"'{id}'" for id in safe_doc_ids)
|
||||
# Use json_extract_string function for ClickZetta compatibility
|
||||
filter_clauses.append(
|
||||
f"json_extract_string({Field.METADATA_KEY.value}, '$.document_id') IN ({doc_ids_str})"
|
||||
)
|
||||
filter_clauses.append(f"json_extract_string({Field.METADATA_KEY}, '$.document_id') IN ({doc_ids_str})")
|
||||
|
||||
# No need for dataset_id filter since each dataset has its own table
|
||||
|
||||
@@ -808,23 +806,21 @@ class ClickzettaVector(BaseVector):
|
||||
distance_func = "COSINE_DISTANCE"
|
||||
if score_threshold > 0:
|
||||
query_vector_str = f"CAST('[{self._format_vector_simple(query_vector)}]' AS VECTOR({vector_dimension}))"
|
||||
filter_clauses.append(
|
||||
f"{distance_func}({Field.VECTOR.value}, {query_vector_str}) < {2 - score_threshold}"
|
||||
)
|
||||
filter_clauses.append(f"{distance_func}({Field.VECTOR}, {query_vector_str}) < {2 - score_threshold}")
|
||||
else:
|
||||
# For L2 distance, smaller is better
|
||||
distance_func = "L2_DISTANCE"
|
||||
if score_threshold > 0:
|
||||
query_vector_str = f"CAST('[{self._format_vector_simple(query_vector)}]' AS VECTOR({vector_dimension}))"
|
||||
filter_clauses.append(f"{distance_func}({Field.VECTOR.value}, {query_vector_str}) < {score_threshold}")
|
||||
filter_clauses.append(f"{distance_func}({Field.VECTOR}, {query_vector_str}) < {score_threshold}")
|
||||
|
||||
where_clause = " AND ".join(filter_clauses) if filter_clauses else "1=1"
|
||||
|
||||
# Execute vector search query
|
||||
query_vector_str = f"CAST('[{self._format_vector_simple(query_vector)}]' AS VECTOR({vector_dimension}))"
|
||||
search_sql = f"""
|
||||
SELECT id, {Field.CONTENT_KEY.value}, {Field.METADATA_KEY.value},
|
||||
{distance_func}({Field.VECTOR.value}, {query_vector_str}) AS distance
|
||||
SELECT id, {Field.CONTENT_KEY}, {Field.METADATA_KEY},
|
||||
{distance_func}({Field.VECTOR}, {query_vector_str}) AS distance
|
||||
FROM {self._config.schema_name}.{self._table_name}
|
||||
WHERE {where_clause}
|
||||
ORDER BY distance
|
||||
@@ -887,9 +883,7 @@ class ClickzettaVector(BaseVector):
|
||||
safe_doc_ids = [str(id).replace("'", "''") for id in document_ids_filter]
|
||||
doc_ids_str = ",".join(f"'{id}'" for id in safe_doc_ids)
|
||||
# Use json_extract_string function for ClickZetta compatibility
|
||||
filter_clauses.append(
|
||||
f"json_extract_string({Field.METADATA_KEY.value}, '$.document_id') IN ({doc_ids_str})"
|
||||
)
|
||||
filter_clauses.append(f"json_extract_string({Field.METADATA_KEY}, '$.document_id') IN ({doc_ids_str})")
|
||||
|
||||
# No need for dataset_id filter since each dataset has its own table
|
||||
|
||||
@@ -897,13 +891,13 @@ class ClickzettaVector(BaseVector):
|
||||
# match_all requires all terms to be present
|
||||
# Use simple quote escaping for MATCH_ALL since it needs to be in the WHERE clause
|
||||
escaped_query = query.replace("'", "''")
|
||||
filter_clauses.append(f"MATCH_ALL({Field.CONTENT_KEY.value}, '{escaped_query}')")
|
||||
filter_clauses.append(f"MATCH_ALL({Field.CONTENT_KEY}, '{escaped_query}')")
|
||||
|
||||
where_clause = " AND ".join(filter_clauses)
|
||||
|
||||
# Execute full-text search query
|
||||
search_sql = f"""
|
||||
SELECT id, {Field.CONTENT_KEY.value}, {Field.METADATA_KEY.value}
|
||||
SELECT id, {Field.CONTENT_KEY}, {Field.METADATA_KEY}
|
||||
FROM {self._config.schema_name}.{self._table_name}
|
||||
WHERE {where_clause}
|
||||
LIMIT {top_k}
|
||||
@@ -986,19 +980,17 @@ class ClickzettaVector(BaseVector):
|
||||
safe_doc_ids = [str(id).replace("'", "''") for id in document_ids_filter]
|
||||
doc_ids_str = ",".join(f"'{id}'" for id in safe_doc_ids)
|
||||
# Use json_extract_string function for ClickZetta compatibility
|
||||
filter_clauses.append(
|
||||
f"json_extract_string({Field.METADATA_KEY.value}, '$.document_id') IN ({doc_ids_str})"
|
||||
)
|
||||
filter_clauses.append(f"json_extract_string({Field.METADATA_KEY}, '$.document_id') IN ({doc_ids_str})")
|
||||
|
||||
# No need for dataset_id filter since each dataset has its own table
|
||||
|
||||
# Use simple quote escaping for LIKE clause
|
||||
escaped_query = query.replace("'", "''")
|
||||
filter_clauses.append(f"{Field.CONTENT_KEY.value} LIKE '%{escaped_query}%'")
|
||||
filter_clauses.append(f"{Field.CONTENT_KEY} LIKE '%{escaped_query}%'")
|
||||
where_clause = " AND ".join(filter_clauses)
|
||||
|
||||
search_sql = f"""
|
||||
SELECT id, {Field.CONTENT_KEY.value}, {Field.METADATA_KEY.value}
|
||||
SELECT id, {Field.CONTENT_KEY}, {Field.METADATA_KEY}
|
||||
FROM {self._config.schema_name}.{self._table_name}
|
||||
WHERE {where_clause}
|
||||
LIMIT {top_k}
|
||||
|
||||
@@ -57,18 +57,18 @@ class ElasticSearchJaVector(ElasticSearchVector):
|
||||
}
|
||||
mappings = {
|
||||
"properties": {
|
||||
Field.CONTENT_KEY.value: {
|
||||
Field.CONTENT_KEY: {
|
||||
"type": "text",
|
||||
"analyzer": "ja_analyzer",
|
||||
"search_analyzer": "ja_analyzer",
|
||||
},
|
||||
Field.VECTOR.value: { # Make sure the dimension is correct here
|
||||
Field.VECTOR: { # Make sure the dimension is correct here
|
||||
"type": "dense_vector",
|
||||
"dims": dim,
|
||||
"index": True,
|
||||
"similarity": "cosine",
|
||||
},
|
||||
Field.METADATA_KEY.value: {
|
||||
Field.METADATA_KEY: {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"doc_id": {"type": "keyword"} # Map doc_id to keyword type
|
||||
|
||||
@@ -163,9 +163,9 @@ class ElasticSearchVector(BaseVector):
|
||||
index=self._collection_name,
|
||||
id=uuids[i],
|
||||
document={
|
||||
Field.CONTENT_KEY.value: documents[i].page_content,
|
||||
Field.VECTOR.value: embeddings[i] or None,
|
||||
Field.METADATA_KEY.value: documents[i].metadata or {},
|
||||
Field.CONTENT_KEY: documents[i].page_content,
|
||||
Field.VECTOR: embeddings[i] or None,
|
||||
Field.METADATA_KEY: documents[i].metadata or {},
|
||||
},
|
||||
)
|
||||
self._client.indices.refresh(index=self._collection_name)
|
||||
@@ -193,7 +193,7 @@ class ElasticSearchVector(BaseVector):
|
||||
def search_by_vector(self, query_vector: list[float], **kwargs: Any) -> list[Document]:
|
||||
top_k = kwargs.get("top_k", 4)
|
||||
num_candidates = math.ceil(top_k * 1.5)
|
||||
knn = {"field": Field.VECTOR.value, "query_vector": query_vector, "k": top_k, "num_candidates": num_candidates}
|
||||
knn = {"field": Field.VECTOR, "query_vector": query_vector, "k": top_k, "num_candidates": num_candidates}
|
||||
document_ids_filter = kwargs.get("document_ids_filter")
|
||||
if document_ids_filter:
|
||||
knn["filter"] = {"terms": {"metadata.document_id": document_ids_filter}}
|
||||
@@ -205,9 +205,9 @@ class ElasticSearchVector(BaseVector):
|
||||
docs_and_scores.append(
|
||||
(
|
||||
Document(
|
||||
page_content=hit["_source"][Field.CONTENT_KEY.value],
|
||||
vector=hit["_source"][Field.VECTOR.value],
|
||||
metadata=hit["_source"][Field.METADATA_KEY.value],
|
||||
page_content=hit["_source"][Field.CONTENT_KEY],
|
||||
vector=hit["_source"][Field.VECTOR],
|
||||
metadata=hit["_source"][Field.METADATA_KEY],
|
||||
),
|
||||
hit["_score"],
|
||||
)
|
||||
@@ -224,13 +224,13 @@ class ElasticSearchVector(BaseVector):
|
||||
return docs
|
||||
|
||||
def search_by_full_text(self, query: str, **kwargs: Any) -> list[Document]:
|
||||
query_str: dict[str, Any] = {"match": {Field.CONTENT_KEY.value: query}}
|
||||
query_str: dict[str, Any] = {"match": {Field.CONTENT_KEY: query}}
|
||||
document_ids_filter = kwargs.get("document_ids_filter")
|
||||
|
||||
if document_ids_filter:
|
||||
query_str = {
|
||||
"bool": {
|
||||
"must": {"match": {Field.CONTENT_KEY.value: query}},
|
||||
"must": {"match": {Field.CONTENT_KEY: query}},
|
||||
"filter": {"terms": {"metadata.document_id": document_ids_filter}},
|
||||
}
|
||||
}
|
||||
@@ -240,9 +240,9 @@ class ElasticSearchVector(BaseVector):
|
||||
for hit in results["hits"]["hits"]:
|
||||
docs.append(
|
||||
Document(
|
||||
page_content=hit["_source"][Field.CONTENT_KEY.value],
|
||||
vector=hit["_source"][Field.VECTOR.value],
|
||||
metadata=hit["_source"][Field.METADATA_KEY.value],
|
||||
page_content=hit["_source"][Field.CONTENT_KEY],
|
||||
vector=hit["_source"][Field.VECTOR],
|
||||
metadata=hit["_source"][Field.METADATA_KEY],
|
||||
)
|
||||
)
|
||||
|
||||
@@ -270,14 +270,14 @@ class ElasticSearchVector(BaseVector):
|
||||
dim = len(embeddings[0])
|
||||
mappings = {
|
||||
"properties": {
|
||||
Field.CONTENT_KEY.value: {"type": "text"},
|
||||
Field.VECTOR.value: { # Make sure the dimension is correct here
|
||||
Field.CONTENT_KEY: {"type": "text"},
|
||||
Field.VECTOR: { # Make sure the dimension is correct here
|
||||
"type": "dense_vector",
|
||||
"dims": dim,
|
||||
"index": True,
|
||||
"similarity": "cosine",
|
||||
},
|
||||
Field.METADATA_KEY.value: {
|
||||
Field.METADATA_KEY: {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"doc_id": {"type": "keyword"}, # Map doc_id to keyword type
|
||||
|
||||
@@ -67,9 +67,9 @@ class HuaweiCloudVector(BaseVector):
|
||||
index=self._collection_name,
|
||||
id=uuids[i],
|
||||
document={
|
||||
Field.CONTENT_KEY.value: documents[i].page_content,
|
||||
Field.VECTOR.value: embeddings[i] or None,
|
||||
Field.METADATA_KEY.value: documents[i].metadata or {},
|
||||
Field.CONTENT_KEY: documents[i].page_content,
|
||||
Field.VECTOR: embeddings[i] or None,
|
||||
Field.METADATA_KEY: documents[i].metadata or {},
|
||||
},
|
||||
)
|
||||
self._client.indices.refresh(index=self._collection_name)
|
||||
@@ -101,7 +101,7 @@ class HuaweiCloudVector(BaseVector):
|
||||
"size": top_k,
|
||||
"query": {
|
||||
"vector": {
|
||||
Field.VECTOR.value: {
|
||||
Field.VECTOR: {
|
||||
"vector": query_vector,
|
||||
"topk": top_k,
|
||||
}
|
||||
@@ -116,9 +116,9 @@ class HuaweiCloudVector(BaseVector):
|
||||
docs_and_scores.append(
|
||||
(
|
||||
Document(
|
||||
page_content=hit["_source"][Field.CONTENT_KEY.value],
|
||||
vector=hit["_source"][Field.VECTOR.value],
|
||||
metadata=hit["_source"][Field.METADATA_KEY.value],
|
||||
page_content=hit["_source"][Field.CONTENT_KEY],
|
||||
vector=hit["_source"][Field.VECTOR],
|
||||
metadata=hit["_source"][Field.METADATA_KEY],
|
||||
),
|
||||
hit["_score"],
|
||||
)
|
||||
@@ -135,15 +135,15 @@ class HuaweiCloudVector(BaseVector):
|
||||
return docs
|
||||
|
||||
def search_by_full_text(self, query: str, **kwargs: Any) -> list[Document]:
|
||||
query_str = {"match": {Field.CONTENT_KEY.value: query}}
|
||||
query_str = {"match": {Field.CONTENT_KEY: query}}
|
||||
results = self._client.search(index=self._collection_name, query=query_str, size=kwargs.get("top_k", 4))
|
||||
docs = []
|
||||
for hit in results["hits"]["hits"]:
|
||||
docs.append(
|
||||
Document(
|
||||
page_content=hit["_source"][Field.CONTENT_KEY.value],
|
||||
vector=hit["_source"][Field.VECTOR.value],
|
||||
metadata=hit["_source"][Field.METADATA_KEY.value],
|
||||
page_content=hit["_source"][Field.CONTENT_KEY],
|
||||
vector=hit["_source"][Field.VECTOR],
|
||||
metadata=hit["_source"][Field.METADATA_KEY],
|
||||
)
|
||||
)
|
||||
|
||||
@@ -171,8 +171,8 @@ class HuaweiCloudVector(BaseVector):
|
||||
dim = len(embeddings[0])
|
||||
mappings = {
|
||||
"properties": {
|
||||
Field.CONTENT_KEY.value: {"type": "text"},
|
||||
Field.VECTOR.value: { # Make sure the dimension is correct here
|
||||
Field.CONTENT_KEY: {"type": "text"},
|
||||
Field.VECTOR: { # Make sure the dimension is correct here
|
||||
"type": "vector",
|
||||
"dimension": dim,
|
||||
"indexing": True,
|
||||
@@ -181,7 +181,7 @@ class HuaweiCloudVector(BaseVector):
|
||||
"neighbors": 32,
|
||||
"efc": 128,
|
||||
},
|
||||
Field.METADATA_KEY.value: {
|
||||
Field.METADATA_KEY: {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"doc_id": {"type": "keyword"} # Map doc_id to keyword type
|
||||
|
||||
@@ -125,9 +125,9 @@ class LindormVectorStore(BaseVector):
|
||||
}
|
||||
}
|
||||
action_values: dict[str, Any] = {
|
||||
Field.CONTENT_KEY.value: documents[i].page_content,
|
||||
Field.VECTOR.value: embeddings[i],
|
||||
Field.METADATA_KEY.value: documents[i].metadata,
|
||||
Field.CONTENT_KEY: documents[i].page_content,
|
||||
Field.VECTOR: embeddings[i],
|
||||
Field.METADATA_KEY: documents[i].metadata,
|
||||
}
|
||||
if self._using_ugc:
|
||||
action_header["index"]["routing"] = self._routing
|
||||
@@ -149,7 +149,7 @@ class LindormVectorStore(BaseVector):
|
||||
|
||||
def get_ids_by_metadata_field(self, key: str, value: str):
|
||||
query: dict[str, Any] = {
|
||||
"query": {"bool": {"must": [{"term": {f"{Field.METADATA_KEY.value}.{key}.keyword": value}}]}}
|
||||
"query": {"bool": {"must": [{"term": {f"{Field.METADATA_KEY}.{key}.keyword": value}}]}}
|
||||
}
|
||||
if self._using_ugc:
|
||||
query["query"]["bool"]["must"].append({"term": {f"{ROUTING_FIELD}.keyword": self._routing}})
|
||||
@@ -252,14 +252,14 @@ class LindormVectorStore(BaseVector):
|
||||
search_query: dict[str, Any] = {
|
||||
"size": top_k,
|
||||
"_source": True,
|
||||
"query": {"knn": {Field.VECTOR.value: {"vector": query_vector, "k": top_k}}},
|
||||
"query": {"knn": {Field.VECTOR: {"vector": query_vector, "k": top_k}}},
|
||||
}
|
||||
|
||||
final_ext: dict[str, Any] = {"lvector": {}}
|
||||
if filters is not None and len(filters) > 0:
|
||||
# when using filter, transform filter from List[Dict] to Dict as valid format
|
||||
filter_dict = {"bool": {"must": filters}} if len(filters) > 1 else filters[0]
|
||||
search_query["query"]["knn"][Field.VECTOR.value]["filter"] = filter_dict # filter should be Dict
|
||||
search_query["query"]["knn"][Field.VECTOR]["filter"] = filter_dict # filter should be Dict
|
||||
final_ext["lvector"]["filter_type"] = "pre_filter"
|
||||
|
||||
if final_ext != {"lvector": {}}:
|
||||
@@ -279,9 +279,9 @@ class LindormVectorStore(BaseVector):
|
||||
docs_and_scores.append(
|
||||
(
|
||||
Document(
|
||||
page_content=hit["_source"][Field.CONTENT_KEY.value],
|
||||
vector=hit["_source"][Field.VECTOR.value],
|
||||
metadata=hit["_source"][Field.METADATA_KEY.value],
|
||||
page_content=hit["_source"][Field.CONTENT_KEY],
|
||||
vector=hit["_source"][Field.VECTOR],
|
||||
metadata=hit["_source"][Field.METADATA_KEY],
|
||||
),
|
||||
hit["_score"],
|
||||
)
|
||||
@@ -318,9 +318,9 @@ class LindormVectorStore(BaseVector):
|
||||
|
||||
docs = []
|
||||
for hit in response["hits"]["hits"]:
|
||||
metadata = hit["_source"].get(Field.METADATA_KEY.value)
|
||||
vector = hit["_source"].get(Field.VECTOR.value)
|
||||
page_content = hit["_source"].get(Field.CONTENT_KEY.value)
|
||||
metadata = hit["_source"].get(Field.METADATA_KEY)
|
||||
vector = hit["_source"].get(Field.VECTOR)
|
||||
page_content = hit["_source"].get(Field.CONTENT_KEY)
|
||||
doc = Document(page_content=page_content, vector=vector, metadata=metadata)
|
||||
docs.append(doc)
|
||||
|
||||
@@ -342,8 +342,8 @@ class LindormVectorStore(BaseVector):
|
||||
"settings": {"index": {"knn": True, "knn_routing": self._using_ugc}},
|
||||
"mappings": {
|
||||
"properties": {
|
||||
Field.CONTENT_KEY.value: {"type": "text"},
|
||||
Field.VECTOR.value: {
|
||||
Field.CONTENT_KEY: {"type": "text"},
|
||||
Field.VECTOR: {
|
||||
"type": "knn_vector",
|
||||
"dimension": len(embeddings[0]), # Make sure the dimension is correct here
|
||||
"method": {
|
||||
|
||||
@@ -85,7 +85,7 @@ class MilvusVector(BaseVector):
|
||||
collection_info = self._client.describe_collection(self._collection_name)
|
||||
fields = [field["name"] for field in collection_info["fields"]]
|
||||
# Since primary field is auto-id, no need to track it
|
||||
self._fields = [f for f in fields if f != Field.PRIMARY_KEY.value]
|
||||
self._fields = [f for f in fields if f != Field.PRIMARY_KEY]
|
||||
|
||||
def _check_hybrid_search_support(self) -> bool:
|
||||
"""
|
||||
@@ -130,9 +130,9 @@ class MilvusVector(BaseVector):
|
||||
insert_dict = {
|
||||
# Do not need to insert the sparse_vector field separately, as the text_bm25_emb
|
||||
# function will automatically convert the native text into a sparse vector for us.
|
||||
Field.CONTENT_KEY.value: documents[i].page_content,
|
||||
Field.VECTOR.value: embeddings[i],
|
||||
Field.METADATA_KEY.value: documents[i].metadata,
|
||||
Field.CONTENT_KEY: documents[i].page_content,
|
||||
Field.VECTOR: embeddings[i],
|
||||
Field.METADATA_KEY: documents[i].metadata,
|
||||
}
|
||||
insert_dict_list.append(insert_dict)
|
||||
# Total insert count
|
||||
@@ -243,15 +243,15 @@ class MilvusVector(BaseVector):
|
||||
results = self._client.search(
|
||||
collection_name=self._collection_name,
|
||||
data=[query_vector],
|
||||
anns_field=Field.VECTOR.value,
|
||||
anns_field=Field.VECTOR,
|
||||
limit=kwargs.get("top_k", 4),
|
||||
output_fields=[Field.CONTENT_KEY.value, Field.METADATA_KEY.value],
|
||||
output_fields=[Field.CONTENT_KEY, Field.METADATA_KEY],
|
||||
filter=filter,
|
||||
)
|
||||
|
||||
return self._process_search_results(
|
||||
results,
|
||||
output_fields=[Field.CONTENT_KEY.value, Field.METADATA_KEY.value],
|
||||
output_fields=[Field.CONTENT_KEY, Field.METADATA_KEY],
|
||||
score_threshold=float(kwargs.get("score_threshold") or 0.0),
|
||||
)
|
||||
|
||||
@@ -264,7 +264,7 @@ class MilvusVector(BaseVector):
|
||||
"Full-text search is disabled: set MILVUS_ENABLE_HYBRID_SEARCH=true (requires Milvus >= 2.5.0)."
|
||||
)
|
||||
return []
|
||||
if not self.field_exists(Field.SPARSE_VECTOR.value):
|
||||
if not self.field_exists(Field.SPARSE_VECTOR):
|
||||
logger.warning(
|
||||
"Full-text search unavailable: collection missing 'sparse_vector' field; "
|
||||
"recreate the collection after enabling MILVUS_ENABLE_HYBRID_SEARCH to add BM25 sparse index."
|
||||
@@ -279,15 +279,15 @@ class MilvusVector(BaseVector):
|
||||
results = self._client.search(
|
||||
collection_name=self._collection_name,
|
||||
data=[query],
|
||||
anns_field=Field.SPARSE_VECTOR.value,
|
||||
anns_field=Field.SPARSE_VECTOR,
|
||||
limit=kwargs.get("top_k", 4),
|
||||
output_fields=[Field.CONTENT_KEY.value, Field.METADATA_KEY.value],
|
||||
output_fields=[Field.CONTENT_KEY, Field.METADATA_KEY],
|
||||
filter=filter,
|
||||
)
|
||||
|
||||
return self._process_search_results(
|
||||
results,
|
||||
output_fields=[Field.CONTENT_KEY.value, Field.METADATA_KEY.value],
|
||||
output_fields=[Field.CONTENT_KEY, Field.METADATA_KEY],
|
||||
score_threshold=float(kwargs.get("score_threshold") or 0.0),
|
||||
)
|
||||
|
||||
@@ -311,7 +311,7 @@ class MilvusVector(BaseVector):
|
||||
dim = len(embeddings[0])
|
||||
fields = []
|
||||
if metadatas:
|
||||
fields.append(FieldSchema(Field.METADATA_KEY.value, DataType.JSON, max_length=65_535))
|
||||
fields.append(FieldSchema(Field.METADATA_KEY, DataType.JSON, max_length=65_535))
|
||||
|
||||
# Create the text field, enable_analyzer will be set True to support milvus automatically
|
||||
# transfer text to sparse_vector, reference: https://milvus.io/docs/full-text-search.md
|
||||
@@ -326,15 +326,15 @@ class MilvusVector(BaseVector):
|
||||
):
|
||||
content_field_kwargs["analyzer_params"] = self._client_config.analyzer_params
|
||||
|
||||
fields.append(FieldSchema(Field.CONTENT_KEY.value, DataType.VARCHAR, **content_field_kwargs))
|
||||
fields.append(FieldSchema(Field.CONTENT_KEY, DataType.VARCHAR, **content_field_kwargs))
|
||||
|
||||
# Create the primary key field
|
||||
fields.append(FieldSchema(Field.PRIMARY_KEY.value, DataType.INT64, is_primary=True, auto_id=True))
|
||||
fields.append(FieldSchema(Field.PRIMARY_KEY, DataType.INT64, is_primary=True, auto_id=True))
|
||||
# Create the vector field, supports binary or float vectors
|
||||
fields.append(FieldSchema(Field.VECTOR.value, infer_dtype_bydata(embeddings[0]), dim=dim))
|
||||
fields.append(FieldSchema(Field.VECTOR, infer_dtype_bydata(embeddings[0]), dim=dim))
|
||||
# Create Sparse Vector Index for the collection
|
||||
if self._hybrid_search_enabled:
|
||||
fields.append(FieldSchema(Field.SPARSE_VECTOR.value, DataType.SPARSE_FLOAT_VECTOR))
|
||||
fields.append(FieldSchema(Field.SPARSE_VECTOR, DataType.SPARSE_FLOAT_VECTOR))
|
||||
|
||||
schema = CollectionSchema(fields)
|
||||
|
||||
@@ -342,8 +342,8 @@ class MilvusVector(BaseVector):
|
||||
if self._hybrid_search_enabled:
|
||||
bm25_function = Function(
|
||||
name="text_bm25_emb",
|
||||
input_field_names=[Field.CONTENT_KEY.value],
|
||||
output_field_names=[Field.SPARSE_VECTOR.value],
|
||||
input_field_names=[Field.CONTENT_KEY],
|
||||
output_field_names=[Field.SPARSE_VECTOR],
|
||||
function_type=FunctionType.BM25,
|
||||
)
|
||||
schema.add_function(bm25_function)
|
||||
@@ -352,12 +352,12 @@ class MilvusVector(BaseVector):
|
||||
|
||||
# Create Index params for the collection
|
||||
index_params_obj = IndexParams()
|
||||
index_params_obj.add_index(field_name=Field.VECTOR.value, **index_params)
|
||||
index_params_obj.add_index(field_name=Field.VECTOR, **index_params)
|
||||
|
||||
# Create Sparse Vector Index for the collection
|
||||
if self._hybrid_search_enabled:
|
||||
index_params_obj.add_index(
|
||||
field_name=Field.SPARSE_VECTOR.value, index_type="AUTOINDEX", metric_type="BM25"
|
||||
field_name=Field.SPARSE_VECTOR, index_type="AUTOINDEX", metric_type="BM25"
|
||||
)
|
||||
|
||||
# Create the collection
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import json
|
||||
import logging
|
||||
from typing import Any, Literal
|
||||
from typing import Any
|
||||
from uuid import uuid4
|
||||
|
||||
from opensearchpy import OpenSearch, Urllib3AWSV4SignerAuth, Urllib3HttpConnection, helpers
|
||||
@@ -8,6 +8,7 @@ from opensearchpy.helpers import BulkIndexError
|
||||
from pydantic import BaseModel, model_validator
|
||||
|
||||
from configs import dify_config
|
||||
from configs.middleware.vdb.opensearch_config import AuthMethod
|
||||
from core.rag.datasource.vdb.field import Field
|
||||
from core.rag.datasource.vdb.vector_base import BaseVector
|
||||
from core.rag.datasource.vdb.vector_factory import AbstractVectorFactory
|
||||
@@ -25,7 +26,7 @@ class OpenSearchConfig(BaseModel):
|
||||
port: int
|
||||
secure: bool = False # use_ssl
|
||||
verify_certs: bool = True
|
||||
auth_method: Literal["basic", "aws_managed_iam"] = "basic"
|
||||
auth_method: AuthMethod = AuthMethod.BASIC
|
||||
user: str | None = None
|
||||
password: str | None = None
|
||||
aws_region: str | None = None
|
||||
@@ -98,9 +99,9 @@ class OpenSearchVector(BaseVector):
|
||||
"_op_type": "index",
|
||||
"_index": self._collection_name.lower(),
|
||||
"_source": {
|
||||
Field.CONTENT_KEY.value: documents[i].page_content,
|
||||
Field.VECTOR.value: embeddings[i], # Make sure you pass an array here
|
||||
Field.METADATA_KEY.value: documents[i].metadata,
|
||||
Field.CONTENT_KEY: documents[i].page_content,
|
||||
Field.VECTOR: embeddings[i], # Make sure you pass an array here
|
||||
Field.METADATA_KEY: documents[i].metadata,
|
||||
},
|
||||
}
|
||||
# See https://github.com/langchain-ai/langchainjs/issues/4346#issuecomment-1935123377
|
||||
@@ -116,7 +117,7 @@ class OpenSearchVector(BaseVector):
|
||||
)
|
||||
|
||||
def get_ids_by_metadata_field(self, key: str, value: str):
|
||||
query = {"query": {"term": {f"{Field.METADATA_KEY.value}.{key}": value}}}
|
||||
query = {"query": {"term": {f"{Field.METADATA_KEY}.{key}": value}}}
|
||||
response = self._client.search(index=self._collection_name.lower(), body=query)
|
||||
if response["hits"]["hits"]:
|
||||
return [hit["_id"] for hit in response["hits"]["hits"]]
|
||||
@@ -180,17 +181,17 @@ class OpenSearchVector(BaseVector):
|
||||
|
||||
query = {
|
||||
"size": kwargs.get("top_k", 4),
|
||||
"query": {"knn": {Field.VECTOR.value: {Field.VECTOR.value: query_vector, "k": kwargs.get("top_k", 4)}}},
|
||||
"query": {"knn": {Field.VECTOR: {Field.VECTOR: query_vector, "k": kwargs.get("top_k", 4)}}},
|
||||
}
|
||||
document_ids_filter = kwargs.get("document_ids_filter")
|
||||
if document_ids_filter:
|
||||
query["query"] = {
|
||||
"script_score": {
|
||||
"query": {"bool": {"filter": [{"terms": {Field.DOCUMENT_ID.value: document_ids_filter}}]}},
|
||||
"query": {"bool": {"filter": [{"terms": {Field.DOCUMENT_ID: document_ids_filter}}]}},
|
||||
"script": {
|
||||
"source": "knn_score",
|
||||
"lang": "knn",
|
||||
"params": {"field": Field.VECTOR.value, "query_value": query_vector, "space_type": "l2"},
|
||||
"params": {"field": Field.VECTOR, "query_value": query_vector, "space_type": "l2"},
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -203,7 +204,7 @@ class OpenSearchVector(BaseVector):
|
||||
|
||||
docs = []
|
||||
for hit in response["hits"]["hits"]:
|
||||
metadata = hit["_source"].get(Field.METADATA_KEY.value, {})
|
||||
metadata = hit["_source"].get(Field.METADATA_KEY, {})
|
||||
|
||||
# Make sure metadata is a dictionary
|
||||
if metadata is None:
|
||||
@@ -212,7 +213,7 @@ class OpenSearchVector(BaseVector):
|
||||
metadata["score"] = hit["_score"]
|
||||
score_threshold = float(kwargs.get("score_threshold") or 0.0)
|
||||
if hit["_score"] >= score_threshold:
|
||||
doc = Document(page_content=hit["_source"].get(Field.CONTENT_KEY.value), metadata=metadata)
|
||||
doc = Document(page_content=hit["_source"].get(Field.CONTENT_KEY), metadata=metadata)
|
||||
docs.append(doc)
|
||||
|
||||
return docs
|
||||
@@ -227,9 +228,9 @@ class OpenSearchVector(BaseVector):
|
||||
|
||||
docs = []
|
||||
for hit in response["hits"]["hits"]:
|
||||
metadata = hit["_source"].get(Field.METADATA_KEY.value)
|
||||
vector = hit["_source"].get(Field.VECTOR.value)
|
||||
page_content = hit["_source"].get(Field.CONTENT_KEY.value)
|
||||
metadata = hit["_source"].get(Field.METADATA_KEY)
|
||||
vector = hit["_source"].get(Field.VECTOR)
|
||||
page_content = hit["_source"].get(Field.CONTENT_KEY)
|
||||
doc = Document(page_content=page_content, vector=vector, metadata=metadata)
|
||||
docs.append(doc)
|
||||
|
||||
@@ -250,8 +251,8 @@ class OpenSearchVector(BaseVector):
|
||||
"settings": {"index": {"knn": True}},
|
||||
"mappings": {
|
||||
"properties": {
|
||||
Field.CONTENT_KEY.value: {"type": "text"},
|
||||
Field.VECTOR.value: {
|
||||
Field.CONTENT_KEY: {"type": "text"},
|
||||
Field.VECTOR: {
|
||||
"type": "knn_vector",
|
||||
"dimension": len(embeddings[0]), # Make sure the dimension is correct here
|
||||
"method": {
|
||||
@@ -261,7 +262,7 @@ class OpenSearchVector(BaseVector):
|
||||
"parameters": {"ef_construction": 64, "m": 8},
|
||||
},
|
||||
},
|
||||
Field.METADATA_KEY.value: {
|
||||
Field.METADATA_KEY: {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"doc_id": {"type": "keyword"}, # Map doc_id to keyword type
|
||||
@@ -293,7 +294,7 @@ class OpenSearchVectorFactory(AbstractVectorFactory):
|
||||
port=dify_config.OPENSEARCH_PORT,
|
||||
secure=dify_config.OPENSEARCH_SECURE,
|
||||
verify_certs=dify_config.OPENSEARCH_VERIFY_CERTS,
|
||||
auth_method=dify_config.OPENSEARCH_AUTH_METHOD.value,
|
||||
auth_method=dify_config.OPENSEARCH_AUTH_METHOD,
|
||||
user=dify_config.OPENSEARCH_USER,
|
||||
password=dify_config.OPENSEARCH_PASSWORD,
|
||||
aws_region=dify_config.OPENSEARCH_AWS_REGION,
|
||||
|
||||
@@ -147,15 +147,13 @@ class QdrantVector(BaseVector):
|
||||
|
||||
# create group_id payload index
|
||||
self._client.create_payload_index(
|
||||
collection_name, Field.GROUP_KEY.value, field_schema=PayloadSchemaType.KEYWORD
|
||||
collection_name, Field.GROUP_KEY, field_schema=PayloadSchemaType.KEYWORD
|
||||
)
|
||||
# create doc_id payload index
|
||||
self._client.create_payload_index(
|
||||
collection_name, Field.DOC_ID.value, field_schema=PayloadSchemaType.KEYWORD
|
||||
)
|
||||
self._client.create_payload_index(collection_name, Field.DOC_ID, field_schema=PayloadSchemaType.KEYWORD)
|
||||
# create document_id payload index
|
||||
self._client.create_payload_index(
|
||||
collection_name, Field.DOCUMENT_ID.value, field_schema=PayloadSchemaType.KEYWORD
|
||||
collection_name, Field.DOCUMENT_ID, field_schema=PayloadSchemaType.KEYWORD
|
||||
)
|
||||
# create full text index
|
||||
text_index_params = TextIndexParams(
|
||||
@@ -165,9 +163,7 @@ class QdrantVector(BaseVector):
|
||||
max_token_len=20,
|
||||
lowercase=True,
|
||||
)
|
||||
self._client.create_payload_index(
|
||||
collection_name, Field.CONTENT_KEY.value, field_schema=text_index_params
|
||||
)
|
||||
self._client.create_payload_index(collection_name, Field.CONTENT_KEY, field_schema=text_index_params)
|
||||
redis_client.set(collection_exist_cache_key, 1, ex=3600)
|
||||
|
||||
def add_texts(self, documents: list[Document], embeddings: list[list[float]], **kwargs):
|
||||
@@ -220,10 +216,10 @@ class QdrantVector(BaseVector):
|
||||
self._build_payloads(
|
||||
batch_texts,
|
||||
batch_metadatas,
|
||||
Field.CONTENT_KEY.value,
|
||||
Field.METADATA_KEY.value,
|
||||
Field.CONTENT_KEY,
|
||||
Field.METADATA_KEY,
|
||||
group_id or "", # Ensure group_id is never None
|
||||
Field.GROUP_KEY.value,
|
||||
Field.GROUP_KEY,
|
||||
),
|
||||
)
|
||||
]
|
||||
@@ -381,12 +377,12 @@ class QdrantVector(BaseVector):
|
||||
for result in results:
|
||||
if result.payload is None:
|
||||
continue
|
||||
metadata = result.payload.get(Field.METADATA_KEY.value) or {}
|
||||
metadata = result.payload.get(Field.METADATA_KEY) or {}
|
||||
# duplicate check score threshold
|
||||
if result.score >= score_threshold:
|
||||
metadata["score"] = result.score
|
||||
doc = Document(
|
||||
page_content=result.payload.get(Field.CONTENT_KEY.value, ""),
|
||||
page_content=result.payload.get(Field.CONTENT_KEY, ""),
|
||||
metadata=metadata,
|
||||
)
|
||||
docs.append(doc)
|
||||
@@ -433,7 +429,7 @@ class QdrantVector(BaseVector):
|
||||
documents = []
|
||||
for result in results:
|
||||
if result:
|
||||
document = self._document_from_scored_point(result, Field.CONTENT_KEY.value, Field.METADATA_KEY.value)
|
||||
document = self._document_from_scored_point(result, Field.CONTENT_KEY, Field.METADATA_KEY)
|
||||
documents.append(document)
|
||||
|
||||
return documents
|
||||
|
||||
@@ -55,7 +55,7 @@ class TableStoreVector(BaseVector):
|
||||
self._normalize_full_text_bm25_score = config.normalize_full_text_bm25_score
|
||||
self._table_name = f"{collection_name}"
|
||||
self._index_name = f"{collection_name}_idx"
|
||||
self._tags_field = f"{Field.METADATA_KEY.value}_tags"
|
||||
self._tags_field = f"{Field.METADATA_KEY}_tags"
|
||||
|
||||
def create_collection(self, embeddings: list[list[float]], **kwargs):
|
||||
dimension = len(embeddings[0])
|
||||
@@ -64,7 +64,7 @@ class TableStoreVector(BaseVector):
|
||||
def get_by_ids(self, ids: list[str]) -> list[Document]:
|
||||
docs = []
|
||||
request = BatchGetRowRequest()
|
||||
columns_to_get = [Field.METADATA_KEY.value, Field.CONTENT_KEY.value]
|
||||
columns_to_get = [Field.METADATA_KEY, Field.CONTENT_KEY]
|
||||
rows_to_get = [[("id", _id)] for _id in ids]
|
||||
request.add(TableInBatchGetRowItem(self._table_name, rows_to_get, columns_to_get, None, 1))
|
||||
|
||||
@@ -73,11 +73,7 @@ class TableStoreVector(BaseVector):
|
||||
for item in table_result:
|
||||
if item.is_ok and item.row:
|
||||
kv = {k: v for k, v, _ in item.row.attribute_columns}
|
||||
docs.append(
|
||||
Document(
|
||||
page_content=kv[Field.CONTENT_KEY.value], metadata=json.loads(kv[Field.METADATA_KEY.value])
|
||||
)
|
||||
)
|
||||
docs.append(Document(page_content=kv[Field.CONTENT_KEY], metadata=json.loads(kv[Field.METADATA_KEY])))
|
||||
return docs
|
||||
|
||||
def get_type(self) -> str:
|
||||
@@ -95,9 +91,9 @@ class TableStoreVector(BaseVector):
|
||||
self._write_row(
|
||||
primary_key=uuids[i],
|
||||
attributes={
|
||||
Field.CONTENT_KEY.value: documents[i].page_content,
|
||||
Field.VECTOR.value: embeddings[i],
|
||||
Field.METADATA_KEY.value: documents[i].metadata,
|
||||
Field.CONTENT_KEY: documents[i].page_content,
|
||||
Field.VECTOR: embeddings[i],
|
||||
Field.METADATA_KEY: documents[i].metadata,
|
||||
},
|
||||
)
|
||||
return uuids
|
||||
@@ -180,7 +176,7 @@ class TableStoreVector(BaseVector):
|
||||
|
||||
field_schemas = [
|
||||
tablestore.FieldSchema(
|
||||
Field.CONTENT_KEY.value,
|
||||
Field.CONTENT_KEY,
|
||||
tablestore.FieldType.TEXT,
|
||||
analyzer=tablestore.AnalyzerType.MAXWORD,
|
||||
index=True,
|
||||
@@ -188,7 +184,7 @@ class TableStoreVector(BaseVector):
|
||||
store=False,
|
||||
),
|
||||
tablestore.FieldSchema(
|
||||
Field.VECTOR.value,
|
||||
Field.VECTOR,
|
||||
tablestore.FieldType.VECTOR,
|
||||
vector_options=tablestore.VectorOptions(
|
||||
data_type=tablestore.VectorDataType.VD_FLOAT_32,
|
||||
@@ -197,7 +193,7 @@ class TableStoreVector(BaseVector):
|
||||
),
|
||||
),
|
||||
tablestore.FieldSchema(
|
||||
Field.METADATA_KEY.value,
|
||||
Field.METADATA_KEY,
|
||||
tablestore.FieldType.KEYWORD,
|
||||
index=True,
|
||||
store=False,
|
||||
@@ -233,15 +229,15 @@ class TableStoreVector(BaseVector):
|
||||
pk = [("id", primary_key)]
|
||||
|
||||
tags = []
|
||||
for key, value in attributes[Field.METADATA_KEY.value].items():
|
||||
for key, value in attributes[Field.METADATA_KEY].items():
|
||||
tags.append(str(key) + "=" + str(value))
|
||||
|
||||
attribute_columns = [
|
||||
(Field.CONTENT_KEY.value, attributes[Field.CONTENT_KEY.value]),
|
||||
(Field.VECTOR.value, json.dumps(attributes[Field.VECTOR.value])),
|
||||
(Field.CONTENT_KEY, attributes[Field.CONTENT_KEY]),
|
||||
(Field.VECTOR, json.dumps(attributes[Field.VECTOR])),
|
||||
(
|
||||
Field.METADATA_KEY.value,
|
||||
json.dumps(attributes[Field.METADATA_KEY.value]),
|
||||
Field.METADATA_KEY,
|
||||
json.dumps(attributes[Field.METADATA_KEY]),
|
||||
),
|
||||
(self._tags_field, json.dumps(tags)),
|
||||
]
|
||||
@@ -270,7 +266,7 @@ class TableStoreVector(BaseVector):
|
||||
index_name=self._index_name,
|
||||
search_query=query,
|
||||
columns_to_get=tablestore.ColumnsToGet(
|
||||
column_names=[Field.PRIMARY_KEY.value], return_type=tablestore.ColumnReturnType.SPECIFIED
|
||||
column_names=[Field.PRIMARY_KEY], return_type=tablestore.ColumnReturnType.SPECIFIED
|
||||
),
|
||||
)
|
||||
|
||||
@@ -288,7 +284,7 @@ class TableStoreVector(BaseVector):
|
||||
self, query_vector: list[float], document_ids_filter: list[str] | None, top_k: int, score_threshold: float
|
||||
) -> list[Document]:
|
||||
knn_vector_query = tablestore.KnnVectorQuery(
|
||||
field_name=Field.VECTOR.value,
|
||||
field_name=Field.VECTOR,
|
||||
top_k=top_k,
|
||||
float32_query_vector=query_vector,
|
||||
)
|
||||
@@ -311,8 +307,8 @@ class TableStoreVector(BaseVector):
|
||||
for col in search_hit.row[1]:
|
||||
ots_column_map[col[0]] = col[1]
|
||||
|
||||
vector_str = ots_column_map.get(Field.VECTOR.value)
|
||||
metadata_str = ots_column_map.get(Field.METADATA_KEY.value)
|
||||
vector_str = ots_column_map.get(Field.VECTOR)
|
||||
metadata_str = ots_column_map.get(Field.METADATA_KEY)
|
||||
|
||||
vector = json.loads(vector_str) if vector_str else None
|
||||
metadata = json.loads(metadata_str) if metadata_str else {}
|
||||
@@ -321,7 +317,7 @@ class TableStoreVector(BaseVector):
|
||||
|
||||
documents.append(
|
||||
Document(
|
||||
page_content=ots_column_map.get(Field.CONTENT_KEY.value) or "",
|
||||
page_content=ots_column_map.get(Field.CONTENT_KEY) or "",
|
||||
vector=vector,
|
||||
metadata=metadata,
|
||||
)
|
||||
@@ -343,7 +339,7 @@ class TableStoreVector(BaseVector):
|
||||
self, query: str, document_ids_filter: list[str] | None, top_k: int, score_threshold: float
|
||||
) -> list[Document]:
|
||||
bool_query = tablestore.BoolQuery(must_queries=[], filter_queries=[], should_queries=[], must_not_queries=[])
|
||||
bool_query.must_queries.append(tablestore.MatchQuery(text=query, field_name=Field.CONTENT_KEY.value))
|
||||
bool_query.must_queries.append(tablestore.MatchQuery(text=query, field_name=Field.CONTENT_KEY))
|
||||
|
||||
if document_ids_filter:
|
||||
bool_query.filter_queries.append(tablestore.TermsQuery(self._tags_field, document_ids_filter))
|
||||
@@ -374,10 +370,10 @@ class TableStoreVector(BaseVector):
|
||||
for col in search_hit.row[1]:
|
||||
ots_column_map[col[0]] = col[1]
|
||||
|
||||
metadata_str = ots_column_map.get(Field.METADATA_KEY.value)
|
||||
metadata_str = ots_column_map.get(Field.METADATA_KEY)
|
||||
metadata = json.loads(metadata_str) if metadata_str else {}
|
||||
|
||||
vector_str = ots_column_map.get(Field.VECTOR.value)
|
||||
vector_str = ots_column_map.get(Field.VECTOR)
|
||||
vector = json.loads(vector_str) if vector_str else None
|
||||
|
||||
if score:
|
||||
@@ -385,7 +381,7 @@ class TableStoreVector(BaseVector):
|
||||
|
||||
documents.append(
|
||||
Document(
|
||||
page_content=ots_column_map.get(Field.CONTENT_KEY.value) or "",
|
||||
page_content=ots_column_map.get(Field.CONTENT_KEY) or "",
|
||||
vector=vector,
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
@@ -141,15 +141,13 @@ class TidbOnQdrantVector(BaseVector):
|
||||
|
||||
# create group_id payload index
|
||||
self._client.create_payload_index(
|
||||
collection_name, Field.GROUP_KEY.value, field_schema=PayloadSchemaType.KEYWORD
|
||||
collection_name, Field.GROUP_KEY, field_schema=PayloadSchemaType.KEYWORD
|
||||
)
|
||||
# create doc_id payload index
|
||||
self._client.create_payload_index(
|
||||
collection_name, Field.DOC_ID.value, field_schema=PayloadSchemaType.KEYWORD
|
||||
)
|
||||
self._client.create_payload_index(collection_name, Field.DOC_ID, field_schema=PayloadSchemaType.KEYWORD)
|
||||
# create document_id payload index
|
||||
self._client.create_payload_index(
|
||||
collection_name, Field.DOCUMENT_ID.value, field_schema=PayloadSchemaType.KEYWORD
|
||||
collection_name, Field.DOCUMENT_ID, field_schema=PayloadSchemaType.KEYWORD
|
||||
)
|
||||
# create full text index
|
||||
text_index_params = TextIndexParams(
|
||||
@@ -159,9 +157,7 @@ class TidbOnQdrantVector(BaseVector):
|
||||
max_token_len=20,
|
||||
lowercase=True,
|
||||
)
|
||||
self._client.create_payload_index(
|
||||
collection_name, Field.CONTENT_KEY.value, field_schema=text_index_params
|
||||
)
|
||||
self._client.create_payload_index(collection_name, Field.CONTENT_KEY, field_schema=text_index_params)
|
||||
redis_client.set(collection_exist_cache_key, 1, ex=3600)
|
||||
|
||||
def add_texts(self, documents: list[Document], embeddings: list[list[float]], **kwargs):
|
||||
@@ -211,10 +207,10 @@ class TidbOnQdrantVector(BaseVector):
|
||||
self._build_payloads(
|
||||
batch_texts,
|
||||
batch_metadatas,
|
||||
Field.CONTENT_KEY.value,
|
||||
Field.METADATA_KEY.value,
|
||||
Field.CONTENT_KEY,
|
||||
Field.METADATA_KEY,
|
||||
group_id or "",
|
||||
Field.GROUP_KEY.value,
|
||||
Field.GROUP_KEY,
|
||||
),
|
||||
)
|
||||
]
|
||||
@@ -349,13 +345,13 @@ class TidbOnQdrantVector(BaseVector):
|
||||
for result in results:
|
||||
if result.payload is None:
|
||||
continue
|
||||
metadata = result.payload.get(Field.METADATA_KEY.value) or {}
|
||||
metadata = result.payload.get(Field.METADATA_KEY) or {}
|
||||
# duplicate check score threshold
|
||||
score_threshold = kwargs.get("score_threshold") or 0.0
|
||||
if result.score >= score_threshold:
|
||||
metadata["score"] = result.score
|
||||
doc = Document(
|
||||
page_content=result.payload.get(Field.CONTENT_KEY.value, ""),
|
||||
page_content=result.payload.get(Field.CONTENT_KEY, ""),
|
||||
metadata=metadata,
|
||||
)
|
||||
docs.append(doc)
|
||||
@@ -392,7 +388,7 @@ class TidbOnQdrantVector(BaseVector):
|
||||
documents = []
|
||||
for result in results:
|
||||
if result:
|
||||
document = self._document_from_scored_point(result, Field.CONTENT_KEY.value, Field.METADATA_KEY.value)
|
||||
document = self._document_from_scored_point(result, Field.CONTENT_KEY, Field.METADATA_KEY)
|
||||
documents.append(document)
|
||||
|
||||
return documents
|
||||
|
||||
@@ -55,13 +55,13 @@ class TiDBVector(BaseVector):
|
||||
return Table(
|
||||
self._collection_name,
|
||||
self._orm_base.metadata,
|
||||
Column(Field.PRIMARY_KEY.value, String(36), primary_key=True, nullable=False),
|
||||
Column(Field.PRIMARY_KEY, String(36), primary_key=True, nullable=False),
|
||||
Column(
|
||||
Field.VECTOR.value,
|
||||
Field.VECTOR,
|
||||
VectorType(dim),
|
||||
nullable=False,
|
||||
),
|
||||
Column(Field.TEXT_KEY.value, TEXT, nullable=False),
|
||||
Column(Field.TEXT_KEY, TEXT, nullable=False),
|
||||
Column("meta", JSON, nullable=False),
|
||||
Column("create_time", DateTime, server_default=sqlalchemy.text("CURRENT_TIMESTAMP")),
|
||||
Column(
|
||||
|
||||
@@ -76,11 +76,11 @@ class VikingDBVector(BaseVector):
|
||||
|
||||
if not self._has_collection():
|
||||
fields = [
|
||||
Field(field_name=vdb_Field.PRIMARY_KEY.value, field_type=FieldType.String, is_primary_key=True),
|
||||
Field(field_name=vdb_Field.METADATA_KEY.value, field_type=FieldType.String),
|
||||
Field(field_name=vdb_Field.GROUP_KEY.value, field_type=FieldType.String),
|
||||
Field(field_name=vdb_Field.CONTENT_KEY.value, field_type=FieldType.Text),
|
||||
Field(field_name=vdb_Field.VECTOR.value, field_type=FieldType.Vector, dim=dimension),
|
||||
Field(field_name=vdb_Field.PRIMARY_KEY, field_type=FieldType.String, is_primary_key=True),
|
||||
Field(field_name=vdb_Field.METADATA_KEY, field_type=FieldType.String),
|
||||
Field(field_name=vdb_Field.GROUP_KEY, field_type=FieldType.String),
|
||||
Field(field_name=vdb_Field.CONTENT_KEY, field_type=FieldType.Text),
|
||||
Field(field_name=vdb_Field.VECTOR, field_type=FieldType.Vector, dim=dimension),
|
||||
]
|
||||
|
||||
self._client.create_collection(
|
||||
@@ -100,7 +100,7 @@ class VikingDBVector(BaseVector):
|
||||
collection_name=self._collection_name,
|
||||
index_name=self._index_name,
|
||||
vector_index=vector_index,
|
||||
partition_by=vdb_Field.GROUP_KEY.value,
|
||||
partition_by=vdb_Field.GROUP_KEY,
|
||||
description="Index For Dify",
|
||||
)
|
||||
redis_client.set(collection_exist_cache_key, 1, ex=3600)
|
||||
@@ -126,11 +126,11 @@ class VikingDBVector(BaseVector):
|
||||
# FIXME: fix the type of metadata later
|
||||
doc = Data(
|
||||
{
|
||||
vdb_Field.PRIMARY_KEY.value: metadatas[i]["doc_id"], # type: ignore
|
||||
vdb_Field.VECTOR.value: embeddings[i] if embeddings else None,
|
||||
vdb_Field.CONTENT_KEY.value: page_content,
|
||||
vdb_Field.METADATA_KEY.value: json.dumps(metadata),
|
||||
vdb_Field.GROUP_KEY.value: self._group_id,
|
||||
vdb_Field.PRIMARY_KEY: metadatas[i]["doc_id"], # type: ignore
|
||||
vdb_Field.VECTOR: embeddings[i] if embeddings else None,
|
||||
vdb_Field.CONTENT_KEY: page_content,
|
||||
vdb_Field.METADATA_KEY: json.dumps(metadata),
|
||||
vdb_Field.GROUP_KEY: self._group_id,
|
||||
}
|
||||
)
|
||||
docs.append(doc)
|
||||
@@ -151,7 +151,7 @@ class VikingDBVector(BaseVector):
|
||||
# Note: Metadata field value is an dict, but vikingdb field
|
||||
# not support json type
|
||||
results = self._client.get_index(self._collection_name, self._index_name).search(
|
||||
filter={"op": "must", "field": vdb_Field.GROUP_KEY.value, "conds": [self._group_id]},
|
||||
filter={"op": "must", "field": vdb_Field.GROUP_KEY, "conds": [self._group_id]},
|
||||
# max value is 5000
|
||||
limit=5000,
|
||||
)
|
||||
@@ -161,7 +161,7 @@ class VikingDBVector(BaseVector):
|
||||
|
||||
ids = []
|
||||
for result in results:
|
||||
metadata = result.fields.get(vdb_Field.METADATA_KEY.value)
|
||||
metadata = result.fields.get(vdb_Field.METADATA_KEY)
|
||||
if metadata is not None:
|
||||
metadata = json.loads(metadata)
|
||||
if metadata.get(key) == value:
|
||||
@@ -189,12 +189,12 @@ class VikingDBVector(BaseVector):
|
||||
|
||||
docs = []
|
||||
for result in results:
|
||||
metadata = result.fields.get(vdb_Field.METADATA_KEY.value)
|
||||
metadata = result.fields.get(vdb_Field.METADATA_KEY)
|
||||
if metadata is not None:
|
||||
metadata = json.loads(metadata)
|
||||
if result.score >= score_threshold:
|
||||
metadata["score"] = result.score
|
||||
doc = Document(page_content=result.fields.get(vdb_Field.CONTENT_KEY.value), metadata=metadata)
|
||||
doc = Document(page_content=result.fields.get(vdb_Field.CONTENT_KEY), metadata=metadata)
|
||||
docs.append(doc)
|
||||
docs = sorted(docs, key=lambda x: x.metadata.get("score", 0) if x.metadata else 0, reverse=True)
|
||||
return docs
|
||||
|
||||
@@ -104,7 +104,7 @@ class WeaviateVector(BaseVector):
|
||||
|
||||
with self._client.batch as batch:
|
||||
for i, text in enumerate(texts):
|
||||
data_properties = {Field.TEXT_KEY.value: text}
|
||||
data_properties = {Field.TEXT_KEY: text}
|
||||
if metadatas is not None:
|
||||
# metadata maybe None
|
||||
for key, val in (metadatas[i] or {}).items():
|
||||
@@ -182,7 +182,7 @@ class WeaviateVector(BaseVector):
|
||||
"""Look up similar documents by embedding vector in Weaviate."""
|
||||
collection_name = self._collection_name
|
||||
properties = self._attributes
|
||||
properties.append(Field.TEXT_KEY.value)
|
||||
properties.append(Field.TEXT_KEY)
|
||||
query_obj = self._client.query.get(collection_name, properties)
|
||||
|
||||
vector = {"vector": query_vector}
|
||||
@@ -204,7 +204,7 @@ class WeaviateVector(BaseVector):
|
||||
|
||||
docs_and_scores = []
|
||||
for res in result["data"]["Get"][collection_name]:
|
||||
text = res.pop(Field.TEXT_KEY.value)
|
||||
text = res.pop(Field.TEXT_KEY)
|
||||
score = 1 - res["_additional"]["distance"]
|
||||
docs_and_scores.append((Document(page_content=text, metadata=res), score))
|
||||
|
||||
@@ -232,7 +232,7 @@ class WeaviateVector(BaseVector):
|
||||
collection_name = self._collection_name
|
||||
content: dict[str, Any] = {"concepts": [query]}
|
||||
properties = self._attributes
|
||||
properties.append(Field.TEXT_KEY.value)
|
||||
properties.append(Field.TEXT_KEY)
|
||||
if kwargs.get("search_distance"):
|
||||
content["certainty"] = kwargs.get("search_distance")
|
||||
query_obj = self._client.query.get(collection_name, properties)
|
||||
@@ -250,7 +250,7 @@ class WeaviateVector(BaseVector):
|
||||
raise ValueError(f"Error during query: {result['errors']}")
|
||||
docs = []
|
||||
for res in result["data"]["Get"][collection_name]:
|
||||
text = res.pop(Field.TEXT_KEY.value)
|
||||
text = res.pop(Field.TEXT_KEY)
|
||||
additional = res.pop("_additional")
|
||||
docs.append(Document(page_content=text, vector=additional["vector"], metadata=res))
|
||||
return docs
|
||||
|
||||
Reference in New Issue
Block a user