Feat/assistant app (#2086)

Co-authored-by: chenhe <guchenhe@gmail.com>
Co-authored-by: Pascal M <11357019+perzeuss@users.noreply.github.com>
This commit is contained in:
Yeuoly
2024-01-23 19:58:23 +08:00
committed by GitHub
parent 7bbe12b2bd
commit 86286e1ac8
175 changed files with 11619 additions and 1235 deletions

View File

@@ -0,0 +1,222 @@
from typing import Any, Dict, List, Union
from json import dumps
from core.tools.entities.tool_bundle import ApiBasedToolBundle
from core.tools.entities.tool_entities import ToolInvokeMessage
from core.tools.tool.tool import Tool
from core.tools.errors import ToolProviderCredentialValidationError
import httpx
import requests
class ApiTool(Tool):
api_bundle: ApiBasedToolBundle
"""
Api tool
"""
def fork_tool_runtime(self, meta: Dict[str, Any]) -> 'Tool':
"""
fork a new tool with meta data
:param meta: the meta data of a tool call processing, tenant_id is required
:return: the new tool
"""
return self.__class__(
identity=self.identity.copy() if self.identity else None,
parameters=self.parameters.copy() if self.parameters else None,
description=self.description.copy() if self.description else None,
api_bundle=self.api_bundle.copy() if self.api_bundle else None,
runtime=Tool.Runtime(**meta)
)
def validate_credentials(self, credentails: Dict[str, Any], parameters: Dict[str, Any], format_only: bool = False) -> None:
"""
validate the credentials for Api tool
"""
# assemble validate request and request parameters
headers = self.assembling_request(parameters)
if format_only:
return
response = self.do_http_request(self.api_bundle.server_url, self.api_bundle.method, headers, parameters)
# validate response
self.validate_and_parse_response(response)
def assembling_request(self, parameters: Dict[str, Any]) -> Dict[str, Any]:
headers = {}
credentials = self.runtime.credentials or {}
if 'auth_type' not in credentials:
raise ToolProviderCredentialValidationError('Missing auth_type')
if credentials['auth_type'] == 'api_key':
api_key_header = 'api_key'
if 'api_key_header' in credentials:
api_key_header = credentials['api_key_header']
if 'api_key_value' not in credentials:
raise ToolProviderCredentialValidationError('Missing api_key_value')
headers[api_key_header] = credentials['api_key_value']
needed_parameters = [parameter for parameter in self.api_bundle.parameters if parameter.required]
for parameter in needed_parameters:
if parameter.required and parameter.name not in parameters:
raise ToolProviderCredentialValidationError(f"Missing required parameter {parameter.name}")
if parameter.default is not None and parameter.name not in parameters:
parameters[parameter.name] = parameter.default
return headers
def validate_and_parse_response(self, response: Union[httpx.Response, requests.Response]) -> str:
"""
validate the response
"""
if isinstance(response, httpx.Response):
if response.status_code >= 400:
raise ToolProviderCredentialValidationError(f"Request failed with status code {response.status_code}")
return response.text
elif isinstance(response, requests.Response):
if not response.ok:
raise ToolProviderCredentialValidationError(f"Request failed with status code {response.status_code}")
return response.text
else:
raise ValueError(f'Invalid response type {type(response)}')
def do_http_request(self, url: str, method: str, headers: Dict[str, Any], parameters: Dict[str, Any]) -> httpx.Response:
"""
do http request depending on api bundle
"""
method = method.lower()
params = {}
path_params = {}
body = {}
cookies = {}
# check parameters
for parameter in self.api_bundle.openapi.get('parameters', []):
if parameter['in'] == 'path':
value = ''
if parameter['name'] in parameters:
value = parameters[parameter['name']]
elif parameter['required']:
raise ToolProviderCredentialValidationError(f"Missing required parameter {parameter['name']}")
path_params[parameter['name']] = value
elif parameter['in'] == 'query':
value = ''
if parameter['name'] in parameters:
value = parameters[parameter['name']]
elif parameter['required']:
raise ToolProviderCredentialValidationError(f"Missing required parameter {parameter['name']}")
params[parameter['name']] = value
elif parameter['in'] == 'cookie':
value = ''
if parameter['name'] in parameters:
value = parameters[parameter['name']]
elif parameter['required']:
raise ToolProviderCredentialValidationError(f"Missing required parameter {parameter['name']}")
cookies[parameter['name']] = value
elif parameter['in'] == 'header':
value = ''
if parameter['name'] in parameters:
value = parameters[parameter['name']]
elif parameter['required']:
raise ToolProviderCredentialValidationError(f"Missing required parameter {parameter['name']}")
headers[parameter['name']] = value
# check if there is a request body and handle it
if 'requestBody' in self.api_bundle.openapi and self.api_bundle.openapi['requestBody'] is not None:
# handle json request body
if 'content' in self.api_bundle.openapi['requestBody']:
for content_type in self.api_bundle.openapi['requestBody']['content']:
headers['Content-Type'] = content_type
body_schema = self.api_bundle.openapi['requestBody']['content'][content_type]['schema']
required = body_schema['required'] if 'required' in body_schema else []
properties = body_schema['properties'] if 'properties' in body_schema else {}
for name, property in properties.items():
if name in parameters:
# convert type
try:
value = parameters[name]
if property['type'] == 'integer':
value = int(value)
elif property['type'] == 'number':
# check if it is a float
if '.' in value:
value = float(value)
else:
value = int(value)
elif property['type'] == 'boolean':
value = bool(value)
body[name] = value
except ValueError as e:
body[name] = parameters[name]
elif name in required:
raise ToolProviderCredentialValidationError(
f"Missing required parameter {name} in operation {self.api_bundle.operation_id}"
)
elif 'default' in property:
body[name] = property['default']
else:
body[name] = None
break
# replace path parameters
for name, value in path_params.items():
url = url.replace(f'{{{name}}}', value)
# parse http body data if needed, for GET/HEAD/OPTIONS/TRACE, the body is ignored
if 'Content-Type' in headers:
if headers['Content-Type'] == 'application/json':
body = dumps(body)
else:
body = body
# do http request
if method == 'get':
response = httpx.get(url, params=params, headers=headers, cookies=cookies, timeout=10, follow_redirects=True)
elif method == 'post':
response = httpx.post(url, params=params, headers=headers, cookies=cookies, data=body, timeout=10, follow_redirects=True)
elif method == 'put':
response = httpx.put(url, params=params, headers=headers, cookies=cookies, data=body, timeout=10, follow_redirects=True)
elif method == 'delete':
"""
request body data is unsupported for DELETE method in standard http protocol
however, OpenAPI 3.0 supports request body data for DELETE method, so we support it here by using requests
"""
response = requests.delete(url, params=params, headers=headers, cookies=cookies, data=body, timeout=10, allow_redirects=True)
elif method == 'patch':
response = httpx.patch(url, params=params, headers=headers, cookies=cookies, data=body, timeout=10, follow_redirects=True)
elif method == 'head':
response = httpx.head(url, params=params, headers=headers, cookies=cookies, timeout=10, follow_redirects=True)
elif method == 'options':
response = httpx.options(url, params=params, headers=headers, cookies=cookies, timeout=10, follow_redirects=True)
else:
raise ValueError(f'Invalid http method {method}')
return response
def _invoke(self, user_id: str, tool_paramters: Dict[str, Any]) -> ToolInvokeMessage | List[ToolInvokeMessage]:
"""
invoke http request
"""
# assemble request
headers = self.assembling_request(tool_paramters)
# do http request
response = self.do_http_request(self.api_bundle.server_url, self.api_bundle.method, headers, tool_paramters)
# validate response
response = self.validate_and_parse_response(response)
# assemble invoke message
return self.create_text_message(response)

View File

@@ -0,0 +1,140 @@
from core.tools.tool.tool import Tool
from core.tools.model.tool_model_manager import ToolModelManager
from core.model_runtime.entities.message_entities import PromptMessage
from core.model_runtime.entities.llm_entities import LLMResult
from core.model_runtime.entities.message_entities import SystemPromptMessage, UserPromptMessage
from core.tools.utils.web_reader_tool import get_url
from typing import List
from enum import Enum
_SUMMARY_PROMPT = """You are a professional language researcher, you are interested in the language
and you can quickly aimed at the main point of an webpage and reproduce it in your own words but
retain the original meaning and keep the key points.
however, the text you got is too long, what you got is possible a part of the text.
Please summarize the text you got.
"""
class BuiltinTool(Tool):
"""
Builtin tool
:param meta: the meta data of a tool call processing
"""
def invoke_model(
self, user_id: str, prompt_messages: List[PromptMessage], stop: List[str]
) -> LLMResult:
"""
invoke model
:param model_config: the model config
:param prompt_messages: the prompt messages
:param stop: the stop words
:return: the model result
"""
# invoke model
return ToolModelManager.invoke(
user_id=user_id,
tenant_id=self.runtime.tenant_id,
tool_type='builtin',
tool_name=self.identity.name,
prompt_messages=prompt_messages,
)
def get_max_tokens(self) -> int:
"""
get max tokens
:param model_config: the model config
:return: the max tokens
"""
return ToolModelManager.get_max_llm_context_tokens(
tenant_id=self.runtime.tenant_id,
)
def get_prompt_tokens(self, prompt_messages: List[PromptMessage]) -> int:
"""
get prompt tokens
:param prompt_messages: the prompt messages
:return: the tokens
"""
return ToolModelManager.calculate_tokens(
tenant_id=self.runtime.tenant_id,
prompt_messages=prompt_messages
)
def summary(self, user_id: str, content: str) -> str:
max_tokens = self.get_max_tokens()
if self.get_prompt_tokens(prompt_messages=[
UserPromptMessage(content=content)
]) < max_tokens * 0.6:
return content
def get_prompt_tokens(content: str) -> int:
return self.get_prompt_tokens(prompt_messages=[
SystemPromptMessage(content=_SUMMARY_PROMPT),
UserPromptMessage(content=content)
])
def summarize(content: str) -> str:
summary = self.invoke_model(user_id=user_id, prompt_messages=[
SystemPromptMessage(content=_SUMMARY_PROMPT),
UserPromptMessage(content=content)
], stop=[])
return summary.message.content
lines = content.split('\n')
new_lines = []
# split long line into multiple lines
for i in range(len(lines)):
line = lines[i]
if not line.strip():
continue
if len(line) < max_tokens * 0.5:
new_lines.append(line)
elif get_prompt_tokens(line) > max_tokens * 0.7:
while get_prompt_tokens(line) > max_tokens * 0.7:
new_lines.append(line[:int(max_tokens * 0.5)])
line = line[int(max_tokens * 0.5):]
new_lines.append(line)
else:
new_lines.append(line)
# merge lines into messages with max tokens
messages: List[str] = []
for i in new_lines:
if len(messages) == 0:
messages.append(i)
else:
if len(messages[-1]) + len(i) < max_tokens * 0.5:
messages[-1] += i
if get_prompt_tokens(messages[-1] + i) > max_tokens * 0.7:
messages.append(i)
else:
messages[-1] += i
summaries = []
for i in range(len(messages)):
message = messages[i]
summary = summarize(message)
summaries.append(summary)
result = '\n'.join(summaries)
if self.get_prompt_tokens(prompt_messages=[
UserPromptMessage(content=result)
]) > max_tokens * 0.7:
return self.summary(user_id=user_id, content=result)
return result
def get_url(self, url: str, user_agent: str = None) -> str:
"""
get url
"""
return get_url(url, user_agent=user_agent)

View File

@@ -0,0 +1,249 @@
import json
import threading
from typing import List, Optional, Type
from core.callback_handler.index_tool_callback_handler import DatasetIndexToolCallbackHandler
from core.embedding.cached_embedding import CacheEmbedding
from core.errors.error import LLMBadRequestError, ProviderTokenNotInitError
from core.index.keyword_table_index.keyword_table_index import KeywordTableConfig, KeywordTableIndex
from core.model_manager import ModelManager
from core.model_runtime.entities.model_entities import ModelType
from core.rerank.rerank import RerankRunner
from extensions.ext_database import db
from flask import Flask, current_app
from langchain.tools import BaseTool
from models.dataset import Dataset, Document, DocumentSegment
from pydantic import BaseModel, Field
from services.retrieval_service import RetrievalService
default_retrieval_model = {
'search_method': 'semantic_search',
'reranking_enable': False,
'reranking_model': {
'reranking_provider_name': '',
'reranking_model_name': ''
},
'top_k': 2,
'score_threshold_enabled': False
}
class DatasetMultiRetrieverToolInput(BaseModel):
query: str = Field(..., description="dataset multi retriever and rerank")
class DatasetMultiRetrieverTool(BaseTool):
"""Tool for querying multi dataset."""
name: str = "dataset-"
args_schema: Type[BaseModel] = DatasetMultiRetrieverToolInput
description: str = "dataset multi retriever and rerank. "
tenant_id: str
dataset_ids: List[str]
top_k: int = 2
score_threshold: Optional[float] = None
reranking_provider_name: str
reranking_model_name: str
return_resource: bool
retriever_from: str
hit_callbacks: List[DatasetIndexToolCallbackHandler] = []
@classmethod
def from_dataset(cls, dataset_ids: List[str], tenant_id: str, **kwargs):
return cls(
name=f'dataset-{tenant_id}',
tenant_id=tenant_id,
dataset_ids=dataset_ids,
**kwargs
)
def _run(self, query: str) -> str:
threads = []
all_documents = []
for dataset_id in self.dataset_ids:
retrieval_thread = threading.Thread(target=self._retriever, kwargs={
'flask_app': current_app._get_current_object(),
'dataset_id': dataset_id,
'query': query,
'all_documents': all_documents,
'hit_callbacks': self.hit_callbacks
})
threads.append(retrieval_thread)
retrieval_thread.start()
for thread in threads:
thread.join()
# do rerank for searched documents
model_manager = ModelManager()
rerank_model_instance = model_manager.get_model_instance(
tenant_id=self.tenant_id,
provider=self.reranking_provider_name,
model_type=ModelType.RERANK,
model=self.reranking_model_name
)
rerank_runner = RerankRunner(rerank_model_instance)
all_documents = rerank_runner.run(query, all_documents, self.score_threshold, self.top_k)
for hit_callback in self.hit_callbacks:
hit_callback.on_tool_end(all_documents)
document_score_list = {}
for item in all_documents:
if 'score' in item.metadata and item.metadata['score']:
document_score_list[item.metadata['doc_id']] = item.metadata['score']
document_context_list = []
index_node_ids = [document.metadata['doc_id'] for document in all_documents]
segments = DocumentSegment.query.filter(
DocumentSegment.dataset_id.in_(self.dataset_ids),
DocumentSegment.completed_at.isnot(None),
DocumentSegment.status == 'completed',
DocumentSegment.enabled == True,
DocumentSegment.index_node_id.in_(index_node_ids)
).all()
if segments:
index_node_id_to_position = {id: position for position, id in enumerate(index_node_ids)}
sorted_segments = sorted(segments,
key=lambda segment: index_node_id_to_position.get(segment.index_node_id,
float('inf')))
for segment in sorted_segments:
if segment.answer:
document_context_list.append(f'question:{segment.content} answer:{segment.answer}')
else:
document_context_list.append(segment.content)
if self.return_resource:
context_list = []
resource_number = 1
for segment in sorted_segments:
dataset = Dataset.query.filter_by(
id=segment.dataset_id
).first()
document = Document.query.filter(Document.id == segment.document_id,
Document.enabled == True,
Document.archived == False,
).first()
if dataset and document:
source = {
'position': resource_number,
'dataset_id': dataset.id,
'dataset_name': dataset.name,
'document_id': document.id,
'document_name': document.name,
'data_source_type': document.data_source_type,
'segment_id': segment.id,
'retriever_from': self.retriever_from,
'score': document_score_list.get(segment.index_node_id, None)
}
if self.retriever_from == 'dev':
source['hit_count'] = segment.hit_count
source['word_count'] = segment.word_count
source['segment_position'] = segment.position
source['index_node_hash'] = segment.index_node_hash
if segment.answer:
source['content'] = f'question:{segment.content} \nanswer:{segment.answer}'
else:
source['content'] = segment.content
context_list.append(source)
resource_number += 1
for hit_callback in self.hit_callbacks:
hit_callback.return_retriever_resource_info(context_list)
return str("\n".join(document_context_list))
async def _arun(self, tool_input: str) -> str:
raise NotImplementedError()
def _retriever(self, flask_app: Flask, dataset_id: str, query: str, all_documents: List,
hit_callbacks: List[DatasetIndexToolCallbackHandler]):
with flask_app.app_context():
dataset = db.session.query(Dataset).filter(
Dataset.tenant_id == self.tenant_id,
Dataset.id == dataset_id
).first()
if not dataset:
return []
for hit_callback in hit_callbacks:
hit_callback.on_query(query, dataset.id)
# get retrieval model , if the model is not setting , using default
retrieval_model = dataset.retrieval_model if dataset.retrieval_model else default_retrieval_model
if dataset.indexing_technique == "economy":
# use keyword table query
kw_table_index = KeywordTableIndex(
dataset=dataset,
config=KeywordTableConfig(
max_keywords_per_chunk=5
)
)
documents = kw_table_index.search(query, search_kwargs={'k': self.top_k})
if documents:
all_documents.extend(documents)
else:
try:
model_manager = ModelManager()
embedding_model = model_manager.get_model_instance(
tenant_id=dataset.tenant_id,
provider=dataset.embedding_model_provider,
model_type=ModelType.TEXT_EMBEDDING,
model=dataset.embedding_model
)
except LLMBadRequestError:
return []
except ProviderTokenNotInitError:
return []
embeddings = CacheEmbedding(embedding_model)
documents = []
threads = []
if self.top_k > 0:
# retrieval_model source with semantic
if retrieval_model['search_method'] == 'semantic_search' or retrieval_model[
'search_method'] == 'hybrid_search':
embedding_thread = threading.Thread(target=RetrievalService.embedding_search, kwargs={
'flask_app': current_app._get_current_object(),
'dataset_id': str(dataset.id),
'query': query,
'top_k': self.top_k,
'score_threshold': self.score_threshold,
'reranking_model': None,
'all_documents': documents,
'search_method': 'hybrid_search',
'embeddings': embeddings
})
threads.append(embedding_thread)
embedding_thread.start()
# retrieval_model source with full text
if retrieval_model['search_method'] == 'full_text_search' or retrieval_model[
'search_method'] == 'hybrid_search':
full_text_index_thread = threading.Thread(target=RetrievalService.full_text_index_search,
kwargs={
'flask_app': current_app._get_current_object(),
'dataset_id': str(dataset.id),
'query': query,
'search_method': 'hybrid_search',
'embeddings': embeddings,
'score_threshold': retrieval_model[
'score_threshold'] if retrieval_model[
'score_threshold_enabled'] else None,
'top_k': self.top_k,
'reranking_model': retrieval_model[
'reranking_model'] if retrieval_model[
'reranking_enable'] else None,
'all_documents': documents
})
threads.append(full_text_index_thread)
full_text_index_thread.start()
for thread in threads:
thread.join()
all_documents.extend(documents)

View File

@@ -0,0 +1,236 @@
import threading
from typing import List, Optional, Type
from core.callback_handler.index_tool_callback_handler import DatasetIndexToolCallbackHandler
from core.embedding.cached_embedding import CacheEmbedding
from core.index.keyword_table_index.keyword_table_index import KeywordTableConfig, KeywordTableIndex
from core.model_manager import ModelManager
from core.model_runtime.entities.model_entities import ModelType
from core.model_runtime.errors.invoke import InvokeAuthorizationError
from core.rerank.rerank import RerankRunner
from extensions.ext_database import db
from flask import current_app
from langchain.tools import BaseTool
from models.dataset import Dataset, Document, DocumentSegment
from pydantic import BaseModel, Field
from services.retrieval_service import RetrievalService
default_retrieval_model = {
'search_method': 'semantic_search',
'reranking_enable': False,
'reranking_model': {
'reranking_provider_name': '',
'reranking_model_name': ''
},
'top_k': 2,
'score_threshold_enabled': False
}
class DatasetRetrieverToolInput(BaseModel):
query: str = Field(..., description="Query for the dataset to be used to retrieve the dataset.")
class DatasetRetrieverTool(BaseTool):
"""Tool for querying a Dataset."""
name: str = "dataset"
args_schema: Type[BaseModel] = DatasetRetrieverToolInput
description: str = "use this to retrieve a dataset. "
tenant_id: str
dataset_id: str
top_k: int = 2
score_threshold: Optional[float] = None
hit_callbacks: List[DatasetIndexToolCallbackHandler] = []
return_resource: bool
retriever_from: str
@classmethod
def from_dataset(cls, dataset: Dataset, **kwargs):
description = dataset.description
if not description:
description = 'useful for when you want to answer queries about the ' + dataset.name
description = description.replace('\n', '').replace('\r', '')
return cls(
name=f'dataset-{dataset.id}',
tenant_id=dataset.tenant_id,
dataset_id=dataset.id,
description=description,
**kwargs
)
def _run(self, query: str) -> str:
dataset = db.session.query(Dataset).filter(
Dataset.tenant_id == self.tenant_id,
Dataset.id == self.dataset_id
).first()
if not dataset:
return ''
for hit_callback in self.hit_callbacks:
hit_callback.on_query(query, dataset.id)
# get retrieval model , if the model is not setting , using default
retrieval_model = dataset.retrieval_model if dataset.retrieval_model else default_retrieval_model
if dataset.indexing_technique == "economy":
# use keyword table query
kw_table_index = KeywordTableIndex(
dataset=dataset,
config=KeywordTableConfig(
max_keywords_per_chunk=5
)
)
documents = kw_table_index.search(query, search_kwargs={'k': self.top_k})
return str("\n".join([document.page_content for document in documents]))
else:
# get embedding model instance
try:
model_manager = ModelManager()
embedding_model = model_manager.get_model_instance(
tenant_id=dataset.tenant_id,
provider=dataset.embedding_model_provider,
model_type=ModelType.TEXT_EMBEDDING,
model=dataset.embedding_model
)
except InvokeAuthorizationError:
return ''
embeddings = CacheEmbedding(embedding_model)
documents = []
threads = []
if self.top_k > 0:
# retrieval source with semantic
if retrieval_model['search_method'] == 'semantic_search' or retrieval_model['search_method'] == 'hybrid_search':
embedding_thread = threading.Thread(target=RetrievalService.embedding_search, kwargs={
'flask_app': current_app._get_current_object(),
'dataset_id': str(dataset.id),
'query': query,
'top_k': self.top_k,
'score_threshold': retrieval_model['score_threshold'] if retrieval_model[
'score_threshold_enabled'] else None,
'reranking_model': retrieval_model['reranking_model'] if retrieval_model[
'reranking_enable'] else None,
'all_documents': documents,
'search_method': retrieval_model['search_method'],
'embeddings': embeddings
})
threads.append(embedding_thread)
embedding_thread.start()
# retrieval_model source with full text
if retrieval_model['search_method'] == 'full_text_search' or retrieval_model['search_method'] == 'hybrid_search':
full_text_index_thread = threading.Thread(target=RetrievalService.full_text_index_search, kwargs={
'flask_app': current_app._get_current_object(),
'dataset_id': str(dataset.id),
'query': query,
'search_method': retrieval_model['search_method'],
'embeddings': embeddings,
'score_threshold': retrieval_model['score_threshold'] if retrieval_model[
'score_threshold_enabled'] else None,
'top_k': self.top_k,
'reranking_model': retrieval_model['reranking_model'] if retrieval_model[
'reranking_enable'] else None,
'all_documents': documents
})
threads.append(full_text_index_thread)
full_text_index_thread.start()
for thread in threads:
thread.join()
# hybrid search: rerank after all documents have been searched
if retrieval_model['search_method'] == 'hybrid_search':
# get rerank model instance
try:
model_manager = ModelManager()
rerank_model_instance = model_manager.get_model_instance(
tenant_id=dataset.tenant_id,
provider=retrieval_model['reranking_model']['reranking_provider_name'],
model_type=ModelType.RERANK,
model=retrieval_model['reranking_model']['reranking_model_name']
)
except InvokeAuthorizationError:
return ''
rerank_runner = RerankRunner(rerank_model_instance)
documents = rerank_runner.run(
query=query,
documents=documents,
score_threshold=retrieval_model['score_threshold'] if retrieval_model[
'score_threshold_enabled'] else None,
top_n=self.top_k
)
else:
documents = []
for hit_callback in self.hit_callbacks:
hit_callback.on_tool_end(documents)
document_score_list = {}
if dataset.indexing_technique != "economy":
for item in documents:
if 'score' in item.metadata and item.metadata['score']:
document_score_list[item.metadata['doc_id']] = item.metadata['score']
document_context_list = []
index_node_ids = [document.metadata['doc_id'] for document in documents]
segments = DocumentSegment.query.filter(DocumentSegment.dataset_id == self.dataset_id,
DocumentSegment.completed_at.isnot(None),
DocumentSegment.status == 'completed',
DocumentSegment.enabled == True,
DocumentSegment.index_node_id.in_(index_node_ids)
).all()
if segments:
index_node_id_to_position = {id: position for position, id in enumerate(index_node_ids)}
sorted_segments = sorted(segments,
key=lambda segment: index_node_id_to_position.get(segment.index_node_id,
float('inf')))
for segment in sorted_segments:
if segment.answer:
document_context_list.append(f'question:{segment.content} answer:{segment.answer}')
else:
document_context_list.append(segment.content)
if self.return_resource:
context_list = []
resource_number = 1
for segment in sorted_segments:
context = {}
document = Document.query.filter(Document.id == segment.document_id,
Document.enabled == True,
Document.archived == False,
).first()
if dataset and document:
source = {
'position': resource_number,
'dataset_id': dataset.id,
'dataset_name': dataset.name,
'document_id': document.id,
'document_name': document.name,
'data_source_type': document.data_source_type,
'segment_id': segment.id,
'retriever_from': self.retriever_from,
'score': document_score_list.get(segment.index_node_id, None)
}
if self.retriever_from == 'dev':
source['hit_count'] = segment.hit_count
source['word_count'] = segment.word_count
source['segment_position'] = segment.position
source['index_node_hash'] = segment.index_node_hash
if segment.answer:
source['content'] = f'question:{segment.content} \nanswer:{segment.answer}'
else:
source['content'] = segment.content
context_list.append(source)
resource_number += 1
for hit_callback in self.hit_callbacks:
hit_callback.return_retriever_resource_info(context_list)
return str("\n".join(document_context_list))
async def _arun(self, tool_input: str) -> str:
raise NotImplementedError()

View File

@@ -0,0 +1,95 @@
from typing import Any, Dict, List, Union
from core.features.dataset_retrieval import DatasetRetrievalFeature
from core.tools.entities.tool_entities import ToolInvokeMessage, ToolParamter, ToolIdentity, ToolDescription
from core.tools.tool.tool import Tool
from core.tools.entities.common_entities import I18nObject
from core.callback_handler.index_tool_callback_handler import DatasetIndexToolCallbackHandler
from core.entities.application_entities import DatasetRetrieveConfigEntity, InvokeFrom
from langchain.tools import BaseTool
class DatasetRetrieverTool(Tool):
langchain_tool: BaseTool
@staticmethod
def get_dataset_tools(tenant_id: str,
dataset_ids: list[str],
retrieve_config: DatasetRetrieveConfigEntity,
return_resource: bool,
invoke_from: InvokeFrom,
hit_callback: DatasetIndexToolCallbackHandler
) -> List['DatasetRetrieverTool']:
"""
get dataset tool
"""
# check if retrieve_config is valid
if dataset_ids is None or len(dataset_ids) == 0:
return []
if retrieve_config is None:
return []
feature = DatasetRetrievalFeature()
# save original retrieve strategy, and set retrieve strategy to SINGLE
# Agent only support SINGLE mode
original_retriever_mode = retrieve_config.retrieve_strategy
retrieve_config.retrieve_strategy = DatasetRetrieveConfigEntity.RetrieveStrategy.SINGLE
langchain_tools = feature.to_dataset_retriever_tool(
tenant_id=tenant_id,
dataset_ids=dataset_ids,
retrieve_config=retrieve_config,
return_resource=return_resource,
invoke_from=invoke_from,
hit_callback=hit_callback
)
# restore retrieve strategy
retrieve_config.retrieve_strategy = original_retriever_mode
# convert langchain tools to Tools
tools = []
for langchain_tool in langchain_tools:
tool = DatasetRetrieverTool(
langchain_tool=langchain_tool,
identity=ToolIdentity(author='', name=langchain_tool.name, label=I18nObject(en_US='', zh_Hans='')),
parameters=[],
is_team_authorization=True,
description=ToolDescription(
human=I18nObject(en_US='', zh_Hans=''),
llm=langchain_tool.description),
runtime=DatasetRetrieverTool.Runtime()
)
tools.append(tool)
return tools
def get_runtime_parameters(self) -> List[ToolParamter]:
return [
ToolParamter(name='query',
label=I18nObject(en_US='', zh_Hans=''),
human_description=I18nObject(en_US='', zh_Hans=''),
type=ToolParamter.ToolParameterType.STRING,
form=ToolParamter.ToolParameterForm.LLM,
llm_description='Query for the dataset to be used to retrieve the dataset.',
required=True,
default=''),
]
def _invoke(self, user_id: str, tool_paramters: Dict[str, Any]) -> ToolInvokeMessage | List[ToolInvokeMessage]:
"""
invoke dataset retriever tool
"""
query = tool_paramters.get('query', None)
if not query:
return self.create_text_message(text='please input query')
# invoke dataset retriever tool
result = self.langchain_tool._run(query=query)
return self.create_text_message(text=result)
def validate_credentials(self, credentails: Dict[str, Any], parameters: Dict[str, Any]) -> None:
"""
validate the credentials for dataset retriever tool
"""
pass

302
api/core/tools/tool/tool.py Normal file
View File

@@ -0,0 +1,302 @@
from pydantic import BaseModel
from typing import List, Dict, Any, Union, Optional
from abc import abstractmethod, ABC
from enum import Enum
from core.tools.entities.tool_entities import ToolIdentity, ToolInvokeMessage,\
ToolParamter, ToolDescription, ToolRuntimeVariablePool, ToolRuntimeVariable, ToolRuntimeImageVariable
from core.tools.tool_file_manager import ToolFileManager
from core.callback_handler.agent_tool_callback_handler import DifyAgentCallbackHandler
class Tool(BaseModel, ABC):
identity: ToolIdentity = None
parameters: Optional[List[ToolParamter]] = None
description: ToolDescription = None
is_team_authorization: bool = False
agent_callback: Optional[DifyAgentCallbackHandler] = None
use_callback: bool = False
class Runtime(BaseModel):
"""
Meta data of a tool call processing
"""
def __init__(self, **data: Any):
super().__init__(**data)
if not self.runtime_parameters:
self.runtime_parameters = {}
tenant_id: str = None
tool_id: str = None
credentials: Dict[str, Any] = None
runtime_parameters: Dict[str, Any] = None
runtime: Runtime = None
variables: ToolRuntimeVariablePool = None
def __init__(self, **data: Any):
super().__init__(**data)
if not self.agent_callback:
self.use_callback = False
else:
self.use_callback = True
class VARIABLE_KEY(Enum):
IMAGE = 'image'
def fork_tool_runtime(self, meta: Dict[str, Any], agent_callback: DifyAgentCallbackHandler = None) -> 'Tool':
"""
fork a new tool with meta data
:param meta: the meta data of a tool call processing, tenant_id is required
:return: the new tool
"""
return self.__class__(
identity=self.identity.copy() if self.identity else None,
parameters=self.parameters.copy() if self.parameters else None,
description=self.description.copy() if self.description else None,
runtime=Tool.Runtime(**meta),
agent_callback=agent_callback
)
def load_variables(self, variables: ToolRuntimeVariablePool):
"""
load variables from database
:param conversation_id: the conversation id
"""
self.variables = variables
def set_image_variable(self, variable_name: str, image_key: str) -> None:
"""
set an image variable
"""
if not self.variables:
return
self.variables.set_file(self.identity.name, variable_name, image_key)
def set_text_variable(self, variable_name: str, text: str) -> None:
"""
set a text variable
"""
if not self.variables:
return
self.variables.set_text(self.identity.name, variable_name, text)
def get_variable(self, name: Union[str, Enum]) -> Optional[ToolRuntimeVariable]:
"""
get a variable
:param name: the name of the variable
:return: the variable
"""
if not self.variables:
return None
if isinstance(name, Enum):
name = name.value
for variable in self.variables.pool:
if variable.name == name:
return variable
return None
def get_default_image_variable(self) -> Optional[ToolRuntimeVariable]:
"""
get the default image variable
:return: the image variable
"""
if not self.variables:
return None
return self.get_variable(self.VARIABLE_KEY.IMAGE)
def get_variable_file(self, name: Union[str, Enum]) -> Optional[bytes]:
"""
get a variable file
:param name: the name of the variable
:return: the variable file
"""
variable = self.get_variable(name)
if not variable:
return None
if not isinstance(variable, ToolRuntimeImageVariable):
return None
message_file_id = variable.value
# get file binary
file_binary = ToolFileManager.get_file_binary_by_message_file_id(message_file_id)
if not file_binary:
return None
return file_binary[0]
def list_variables(self) -> List[ToolRuntimeVariable]:
"""
list all variables
:return: the variables
"""
if not self.variables:
return []
return self.variables.pool
def list_default_image_variables(self) -> List[ToolRuntimeVariable]:
"""
list all image variables
:return: the image variables
"""
if not self.variables:
return []
result = []
for variable in self.variables.pool:
if variable.name.startswith(self.VARIABLE_KEY.IMAGE.value):
result.append(variable)
return result
def invoke(self, user_id: str, tool_paramters: Dict[str, Any]) -> List[ToolInvokeMessage]:
# update tool_paramters
if self.runtime.runtime_parameters:
tool_paramters.update(self.runtime.runtime_parameters)
# hit callback
if self.use_callback:
self.agent_callback.on_tool_start(
tool_name=self.identity.name,
tool_inputs=tool_paramters
)
try:
result = self._invoke(
user_id=user_id,
tool_paramters=tool_paramters,
)
except Exception as e:
if self.use_callback:
self.agent_callback.on_tool_error(e)
raise e
if not isinstance(result, list):
result = [result]
# hit callback
if self.use_callback:
self.agent_callback.on_tool_end(
tool_name=self.identity.name,
tool_inputs=tool_paramters,
tool_outputs=self._convert_tool_response_to_str(result)
)
return result
def _convert_tool_response_to_str(self, tool_response: List[ToolInvokeMessage]) -> str:
"""
Handle tool response
"""
result = ''
for response in tool_response:
if response.type == ToolInvokeMessage.MessageType.TEXT:
result += response.message
elif response.type == ToolInvokeMessage.MessageType.LINK:
result += f"result link: {response.message}. please dirct user to check it."
elif response.type == ToolInvokeMessage.MessageType.IMAGE_LINK or \
response.type == ToolInvokeMessage.MessageType.IMAGE:
result += f"image has been created and sent to user already, you should tell user to check it now."
elif response.type == ToolInvokeMessage.MessageType.BLOB:
if len(response.message) > 114:
result += str(response.message[:114]) + '...'
else:
result += str(response.message)
else:
result += f"tool response: {response.message}."
return result
@abstractmethod
def _invoke(self, user_id: str, tool_paramters: Dict[str, Any]) -> Union[ToolInvokeMessage, List[ToolInvokeMessage]]:
pass
def validate_credentials(self, credentails: Dict[str, Any], parameters: Dict[str, Any]) -> None:
"""
validate the credentials
:param credentails: the credentials
:param parameters: the parameters
"""
pass
def get_runtime_parameters(self) -> List[ToolParamter]:
"""
get the runtime parameters
interface for developer to dynamic change the parameters of a tool depends on the variables pool
:return: the runtime parameters
"""
return self.parameters
def is_tool_avaliable(self) -> bool:
"""
check if the tool is avaliable
:return: if the tool is avaliable
"""
return True
def create_image_message(self, image: str, save_as: str = '') -> ToolInvokeMessage:
"""
create an image message
:param image: the url of the image
:return: the image message
"""
return ToolInvokeMessage(type=ToolInvokeMessage.MessageType.IMAGE,
message=image,
save_as=save_as)
def create_link_message(self, link: str, save_as: str = '') -> ToolInvokeMessage:
"""
create a link message
:param link: the url of the link
:return: the link message
"""
return ToolInvokeMessage(type=ToolInvokeMessage.MessageType.LINK,
message=link,
save_as=save_as)
def create_text_message(self, text: str, save_as: str = '') -> ToolInvokeMessage:
"""
create a text message
:param text: the text
:return: the text message
"""
return ToolInvokeMessage(type=ToolInvokeMessage.MessageType.TEXT,
message=text,
save_as=save_as
)
def create_blob_message(self, blob: bytes, meta: dict = None, save_as: str = '') -> ToolInvokeMessage:
"""
create a blob message
:param blob: the blob
:return: the blob message
"""
return ToolInvokeMessage(type=ToolInvokeMessage.MessageType.BLOB,
message=blob, meta=meta,
save_as=save_as
)