Feat/dify rag (#2528)

Co-authored-by: jyong <jyong@dify.ai>
This commit is contained in:
Jyong
2024-02-22 23:31:57 +08:00
committed by GitHub
parent 97fe817186
commit 6c4e6bf1d6
119 changed files with 3181 additions and 5892 deletions

View File

@@ -21,9 +21,9 @@ from pydantic import BaseModel, Field
from regex import regex
from core.chain.llm_chain import LLMChain
from core.data_loader import file_extractor
from core.data_loader.file_extractor import FileExtractor
from core.entities.application_entities import ModelConfigEntity
from core.rag.extractor import extract_processor
from core.rag.extractor.extract_processor import ExtractProcessor
FULL_TEMPLATE = """
TITLE: {title}
@@ -149,7 +149,7 @@ def get_url(url: str, user_agent: str = None) -> str:
if user_agent:
headers["User-Agent"] = user_agent
supported_content_types = file_extractor.SUPPORT_URL_CONTENT_TYPES + ["text/html"]
supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"]
head_response = requests.head(url, headers=headers, allow_redirects=True, timeout=(5, 10))
@@ -161,8 +161,8 @@ def get_url(url: str, user_agent: str = None) -> str:
if main_content_type not in supported_content_types:
return "Unsupported content-type [{}] of URL.".format(main_content_type)
if main_content_type in file_extractor.SUPPORT_URL_CONTENT_TYPES:
return FileExtractor.load_from_url(url, return_text=True)
if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES:
return ExtractProcessor.load_from_url(url, return_text=True)
response = requests.get(url, headers=headers, allow_redirects=True, timeout=(5, 30))
a = extract_using_readabilipy(response.text)