feat: knowledge pipeline (#25360)
Signed-off-by: -LAN- <laipz8200@outlook.com> Co-authored-by: twwu <twwu@dify.ai> Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com> Co-authored-by: jyong <718720800@qq.com> Co-authored-by: Wu Tianwei <30284043+WTW0313@users.noreply.github.com> Co-authored-by: QuantumGhost <obelisk.reg+git@gmail.com> Co-authored-by: lyzno1 <yuanyouhuilyz@gmail.com> Co-authored-by: quicksand <quicksandzn@gmail.com> Co-authored-by: Jyong <76649700+JohnJyong@users.noreply.github.com> Co-authored-by: lyzno1 <92089059+lyzno1@users.noreply.github.com> Co-authored-by: zxhlyh <jasonapring2015@outlook.com> Co-authored-by: Yongtao Huang <yongtaoh2022@gmail.com> Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: Joel <iamjoel007@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: nite-knite <nkCoding@gmail.com> Co-authored-by: Hanqing Zhao <sherry9277@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Harry <xh001x@hotmail.com>
This commit is contained in:
@@ -15,7 +15,7 @@ from typing import Any, cast
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy import DateTime, String, func, select
|
||||
from sqlalchemy.dialects.postgresql import JSONB
|
||||
from sqlalchemy.orm import Mapped, mapped_column
|
||||
from sqlalchemy.orm import Mapped, Session, mapped_column
|
||||
|
||||
from configs import dify_config
|
||||
from core.rag.index_processor.constant.built_in_field import BuiltInField, MetadataDataSource
|
||||
@@ -61,12 +61,35 @@ class Dataset(Base):
|
||||
created_by = mapped_column(StringUUID, nullable=False)
|
||||
created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
|
||||
updated_by = mapped_column(StringUUID, nullable=True)
|
||||
updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
|
||||
embedding_model = mapped_column(String(255), nullable=True)
|
||||
embedding_model_provider = mapped_column(String(255), nullable=True)
|
||||
updated_at = mapped_column(db.DateTime, nullable=False, server_default=func.current_timestamp())
|
||||
embedding_model = mapped_column(db.String(255), nullable=True)
|
||||
embedding_model_provider = mapped_column(db.String(255), nullable=True)
|
||||
keyword_number = db.Column(db.Integer, nullable=True, server_default=db.text("10"))
|
||||
collection_binding_id = mapped_column(StringUUID, nullable=True)
|
||||
retrieval_model = mapped_column(JSONB, nullable=True)
|
||||
built_in_field_enabled: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, server_default=sa.text("false"))
|
||||
built_in_field_enabled = mapped_column(db.Boolean, nullable=False, server_default=db.text("false"))
|
||||
icon_info = db.Column(JSONB, nullable=True)
|
||||
runtime_mode = db.Column(db.String(255), nullable=True, server_default=db.text("'general'::character varying"))
|
||||
pipeline_id = db.Column(StringUUID, nullable=True)
|
||||
chunk_structure = db.Column(db.String(255), nullable=True)
|
||||
enable_api = db.Column(db.Boolean, nullable=False, server_default=db.text("true"))
|
||||
|
||||
@property
|
||||
def total_documents(self):
|
||||
return db.session.query(func.count(Document.id)).where(Document.dataset_id == self.id).scalar()
|
||||
|
||||
@property
|
||||
def total_available_documents(self):
|
||||
return (
|
||||
db.session.query(func.count(Document.id))
|
||||
.where(
|
||||
Document.dataset_id == self.id,
|
||||
Document.indexing_status == "completed",
|
||||
Document.enabled == True,
|
||||
Document.archived == False,
|
||||
)
|
||||
.scalar()
|
||||
)
|
||||
|
||||
@property
|
||||
def dataset_keyword_table(self):
|
||||
@@ -150,7 +173,9 @@ class Dataset(Base):
|
||||
)
|
||||
|
||||
@property
|
||||
def doc_form(self):
|
||||
def doc_form(self) -> str | None:
|
||||
if self.chunk_structure:
|
||||
return self.chunk_structure
|
||||
document = db.session.query(Document).where(Document.dataset_id == self.id).first()
|
||||
if document:
|
||||
return document.doc_form
|
||||
@@ -206,6 +231,14 @@ class Dataset(Base):
|
||||
"external_knowledge_api_endpoint": json.loads(external_knowledge_api.settings).get("endpoint", ""),
|
||||
}
|
||||
|
||||
@property
|
||||
def is_published(self):
|
||||
if self.pipeline_id:
|
||||
pipeline = db.session.query(Pipeline).where(Pipeline.id == self.pipeline_id).first()
|
||||
if pipeline:
|
||||
return pipeline.is_published
|
||||
return False
|
||||
|
||||
@property
|
||||
def doc_metadata(self):
|
||||
dataset_metadatas = db.session.scalars(
|
||||
@@ -394,7 +427,7 @@ class Document(Base):
|
||||
return status
|
||||
|
||||
@property
|
||||
def data_source_info_dict(self) -> dict[str, Any] | None:
|
||||
def data_source_info_dict(self) -> dict[str, Any]:
|
||||
if self.data_source_info:
|
||||
try:
|
||||
data_source_info_dict: dict[str, Any] = json.loads(self.data_source_info)
|
||||
@@ -402,7 +435,7 @@ class Document(Base):
|
||||
data_source_info_dict = {}
|
||||
|
||||
return data_source_info_dict
|
||||
return None
|
||||
return {}
|
||||
|
||||
@property
|
||||
def data_source_detail_dict(self) -> dict[str, Any]:
|
||||
@@ -759,7 +792,7 @@ class DocumentSegment(Base):
|
||||
text = self.content
|
||||
|
||||
# For data before v0.10.0
|
||||
pattern = r"/files/([a-f0-9\-]+)/image-preview"
|
||||
pattern = r"/files/([a-f0-9\-]+)/image-preview(?:\?.*?)?"
|
||||
matches = re.finditer(pattern, text)
|
||||
for match in matches:
|
||||
upload_file_id = match.group(1)
|
||||
@@ -771,11 +804,12 @@ class DocumentSegment(Base):
|
||||
encoded_sign = base64.urlsafe_b64encode(sign).decode()
|
||||
|
||||
params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
|
||||
signed_url = f"{match.group(0)}?{params}"
|
||||
base_url = f"/files/{upload_file_id}/image-preview"
|
||||
signed_url = f"{base_url}?{params}"
|
||||
signed_urls.append((match.start(), match.end(), signed_url))
|
||||
|
||||
# For data after v0.10.0
|
||||
pattern = r"/files/([a-f0-9\-]+)/file-preview"
|
||||
pattern = r"/files/([a-f0-9\-]+)/file-preview(?:\?.*?)?"
|
||||
matches = re.finditer(pattern, text)
|
||||
for match in matches:
|
||||
upload_file_id = match.group(1)
|
||||
@@ -787,7 +821,27 @@ class DocumentSegment(Base):
|
||||
encoded_sign = base64.urlsafe_b64encode(sign).decode()
|
||||
|
||||
params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
|
||||
signed_url = f"{match.group(0)}?{params}"
|
||||
base_url = f"/files/{upload_file_id}/file-preview"
|
||||
signed_url = f"{base_url}?{params}"
|
||||
signed_urls.append((match.start(), match.end(), signed_url))
|
||||
|
||||
# For tools directory - direct file formats (e.g., .png, .jpg, etc.)
|
||||
# Match URL including any query parameters up to common URL boundaries (space, parenthesis, quotes)
|
||||
pattern = r"/files/tools/([a-f0-9\-]+)\.([a-zA-Z0-9]+)(?:\?[^\s\)\"\']*)?"
|
||||
matches = re.finditer(pattern, text)
|
||||
for match in matches:
|
||||
upload_file_id = match.group(1)
|
||||
file_extension = match.group(2)
|
||||
nonce = os.urandom(16).hex()
|
||||
timestamp = str(int(time.time()))
|
||||
data_to_sign = f"file-preview|{upload_file_id}|{timestamp}|{nonce}"
|
||||
secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
|
||||
sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
|
||||
encoded_sign = base64.urlsafe_b64encode(sign).decode()
|
||||
|
||||
params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
|
||||
base_url = f"/files/tools/{upload_file_id}.{file_extension}"
|
||||
signed_url = f"{base_url}?{params}"
|
||||
signed_urls.append((match.start(), match.end(), signed_url))
|
||||
|
||||
# Reconstruct the text with signed URLs
|
||||
@@ -1166,3 +1220,112 @@ class DatasetMetadataBinding(Base):
|
||||
document_id = mapped_column(StringUUID, nullable=False)
|
||||
created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
|
||||
created_by = mapped_column(StringUUID, nullable=False)
|
||||
|
||||
|
||||
class PipelineBuiltInTemplate(Base): # type: ignore[name-defined]
|
||||
__tablename__ = "pipeline_built_in_templates"
|
||||
__table_args__ = (db.PrimaryKeyConstraint("id", name="pipeline_built_in_template_pkey"),)
|
||||
|
||||
id = db.Column(StringUUID, server_default=db.text("uuidv7()"))
|
||||
name = db.Column(db.String(255), nullable=False)
|
||||
description = db.Column(db.Text, nullable=False)
|
||||
chunk_structure = db.Column(db.String(255), nullable=False)
|
||||
icon = db.Column(db.JSON, nullable=False)
|
||||
yaml_content = db.Column(db.Text, nullable=False)
|
||||
copyright = db.Column(db.String(255), nullable=False)
|
||||
privacy_policy = db.Column(db.String(255), nullable=False)
|
||||
position = db.Column(db.Integer, nullable=False)
|
||||
install_count = db.Column(db.Integer, nullable=False, default=0)
|
||||
language = db.Column(db.String(255), nullable=False)
|
||||
created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
|
||||
updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
|
||||
created_by = db.Column(StringUUID, nullable=False)
|
||||
updated_by = db.Column(StringUUID, nullable=True)
|
||||
|
||||
@property
|
||||
def created_user_name(self):
|
||||
account = db.session.query(Account).where(Account.id == self.created_by).first()
|
||||
if account:
|
||||
return account.name
|
||||
return ""
|
||||
|
||||
|
||||
class PipelineCustomizedTemplate(Base): # type: ignore[name-defined]
|
||||
__tablename__ = "pipeline_customized_templates"
|
||||
__table_args__ = (
|
||||
db.PrimaryKeyConstraint("id", name="pipeline_customized_template_pkey"),
|
||||
db.Index("pipeline_customized_template_tenant_idx", "tenant_id"),
|
||||
)
|
||||
|
||||
id = db.Column(StringUUID, server_default=db.text("uuidv7()"))
|
||||
tenant_id = db.Column(StringUUID, nullable=False)
|
||||
name = db.Column(db.String(255), nullable=False)
|
||||
description = db.Column(db.Text, nullable=False)
|
||||
chunk_structure = db.Column(db.String(255), nullable=False)
|
||||
icon = db.Column(db.JSON, nullable=False)
|
||||
position = db.Column(db.Integer, nullable=False)
|
||||
yaml_content = db.Column(db.Text, nullable=False)
|
||||
install_count = db.Column(db.Integer, nullable=False, default=0)
|
||||
language = db.Column(db.String(255), nullable=False)
|
||||
created_by = db.Column(StringUUID, nullable=False)
|
||||
updated_by = db.Column(StringUUID, nullable=True)
|
||||
created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
|
||||
updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
|
||||
|
||||
@property
|
||||
def created_user_name(self):
|
||||
account = db.session.query(Account).where(Account.id == self.created_by).first()
|
||||
if account:
|
||||
return account.name
|
||||
return ""
|
||||
|
||||
|
||||
class Pipeline(Base): # type: ignore[name-defined]
|
||||
__tablename__ = "pipelines"
|
||||
__table_args__ = (db.PrimaryKeyConstraint("id", name="pipeline_pkey"),)
|
||||
|
||||
id = db.Column(StringUUID, server_default=db.text("uuidv7()"))
|
||||
tenant_id: Mapped[str] = db.Column(StringUUID, nullable=False)
|
||||
name = db.Column(db.String(255), nullable=False)
|
||||
description = db.Column(db.Text, nullable=False, server_default=db.text("''::character varying"))
|
||||
workflow_id = db.Column(StringUUID, nullable=True)
|
||||
is_public = db.Column(db.Boolean, nullable=False, server_default=db.text("false"))
|
||||
is_published = db.Column(db.Boolean, nullable=False, server_default=db.text("false"))
|
||||
created_by = db.Column(StringUUID, nullable=True)
|
||||
created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
|
||||
updated_by = db.Column(StringUUID, nullable=True)
|
||||
updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
|
||||
|
||||
def retrieve_dataset(self, session: Session):
|
||||
return session.query(Dataset).where(Dataset.pipeline_id == self.id).first()
|
||||
|
||||
|
||||
class DocumentPipelineExecutionLog(Base):
|
||||
__tablename__ = "document_pipeline_execution_logs"
|
||||
__table_args__ = (
|
||||
db.PrimaryKeyConstraint("id", name="document_pipeline_execution_log_pkey"),
|
||||
db.Index("document_pipeline_execution_logs_document_id_idx", "document_id"),
|
||||
)
|
||||
|
||||
id = db.Column(StringUUID, server_default=db.text("uuidv7()"))
|
||||
pipeline_id = db.Column(StringUUID, nullable=False)
|
||||
document_id = db.Column(StringUUID, nullable=False)
|
||||
datasource_type = db.Column(db.String(255), nullable=False)
|
||||
datasource_info = db.Column(db.Text, nullable=False)
|
||||
datasource_node_id = db.Column(db.String(255), nullable=False)
|
||||
input_data = db.Column(db.JSON, nullable=False)
|
||||
created_by = db.Column(StringUUID, nullable=True)
|
||||
created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
|
||||
|
||||
|
||||
class PipelineRecommendedPlugin(Base):
|
||||
__tablename__ = "pipeline_recommended_plugins"
|
||||
__table_args__ = (db.PrimaryKeyConstraint("id", name="pipeline_recommended_plugin_pkey"),)
|
||||
|
||||
id = db.Column(StringUUID, server_default=db.text("uuidv7()"))
|
||||
plugin_id = db.Column(db.Text, nullable=False)
|
||||
provider_name = db.Column(db.Text, nullable=False)
|
||||
position = db.Column(db.Integer, nullable=False, default=0)
|
||||
active = db.Column(db.Boolean, nullable=False, default=True)
|
||||
created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
|
||||
updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
|
||||
|
||||
Reference in New Issue
Block a user