feat: knowledge pipeline (#25360)

Signed-off-by: -LAN- <laipz8200@outlook.com>
Co-authored-by: twwu <twwu@dify.ai>
Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
Co-authored-by: jyong <718720800@qq.com>
Co-authored-by: Wu Tianwei <30284043+WTW0313@users.noreply.github.com>
Co-authored-by: QuantumGhost <obelisk.reg+git@gmail.com>
Co-authored-by: lyzno1 <yuanyouhuilyz@gmail.com>
Co-authored-by: quicksand <quicksandzn@gmail.com>
Co-authored-by: Jyong <76649700+JohnJyong@users.noreply.github.com>
Co-authored-by: lyzno1 <92089059+lyzno1@users.noreply.github.com>
Co-authored-by: zxhlyh <jasonapring2015@outlook.com>
Co-authored-by: Yongtao Huang <yongtaoh2022@gmail.com>
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
Co-authored-by: Joel <iamjoel007@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: nite-knite <nkCoding@gmail.com>
Co-authored-by: Hanqing Zhao <sherry9277@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Harry <xh001x@hotmail.com>
This commit is contained in:
-LAN-
2025-09-18 12:49:10 +08:00
committed by GitHub
parent 7dadb33003
commit 85cda47c70
1772 changed files with 102407 additions and 31710 deletions

View File

@@ -0,0 +1,222 @@
"""knowledge_pipeline_migrate
Revision ID: 68519ad5cd18
Revises: cf7c38a32b2d
Create Date: 2025-09-17 15:15:50.697885
"""
from alembic import op
import models as models
import sqlalchemy as sa
from sqlalchemy.dialects import postgresql
# revision identifiers, used by Alembic.
revision = '68519ad5cd18'
down_revision = 'cf7c38a32b2d'
branch_labels = None
depends_on = None
def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.create_table('datasource_oauth_params',
sa.Column('id', models.types.StringUUID(), server_default=sa.text('uuidv7()'), nullable=False),
sa.Column('plugin_id', sa.String(length=255), nullable=False),
sa.Column('provider', sa.String(length=255), nullable=False),
sa.Column('system_credentials', postgresql.JSONB(astext_type=sa.Text()), nullable=False),
sa.PrimaryKeyConstraint('id', name='datasource_oauth_config_pkey'),
sa.UniqueConstraint('plugin_id', 'provider', name='datasource_oauth_config_datasource_id_provider_idx')
)
op.create_table('datasource_oauth_tenant_params',
sa.Column('id', models.types.StringUUID(), server_default=sa.text('uuidv7()'), nullable=False),
sa.Column('tenant_id', models.types.StringUUID(), nullable=False),
sa.Column('provider', sa.String(length=255), nullable=False),
sa.Column('plugin_id', sa.String(length=255), nullable=False),
sa.Column('client_params', postgresql.JSONB(astext_type=sa.Text()), nullable=False),
sa.Column('enabled', sa.Boolean(), nullable=False),
sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False),
sa.Column('updated_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False),
sa.PrimaryKeyConstraint('id', name='datasource_oauth_tenant_config_pkey'),
sa.UniqueConstraint('tenant_id', 'plugin_id', 'provider', name='datasource_oauth_tenant_config_unique')
)
op.create_table('datasource_providers',
sa.Column('id', models.types.StringUUID(), server_default=sa.text('uuidv7()'), nullable=False),
sa.Column('tenant_id', models.types.StringUUID(), nullable=False),
sa.Column('name', sa.String(length=255), nullable=False),
sa.Column('provider', sa.String(length=255), nullable=False),
sa.Column('plugin_id', sa.String(length=255), nullable=False),
sa.Column('auth_type', sa.String(length=255), nullable=False),
sa.Column('encrypted_credentials', postgresql.JSONB(astext_type=sa.Text()), nullable=False),
sa.Column('avatar_url', sa.String(length=255), nullable=True),
sa.Column('is_default', sa.Boolean(), server_default=sa.text('false'), nullable=False),
sa.Column('expires_at', sa.Integer(), server_default='-1', nullable=False),
sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False),
sa.Column('updated_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False),
sa.PrimaryKeyConstraint('id', name='datasource_provider_pkey'),
sa.UniqueConstraint('tenant_id', 'plugin_id', 'provider', 'name', name='datasource_provider_unique_name')
)
with op.batch_alter_table('datasource_providers', schema=None) as batch_op:
batch_op.create_index('datasource_provider_auth_type_provider_idx', ['tenant_id', 'plugin_id', 'provider'], unique=False)
op.create_table('document_pipeline_execution_logs',
sa.Column('id', models.types.StringUUID(), server_default=sa.text('uuidv7()'), nullable=False),
sa.Column('pipeline_id', models.types.StringUUID(), nullable=False),
sa.Column('document_id', models.types.StringUUID(), nullable=False),
sa.Column('datasource_type', sa.String(length=255), nullable=False),
sa.Column('datasource_info', sa.Text(), nullable=False),
sa.Column('datasource_node_id', sa.String(length=255), nullable=False),
sa.Column('input_data', sa.JSON(), nullable=False),
sa.Column('created_by', models.types.StringUUID(), nullable=True),
sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False),
sa.PrimaryKeyConstraint('id', name='document_pipeline_execution_log_pkey')
)
with op.batch_alter_table('document_pipeline_execution_logs', schema=None) as batch_op:
batch_op.create_index('document_pipeline_execution_logs_document_id_idx', ['document_id'], unique=False)
op.create_table('pipeline_built_in_templates',
sa.Column('id', models.types.StringUUID(), server_default=sa.text('uuidv7()'), nullable=False),
sa.Column('name', sa.String(length=255), nullable=False),
sa.Column('description', sa.Text(), nullable=False),
sa.Column('chunk_structure', sa.String(length=255), nullable=False),
sa.Column('icon', sa.JSON(), nullable=False),
sa.Column('yaml_content', sa.Text(), nullable=False),
sa.Column('copyright', sa.String(length=255), nullable=False),
sa.Column('privacy_policy', sa.String(length=255), nullable=False),
sa.Column('position', sa.Integer(), nullable=False),
sa.Column('install_count', sa.Integer(), nullable=False),
sa.Column('language', sa.String(length=255), nullable=False),
sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False),
sa.Column('updated_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False),
sa.Column('created_by', models.types.StringUUID(), nullable=False),
sa.Column('updated_by', models.types.StringUUID(), nullable=True),
sa.PrimaryKeyConstraint('id', name='pipeline_built_in_template_pkey')
)
op.create_table('pipeline_customized_templates',
sa.Column('id', models.types.StringUUID(), server_default=sa.text('uuidv7()'), nullable=False),
sa.Column('tenant_id', models.types.StringUUID(), nullable=False),
sa.Column('name', sa.String(length=255), nullable=False),
sa.Column('description', sa.Text(), nullable=False),
sa.Column('chunk_structure', sa.String(length=255), nullable=False),
sa.Column('icon', sa.JSON(), nullable=False),
sa.Column('position', sa.Integer(), nullable=False),
sa.Column('yaml_content', sa.Text(), nullable=False),
sa.Column('install_count', sa.Integer(), nullable=False),
sa.Column('language', sa.String(length=255), nullable=False),
sa.Column('created_by', models.types.StringUUID(), nullable=False),
sa.Column('updated_by', models.types.StringUUID(), nullable=True),
sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False),
sa.Column('updated_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False),
sa.PrimaryKeyConstraint('id', name='pipeline_customized_template_pkey')
)
with op.batch_alter_table('pipeline_customized_templates', schema=None) as batch_op:
batch_op.create_index('pipeline_customized_template_tenant_idx', ['tenant_id'], unique=False)
op.create_table('pipeline_recommended_plugins',
sa.Column('id', models.types.StringUUID(), server_default=sa.text('uuidv7()'), nullable=False),
sa.Column('plugin_id', sa.Text(), nullable=False),
sa.Column('provider_name', sa.Text(), nullable=False),
sa.Column('position', sa.Integer(), nullable=False),
sa.Column('active', sa.Boolean(), nullable=False),
sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False),
sa.Column('updated_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False),
sa.PrimaryKeyConstraint('id', name='pipeline_recommended_plugin_pkey')
)
op.create_table('pipelines',
sa.Column('id', models.types.StringUUID(), server_default=sa.text('uuidv7()'), nullable=False),
sa.Column('tenant_id', models.types.StringUUID(), nullable=False),
sa.Column('name', sa.String(length=255), nullable=False),
sa.Column('description', sa.Text(), server_default=sa.text("''::character varying"), nullable=False),
sa.Column('workflow_id', models.types.StringUUID(), nullable=True),
sa.Column('is_public', sa.Boolean(), server_default=sa.text('false'), nullable=False),
sa.Column('is_published', sa.Boolean(), server_default=sa.text('false'), nullable=False),
sa.Column('created_by', models.types.StringUUID(), nullable=True),
sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False),
sa.Column('updated_by', models.types.StringUUID(), nullable=True),
sa.Column('updated_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False),
sa.PrimaryKeyConstraint('id', name='pipeline_pkey')
)
op.create_table('workflow_draft_variable_files',
sa.Column('id', models.types.StringUUID(), server_default=sa.text('uuidv7()'), nullable=False),
sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False),
sa.Column('tenant_id', models.types.StringUUID(), nullable=False, comment='The tenant to which the WorkflowDraftVariableFile belongs, referencing Tenant.id'),
sa.Column('app_id', models.types.StringUUID(), nullable=False, comment='The application to which the WorkflowDraftVariableFile belongs, referencing App.id'),
sa.Column('user_id', models.types.StringUUID(), nullable=False, comment='The owner to of the WorkflowDraftVariableFile, referencing Account.id'),
sa.Column('upload_file_id', models.types.StringUUID(), nullable=False, comment='Reference to UploadFile containing the large variable data'),
sa.Column('size', sa.BigInteger(), nullable=False, comment='Size of the original variable content in bytes'),
sa.Column('length', sa.Integer(), nullable=True, comment='Length of the original variable content. For array and array-like types, this represents the number of elements. For object types, it indicates the number of keys. For other types, the value is NULL.'),
sa.Column('value_type', sa.String(20), nullable=False),
sa.PrimaryKeyConstraint('id', name=op.f('workflow_draft_variable_files_pkey'))
)
op.create_table('workflow_node_execution_offload',
sa.Column('id', models.types.StringUUID(), server_default=sa.text('uuidv7()'), nullable=False),
sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False),
sa.Column('tenant_id', models.types.StringUUID(), nullable=False),
sa.Column('app_id', models.types.StringUUID(), nullable=False),
sa.Column('node_execution_id', models.types.StringUUID(), nullable=True),
sa.Column('type', sa.String(20), nullable=False),
sa.Column('file_id', models.types.StringUUID(), nullable=False),
sa.PrimaryKeyConstraint('id', name=op.f('workflow_node_execution_offload_pkey')),
sa.UniqueConstraint('node_execution_id', 'type', name=op.f('workflow_node_execution_offload_node_execution_id_key'))
)
with op.batch_alter_table('datasets', schema=None) as batch_op:
batch_op.add_column(sa.Column('keyword_number', sa.Integer(), server_default=sa.text('10'), nullable=True))
batch_op.add_column(sa.Column('icon_info', postgresql.JSONB(astext_type=sa.Text()), nullable=True))
batch_op.add_column(sa.Column('runtime_mode', sa.String(length=255), server_default=sa.text("'general'::character varying"), nullable=True))
batch_op.add_column(sa.Column('pipeline_id', models.types.StringUUID(), nullable=True))
batch_op.add_column(sa.Column('chunk_structure', sa.String(length=255), nullable=True))
batch_op.add_column(sa.Column('enable_api', sa.Boolean(), server_default=sa.text('true'), nullable=False))
with op.batch_alter_table('workflow_draft_variables', schema=None) as batch_op:
batch_op.add_column(sa.Column('file_id', models.types.StringUUID(), nullable=True, comment='Reference to WorkflowDraftVariableFile if variable is offloaded to external storage'))
batch_op.add_column(
sa.Column(
'is_default_value', sa.Boolean(), nullable=False,
server_default=sa.text(text="FALSE"),
comment='Indicates whether the current value is the default for a conversation variable. Always `FALSE` for other types of variables.',)
)
batch_op.create_index('workflow_draft_variable_file_id_idx', ['file_id'], unique=False)
with op.batch_alter_table('workflows', schema=None) as batch_op:
batch_op.add_column(sa.Column('rag_pipeline_variables', sa.Text(), server_default='{}', nullable=False))
# ### end Alembic commands ###
def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table('workflows', schema=None) as batch_op:
batch_op.drop_column('rag_pipeline_variables')
with op.batch_alter_table('workflow_draft_variables', schema=None) as batch_op:
batch_op.drop_index('workflow_draft_variable_file_id_idx')
batch_op.drop_column('is_default_value')
batch_op.drop_column('file_id')
with op.batch_alter_table('datasets', schema=None) as batch_op:
batch_op.drop_column('enable_api')
batch_op.drop_column('chunk_structure')
batch_op.drop_column('pipeline_id')
batch_op.drop_column('runtime_mode')
batch_op.drop_column('icon_info')
batch_op.drop_column('keyword_number')
op.drop_table('workflow_node_execution_offload')
op.drop_table('workflow_draft_variable_files')
op.drop_table('pipelines')
op.drop_table('pipeline_recommended_plugins')
with op.batch_alter_table('pipeline_customized_templates', schema=None) as batch_op:
batch_op.drop_index('pipeline_customized_template_tenant_idx')
op.drop_table('pipeline_customized_templates')
op.drop_table('pipeline_built_in_templates')
with op.batch_alter_table('document_pipeline_execution_logs', schema=None) as batch_op:
batch_op.drop_index('document_pipeline_execution_logs_document_id_idx')
op.drop_table('document_pipeline_execution_logs')
with op.batch_alter_table('datasource_providers', schema=None) as batch_op:
batch_op.drop_index('datasource_provider_auth_type_provider_idx')
op.drop_table('datasource_providers')
op.drop_table('datasource_oauth_tenant_params')
op.drop_table('datasource_oauth_params')
# ### end Alembic commands ###