feat: knowledge pipeline (#25360)

Signed-off-by: -LAN- <laipz8200@outlook.com>
Co-authored-by: twwu <twwu@dify.ai>
Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
Co-authored-by: jyong <718720800@qq.com>
Co-authored-by: Wu Tianwei <30284043+WTW0313@users.noreply.github.com>
Co-authored-by: QuantumGhost <obelisk.reg+git@gmail.com>
Co-authored-by: lyzno1 <yuanyouhuilyz@gmail.com>
Co-authored-by: quicksand <quicksandzn@gmail.com>
Co-authored-by: Jyong <76649700+JohnJyong@users.noreply.github.com>
Co-authored-by: lyzno1 <92089059+lyzno1@users.noreply.github.com>
Co-authored-by: zxhlyh <jasonapring2015@outlook.com>
Co-authored-by: Yongtao Huang <yongtaoh2022@gmail.com>
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
Co-authored-by: Joel <iamjoel007@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: nite-knite <nkCoding@gmail.com>
Co-authored-by: Hanqing Zhao <sherry9277@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Harry <xh001x@hotmail.com>
This commit is contained in:
-LAN-
2025-09-18 12:49:10 +08:00
committed by GitHub
parent 7dadb33003
commit 85cda47c70
1772 changed files with 102407 additions and 31710 deletions

View File

@@ -1,9 +1,12 @@
import type { DataSourceNotionPage, DataSourceProvider } from './common'
import type { AppIconType, AppMode, RetrievalConfig } from '@/types/app'
import type { AppIconType, AppMode, RetrievalConfig, TransferMethod } from '@/types/app'
import type { Tag } from '@/app/components/base/tag-management/constant'
import type { IndexingType } from '@/app/components/datasets/create/step-two'
import type { MetadataFilteringVariableType } from '@/app/components/workflow/nodes/knowledge-retrieval/types'
import type { MetadataItemWithValue } from '@/app/components/datasets/metadata/types'
import { ExternalKnowledgeBase, General, ParentChild, Qa } from '@/app/components/base/icons/src/public/knowledge/dataset-card'
import { GeneralChunk, ParentChildChunk, QuestionAndAnswer } from '@/app/components/base/icons/src/vender/knowledge'
import type { DatasourceType } from './pipeline'
export enum DataSourceType {
FILE = 'upload_file',
@@ -21,6 +24,7 @@ export enum ChunkingMode {
text = 'text_model', // General text
qa = 'qa_model', // General QA
parentChild = 'hierarchical_model', // Parent-Child
// graph = 'graph', // todo: Graph RAG
}
export type MetadataInDoc = {
@@ -30,11 +34,18 @@ export type MetadataInDoc = {
name: string
}
export type IconInfo = {
icon: string
icon_background?: string
icon_type: AppIconType
icon_url?: string
}
export type DataSet = {
id: string
name: string
icon: string
icon_background: string
indexing_status: DocumentIndexingStatus
icon_info: IconInfo
description: string
permission: DatasetPermission
data_source_type: DataSourceType
@@ -45,6 +56,8 @@ export type DataSet = {
app_count: number
doc_form: ChunkingMode
document_count: number
total_document_count: number
total_available_documents?: number
word_count: number
provider: string
embedding_model: string
@@ -67,6 +80,11 @@ export type DataSet = {
}
built_in_field_enabled: boolean
doc_metadata?: MetadataInDoc[]
keyword_number?: number
pipeline_id?: string
is_published?: boolean // Indicates if the pipeline is published
runtime_mode: 'rag_pipeline' | 'general'
enable_api: boolean
}
export type ExternalAPIItem = {
@@ -136,11 +154,22 @@ export type CrawlOptions = {
export type CrawlResultItem = {
title: string
markdown: string
content: string
description: string
source_url: string
}
export type CrawlResult = {
data: CrawlResultItem[]
time_consuming: number | string
}
export enum CrawlStep {
init = 'init',
running = 'running',
finished = 'finished',
}
export type FileItem = {
fileID: string
file: CustomFile
@@ -159,6 +188,14 @@ export type FetchDatasetsParams = {
}
}
export type DatasetListRequest = {
initialPage: number
tag_ids?: string[]
limit: number
include_all?: boolean
keyword?: string
}
export type DataSetListResponse = {
data: DataSet[]
has_more: boolean
@@ -272,7 +309,7 @@ export const DisplayStatusList = [
export type DocumentDisplayStatus = typeof DisplayStatusList[number]
export type DataSourceInfo = {
export type LegacyDataSourceInfo = {
upload_file: {
id: string
name: string
@@ -288,18 +325,60 @@ export type DataSourceInfo = {
provider?: DataSourceProvider
job_id: string
url: string
credential_id?: string
}
export type LocalFileInfo = {
extension: string
mime_type: string
name: string
related_id: string
size: number
transfer_method: TransferMethod
url: string
}
export type WebsiteCrawlInfo = {
content: string
credential_id: string
description: string
source_url: string
title: string
}
export type OnlineDocumentInfo = {
credential_id: string
workspace_id: string
page: {
last_edited_time: string
page_icon: DataSourceNotionPage['page_icon']
page_id: string
page_name: string
parent_id: string
type: string
},
}
export type OnlineDriveInfo = {
bucket: string
credential_id: string
id: string
name: string
type: 'file' | 'folder'
}
export type DataSourceInfo = LegacyDataSourceInfo | LocalFileInfo | OnlineDocumentInfo | WebsiteCrawlInfo
export type InitialDocumentDetail = {
id: string
batch: string
position: number
dataset_id: string
data_source_type: DataSourceType
data_source_type: DataSourceType | DatasourceType
data_source_info: DataSourceInfo
dataset_process_rule_id: string
name: string
created_from: 'api' | 'web'
created_from: 'rag-pipeline' | 'api' | 'web'
created_by: string
created_at: number
indexing_status: DocumentIndexingStatus
@@ -313,7 +392,6 @@ export type InitialDocumentDetail = {
export type SimpleDocumentDetail = InitialDocumentDetail & {
enabled: boolean
word_count: number
is_qa: boolean // TODO waiting for backend to add this field
error?: string | null
archived: boolean
updated_at: number
@@ -338,7 +416,7 @@ export type DocumentListResponse = {
export type DocumentReq = {
original_document_id?: string
indexing_technique?: string
indexing_technique?: IndexingType
doc_form: ChunkingMode
doc_language: string
process_rule: ProcessRule
@@ -374,6 +452,7 @@ export type DataSource = {
export type NotionInfo = {
workspace_id: string
pages: DataSourceNotionPage[]
credential_id: string
}
export type NotionPage = {
page_id: string
@@ -688,3 +767,47 @@ export type BatchImportResponse = {
job_id: string
job_status: string
}
export const DOC_FORM_ICON_WITH_BG: Record<ChunkingMode | 'external', React.ComponentType<{ className: string }>> = {
[ChunkingMode.text]: General,
[ChunkingMode.qa]: Qa,
[ChunkingMode.parentChild]: ParentChild,
// [ChunkingMode.graph]: Graph, // todo: Graph RAG
external: ExternalKnowledgeBase,
}
export const DOC_FORM_ICON: Record<ChunkingMode.text | ChunkingMode.qa | ChunkingMode.parentChild, React.ComponentType<{ className: string }>> = {
[ChunkingMode.text]: GeneralChunk,
[ChunkingMode.qa]: QuestionAndAnswer,
[ChunkingMode.parentChild]: ParentChildChunk,
}
export const DOC_FORM_TEXT: Record<ChunkingMode, string> = {
[ChunkingMode.text]: 'general',
[ChunkingMode.qa]: 'qa',
[ChunkingMode.parentChild]: 'parentChild',
// [ChunkingMode.graph]: 'graph', // todo: Graph RAG
}
export type CreateDatasetReq = {
yaml_content?: string
}
export type CreateDatasetResponse = {
id: string
name: string
description: string
permission: DatasetPermission
indexing_technique: IndexingType
created_by: string
created_at: number
updated_by: string
updated_at: number
pipeline_id: string
dataset_id: string
}
export type IndexingStatusBatchRequest = {
datasetId: string
batchId: string
}