117 lines
4.9 KiB
Python
117 lines
4.9 KiB
Python
|
|
"""
|
|||
|
|
知识库 RAG 模型。
|
|||
|
|
|
|||
|
|
KnowledgeBase — 知识库容器
|
|||
|
|
Document — 上传的源文档
|
|||
|
|
DocumentChunk — 文档切片(含 embedding)
|
|||
|
|
"""
|
|||
|
|
from __future__ import annotations
|
|||
|
|
|
|||
|
|
import uuid
|
|||
|
|
from datetime import datetime
|
|||
|
|
|
|||
|
|
from sqlalchemy import Column, String, Text, Integer, DateTime, ForeignKey, Index
|
|||
|
|
from sqlalchemy.dialects.mysql import JSON as MySQLJSON
|
|||
|
|
from sqlalchemy.orm import relationship
|
|||
|
|
|
|||
|
|
from app.core.database import Base
|
|||
|
|
|
|||
|
|
|
|||
|
|
class KnowledgeBase(Base):
|
|||
|
|
"""知识库"""
|
|||
|
|
__tablename__ = "knowledge_bases"
|
|||
|
|
|
|||
|
|
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
|
|||
|
|
name = Column(String(200), nullable=False, comment="知识库名称")
|
|||
|
|
description = Column(Text, nullable=True, comment="描述")
|
|||
|
|
user_id = Column(String(36), nullable=True, index=True, comment="创建者 ID")
|
|||
|
|
chunk_size = Column(Integer, default=500, comment="分块大小(字符数)")
|
|||
|
|
chunk_overlap = Column(Integer, default=50, comment="分块重叠(字符数)")
|
|||
|
|
doc_count = Column(Integer, default=0, comment="文档数量")
|
|||
|
|
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
|||
|
|
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
|
|||
|
|
|
|||
|
|
documents = relationship("Document", back_populates="kb", cascade="all, delete-orphan",
|
|||
|
|
passive_deletes=True)
|
|||
|
|
|
|||
|
|
def to_dict(self) -> dict:
|
|||
|
|
return {
|
|||
|
|
"id": self.id,
|
|||
|
|
"name": self.name,
|
|||
|
|
"description": self.description,
|
|||
|
|
"user_id": self.user_id,
|
|||
|
|
"chunk_size": self.chunk_size,
|
|||
|
|
"chunk_overlap": self.chunk_overlap,
|
|||
|
|
"doc_count": self.doc_count,
|
|||
|
|
"created_at": self.created_at.isoformat() if self.created_at else None,
|
|||
|
|
"updated_at": self.updated_at.isoformat() if self.updated_at else None,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
class Document(Base):
|
|||
|
|
"""知识库文档"""
|
|||
|
|
__tablename__ = "kb_documents"
|
|||
|
|
|
|||
|
|
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
|
|||
|
|
kb_id = Column(String(36), ForeignKey("knowledge_bases.id", ondelete="CASCADE"),
|
|||
|
|
nullable=False, index=True, comment="所属知识库")
|
|||
|
|
filename = Column(String(500), nullable=False, comment="原始文件名")
|
|||
|
|
file_type = Column(String(20), nullable=False, comment="文件类型: txt/pdf/docx/md/csv")
|
|||
|
|
file_size = Column(Integer, default=0, comment="文件大小(字节)")
|
|||
|
|
status = Column(String(20), default="pending",
|
|||
|
|
comment="状态: pending/processing/ready/failed")
|
|||
|
|
error_message = Column(Text, nullable=True, comment="处理失败信息")
|
|||
|
|
chunk_count = Column(Integer, default=0, comment="分块数量")
|
|||
|
|
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
|||
|
|
|
|||
|
|
kb = relationship("KnowledgeBase", back_populates="documents")
|
|||
|
|
chunks = relationship("DocumentChunk", back_populates="document",
|
|||
|
|
cascade="all, delete-orphan", passive_deletes=True)
|
|||
|
|
|
|||
|
|
def to_dict(self) -> dict:
|
|||
|
|
return {
|
|||
|
|
"id": self.id,
|
|||
|
|
"kb_id": self.kb_id,
|
|||
|
|
"filename": self.filename,
|
|||
|
|
"file_type": self.file_type,
|
|||
|
|
"file_size": self.file_size,
|
|||
|
|
"status": self.status,
|
|||
|
|
"error_message": self.error_message,
|
|||
|
|
"chunk_count": self.chunk_count,
|
|||
|
|
"created_at": self.created_at.isoformat() if self.created_at else None,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
class DocumentChunk(Base):
|
|||
|
|
"""文档切片 — 含 embedding 向量"""
|
|||
|
|
__tablename__ = "kb_document_chunks"
|
|||
|
|
|
|||
|
|
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
|
|||
|
|
document_id = Column(String(36), ForeignKey("kb_documents.id", ondelete="CASCADE"),
|
|||
|
|
nullable=False, index=True, comment="所属文档")
|
|||
|
|
kb_id = Column(String(36), ForeignKey("knowledge_bases.id", ondelete="CASCADE"),
|
|||
|
|
nullable=False, index=True, comment="所属知识库")
|
|||
|
|
chunk_index = Column(Integer, default=0, comment="块序号")
|
|||
|
|
content = Column(Text, nullable=False, comment="切片文本内容")
|
|||
|
|
embedding = Column(Text, nullable=True, comment="JSON 序列化的 embedding 向量")
|
|||
|
|
metadata_ = Column("metadata", MySQLJSON, nullable=True, comment="元数据")
|
|||
|
|
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
|||
|
|
|
|||
|
|
document = relationship("Document", back_populates="chunks")
|
|||
|
|
|
|||
|
|
__table_args__ = (
|
|||
|
|
Index("ix_kb_chunks_kb_id", "kb_id"),
|
|||
|
|
Index("ix_kb_chunks_doc_id", "document_id"),
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
def to_dict(self) -> dict:
|
|||
|
|
return {
|
|||
|
|
"id": self.id,
|
|||
|
|
"document_id": self.document_id,
|
|||
|
|
"kb_id": self.kb_id,
|
|||
|
|
"chunk_index": self.chunk_index,
|
|||
|
|
"content": self.content[:200] + "..." if len(self.content) > 200 else self.content,
|
|||
|
|
"metadata": self.metadata_ or {},
|
|||
|
|
"created_at": self.created_at.isoformat() if self.created_at else None,
|
|||
|
|
}
|