Files
aiagent/backend/app/models/knowledge_base.py

117 lines
4.9 KiB
Python
Raw Normal View History

"""
知识库 RAG 模型
KnowledgeBase 知识库容器
Document 上传的源文档
DocumentChunk 文档切片 embedding
"""
from __future__ import annotations
import uuid
from datetime import datetime
from sqlalchemy import Column, String, Text, Integer, DateTime, ForeignKey, Index
from sqlalchemy.dialects.mysql import JSON as MySQLJSON
from sqlalchemy.orm import relationship
from app.core.database import Base
class KnowledgeBase(Base):
"""知识库"""
__tablename__ = "knowledge_bases"
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
name = Column(String(200), nullable=False, comment="知识库名称")
description = Column(Text, nullable=True, comment="描述")
user_id = Column(String(36), nullable=True, index=True, comment="创建者 ID")
chunk_size = Column(Integer, default=500, comment="分块大小(字符数)")
chunk_overlap = Column(Integer, default=50, comment="分块重叠(字符数)")
doc_count = Column(Integer, default=0, comment="文档数量")
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
documents = relationship("Document", back_populates="kb", cascade="all, delete-orphan",
passive_deletes=True)
def to_dict(self) -> dict:
return {
"id": self.id,
"name": self.name,
"description": self.description,
"user_id": self.user_id,
"chunk_size": self.chunk_size,
"chunk_overlap": self.chunk_overlap,
"doc_count": self.doc_count,
"created_at": self.created_at.isoformat() if self.created_at else None,
"updated_at": self.updated_at.isoformat() if self.updated_at else None,
}
class Document(Base):
"""知识库文档"""
__tablename__ = "kb_documents"
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
kb_id = Column(String(36), ForeignKey("knowledge_bases.id", ondelete="CASCADE"),
nullable=False, index=True, comment="所属知识库")
filename = Column(String(500), nullable=False, comment="原始文件名")
file_type = Column(String(20), nullable=False, comment="文件类型: txt/pdf/docx/md/csv")
file_size = Column(Integer, default=0, comment="文件大小(字节)")
status = Column(String(20), default="pending",
comment="状态: pending/processing/ready/failed")
error_message = Column(Text, nullable=True, comment="处理失败信息")
chunk_count = Column(Integer, default=0, comment="分块数量")
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
kb = relationship("KnowledgeBase", back_populates="documents")
chunks = relationship("DocumentChunk", back_populates="document",
cascade="all, delete-orphan", passive_deletes=True)
def to_dict(self) -> dict:
return {
"id": self.id,
"kb_id": self.kb_id,
"filename": self.filename,
"file_type": self.file_type,
"file_size": self.file_size,
"status": self.status,
"error_message": self.error_message,
"chunk_count": self.chunk_count,
"created_at": self.created_at.isoformat() if self.created_at else None,
}
class DocumentChunk(Base):
"""文档切片 — 含 embedding 向量"""
__tablename__ = "kb_document_chunks"
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
document_id = Column(String(36), ForeignKey("kb_documents.id", ondelete="CASCADE"),
nullable=False, index=True, comment="所属文档")
kb_id = Column(String(36), ForeignKey("knowledge_bases.id", ondelete="CASCADE"),
nullable=False, index=True, comment="所属知识库")
chunk_index = Column(Integer, default=0, comment="块序号")
content = Column(Text, nullable=False, comment="切片文本内容")
embedding = Column(Text, nullable=True, comment="JSON 序列化的 embedding 向量")
metadata_ = Column("metadata", MySQLJSON, nullable=True, comment="元数据")
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
document = relationship("Document", back_populates="chunks")
__table_args__ = (
Index("ix_kb_chunks_kb_id", "kb_id"),
Index("ix_kb_chunks_doc_id", "document_id"),
)
def to_dict(self) -> dict:
return {
"id": self.id,
"document_id": self.document_id,
"kb_id": self.kb_id,
"chunk_index": self.chunk_index,
"content": self.content[:200] + "..." if len(self.content) > 200 else self.content,
"metadata": self.metadata_ or {},
"created_at": self.created_at.isoformat() if self.created_at else None,
}