aiagent/backend/app/models/knowledge_base.py

"""
知识库 RAG 模型。

KnowledgeBase — 知识库容器
Document     — 上传的源文档
DocumentChunk — 文档切片（含 embedding）
"""
from __future__ import annotations

import uuid
from datetime import datetime

from sqlalchemy import Column, String, Text, Integer, DateTime, ForeignKey, Index
from sqlalchemy.dialects.mysql import JSON as MySQLJSON
from sqlalchemy.orm import relationship

from app.core.database import Base


class KnowledgeBase(Base):
    """知识库"""
    __tablename__ = "knowledge_bases"

    id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
    name = Column(String(200), nullable=False, comment="知识库名称")
    description = Column(Text, nullable=True, comment="描述")
    user_id = Column(String(36), nullable=True, index=True, comment="创建者 ID")
    chunk_size = Column(Integer, default=500, comment="分块大小（字符数）")
    chunk_overlap = Column(Integer, default=50, comment="分块重叠（字符数）")
    doc_count = Column(Integer, default=0, comment="文档数量")
    created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)

    documents = relationship("Document", back_populates="kb", cascade="all, delete-orphan",
                             passive_deletes=True)

    def to_dict(self) -> dict:
        return {
            "id": self.id,
            "name": self.name,
            "description": self.description,
            "user_id": self.user_id,
            "chunk_size": self.chunk_size,
            "chunk_overlap": self.chunk_overlap,
            "doc_count": self.doc_count,
            "created_at": self.created_at.isoformat() if self.created_at else None,
            "updated_at": self.updated_at.isoformat() if self.updated_at else None,
        }


class Document(Base):
    """知识库文档"""
    __tablename__ = "kb_documents"

    id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
    kb_id = Column(String(36), ForeignKey("knowledge_bases.id", ondelete="CASCADE"),
                   nullable=False, index=True, comment="所属知识库")
    filename = Column(String(500), nullable=False, comment="原始文件名")
    file_type = Column(String(20), nullable=False, comment="文件类型: txt/pdf/docx/md/csv")
    file_size = Column(Integer, default=0, comment="文件大小（字节）")
    status = Column(String(20), default="pending",
                    comment="状态: pending/processing/ready/failed")
    error_message = Column(Text, nullable=True, comment="处理失败信息")
    chunk_count = Column(Integer, default=0, comment="分块数量")
    created_at = Column(DateTime, default=datetime.utcnow, nullable=False)

    kb = relationship("KnowledgeBase", back_populates="documents")
    chunks = relationship("DocumentChunk", back_populates="document",
                          cascade="all, delete-orphan", passive_deletes=True)

    def to_dict(self) -> dict:
        return {
            "id": self.id,
            "kb_id": self.kb_id,
            "filename": self.filename,
            "file_type": self.file_type,
            "file_size": self.file_size,
            "status": self.status,
            "error_message": self.error_message,
            "chunk_count": self.chunk_count,
            "created_at": self.created_at.isoformat() if self.created_at else None,
        }


class DocumentChunk(Base):
    """文档切片 — 含 embedding 向量"""
    __tablename__ = "kb_document_chunks"

    id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
    document_id = Column(String(36), ForeignKey("kb_documents.id", ondelete="CASCADE"),
                         nullable=False, index=True, comment="所属文档")
    kb_id = Column(String(36), ForeignKey("knowledge_bases.id", ondelete="CASCADE"),
                   nullable=False, index=True, comment="所属知识库")
    chunk_index = Column(Integer, default=0, comment="块序号")
    content = Column(Text, nullable=False, comment="切片文本内容")
    embedding = Column(Text, nullable=True, comment="JSON 序列化的 embedding 向量")
    metadata_ = Column("metadata", MySQLJSON, nullable=True, comment="元数据")
    created_at = Column(DateTime, default=datetime.utcnow, nullable=False)

    document = relationship("Document", back_populates="chunks")

    __table_args__ = (
        Index("ix_kb_chunks_kb_id", "kb_id"),
        Index("ix_kb_chunks_doc_id", "document_id"),
    )

    def to_dict(self) -> dict:
        return {
            "id": self.id,
            "document_id": self.document_id,
            "kb_id": self.kb_id,
            "chunk_index": self.chunk_index,
            "content": self.content[:200] + "..." if len(self.content) > 200 else self.content,
            "metadata": self.metadata_ or {},
            "created_at": self.created_at.isoformat() if self.created_at else None,
        }