- 新增 embedding_service(语义检索)、knowledge_service(RAG)、text_chunker、document_parser - 新增 tool_registry(自定义工具注册表)并完善工具市场 API(CRUD + code/http 执行) - 新增 agent_vector_memory / knowledge_base 模型及对应数据库表 - 实现 SSE 流式响应与 Agent 预算控制 - AgentChat.vue 集成 MainLayout 导航布局 - 完善测试体系:7 个新测试文件共 110 个测试覆盖 - 修复 conftest.py SQLite 内存数据库连接隔离问题 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
117 lines
4.9 KiB
Python
117 lines
4.9 KiB
Python
"""
|
||
知识库 RAG 模型。
|
||
|
||
KnowledgeBase — 知识库容器
|
||
Document — 上传的源文档
|
||
DocumentChunk — 文档切片(含 embedding)
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import uuid
|
||
from datetime import datetime
|
||
|
||
from sqlalchemy import Column, String, Text, Integer, DateTime, ForeignKey, Index
|
||
from sqlalchemy.dialects.mysql import JSON as MySQLJSON
|
||
from sqlalchemy.orm import relationship
|
||
|
||
from app.core.database import Base
|
||
|
||
|
||
class KnowledgeBase(Base):
|
||
"""知识库"""
|
||
__tablename__ = "knowledge_bases"
|
||
|
||
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
|
||
name = Column(String(200), nullable=False, comment="知识库名称")
|
||
description = Column(Text, nullable=True, comment="描述")
|
||
user_id = Column(String(36), nullable=True, index=True, comment="创建者 ID")
|
||
chunk_size = Column(Integer, default=500, comment="分块大小(字符数)")
|
||
chunk_overlap = Column(Integer, default=50, comment="分块重叠(字符数)")
|
||
doc_count = Column(Integer, default=0, comment="文档数量")
|
||
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
||
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
|
||
|
||
documents = relationship("Document", back_populates="kb", cascade="all, delete-orphan",
|
||
passive_deletes=True)
|
||
|
||
def to_dict(self) -> dict:
|
||
return {
|
||
"id": self.id,
|
||
"name": self.name,
|
||
"description": self.description,
|
||
"user_id": self.user_id,
|
||
"chunk_size": self.chunk_size,
|
||
"chunk_overlap": self.chunk_overlap,
|
||
"doc_count": self.doc_count,
|
||
"created_at": self.created_at.isoformat() if self.created_at else None,
|
||
"updated_at": self.updated_at.isoformat() if self.updated_at else None,
|
||
}
|
||
|
||
|
||
class Document(Base):
|
||
"""知识库文档"""
|
||
__tablename__ = "kb_documents"
|
||
|
||
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
|
||
kb_id = Column(String(36), ForeignKey("knowledge_bases.id", ondelete="CASCADE"),
|
||
nullable=False, index=True, comment="所属知识库")
|
||
filename = Column(String(500), nullable=False, comment="原始文件名")
|
||
file_type = Column(String(20), nullable=False, comment="文件类型: txt/pdf/docx/md/csv")
|
||
file_size = Column(Integer, default=0, comment="文件大小(字节)")
|
||
status = Column(String(20), default="pending",
|
||
comment="状态: pending/processing/ready/failed")
|
||
error_message = Column(Text, nullable=True, comment="处理失败信息")
|
||
chunk_count = Column(Integer, default=0, comment="分块数量")
|
||
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
||
|
||
kb = relationship("KnowledgeBase", back_populates="documents")
|
||
chunks = relationship("DocumentChunk", back_populates="document",
|
||
cascade="all, delete-orphan", passive_deletes=True)
|
||
|
||
def to_dict(self) -> dict:
|
||
return {
|
||
"id": self.id,
|
||
"kb_id": self.kb_id,
|
||
"filename": self.filename,
|
||
"file_type": self.file_type,
|
||
"file_size": self.file_size,
|
||
"status": self.status,
|
||
"error_message": self.error_message,
|
||
"chunk_count": self.chunk_count,
|
||
"created_at": self.created_at.isoformat() if self.created_at else None,
|
||
}
|
||
|
||
|
||
class DocumentChunk(Base):
|
||
"""文档切片 — 含 embedding 向量"""
|
||
__tablename__ = "kb_document_chunks"
|
||
|
||
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
|
||
document_id = Column(String(36), ForeignKey("kb_documents.id", ondelete="CASCADE"),
|
||
nullable=False, index=True, comment="所属文档")
|
||
kb_id = Column(String(36), ForeignKey("knowledge_bases.id", ondelete="CASCADE"),
|
||
nullable=False, index=True, comment="所属知识库")
|
||
chunk_index = Column(Integer, default=0, comment="块序号")
|
||
content = Column(Text, nullable=False, comment="切片文本内容")
|
||
embedding = Column(Text, nullable=True, comment="JSON 序列化的 embedding 向量")
|
||
metadata_ = Column("metadata", MySQLJSON, nullable=True, comment="元数据")
|
||
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
||
|
||
document = relationship("Document", back_populates="chunks")
|
||
|
||
__table_args__ = (
|
||
Index("ix_kb_chunks_kb_id", "kb_id"),
|
||
Index("ix_kb_chunks_doc_id", "document_id"),
|
||
)
|
||
|
||
def to_dict(self) -> dict:
|
||
return {
|
||
"id": self.id,
|
||
"document_id": self.document_id,
|
||
"kb_id": self.kb_id,
|
||
"chunk_index": self.chunk_index,
|
||
"content": self.content[:200] + "..." if len(self.content) > 200 else self.content,
|
||
"metadata": self.metadata_ or {},
|
||
"created_at": self.created_at.isoformat() if self.created_at else None,
|
||
}
|