Files
aiagent/backend/app/services/text_chunker.py
renjianbo 7b9e0826de feat: 向量记忆 RAG、工具市场、SSE 流式响应、前端集成与测试覆盖
- 新增 embedding_service(语义检索)、knowledge_service(RAG)、text_chunker、document_parser
- 新增 tool_registry(自定义工具注册表)并完善工具市场 API(CRUD + code/http 执行)
- 新增 agent_vector_memory / knowledge_base 模型及对应数据库表
- 实现 SSE 流式响应与 Agent 预算控制
- AgentChat.vue 集成 MainLayout 导航布局
- 完善测试体系:7 个新测试文件共 110 个测试覆盖
- 修复 conftest.py SQLite 内存数据库连接隔离问题

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-01 22:30:46 +08:00

133 lines
3.7 KiB
Python

"""
文本分块器 — 将长文本切分为适合 Embedding + 检索的块。
策略:
1. 按段落分割(连续换行)
2. 超长段落按句子切分(句号、问号、感叹号、换行)
3. 短段落合并,直到接近 chunk_size
4. chunk 之间保留 overlap 字符的重叠
"""
from __future__ import annotations
import re
from typing import List
def chunk_text(
text: str,
chunk_size: int = 500,
chunk_overlap: int = 50,
) -> List[str]:
"""
将文本分割为语义完整的块。
Args:
text: 输入文本
chunk_size: 每块目标字符数
chunk_overlap: 相邻块重叠字符数
Returns:
文本块列表
"""
if not text or not text.strip():
return []
# 1. 按段落分割
paragraphs = _split_paragraphs(text)
if not paragraphs:
return []
# 2. 处理每个段落:超长段落进一步拆分
segments: List[str] = []
for para in paragraphs:
if len(para) <= chunk_size:
segments.append(para)
else:
segments.extend(_split_long_paragraph(para, chunk_size))
# 3. 合并短段落为块
chunks = _merge_segments(segments, chunk_size, chunk_overlap)
return chunks
def _split_paragraphs(text: str) -> List[str]:
"""按连续换行分割段落,过滤空白段落。"""
# 按两个以上换行分割
raw = re.split(r"\n\s*\n", text)
paras = []
for p in raw:
p = p.strip()
if p:
paras.append(p)
return paras
def _split_long_paragraph(para: str, chunk_size: int) -> List[str]:
"""将超长段落按句子切分。"""
# 先按句子结束符分割(去掉 look-behind 长度限制)
sentences = re.split(r"(?<=[。!?])|(?<=[.!?])\s*", para)
sentences = [s.strip() for s in sentences if s.strip()]
if not sentences:
# 无法按句子分割,按字符硬切
return [para[i : i + chunk_size] for i in range(0, len(para), chunk_size)]
chunks: List[str] = []
current = ""
for sent in sentences:
if len(current) + len(sent) <= chunk_size:
current += sent
else:
if current:
chunks.append(current.strip())
current = sent
if current:
chunks.append(current.strip())
# 如果某个块还是太长(单句超长),按字符硬切
result: List[str] = []
for c in chunks:
if len(c) <= chunk_size:
result.append(c)
else:
for i in range(0, len(c), chunk_size):
result.append(c[i : i + chunk_size])
return result
def _merge_segments(segments: List[str], chunk_size: int, overlap: int) -> List[str]:
"""合并短段落为块,块间带重叠。"""
if not segments:
return []
chunks: List[str] = []
current = ""
for seg in segments:
# 如果当前块 + 新段不超过上限,追加
if not current:
current = seg
elif len(current) + 1 + len(seg) <= chunk_size:
current += "\n\n" + seg
else:
# 当前块已满
chunks.append(current.strip())
# 重叠:取前一块末尾的 overlap 字符
if overlap > 0 and len(current) > overlap:
# 从上一个块末尾取 overlap 字符作为新块的起始
tail = current[-overlap:]
# 从上一个换行处开始,避免截断句子
newline_pos = tail.find("\n")
if newline_pos > 0 and newline_pos < overlap // 2:
tail = tail[newline_pos + 1 :]
current = tail + "\n\n" + seg
else:
current = seg
if current.strip():
chunks.append(current.strip())
return chunks