Files
aiagent/backend/tests/test_text_chunker.py
renjianbo 7b9e0826de feat: 向量记忆 RAG、工具市场、SSE 流式响应、前端集成与测试覆盖
- 新增 embedding_service(语义检索)、knowledge_service(RAG)、text_chunker、document_parser
- 新增 tool_registry(自定义工具注册表)并完善工具市场 API(CRUD + code/http 执行)
- 新增 agent_vector_memory / knowledge_base 模型及对应数据库表
- 实现 SSE 流式响应与 Agent 预算控制
- AgentChat.vue 集成 MainLayout 导航布局
- 完善测试体系:7 个新测试文件共 110 个测试覆盖
- 修复 conftest.py SQLite 内存数据库连接隔离问题

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-01 22:30:46 +08:00

140 lines
4.5 KiB
Python

"""
文本分块器单元测试
"""
from __future__ import annotations
import pytest
from app.services.text_chunker import chunk_text, _split_paragraphs, _split_long_paragraph, _merge_segments
class TestSplitParagraphs:
"""段落分割测试"""
def test_empty_text(self):
assert _split_paragraphs("") == []
assert _split_paragraphs(" ") == []
assert _split_paragraphs("\n\n\n") == []
def test_single_paragraph(self):
result = _split_paragraphs("Hello world")
assert result == ["Hello world"]
def test_multiple_paragraphs(self):
text = "第一段内容。\n\n第二段内容。\n\n第三段内容。"
result = _split_paragraphs(text)
assert len(result) == 3
assert "第一段" in result[0]
assert "第二段" in result[1]
assert "第三段" in result[2]
def test_mixed_newlines(self):
text = "段1\n\n\n段2\n\n段3"
result = _split_paragraphs(text)
assert len(result) == 3
class TestSplitLongParagraph:
"""超长段落分割测试"""
def test_short_paragraph(self):
result = _split_long_paragraph("短文本", chunk_size=500)
assert result == ["短文本"]
def test_long_paragraph_chinese(self):
para = "第一句。" * 200 # 600 chars, exceeds 500
result = _split_long_paragraph(para, chunk_size=500)
assert len(result) >= 2
assert all(len(c) <= 500 for c in result)
def test_long_paragraph_english(self):
para = "Sentence. " * 200
result = _split_long_paragraph(para, chunk_size=500)
assert len(result) >= 2
assert all(len(c) <= 500 for c in result)
def test_no_sentence_boundary(self):
"""无句号可分割时按字符硬切"""
para = "a" * 1000
result = _split_long_paragraph(para, chunk_size=500)
assert len(result) == 2
assert len(result[0]) == 500
assert len(result[1]) == 500
class TestMergeSegments:
"""段落合并测试"""
def test_empty(self):
assert _merge_segments([], 500, 0) == []
def test_single_segment(self):
result = _merge_segments(["hello"], 500, 0)
assert result == ["hello"]
def test_merge_short_segments(self):
segs = ["a" * 100, "b" * 100, "c" * 100] # each < 500, total ~300 < 500
result = _merge_segments(segs, 500, 0)
assert len(result) == 1
assert len(result[0]) > 200
def test_split_large_segments(self):
segs = ["a" * 300, "b" * 300, "c" * 300] # 需要分为多个chunk
result = _merge_segments(segs, 500, 0)
assert len(result) >= 2
def test_overlap(self):
segs = ["a" * 300, "b" * 300]
result = _merge_segments(segs, 500, 50)
assert len(result) >= 1
class TestChunkText:
"""chunk_text 整体测试"""
def test_empty_text(self):
assert chunk_text("") == []
assert chunk_text(" ") == []
assert chunk_text(None) == [] # type: ignore
def test_short_text(self):
result = chunk_text("Hello world", chunk_size=500, chunk_overlap=0)
assert len(result) == 1
assert "Hello world" in result[0]
def test_normal_text(self):
text = """
这是第一段。它包含一些内容。
这是第二段。它也有一些内容。而且更长一些。
这是第三段。最后一段内容。
"""
result = chunk_text(text, chunk_size=500, chunk_overlap=0)
assert len(result) >= 1
# 所有内容都应该在结果中
all_text = "".join(result)
assert "第一段" in all_text
assert "第二段" in all_text
assert "第三段" in all_text
def test_chinese_text(self):
"""中文文本测试"""
text = "我喜欢吃川菜。特别是麻辣火锅。还有水煮鱼。这些都很美味。"
result = chunk_text(text, chunk_size=100, chunk_overlap=0)
assert len(result) >= 1
def test_overlap_between_chunks(self):
"""验证块间重叠"""
para = "这是一个很长的段落。" * 50
result = chunk_text(para, chunk_size=200, chunk_overlap=50)
if len(result) > 1:
# 相邻块应该有重叠内容
assert len(result[0]) > 0
assert len(result[1]) > 0
@pytest.mark.unit
def test_with_mixed_punctuation(self):
text = "Hello! How are you? I am fine. Thank you. 你好!最近怎么样?我很好。谢谢。"
result = chunk_text(text, chunk_size=200, chunk_overlap=0)
assert len(result) >= 1