138 lines
4.4 KiB
Python
138 lines
4.4 KiB
Python
|
|
"""
|
||
|
|
文档解析器单元测试
|
||
|
|
"""
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import os
|
||
|
|
import tempfile
|
||
|
|
import pytest
|
||
|
|
|
||
|
|
from app.services.document_parser import parse_document
|
||
|
|
|
||
|
|
|
||
|
|
class TestDocumentParser:
|
||
|
|
"""文档解析器测试"""
|
||
|
|
|
||
|
|
@pytest.mark.unit
|
||
|
|
def test_parse_txt(self):
|
||
|
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", encoding="utf-8", delete=False) as f:
|
||
|
|
f.write("Hello world\nThis is a test.")
|
||
|
|
tmp_path = f.name
|
||
|
|
try:
|
||
|
|
result = parse_document(tmp_path, "txt")
|
||
|
|
assert "Hello world" in result
|
||
|
|
assert "This is a test" in result
|
||
|
|
finally:
|
||
|
|
os.unlink(tmp_path)
|
||
|
|
|
||
|
|
@pytest.mark.unit
|
||
|
|
def test_parse_txt_utf8_bom(self):
|
||
|
|
"""带 BOM 的 UTF-8 文件"""
|
||
|
|
content = "\ufeff你好世界\n测试内容"
|
||
|
|
with tempfile.NamedTemporaryFile(mode="wb", suffix=".txt", delete=False) as f:
|
||
|
|
f.write(content.encode("utf-8-sig"))
|
||
|
|
tmp_path = f.name
|
||
|
|
try:
|
||
|
|
result = parse_document(tmp_path, "txt")
|
||
|
|
assert "你好" in result
|
||
|
|
assert "测试" in result
|
||
|
|
finally:
|
||
|
|
os.unlink(tmp_path)
|
||
|
|
|
||
|
|
@pytest.mark.unit
|
||
|
|
def test_parse_md(self):
|
||
|
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".md", encoding="utf-8", delete=False) as f:
|
||
|
|
f.write("# Title\n\nThis is **bold** text.")
|
||
|
|
tmp_path = f.name
|
||
|
|
try:
|
||
|
|
result = parse_document(tmp_path, "md")
|
||
|
|
assert "Title" in result
|
||
|
|
assert "bold" in result
|
||
|
|
finally:
|
||
|
|
os.unlink(tmp_path)
|
||
|
|
|
||
|
|
@pytest.mark.unit
|
||
|
|
def test_parse_csv(self):
|
||
|
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", encoding="utf-8", delete=False) as f:
|
||
|
|
f.write("name,age,city\nAlice,30,Beijing\nBob,25,Shanghai")
|
||
|
|
tmp_path = f.name
|
||
|
|
try:
|
||
|
|
result = parse_document(tmp_path, "csv")
|
||
|
|
assert "Alice" in result
|
||
|
|
assert "Beijing" in result
|
||
|
|
assert "Bob" in result
|
||
|
|
finally:
|
||
|
|
os.unlink(tmp_path)
|
||
|
|
|
||
|
|
@pytest.mark.unit
|
||
|
|
def test_parse_csv_with_different_delimiter(self):
|
||
|
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", encoding="utf-8", delete=False) as f:
|
||
|
|
f.write("a|b|c\n1|2|3\n4|5|6")
|
||
|
|
tmp_path = f.name
|
||
|
|
try:
|
||
|
|
result = parse_document(tmp_path, "csv")
|
||
|
|
assert "1" in result or "a" in result
|
||
|
|
finally:
|
||
|
|
os.unlink(tmp_path)
|
||
|
|
|
||
|
|
@pytest.mark.unit
|
||
|
|
def test_unsupported_format(self):
|
||
|
|
with tempfile.NamedTemporaryFile(suffix=".xyz", delete=False) as f:
|
||
|
|
tmp_path = f.name
|
||
|
|
try:
|
||
|
|
result = parse_document(tmp_path, "xyz")
|
||
|
|
assert result is None # 不支持的格式返回 None
|
||
|
|
finally:
|
||
|
|
os.unlink(tmp_path)
|
||
|
|
|
||
|
|
@pytest.mark.unit
|
||
|
|
def test_file_not_found(self):
|
||
|
|
result = parse_document("/nonexistent/file.txt", "txt")
|
||
|
|
assert result is None # 文件不存在返回 None
|
||
|
|
|
||
|
|
@pytest.mark.unit
|
||
|
|
def test_empty_file(self):
|
||
|
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", encoding="utf-8", delete=False) as f:
|
||
|
|
tmp_path = f.name
|
||
|
|
try:
|
||
|
|
result = parse_document(tmp_path, "txt")
|
||
|
|
assert result is not None
|
||
|
|
finally:
|
||
|
|
os.unlink(tmp_path)
|
||
|
|
|
||
|
|
@pytest.mark.integration
|
||
|
|
def test_parse_docx(self):
|
||
|
|
"""需要 python-docx 包"""
|
||
|
|
pytest.importorskip("docx")
|
||
|
|
from docx import Document
|
||
|
|
|
||
|
|
doc = Document()
|
||
|
|
doc.add_paragraph("Hello from docx")
|
||
|
|
doc.add_paragraph("第二段落")
|
||
|
|
with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as f:
|
||
|
|
doc.save(f.name)
|
||
|
|
tmp_path = f.name
|
||
|
|
try:
|
||
|
|
result = parse_document(tmp_path, "docx")
|
||
|
|
assert "Hello from docx" in result
|
||
|
|
assert "第二段落" in result
|
||
|
|
finally:
|
||
|
|
os.unlink(tmp_path)
|
||
|
|
|
||
|
|
@pytest.mark.integration
|
||
|
|
def test_parse_pdf(self):
|
||
|
|
"""需要 PyPDF2 包"""
|
||
|
|
pytest.importorskip("PyPDF2")
|
||
|
|
from PyPDF2 import PdfWriter
|
||
|
|
|
||
|
|
writer = PdfWriter()
|
||
|
|
writer.add_blank_page(72, 72) # 1 inch page
|
||
|
|
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f:
|
||
|
|
writer.write(f)
|
||
|
|
tmp_path = f.name
|
||
|
|
try:
|
||
|
|
result = parse_document(tmp_path, "pdf")
|
||
|
|
assert result is not None
|
||
|
|
finally:
|
||
|
|
os.unlink(tmp_path)
|