""" 文档解析器单元测试 """ from __future__ import annotations import os import tempfile import pytest from app.services.document_parser import parse_document class TestDocumentParser: """文档解析器测试""" @pytest.mark.unit def test_parse_txt(self): with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", encoding="utf-8", delete=False) as f: f.write("Hello world\nThis is a test.") tmp_path = f.name try: result = parse_document(tmp_path, "txt") assert "Hello world" in result assert "This is a test" in result finally: os.unlink(tmp_path) @pytest.mark.unit def test_parse_txt_utf8_bom(self): """带 BOM 的 UTF-8 文件""" content = "\ufeff你好世界\n测试内容" with tempfile.NamedTemporaryFile(mode="wb", suffix=".txt", delete=False) as f: f.write(content.encode("utf-8-sig")) tmp_path = f.name try: result = parse_document(tmp_path, "txt") assert "你好" in result assert "测试" in result finally: os.unlink(tmp_path) @pytest.mark.unit def test_parse_md(self): with tempfile.NamedTemporaryFile(mode="w", suffix=".md", encoding="utf-8", delete=False) as f: f.write("# Title\n\nThis is **bold** text.") tmp_path = f.name try: result = parse_document(tmp_path, "md") assert "Title" in result assert "bold" in result finally: os.unlink(tmp_path) @pytest.mark.unit def test_parse_csv(self): with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", encoding="utf-8", delete=False) as f: f.write("name,age,city\nAlice,30,Beijing\nBob,25,Shanghai") tmp_path = f.name try: result = parse_document(tmp_path, "csv") assert "Alice" in result assert "Beijing" in result assert "Bob" in result finally: os.unlink(tmp_path) @pytest.mark.unit def test_parse_csv_with_different_delimiter(self): with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", encoding="utf-8", delete=False) as f: f.write("a|b|c\n1|2|3\n4|5|6") tmp_path = f.name try: result = parse_document(tmp_path, "csv") assert "1" in result or "a" in result finally: os.unlink(tmp_path) @pytest.mark.unit def test_unsupported_format(self): with tempfile.NamedTemporaryFile(suffix=".xyz", delete=False) as f: tmp_path = f.name try: result = parse_document(tmp_path, "xyz") assert result is None # 不支持的格式返回 None finally: os.unlink(tmp_path) @pytest.mark.unit def test_file_not_found(self): result = parse_document("/nonexistent/file.txt", "txt") assert result is None # 文件不存在返回 None @pytest.mark.unit def test_empty_file(self): with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", encoding="utf-8", delete=False) as f: tmp_path = f.name try: result = parse_document(tmp_path, "txt") assert result is not None finally: os.unlink(tmp_path) @pytest.mark.integration def test_parse_docx(self): """需要 python-docx 包""" pytest.importorskip("docx") from docx import Document doc = Document() doc.add_paragraph("Hello from docx") doc.add_paragraph("第二段落") with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as f: doc.save(f.name) tmp_path = f.name try: result = parse_document(tmp_path, "docx") assert "Hello from docx" in result assert "第二段落" in result finally: os.unlink(tmp_path) @pytest.mark.integration def test_parse_pdf(self): """需要 PyPDF2 包""" pytest.importorskip("PyPDF2") from PyPDF2 import PdfWriter writer = PdfWriter() writer.add_blank_page(72, 72) # 1 inch page with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f: writer.write(f) tmp_path = f.name try: result = parse_document(tmp_path, "pdf") assert result is not None finally: os.unlink(tmp_path)