backend/tests/test_document_parser.py

"""
文档解析器单元测试
"""
from __future__ import annotations

import os
import tempfile
import pytest

from app.services.document_parser import parse_document


class TestDocumentParser:
    """文档解析器测试"""

    @pytest.mark.unit
    def test_parse_txt(self):
        with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", encoding="utf-8", delete=False) as f:
            f.write("Hello world\nThis is a test.")
            tmp_path = f.name
        try:
            result = parse_document(tmp_path, "txt")
            assert "Hello world" in result
            assert "This is a test" in result
        finally:
            os.unlink(tmp_path)

    @pytest.mark.unit
    def test_parse_txt_utf8_bom(self):
        """带 BOM 的 UTF-8 文件"""
        content = "\ufeff你好世界\n测试内容"
        with tempfile.NamedTemporaryFile(mode="wb", suffix=".txt", delete=False) as f:
            f.write(content.encode("utf-8-sig"))
            tmp_path = f.name
        try:
            result = parse_document(tmp_path, "txt")
            assert "你好" in result
            assert "测试" in result
        finally:
            os.unlink(tmp_path)

    @pytest.mark.unit
    def test_parse_md(self):
        with tempfile.NamedTemporaryFile(mode="w", suffix=".md", encoding="utf-8", delete=False) as f:
            f.write("# Title\n\nThis is **bold** text.")
            tmp_path = f.name
        try:
            result = parse_document(tmp_path, "md")
            assert "Title" in result
            assert "bold" in result
        finally:
            os.unlink(tmp_path)

    @pytest.mark.unit
    def test_parse_csv(self):
        with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", encoding="utf-8", delete=False) as f:
            f.write("name,age,city\nAlice,30,Beijing\nBob,25,Shanghai")
            tmp_path = f.name
        try:
            result = parse_document(tmp_path, "csv")
            assert "Alice" in result
            assert "Beijing" in result
            assert "Bob" in result
        finally:
            os.unlink(tmp_path)

    @pytest.mark.unit
    def test_parse_csv_with_different_delimiter(self):
        with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", encoding="utf-8", delete=False) as f:
            f.write("a|b|c\n1|2|3\n4|5|6")
            tmp_path = f.name
        try:
            result = parse_document(tmp_path, "csv")
            assert "1" in result or "a" in result
        finally:
            os.unlink(tmp_path)

    @pytest.mark.unit
    def test_unsupported_format(self):
        with tempfile.NamedTemporaryFile(suffix=".xyz", delete=False) as f:
            tmp_path = f.name
        try:
            result = parse_document(tmp_path, "xyz")
            assert result is None  # 不支持的格式返回 None
        finally:
            os.unlink(tmp_path)

    @pytest.mark.unit
    def test_file_not_found(self):
        result = parse_document("/nonexistent/file.txt", "txt")
        assert result is None  # 文件不存在返回 None

    @pytest.mark.unit
    def test_empty_file(self):
        with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", encoding="utf-8", delete=False) as f:
            tmp_path = f.name
        try:
            result = parse_document(tmp_path, "txt")
            assert result is not None
        finally:
            os.unlink(tmp_path)

    @pytest.mark.integration
    def test_parse_docx(self):
        """需要 python-docx 包"""
        pytest.importorskip("docx")
        from docx import Document

        doc = Document()
        doc.add_paragraph("Hello from docx")
        doc.add_paragraph("第二段落")
        with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as f:
            doc.save(f.name)
            tmp_path = f.name
        try:
            result = parse_document(tmp_path, "docx")
            assert "Hello from docx" in result
            assert "第二段落" in result
        finally:
            os.unlink(tmp_path)

    @pytest.mark.integration
    def test_parse_pdf(self):
        """需要 PyPDF2 包"""
        pytest.importorskip("PyPDF2")
        from PyPDF2 import PdfWriter

        writer = PdfWriter()
        writer.add_blank_page(72, 72)  # 1 inch page
        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f:
            writer.write(f)
            tmp_path = f.name
        try:
            result = parse_document(tmp_path, "pdf")
            assert result is not None
        finally:
            os.unlink(tmp_path)
feat: 向量记忆 RAG、工具市场、SSE 流式响应、前端集成与测试覆盖 - 新增 embedding_service（语义检索）、knowledge_service（RAG）、text_chunker、document_parser - 新增 tool_registry（自定义工具注册表）并完善工具市场 API（CRUD + code/http 执行） - 新增 agent_vector_memory / knowledge_base 模型及对应数据库表 - 实现 SSE 流式响应与 Agent 预算控制 - AgentChat.vue 集成 MainLayout 导航布局 - 完善测试体系：7 个新测试文件共 110 个测试覆盖 - 修复 conftest.py SQLite 内存数据库连接隔离问题 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> 2026-05-01 22:30:46 +08:00			`"""`
			`文档解析器单元测试`
			`"""`
			`from __future__ import annotations`

			`import os`
			`import tempfile`
			`import pytest`

			`from app.services.document_parser import parse_document`


			`class TestDocumentParser:`
			`"""文档解析器测试"""`

			`@pytest.mark.unit`
			`def test_parse_txt(self):`
			`with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", encoding="utf-8", delete=False) as f:`
			`f.write("Hello world\nThis is a test.")`
			`tmp_path = f.name`
			`try:`
			`result = parse_document(tmp_path, "txt")`
			`assert "Hello world" in result`
			`assert "This is a test" in result`
			`finally:`
			`os.unlink(tmp_path)`

			`@pytest.mark.unit`
			`def test_parse_txt_utf8_bom(self):`
			`"""带 BOM 的 UTF-8 文件"""`
			`content = "\ufeff你好世界\n测试内容"`
			`with tempfile.NamedTemporaryFile(mode="wb", suffix=".txt", delete=False) as f:`
			`f.write(content.encode("utf-8-sig"))`
			`tmp_path = f.name`
			`try:`
			`result = parse_document(tmp_path, "txt")`
			`assert "你好" in result`
			`assert "测试" in result`
			`finally:`
			`os.unlink(tmp_path)`

			`@pytest.mark.unit`
			`def test_parse_md(self):`
			`with tempfile.NamedTemporaryFile(mode="w", suffix=".md", encoding="utf-8", delete=False) as f:`
			`f.write("# Title\n\nThis is bold text.")`
			`tmp_path = f.name`
			`try:`
			`result = parse_document(tmp_path, "md")`
			`assert "Title" in result`
			`assert "bold" in result`
			`finally:`
			`os.unlink(tmp_path)`

			`@pytest.mark.unit`
			`def test_parse_csv(self):`
			`with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", encoding="utf-8", delete=False) as f:`
			`f.write("name,age,city\nAlice,30,Beijing\nBob,25,Shanghai")`
			`tmp_path = f.name`
			`try:`
			`result = parse_document(tmp_path, "csv")`
			`assert "Alice" in result`
			`assert "Beijing" in result`
			`assert "Bob" in result`
			`finally:`
			`os.unlink(tmp_path)`

			`@pytest.mark.unit`
			`def test_parse_csv_with_different_delimiter(self):`
			`with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", encoding="utf-8", delete=False) as f:`
			`f.write("a\|b\|c\n1\|2\|3\n4\|5\|6")`
			`tmp_path = f.name`
			`try:`
			`result = parse_document(tmp_path, "csv")`
			`assert "1" in result or "a" in result`
			`finally:`
			`os.unlink(tmp_path)`

			`@pytest.mark.unit`
			`def test_unsupported_format(self):`
			`with tempfile.NamedTemporaryFile(suffix=".xyz", delete=False) as f:`
			`tmp_path = f.name`
			`try:`
			`result = parse_document(tmp_path, "xyz")`
			`assert result is None # 不支持的格式返回 None`
			`finally:`
			`os.unlink(tmp_path)`

			`@pytest.mark.unit`
			`def test_file_not_found(self):`
			`result = parse_document("/nonexistent/file.txt", "txt")`
			`assert result is None # 文件不存在返回 None`

			`@pytest.mark.unit`
			`def test_empty_file(self):`
			`with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", encoding="utf-8", delete=False) as f:`
			`tmp_path = f.name`
			`try:`
			`result = parse_document(tmp_path, "txt")`
			`assert result is not None`
			`finally:`
			`os.unlink(tmp_path)`

			`@pytest.mark.integration`
			`def test_parse_docx(self):`
			`"""需要 python-docx 包"""`
			`pytest.importorskip("docx")`
			`from docx import Document`

			`doc = Document()`
			`doc.add_paragraph("Hello from docx")`
			`doc.add_paragraph("第二段落")`
			`with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as f:`
			`doc.save(f.name)`
			`tmp_path = f.name`
			`try:`
			`result = parse_document(tmp_path, "docx")`
			`assert "Hello from docx" in result`
			`assert "第二段落" in result`
			`finally:`
			`os.unlink(tmp_path)`

			`@pytest.mark.integration`
			`def test_parse_pdf(self):`
			`"""需要 PyPDF2 包"""`
			`pytest.importorskip("PyPDF2")`
			`from PyPDF2 import PdfWriter`

			`writer = PdfWriter()`
			`writer.add_blank_page(72, 72) # 1 inch page`
			`with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f:`
			`writer.write(f)`
			`tmp_path = f.name`
			`try:`
			`result = parse_document(tmp_path, "pdf")`
			`assert result is not None`
			`finally:`
			`os.unlink(tmp_path)`