Files
aiagent/backend/app/services/document_parser.py
renjianbo 7b9e0826de feat: 向量记忆 RAG、工具市场、SSE 流式响应、前端集成与测试覆盖
- 新增 embedding_service(语义检索)、knowledge_service(RAG)、text_chunker、document_parser
- 新增 tool_registry(自定义工具注册表)并完善工具市场 API(CRUD + code/http 执行)
- 新增 agent_vector_memory / knowledge_base 模型及对应数据库表
- 实现 SSE 流式响应与 Agent 预算控制
- AgentChat.vue 集成 MainLayout 导航布局
- 完善测试体系:7 个新测试文件共 110 个测试覆盖
- 修复 conftest.py SQLite 内存数据库连接隔离问题

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-01 22:30:46 +08:00

102 lines
2.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
文档解析器 — 支持 txt / pdf / docx / md / csv。
解析结果统一返回纯文本字符串,由调用方做分块处理。
"""
from __future__ import annotations
import csv
import io
import logging
import os
from typing import Optional
logger = logging.getLogger(__name__)
def parse_document(file_path: str, file_type: str) -> Optional[str]:
"""
解析文档为纯文本。
Args:
file_path: 文件绝对路径
file_type: 文件类型txt / pdf / docx / md / csv
Returns:
文本内容,解析失败返回 None
"""
if not os.path.isfile(file_path):
logger.warning("文件不存在: %s", file_path)
return None
parsers = {
"txt": _parse_text,
"md": _parse_text,
"pdf": _parse_pdf,
"docx": _parse_docx,
"csv": _parse_csv,
}
parser = parsers.get(file_type)
if not parser:
logger.warning("不支持的文件类型: %s", file_type)
return None
try:
text = parser(file_path)
if text:
text = text.strip()
logger.info("文档解析完成: %s (%s, %d 字符)", file_path, file_type, len(text or ""))
return text
except Exception as e:
logger.error("文档解析失败: %s (%s)", file_path, e, exc_info=True)
return None
def _parse_text(file_path: str) -> str:
"""解析纯文本 / Markdown 文件。"""
with open(file_path, "r", encoding="utf-8", errors="replace") as f:
return f.read()
def _parse_pdf(file_path: str) -> str:
"""解析 PDF 文件。"""
from pypdf import PdfReader
reader = PdfReader(file_path)
pages = []
for page in reader.pages:
text = page.extract_text()
if text:
pages.append(text)
return "\n\n".join(pages)
def _parse_docx(file_path: str) -> str:
"""解析 Word 文档。"""
from docx import Document as DocxDocument
doc = DocxDocument(file_path)
paragraphs = []
for para in doc.paragraphs:
if para.text and para.text.strip():
paragraphs.append(para.text.strip())
# 也提取表格内容
for table in doc.tables:
for row in table.rows:
cells = [cell.text.strip() for cell in row.cells if cell.text.strip()]
if cells:
paragraphs.append(" | ".join(cells))
return "\n\n".join(paragraphs)
def _parse_csv(file_path: str) -> str:
"""解析 CSV 文件为文本表格。"""
rows = []
with open(file_path, "r", encoding="utf-8", errors="replace") as f:
reader = csv.reader(f)
for row in reader:
rows.append(" | ".join(cell.strip() for cell in row))
return "\n".join(rows)