Files
aiagent/backend/scripts/check_ocr_env.py
2026-04-13 22:52:36 +08:00

59 lines
1.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""检查图片 OCR 环境Pillow、pytesseract、Tesseract 可执行文件、chi_sim 语言包。
在 backend 目录执行:
.\\venv\\Scripts\\python scripts\\check_ocr_env.py
"""
from __future__ import annotations
import sys
from pathlib import Path
# 保证能加载 app 配置
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from app.core.config import settings # noqa: E402
from app.services.builtin_tools import _local_file_workspace_root, _tessdata_dir_for_ocr # noqa: E402
def main() -> int:
print("TESSERACT_CMD (settings):", settings.TESSERACT_CMD or "(空,将尝试 PATH)")
print("TESSERACT_TESSDATA_DIR (settings):", settings.TESSERACT_TESSDATA_DIR or "(空,将尝试仓库 tessdata/)")
try:
import PIL # noqa: F401
print("Pillow: OK")
except ImportError as e:
print("Pillow: 缺失 —", e)
print(" 请执行: pip install Pillow")
return 2
try:
import pytesseract as pt
print("pytesseract: OK")
except ImportError as e:
print("pytesseract: 缺失 —", e)
print(" 请执行: pip install pytesseract")
return 3
cmd = (settings.TESSERACT_CMD or "").strip()
if cmd:
pt.pytesseract.tesseract_cmd = cmd
try:
ver = pt.get_tesseract_version()
print("Tesseract 版本:", ver)
except Exception as e:
print("Tesseract 可执行文件: 不可用 —", e)
print(" Windows 请安装 Tesseract并在 .env 设置 TESSERACT_CMD=.../tesseract.exe")
return 4
td = _tessdata_dir_for_ocr()
print("解析到的 tessdata 目录:", td or "(未找到)")
root = _local_file_workspace_root()
loc = root / "tessdata"
if loc.is_dir():
has_chi = any(loc.glob("chi_sim.traineddata"))
print("仓库 tessdata/chi_sim.traineddata:", "存在" if has_chi else "缺失(中文识别差)")
return 0
if __name__ == "__main__":
raise SystemExit(main())