Files
aiagent/backend/app/services/document_review_agent.py
renjianbo ab1589921a fix: 修复35个安全与功能缺陷,补全知识进化/数字孪生/行为采集模块
## 安全修复 (12项)
- Webhook接口添加全局Token认证,过滤敏感请求头
- 修复JWT Base64 padding公式,防止签名验证绕过
- 数据库密码/飞书Token从源码移除,改为环境变量
- 工作流引擎添加路径遍历防护 (_resolve_safe_path)
- eval()添加模板长度上限检查
- 审批API添加认证依赖
- 前端v-html增强XSS转义,console.log仅开发模式输出
- 500错误不再暴露内部异常详情

## Agent运行时修复 (7项)
- 删除_inject_knowledge_context中未定义db变量的finally块
- 工具执行添加try/except保护,异常不崩溃Agent
- LLM重试计入budget计数器
- self_review异常时passed=False
- max_iterations截断标记success=False
- 工具参数JSON解析失败时记录警告日志
- run()开始时重置_llm_invocations计数器

## 配置与基础设施
- DEBUG默认False,SQL_ECHO独立配置项
- init_db()补全13个缺失模型导入
- 新增WEBHOOK_AUTH_TOKEN/SQL_ECHO配置项
- 新增.env.example模板文件

## 前端修复 (12项)
- 登录改用URLSearchParams替代FormData
- 401拦截器通过Pinia store统一清理状态
- SSE流超时从60s延长至300s
- final/error事件时清除streamTimeout
- localStorage聊天记录添加24h TTL
- safeParseArgCount替代模板中裸JSON.parse
- fetchUser 401时同时清除user对象

## 新增模块
- 知识进化: knowledge_extractor/retriever/tasks
- 数字孪生: shadow_executor/comparison模型
- 行为采集: behavior_middleware/collector/fingerprint_engine
- 代码审查: code_review_agent/document_review_agent
- 反馈学习: feedback_learner
- 瓶颈检测/优化引擎/成本估算/需求估算
- 速率限制器 (rate_limiter)
- Alembic迁移 015-020

## 文档
- 商业化落地计划
- 8篇docs文档 (架构/API/部署/开发/贡献等)
- Docker Compose生产配置

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-10 19:50:20 +08:00

258 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
文档审查代理 — 数字分身自动审查合同/文档,检查完整性、风险点、格式
"""
from __future__ import annotations
import logging
import re
from typing import Any, Dict, List, Optional
from app.services.fingerprint_engine import fingerprint_engine
from app.services.behavior_collector import behavior_collector
logger = logging.getLogger(__name__)
DOCUMENT_CHECKLISTS = {
"contract": {
"parties": {"label": "签约方信息", "checks": ["甲方全称", "乙方全称", "统一社会信用代码", "法定代表人", "联系方式"]},
"subject": {"label": "合同标的", "checks": ["标的描述是否明确", "数量/规格是否清晰", "质量标准是否定义"]},
"payment": {"label": "付款条款", "checks": ["金额大小写一致", "付款节点清晰", "发票要求", "违约金比例 ≤ 30%"]},
"term": {"label": "履约期限", "checks": ["起止日期明确", "延期条款", "不可抗力定义"]},
"liability": {"label": "违约责任", "checks": ["双方对等", "赔偿上限合理", "免责条款不违法"]},
"confidentiality": {"label": "保密条款", "checks": ["保密范围", "保密期限", "违约责任"]},
"termination": {"label": "终止条款", "checks": ["解除条件", "善后义务", "争议解决方式"]},
"signature": {"label": "签章", "checks": ["签字日期", "盖章位置", "骑缝章"]},
},
"technical_doc": {
"overview": {"label": "概述", "checks": ["背景说明", "目标定义", "适用范围"]},
"terms": {"label": "术语定义", "checks": ["关键术语有定义", "缩写全称首次出现"]},
"architecture": {"label": "架构设计", "checks": ["架构图/示意图", "模块划分", "接口定义"]},
"implementation": {"label": "实现细节", "checks": ["核心逻辑描述", "关键配置项", "异常处理设计"]},
"deployment": {"label": "部署运维", "checks": ["环境要求", "部署步骤", "监控指标", "回滚方案"]},
},
"email": {
"subject": {"label": "主题", "checks": ["主题简明扼要", "无多余前缀"]},
"recipient": {"label": "收件人", "checks": ["To/CC/BCC 区分正确", "收件人数量合理"]},
"greeting": {"label": "称呼", "checks": ["称呼得体", "与收件人类系匹配"]},
"body": {"label": "正文", "checks": ["核心信息在前", "段落简短", "行动项明确"]},
"attachments": {"label": "附件", "checks": ["附件已提及", "文件大小合理", "格式通用"]},
"closing": {"label": "结尾", "checks": ["署名完整", "签名档信息准确"]},
},
}
RISK_PATTERNS = {
"high": [
(r"不承担.*任何.*责任", "单方面免责声明"),
(r"自动续期.*未.*通知", "自动续期条款需关注"),
(r"违约金.*超过.*30%", "违约金比例可能过高"),
(r"放弃.*诉讼.*权利", "放弃诉讼权利条款"),
(r"无条件.*连带.*保证", "无条件连带保证责任"),
],
"medium": [
(r"以.*为准.*解释", "单方解释权条款"),
(r"仲裁.*北京|仲裁.*上海", "仲裁地条款需确认"),
(r"保密.*永久", "永久保密义务可能过于严苛"),
(r"竞业.*2.*年|竞业.*3.*年", "竞业限制期限较长"),
],
"low": [
(r"未尽事宜.*协商", "兜底条款可接受"),
(r"一式.*份", "签署份数需确认"),
],
}
class DocumentReviewAgent:
"""文档审查代理"""
def review_document(self, user_id: str, document_type: str,
title: str, content: str,
metadata: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
"""审查文档/合同。
Args:
user_id: 用户ID用于加载偏好
document_type: contract / technical_doc / email
title: 文档标题
content: 文档内容
metadata: 附加元数据
"""
fp = fingerprint_engine.get_fingerprint(user_id)
preference = (fp.get("preference_weights", {}).get("document") if fp else None) or {}
checklist = DOCUMENT_CHECKLISTS.get(document_type, DOCUMENT_CHECKLISTS["contract"])
# 1. 完整性检查
completeness = self._check_completeness(content, checklist)
# 2. 风险扫描
risks = self._scan_risks(content)
# 3. 格式检查
format_issues = self._check_format(content, document_type)
# 4. 生成建议
suggestions = self._generate_suggestions(completeness, risks, format_issues)
score = self._calculate_score(completeness, risks, format_issues)
report = {
"document_type": document_type,
"title": title,
"overall_score": score,
"grade": self._grade(score),
"completeness": completeness,
"risks": risks,
"format_issues": format_issues,
"suggestions": suggestions,
"user_preference_applied": bool(preference),
}
# 记录行为
behavior_collector.log_fire_and_forget(
user_id=user_id,
category="document",
action=f"review_{document_type}",
context={"title": title, "doc_type": document_type, "content_length": len(content)},
result={"score": score, "risks_count": len(risks)},
)
return report
def _check_completeness(self, content: str, checklist: Dict[str, Any]) -> Dict[str, Any]:
"""检查文档完整性。"""
results = {}
for section_id, section in checklist.items():
checked = []
for item in section["checks"]:
present = self._fuzzy_contains(content, item)
checked.append({"item": item, "present": present})
passed = sum(1 for c in checked if c["present"])
total = len(checked)
results[section_id] = {
"label": section["label"],
"passed": passed,
"total": total,
"percentage": round(passed / total * 100, 1) if total > 0 else 0,
"details": checked,
}
return results
def _fuzzy_contains(self, content: str, query: str) -> bool:
"""模糊检查内容是否包含关键信息。"""
content_lower = content.lower()
parts = re.split(r'[,、\s]+', query)
for part in parts:
if len(part) >= 2 and part.lower() in content_lower:
return True
if len(query) >= 2 and query.lower() in content_lower:
return True
return False
def _scan_risks(self, content: str) -> List[Dict[str, Any]]:
"""扫描风险条款。"""
risks = []
for severity, patterns in RISK_PATTERNS.items():
for pattern, label in patterns:
matches = re.findall(pattern, content)
if matches:
risks.append({
"severity": severity,
"label": label,
"pattern": pattern,
"match_count": len(matches),
"snippet": self._extract_snippet(content, pattern),
})
risks.sort(key=lambda r: {"high": 0, "medium": 1, "low": 2}[r["severity"]])
return risks
def _extract_snippet(self, content: str, pattern: str) -> str:
"""提取匹配行的上下文片段。"""
for line in content.split("\n"):
if re.search(pattern, line):
return line.strip()[:150]
return ""
def _check_format(self, content: str, doc_type: str) -> List[Dict[str, Any]]:
"""检查格式问题。"""
issues = []
if doc_type == "contract":
if not re.search(r'[(]\s*[一二三1-3]\s*[)]', content):
issues.append({"type": "structure", "message": "未检测到分条结构,合同建议使用条/款/项结构"})
if len(content) < 500:
issues.append({"type": "length", "message": f"文档仅 {len(content)} 字符,合同内容可能不完整"})
if doc_type == "technical_doc":
if not re.search(r'#+\s|第[一二三1-3]章|第[一二三1-3]节', content):
issues.append({"type": "structure", "message": "未检测到章节标题,建议添加层次化标题"})
if doc_type == "email":
if len(content) > 5000:
issues.append({"type": "length", "message": "邮件过长,建议精简核心内容"})
has_line_breaks = len(re.findall(r'\n\s*\n', content))
if has_line_breaks == 0 and len(content) > 1000:
issues.append({"type": "readability", "message": "大段文字缺少分段,建议增加段落间距"})
return issues
def _generate_suggestions(self, completeness: Dict, risks: List,
format_issues: List) -> List[str]:
"""汇总改善建议。"""
suggestions = []
for section_id, result in completeness.items():
if result["percentage"] < 50:
suggestions.append(
f"[{result['label']}] 完整性仅 {result['percentage']}%,建议补充缺少的要素"
)
elif result["percentage"] < 80:
missing = [d["item"] for d in result.get("details", []) if not d["present"]]
suggestions.append(
f"[{result['label']}] 缺少: {', '.join(missing[:3])}"
)
high_risks = [r for r in risks if r["severity"] == "high"]
if high_risks:
suggestions.append(f"发现 {len(high_risks)} 个高风险条款: "
f"{', '.join(r['label'] for r in high_risks)}")
for issue in format_issues:
suggestions.append(f"[格式] {issue['message']}")
return suggestions
def _calculate_score(self, completeness: Dict, risks: List,
format_issues: List) -> float:
"""计算综合评分 (0-100)。"""
# 完整性权重 40%
if completeness:
avg = sum(r["percentage"] for r in completeness.values()) / len(completeness)
else:
avg = 0
completeness_score = avg * 0.4
# 风险权重 40%
risk_penalty = sum({"high": 20, "medium": 10, "low": 5}[r["severity"]] for r in risks)
risk_score = max(0, 40 - risk_penalty)
# 格式权重 20%
format_score = max(0, 20 - len(format_issues) * 5)
return round(completeness_score + risk_score + format_score, 1)
def _grade(self, score: float) -> str:
if score >= 90:
return "A — 优秀"
elif score >= 75:
return "B — 良好"
elif score >= 60:
return "C — 需改进"
elif score >= 40:
return "D — 风险较高"
else:
return "F — 不建议签署/发布"
document_review_agent = DocumentReviewAgent()