## 安全修复 (12项) - Webhook接口添加全局Token认证,过滤敏感请求头 - 修复JWT Base64 padding公式,防止签名验证绕过 - 数据库密码/飞书Token从源码移除,改为环境变量 - 工作流引擎添加路径遍历防护 (_resolve_safe_path) - eval()添加模板长度上限检查 - 审批API添加认证依赖 - 前端v-html增强XSS转义,console.log仅开发模式输出 - 500错误不再暴露内部异常详情 ## Agent运行时修复 (7项) - 删除_inject_knowledge_context中未定义db变量的finally块 - 工具执行添加try/except保护,异常不崩溃Agent - LLM重试计入budget计数器 - self_review异常时passed=False - max_iterations截断标记success=False - 工具参数JSON解析失败时记录警告日志 - run()开始时重置_llm_invocations计数器 ## 配置与基础设施 - DEBUG默认False,SQL_ECHO独立配置项 - init_db()补全13个缺失模型导入 - 新增WEBHOOK_AUTH_TOKEN/SQL_ECHO配置项 - 新增.env.example模板文件 ## 前端修复 (12项) - 登录改用URLSearchParams替代FormData - 401拦截器通过Pinia store统一清理状态 - SSE流超时从60s延长至300s - final/error事件时清除streamTimeout - localStorage聊天记录添加24h TTL - safeParseArgCount替代模板中裸JSON.parse - fetchUser 401时同时清除user对象 ## 新增模块 - 知识进化: knowledge_extractor/retriever/tasks - 数字孪生: shadow_executor/comparison模型 - 行为采集: behavior_middleware/collector/fingerprint_engine - 代码审查: code_review_agent/document_review_agent - 反馈学习: feedback_learner - 瓶颈检测/优化引擎/成本估算/需求估算 - 速率限制器 (rate_limiter) - Alembic迁移 015-020 ## 文档 - 商业化落地计划 - 8篇docs文档 (架构/API/部署/开发/贡献等) - Docker Compose生产配置 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
258 lines
11 KiB
Python
258 lines
11 KiB
Python
"""
|
||
文档审查代理 — 数字分身自动审查合同/文档,检查完整性、风险点、格式
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import logging
|
||
import re
|
||
from typing import Any, Dict, List, Optional
|
||
|
||
from app.services.fingerprint_engine import fingerprint_engine
|
||
from app.services.behavior_collector import behavior_collector
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
DOCUMENT_CHECKLISTS = {
|
||
"contract": {
|
||
"parties": {"label": "签约方信息", "checks": ["甲方全称", "乙方全称", "统一社会信用代码", "法定代表人", "联系方式"]},
|
||
"subject": {"label": "合同标的", "checks": ["标的描述是否明确", "数量/规格是否清晰", "质量标准是否定义"]},
|
||
"payment": {"label": "付款条款", "checks": ["金额大小写一致", "付款节点清晰", "发票要求", "违约金比例 ≤ 30%"]},
|
||
"term": {"label": "履约期限", "checks": ["起止日期明确", "延期条款", "不可抗力定义"]},
|
||
"liability": {"label": "违约责任", "checks": ["双方对等", "赔偿上限合理", "免责条款不违法"]},
|
||
"confidentiality": {"label": "保密条款", "checks": ["保密范围", "保密期限", "违约责任"]},
|
||
"termination": {"label": "终止条款", "checks": ["解除条件", "善后义务", "争议解决方式"]},
|
||
"signature": {"label": "签章", "checks": ["签字日期", "盖章位置", "骑缝章"]},
|
||
},
|
||
"technical_doc": {
|
||
"overview": {"label": "概述", "checks": ["背景说明", "目标定义", "适用范围"]},
|
||
"terms": {"label": "术语定义", "checks": ["关键术语有定义", "缩写全称首次出现"]},
|
||
"architecture": {"label": "架构设计", "checks": ["架构图/示意图", "模块划分", "接口定义"]},
|
||
"implementation": {"label": "实现细节", "checks": ["核心逻辑描述", "关键配置项", "异常处理设计"]},
|
||
"deployment": {"label": "部署运维", "checks": ["环境要求", "部署步骤", "监控指标", "回滚方案"]},
|
||
},
|
||
"email": {
|
||
"subject": {"label": "主题", "checks": ["主题简明扼要", "无多余前缀"]},
|
||
"recipient": {"label": "收件人", "checks": ["To/CC/BCC 区分正确", "收件人数量合理"]},
|
||
"greeting": {"label": "称呼", "checks": ["称呼得体", "与收件人类系匹配"]},
|
||
"body": {"label": "正文", "checks": ["核心信息在前", "段落简短", "行动项明确"]},
|
||
"attachments": {"label": "附件", "checks": ["附件已提及", "文件大小合理", "格式通用"]},
|
||
"closing": {"label": "结尾", "checks": ["署名完整", "签名档信息准确"]},
|
||
},
|
||
}
|
||
|
||
RISK_PATTERNS = {
|
||
"high": [
|
||
(r"不承担.*任何.*责任", "单方面免责声明"),
|
||
(r"自动续期.*未.*通知", "自动续期条款需关注"),
|
||
(r"违约金.*超过.*30%", "违约金比例可能过高"),
|
||
(r"放弃.*诉讼.*权利", "放弃诉讼权利条款"),
|
||
(r"无条件.*连带.*保证", "无条件连带保证责任"),
|
||
],
|
||
"medium": [
|
||
(r"以.*为准.*解释", "单方解释权条款"),
|
||
(r"仲裁.*北京|仲裁.*上海", "仲裁地条款需确认"),
|
||
(r"保密.*永久", "永久保密义务可能过于严苛"),
|
||
(r"竞业.*2.*年|竞业.*3.*年", "竞业限制期限较长"),
|
||
],
|
||
"low": [
|
||
(r"未尽事宜.*协商", "兜底条款可接受"),
|
||
(r"一式.*份", "签署份数需确认"),
|
||
],
|
||
}
|
||
|
||
|
||
class DocumentReviewAgent:
|
||
"""文档审查代理"""
|
||
|
||
def review_document(self, user_id: str, document_type: str,
|
||
title: str, content: str,
|
||
metadata: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
||
"""审查文档/合同。
|
||
|
||
Args:
|
||
user_id: 用户ID(用于加载偏好)
|
||
document_type: contract / technical_doc / email
|
||
title: 文档标题
|
||
content: 文档内容
|
||
metadata: 附加元数据
|
||
"""
|
||
fp = fingerprint_engine.get_fingerprint(user_id)
|
||
preference = (fp.get("preference_weights", {}).get("document") if fp else None) or {}
|
||
|
||
checklist = DOCUMENT_CHECKLISTS.get(document_type, DOCUMENT_CHECKLISTS["contract"])
|
||
|
||
# 1. 完整性检查
|
||
completeness = self._check_completeness(content, checklist)
|
||
|
||
# 2. 风险扫描
|
||
risks = self._scan_risks(content)
|
||
|
||
# 3. 格式检查
|
||
format_issues = self._check_format(content, document_type)
|
||
|
||
# 4. 生成建议
|
||
suggestions = self._generate_suggestions(completeness, risks, format_issues)
|
||
|
||
score = self._calculate_score(completeness, risks, format_issues)
|
||
|
||
report = {
|
||
"document_type": document_type,
|
||
"title": title,
|
||
"overall_score": score,
|
||
"grade": self._grade(score),
|
||
"completeness": completeness,
|
||
"risks": risks,
|
||
"format_issues": format_issues,
|
||
"suggestions": suggestions,
|
||
"user_preference_applied": bool(preference),
|
||
}
|
||
|
||
# 记录行为
|
||
behavior_collector.log_fire_and_forget(
|
||
user_id=user_id,
|
||
category="document",
|
||
action=f"review_{document_type}",
|
||
context={"title": title, "doc_type": document_type, "content_length": len(content)},
|
||
result={"score": score, "risks_count": len(risks)},
|
||
)
|
||
|
||
return report
|
||
|
||
def _check_completeness(self, content: str, checklist: Dict[str, Any]) -> Dict[str, Any]:
|
||
"""检查文档完整性。"""
|
||
results = {}
|
||
for section_id, section in checklist.items():
|
||
checked = []
|
||
for item in section["checks"]:
|
||
present = self._fuzzy_contains(content, item)
|
||
checked.append({"item": item, "present": present})
|
||
passed = sum(1 for c in checked if c["present"])
|
||
total = len(checked)
|
||
results[section_id] = {
|
||
"label": section["label"],
|
||
"passed": passed,
|
||
"total": total,
|
||
"percentage": round(passed / total * 100, 1) if total > 0 else 0,
|
||
"details": checked,
|
||
}
|
||
return results
|
||
|
||
def _fuzzy_contains(self, content: str, query: str) -> bool:
|
||
"""模糊检查内容是否包含关键信息。"""
|
||
content_lower = content.lower()
|
||
parts = re.split(r'[,,、\s]+', query)
|
||
for part in parts:
|
||
if len(part) >= 2 and part.lower() in content_lower:
|
||
return True
|
||
if len(query) >= 2 and query.lower() in content_lower:
|
||
return True
|
||
return False
|
||
|
||
def _scan_risks(self, content: str) -> List[Dict[str, Any]]:
|
||
"""扫描风险条款。"""
|
||
risks = []
|
||
for severity, patterns in RISK_PATTERNS.items():
|
||
for pattern, label in patterns:
|
||
matches = re.findall(pattern, content)
|
||
if matches:
|
||
risks.append({
|
||
"severity": severity,
|
||
"label": label,
|
||
"pattern": pattern,
|
||
"match_count": len(matches),
|
||
"snippet": self._extract_snippet(content, pattern),
|
||
})
|
||
risks.sort(key=lambda r: {"high": 0, "medium": 1, "low": 2}[r["severity"]])
|
||
return risks
|
||
|
||
def _extract_snippet(self, content: str, pattern: str) -> str:
|
||
"""提取匹配行的上下文片段。"""
|
||
for line in content.split("\n"):
|
||
if re.search(pattern, line):
|
||
return line.strip()[:150]
|
||
return ""
|
||
|
||
def _check_format(self, content: str, doc_type: str) -> List[Dict[str, Any]]:
|
||
"""检查格式问题。"""
|
||
issues = []
|
||
|
||
if doc_type == "contract":
|
||
if not re.search(r'[((]\s*[一二三1-3]\s*[))]', content):
|
||
issues.append({"type": "structure", "message": "未检测到分条结构,合同建议使用条/款/项结构"})
|
||
if len(content) < 500:
|
||
issues.append({"type": "length", "message": f"文档仅 {len(content)} 字符,合同内容可能不完整"})
|
||
|
||
if doc_type == "technical_doc":
|
||
if not re.search(r'#+\s|第[一二三1-3]章|第[一二三1-3]节', content):
|
||
issues.append({"type": "structure", "message": "未检测到章节标题,建议添加层次化标题"})
|
||
|
||
if doc_type == "email":
|
||
if len(content) > 5000:
|
||
issues.append({"type": "length", "message": "邮件过长,建议精简核心内容"})
|
||
|
||
has_line_breaks = len(re.findall(r'\n\s*\n', content))
|
||
if has_line_breaks == 0 and len(content) > 1000:
|
||
issues.append({"type": "readability", "message": "大段文字缺少分段,建议增加段落间距"})
|
||
|
||
return issues
|
||
|
||
def _generate_suggestions(self, completeness: Dict, risks: List,
|
||
format_issues: List) -> List[str]:
|
||
"""汇总改善建议。"""
|
||
suggestions = []
|
||
|
||
for section_id, result in completeness.items():
|
||
if result["percentage"] < 50:
|
||
suggestions.append(
|
||
f"[{result['label']}] 完整性仅 {result['percentage']}%,建议补充缺少的要素"
|
||
)
|
||
elif result["percentage"] < 80:
|
||
missing = [d["item"] for d in result.get("details", []) if not d["present"]]
|
||
suggestions.append(
|
||
f"[{result['label']}] 缺少: {', '.join(missing[:3])}"
|
||
)
|
||
|
||
high_risks = [r for r in risks if r["severity"] == "high"]
|
||
if high_risks:
|
||
suggestions.append(f"发现 {len(high_risks)} 个高风险条款: "
|
||
f"{', '.join(r['label'] for r in high_risks)}")
|
||
|
||
for issue in format_issues:
|
||
suggestions.append(f"[格式] {issue['message']}")
|
||
|
||
return suggestions
|
||
|
||
def _calculate_score(self, completeness: Dict, risks: List,
|
||
format_issues: List) -> float:
|
||
"""计算综合评分 (0-100)。"""
|
||
# 完整性权重 40%
|
||
if completeness:
|
||
avg = sum(r["percentage"] for r in completeness.values()) / len(completeness)
|
||
else:
|
||
avg = 0
|
||
completeness_score = avg * 0.4
|
||
|
||
# 风险权重 40%
|
||
risk_penalty = sum({"high": 20, "medium": 10, "low": 5}[r["severity"]] for r in risks)
|
||
risk_score = max(0, 40 - risk_penalty)
|
||
|
||
# 格式权重 20%
|
||
format_score = max(0, 20 - len(format_issues) * 5)
|
||
|
||
return round(completeness_score + risk_score + format_score, 1)
|
||
|
||
def _grade(self, score: float) -> str:
|
||
if score >= 90:
|
||
return "A — 优秀"
|
||
elif score >= 75:
|
||
return "B — 良好"
|
||
elif score >= 60:
|
||
return "C — 需改进"
|
||
elif score >= 40:
|
||
return "D — 风险较高"
|
||
else:
|
||
return "F — 不建议签署/发布"
|
||
|
||
|
||
document_review_agent = DocumentReviewAgent()
|