""" 文档审查代理 — 数字分身自动审查合同/文档,检查完整性、风险点、格式 """ from __future__ import annotations import logging import re from typing import Any, Dict, List, Optional from app.services.fingerprint_engine import fingerprint_engine from app.services.behavior_collector import behavior_collector logger = logging.getLogger(__name__) DOCUMENT_CHECKLISTS = { "contract": { "parties": {"label": "签约方信息", "checks": ["甲方全称", "乙方全称", "统一社会信用代码", "法定代表人", "联系方式"]}, "subject": {"label": "合同标的", "checks": ["标的描述是否明确", "数量/规格是否清晰", "质量标准是否定义"]}, "payment": {"label": "付款条款", "checks": ["金额大小写一致", "付款节点清晰", "发票要求", "违约金比例 ≤ 30%"]}, "term": {"label": "履约期限", "checks": ["起止日期明确", "延期条款", "不可抗力定义"]}, "liability": {"label": "违约责任", "checks": ["双方对等", "赔偿上限合理", "免责条款不违法"]}, "confidentiality": {"label": "保密条款", "checks": ["保密范围", "保密期限", "违约责任"]}, "termination": {"label": "终止条款", "checks": ["解除条件", "善后义务", "争议解决方式"]}, "signature": {"label": "签章", "checks": ["签字日期", "盖章位置", "骑缝章"]}, }, "technical_doc": { "overview": {"label": "概述", "checks": ["背景说明", "目标定义", "适用范围"]}, "terms": {"label": "术语定义", "checks": ["关键术语有定义", "缩写全称首次出现"]}, "architecture": {"label": "架构设计", "checks": ["架构图/示意图", "模块划分", "接口定义"]}, "implementation": {"label": "实现细节", "checks": ["核心逻辑描述", "关键配置项", "异常处理设计"]}, "deployment": {"label": "部署运维", "checks": ["环境要求", "部署步骤", "监控指标", "回滚方案"]}, }, "email": { "subject": {"label": "主题", "checks": ["主题简明扼要", "无多余前缀"]}, "recipient": {"label": "收件人", "checks": ["To/CC/BCC 区分正确", "收件人数量合理"]}, "greeting": {"label": "称呼", "checks": ["称呼得体", "与收件人类系匹配"]}, "body": {"label": "正文", "checks": ["核心信息在前", "段落简短", "行动项明确"]}, "attachments": {"label": "附件", "checks": ["附件已提及", "文件大小合理", "格式通用"]}, "closing": {"label": "结尾", "checks": ["署名完整", "签名档信息准确"]}, }, } RISK_PATTERNS = { "high": [ (r"不承担.*任何.*责任", "单方面免责声明"), (r"自动续期.*未.*通知", "自动续期条款需关注"), (r"违约金.*超过.*30%", "违约金比例可能过高"), (r"放弃.*诉讼.*权利", "放弃诉讼权利条款"), (r"无条件.*连带.*保证", "无条件连带保证责任"), ], "medium": [ (r"以.*为准.*解释", "单方解释权条款"), (r"仲裁.*北京|仲裁.*上海", "仲裁地条款需确认"), (r"保密.*永久", "永久保密义务可能过于严苛"), (r"竞业.*2.*年|竞业.*3.*年", "竞业限制期限较长"), ], "low": [ (r"未尽事宜.*协商", "兜底条款可接受"), (r"一式.*份", "签署份数需确认"), ], } class DocumentReviewAgent: """文档审查代理""" def review_document(self, user_id: str, document_type: str, title: str, content: str, metadata: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: """审查文档/合同。 Args: user_id: 用户ID(用于加载偏好) document_type: contract / technical_doc / email title: 文档标题 content: 文档内容 metadata: 附加元数据 """ fp = fingerprint_engine.get_fingerprint(user_id) preference = (fp.get("preference_weights", {}).get("document") if fp else None) or {} checklist = DOCUMENT_CHECKLISTS.get(document_type, DOCUMENT_CHECKLISTS["contract"]) # 1. 完整性检查 completeness = self._check_completeness(content, checklist) # 2. 风险扫描 risks = self._scan_risks(content) # 3. 格式检查 format_issues = self._check_format(content, document_type) # 4. 生成建议 suggestions = self._generate_suggestions(completeness, risks, format_issues) score = self._calculate_score(completeness, risks, format_issues) report = { "document_type": document_type, "title": title, "overall_score": score, "grade": self._grade(score), "completeness": completeness, "risks": risks, "format_issues": format_issues, "suggestions": suggestions, "user_preference_applied": bool(preference), } # 记录行为 behavior_collector.log_fire_and_forget( user_id=user_id, category="document", action=f"review_{document_type}", context={"title": title, "doc_type": document_type, "content_length": len(content)}, result={"score": score, "risks_count": len(risks)}, ) return report def _check_completeness(self, content: str, checklist: Dict[str, Any]) -> Dict[str, Any]: """检查文档完整性。""" results = {} for section_id, section in checklist.items(): checked = [] for item in section["checks"]: present = self._fuzzy_contains(content, item) checked.append({"item": item, "present": present}) passed = sum(1 for c in checked if c["present"]) total = len(checked) results[section_id] = { "label": section["label"], "passed": passed, "total": total, "percentage": round(passed / total * 100, 1) if total > 0 else 0, "details": checked, } return results def _fuzzy_contains(self, content: str, query: str) -> bool: """模糊检查内容是否包含关键信息。""" content_lower = content.lower() parts = re.split(r'[,,、\s]+', query) for part in parts: if len(part) >= 2 and part.lower() in content_lower: return True if len(query) >= 2 and query.lower() in content_lower: return True return False def _scan_risks(self, content: str) -> List[Dict[str, Any]]: """扫描风险条款。""" risks = [] for severity, patterns in RISK_PATTERNS.items(): for pattern, label in patterns: matches = re.findall(pattern, content) if matches: risks.append({ "severity": severity, "label": label, "pattern": pattern, "match_count": len(matches), "snippet": self._extract_snippet(content, pattern), }) risks.sort(key=lambda r: {"high": 0, "medium": 1, "low": 2}[r["severity"]]) return risks def _extract_snippet(self, content: str, pattern: str) -> str: """提取匹配行的上下文片段。""" for line in content.split("\n"): if re.search(pattern, line): return line.strip()[:150] return "" def _check_format(self, content: str, doc_type: str) -> List[Dict[str, Any]]: """检查格式问题。""" issues = [] if doc_type == "contract": if not re.search(r'[((]\s*[一二三1-3]\s*[))]', content): issues.append({"type": "structure", "message": "未检测到分条结构,合同建议使用条/款/项结构"}) if len(content) < 500: issues.append({"type": "length", "message": f"文档仅 {len(content)} 字符,合同内容可能不完整"}) if doc_type == "technical_doc": if not re.search(r'#+\s|第[一二三1-3]章|第[一二三1-3]节', content): issues.append({"type": "structure", "message": "未检测到章节标题,建议添加层次化标题"}) if doc_type == "email": if len(content) > 5000: issues.append({"type": "length", "message": "邮件过长,建议精简核心内容"}) has_line_breaks = len(re.findall(r'\n\s*\n', content)) if has_line_breaks == 0 and len(content) > 1000: issues.append({"type": "readability", "message": "大段文字缺少分段,建议增加段落间距"}) return issues def _generate_suggestions(self, completeness: Dict, risks: List, format_issues: List) -> List[str]: """汇总改善建议。""" suggestions = [] for section_id, result in completeness.items(): if result["percentage"] < 50: suggestions.append( f"[{result['label']}] 完整性仅 {result['percentage']}%,建议补充缺少的要素" ) elif result["percentage"] < 80: missing = [d["item"] for d in result.get("details", []) if not d["present"]] suggestions.append( f"[{result['label']}] 缺少: {', '.join(missing[:3])}" ) high_risks = [r for r in risks if r["severity"] == "high"] if high_risks: suggestions.append(f"发现 {len(high_risks)} 个高风险条款: " f"{', '.join(r['label'] for r in high_risks)}") for issue in format_issues: suggestions.append(f"[格式] {issue['message']}") return suggestions def _calculate_score(self, completeness: Dict, risks: List, format_issues: List) -> float: """计算综合评分 (0-100)。""" # 完整性权重 40% if completeness: avg = sum(r["percentage"] for r in completeness.values()) / len(completeness) else: avg = 0 completeness_score = avg * 0.4 # 风险权重 40% risk_penalty = sum({"high": 20, "medium": 10, "low": 5}[r["severity"]] for r in risks) risk_score = max(0, 40 - risk_penalty) # 格式权重 20% format_score = max(0, 20 - len(format_issues) * 5) return round(completeness_score + risk_score + format_score, 1) def _grade(self, score: float) -> str: if score >= 90: return "A — 优秀" elif score >= 75: return "B — 良好" elif score >= 60: return "C — 需改进" elif score >= 40: return "D — 风险较高" else: return "F — 不建议签署/发布" document_review_agent = DocumentReviewAgent()