""" 对话自动压缩引擎 — 三级压缩体系 参考 Claude Code: - src/services/compact/microCompact.ts — Tier 1: 工具结果打桩 - src/services/compact/compact.ts — Tier 2: LLM 摘要替换 - src/services/compact/reactiveCompact.ts — Tier 3: 错误触发压缩 - src/services/compact/grouping.ts — 安全分割点识别 """ from __future__ import annotations import logging import time from enum import Enum from typing import Any, Dict, List, Optional, Tuple from app.core.token_counter import ( TokenCounter, get_model_context_window, is_context_length_error, ) from app.core.compaction_config import CompactionConfig logger = logging.getLogger(__name__) # ──────────────────────────── 数据结构 ──────────────────────────── class CompactionStrategy(str, Enum): NONE = "none" MICRO = "micro" # Tier 1: 工具结果打桩 FULL = "full" # Tier 2: LLM 摘要替换 REACTIVE = "reactive" # Tier 3: 错误触发 class CompactionResult: """压缩操作结果。""" def __init__( self, messages: List[Dict[str, Any]], strategy: CompactionStrategy, tokens_before: int, tokens_after: int, details: Optional[str] = None, ): self.messages = messages self.strategy = strategy self.tokens_before = tokens_before self.tokens_after = tokens_after self.tokens_saved = tokens_before - tokens_after self.details = details self.timestamp = time.time() def __repr__(self) -> str: return ( f"CompactionResult(strategy={self.strategy.value}, " f"saved={self.tokens_saved} tokens, " f"before={self.tokens_before} after={self.tokens_after})" ) # ──────────────────────────── 压缩摘要提示词 ──────────────────────────── COMPACT_SUMMARY_SYSTEM = """你是一个对话摘要专家。你需要将一段 AI 助手与用户的对话历史压缩为简洁的摘要。 规则: - 保留关键事实和决策(文件路径、数值、结论、用户偏好) - 保留未完成的任务或待处理事项 - 忽略纯粹的问候、闲聊和中间工具调用细节 - 用第三人称中文描述 - 控制在你被要求的字数范围内 返回格式:直接返回摘要文本,不要加任何前缀或标记。""" def _build_compact_user_prompt(older_messages: List[Dict[str, Any]], max_chars: int = 2000) -> str: """从旧消息中构建压缩提示词。""" parts = [] total_chars = 0 for msg in older_messages: role = msg.get("role", "?") content = msg.get("content", "") or "" # 截断长内容 if len(content) > 500: content = content[:500] + "..." line = f"[{role}]: {content}" if total_chars + len(line) > max_chars: parts.append("...(更早的消息已省略)") break parts.append(line) total_chars += len(line) return "\n".join(parts) # ──────────────────────────── CompactionEngine ──────────────────────────── class CompactionEngine: """三级对话压缩引擎。 与 Claude Code 一样,在每轮 LLM 调用前检查 token 用量: - >70% → MicroCompact(旧工具结果打桩) - >85% → FullCompact(LLM 摘要替换) - >95% → 等 API 报错后 ReactiveCompact """ def __init__( self, config: CompactionConfig, token_counter: Optional[TokenCounter] = None, model: str = "deepseek-v4-flash", ): self.config = config self.token_counter = token_counter or TokenCounter(model=model) self.model = model # 熔断状态 self._consecutive_failures = 0 self._last_compact_time: float = 0 self._compact_count = 0 # ──────────────────── 入口 ──────────────────── async def maybe_compact( self, messages: List[Dict[str, Any]], context_window: Optional[int] = None, ) -> CompactionResult: """根据当前 token 用量决定是否压缩,返回(可能压缩后的)消息列表。 Args: messages: 当前消息列表 (含 system prompt) context_window: 模型上下文窗口大小(None=自动检测) Returns: CompactionResult,包含(可能压缩后的)消息列表 """ if not self.config.enabled: return CompactionResult( messages, CompactionStrategy.NONE, tokens_before=0, tokens_after=0, details="压缩已禁用", ) # 确定上下文窗口 if context_window is None: context_window = get_model_context_window(self.model) if self.config.context_window_override > 0: context_window = self.config.context_window_override # 有效窗口 = 模型窗口 - 输出余量 effective_window = context_window - self.config.output_reserve_tokens # 计算当前 token 数 tokens_before = self.token_counter.count_messages(messages) usage_ratio = tokens_before / effective_window if effective_window > 0 else 0 # ── 决策 ── # Tier 1: MicroCompact if ( self.config.micro_compact_enabled and usage_ratio >= self.config.micro_compact_threshold and usage_ratio < self.config.full_compact_threshold ): return await self._micro_compact(messages, tokens_before) # Tier 2: FullCompact if ( self.config.full_compact_enabled and usage_ratio >= self.config.full_compact_threshold ): return await self._full_compact(messages, tokens_before) # 不需要压缩 return CompactionResult( messages, CompactionStrategy.NONE, tokens_before=tokens_before, tokens_after=tokens_before, details=f"usage={usage_ratio:.1%} < threshold={self.config.micro_compact_threshold:.0%}", ) async def reactive_compact( self, messages: List[Dict[str, Any]], error: Exception, context_window: Optional[int] = None, ) -> CompactionResult: """响应 API 上下文超限错误的紧急压缩(Tier 3)。 Args: messages: 当前消息列表 error: 触发的异常 context_window: 上下文窗口大小 Returns: CompactionResult """ if not self.config.reactive_compact_enabled: raise error if context_window is None: context_window = get_model_context_window(self.model) tokens_before = self.token_counter.count_messages(messages) logger.warning( "ReactiveCompact 触发: %s, tokens=%d, window=%d", str(error)[:100], tokens_before, context_window, ) return await self._full_compact(messages, tokens_before, is_reactive=True) # ──────────────────── Tier 1: MicroCompact ──────────────────── async def _micro_compact( self, messages: List[Dict[str, Any]], tokens_before: int ) -> CompactionResult: """MicroCompact: 将旧工具结果替换为桩标记。 核心逻辑(参考 Claude Code microCompact.ts): 1. 找到所有 "可压缩" 工具类型的结果消息 2. 保留最近 N 轮的工具结果不动 3. 更早的工具结果 → 替换 content 为 "[Tool result compacted]" 4. 保护 assistant(tool_calls) 消息 — 它们包含推理链 5. 保护破坏性工具结果(write/edit/deploy 等) """ try: compactable = set(self.config.compactable_tools) protected = set(self.config.protected_tools) # ── 第 1 步: 识别各消息角色 ── # 从后往前数 user 消息来确定"轮次" user_indices = [] for i, msg in enumerate(messages): if msg.get("role") == "user": user_indices.append(i) if len(user_indices) <= self.config.min_preserve_messages // 2: # 对话轮次太少,不需要压缩 return CompactionResult( messages, CompactionStrategy.MICRO, tokens_before=tokens_before, tokens_after=tokens_before, details="对话轮次不足,跳过 MicroCompact", ) # 找到"保护线":倒数第 compact_older_than_rounds 个 user 消息的位置 preserve_idx = max(0, len(user_indices) - self.config.compact_older_than_rounds) compact_before = user_indices[preserve_idx] if preserve_idx < len(user_indices) else 0 # ── 第 2 步: 识别 tool_call → tool_result 配对 ── # 收集 assistant(tool_calls) 的 tool_call_id 集合 active_tool_ids = set() for msg in messages: if msg.get("role") == "assistant" and msg.get("tool_calls"): for tc in msg["tool_calls"]: tc_id = tc.get("id") or tc.get("tool_call_id") if tc_id: active_tool_ids.add(tc_id) # ── 第 3 步: 在保护线之前压缩可压缩工具结果 ── stubbed_count = 0 result = [] for i, msg in enumerate(messages): if i >= compact_before: # 在保护线之后,保留原样 result.append(msg) continue role = msg.get("role", "") tool_name = msg.get("name", "") if role == "tool" and tool_name in compactable and tool_name not in protected: # 检查是否有对应的 assistant(tool_calls) 也早于保护线 tc_id = msg.get("tool_call_id", "") result.append({ "role": "tool", "tool_call_id": tc_id or "compacted", "content": "[Tool result compacted]", "name": tool_name, }) stubbed_count += 1 else: result.append(msg) if stubbed_count == 0: return CompactionResult( result, CompactionStrategy.MICRO, tokens_before=tokens_before, tokens_after=self.token_counter.count_messages(result), details="没有可压缩的旧工具结果", ) tokens_after = self.token_counter.count_messages(result) logger.info( "MicroCompact: %d 条工具结果打桩, %d → %d tokens (节省 %d)", stubbed_count, tokens_before, tokens_after, tokens_before - tokens_after, ) return CompactionResult( result, CompactionStrategy.MICRO, tokens_before=tokens_before, tokens_after=tokens_after, details=f"{stubbed_count} 条工具结果已压缩", ) except Exception as e: self._consecutive_failures += 1 logger.error("MicroCompact 失败 (%d/%d): %s", self._consecutive_failures, self.config.max_consecutive_failures, e) if self._consecutive_failures >= self.config.max_consecutive_failures: logger.warning("MicroCompact 熔断!跳过本次压缩") return CompactionResult( messages, CompactionStrategy.MICRO, tokens_before=tokens_before, tokens_after=tokens_before, details=f"熔断 ({self._consecutive_failures}次连续失败)", ) return CompactionResult( messages, CompactionStrategy.MICRO, tokens_before=tokens_before, tokens_after=tokens_before, details=f"失败: {e}", ) # ──────────────────── Tier 2: FullCompact ──────────────────── async def _full_compact( self, messages: List[Dict[str, Any]], tokens_before: int, is_reactive: bool = False, llm_client=None, # 可选:外部传入 LLM 客户端 ) -> CompactionResult: """FullCompact: 用 LLM 将旧对话压缩为摘要消息。 核心逻辑(参考 Claude Code compact.ts): 1. 保留 system 消息 + 最近 N 条消息 2. 中间部分 → 调用轻量 LLM 生成摘要 3. 将摘要作为 compact_boundary 消息插入 4. 熔断保护:连续失败 N 次后放弃 """ try: preserve_count = self.config.min_preserve_messages if len(messages) <= preserve_count + 4: return CompactionResult( messages, CompactionStrategy.FULL, tokens_before=tokens_before, tokens_after=tokens_before, details="消息数不足,跳过 FullCompact", ) # ── 分离各段 ── # 找到 system 消息 system_msgs = [m for m in messages if m.get("role") == "system"] non_system = [m for m in messages if m.get("role") != "system"] middle_start = 0 # 跳过最前面的几条 system 后的过渡消息(通常是首次问候等) # 保留至少 preserve_count 条在末尾 if len(non_system) > preserve_count + 4: middle_end = len(non_system) - preserve_count # 只压缩中间部分:跳过前 2 条(通常是 system 后的首次交互)到倒数 preserve_count 条之间 middle_start = max(2, 0) older = non_system[middle_start:middle_end] recent = non_system[middle_end:] else: # 消息太少,只保留最近的 older = non_system[:-preserve_count] if len(non_system) > preserve_count else [] recent = non_system[-preserve_count:] if len(non_system) >= preserve_count else non_system if len(older) < 3: return CompactionResult( messages, CompactionStrategy.FULL, tokens_before=tokens_before, tokens_after=tokens_before, details="旧消息不足以压缩", ) # ── 调用 LLM 生成摘要 ── summary_text = await self._generate_summary(older, llm_client) # ── 组装结果: system + compact_boundary + recent ── compact_boundary = { "role": "user", "content": ( f"[对话上下文摘要 — 之前的关键信息]\n\n{summary_text}\n\n" f"[以上为自动生成的对话摘要,共压缩 {len(older)} 条消息。" f"以下是最近的对话延续]" ), } new_messages = system_msgs + [compact_boundary] + recent tokens_after = self.token_counter.count_messages(new_messages) strategy = CompactionStrategy.REACTIVE if is_reactive else CompactionStrategy.FULL logger.info( "FullCompact (%s): %d 条消息→摘要 (%d 字), %d → %d tokens (节省 %d)", "被动" if is_reactive else "主动", len(older), len(summary_text), tokens_before, tokens_after, tokens_before - tokens_after, ) self._consecutive_failures = 0 # 成功后重置 self._last_compact_time = time.time() self._compact_count += 1 return CompactionResult( new_messages, strategy, tokens_before=tokens_before, tokens_after=tokens_after, details=f"压缩 {len(older)} 条→{len(summary_text)} 字摘要", ) except Exception as e: self._consecutive_failures += 1 logger.error("FullCompact 失败 (%d/%d): %s", self._consecutive_failures, self.config.max_consecutive_failures, e) if self._consecutive_failures >= self.config.max_consecutive_failures: logger.warning("FullCompact 熔断!返回原始消息") return CompactionResult( messages, CompactionStrategy.FULL, tokens_before=tokens_before, tokens_after=tokens_before, details=f"熔断 ({self._consecutive_failures}次连续失败)", ) return CompactionResult( messages, CompactionStrategy.FULL, tokens_before=tokens_before, tokens_after=tokens_before, details=f"失败: {e}", ) async def _generate_summary( self, older_messages: List[Dict[str, Any]], llm_client=None, ) -> str: """调用轻量 LLM 生成对话摘要。""" # 构建提示词 user_content = _build_compact_user_prompt( older_messages, max_chars=3000, ) if llm_client is not None: # 使用外部传入的 LLM 客户端 from app.agent_runtime.core import _LLMClient from app.agent_runtime.schemas import AgentLLMConfig if not isinstance(llm_client, _LLMClient): # 创建临时客户端 summary_config = AgentLLMConfig( provider="deepseek", model=self.config.summary_model, temperature=self.config.summary_temperature, max_tokens=self.config.summary_max_tokens, request_timeout=30.0, ) llm_client = _LLMClient(summary_config) messages = [ {"role": "system", "content": COMPACT_SUMMARY_SYSTEM}, {"role": "user", "content": f"请将以下对话历史压缩为不超过{self.config.summary_max_tokens // 2}字的摘要:\n\n{user_content}"}, ] response = await llm_client.chat(messages=messages, tools=None, iteration=-1) content = getattr(response, 'content', '') or ( response.get('content', '') if isinstance(response, dict) else "" ) return content.strip() or self._fallback_summary(older_messages) else: # 无 LLM 客户端,使用 fallback return self._fallback_summary(older_messages) @staticmethod def _fallback_summary(older_messages: List[Dict[str, Any]]) -> str: """无 LLM 时的降级摘要(提取关键信息)。""" topics = set() for msg in older_messages: if msg.get("role") == "user": content = msg.get("content", "") if len(content) > 60: content = content[:60] + "..." if content: topics.add(content) if not topics: return "此段对话为助手与用户的交互。" topic_list = ";".join(list(topics)[:10]) return f"对话涉及以下话题: {topic_list}" # ──────────────────────────── 工厂函数 ──────────────────────────── def create_compaction_engine( config: Optional[CompactionConfig] = None, model: str = "deepseek-v4-flash", ) -> CompactionEngine: """创建 CompactionEngine 实例的便捷工厂。""" if config is None: config = CompactionConfig() return CompactionEngine(config=config, model=model)