""" Token 预算管理器 — 追踪每次 LLM 调用的 token 消耗,提供预警和限额控制。 参考 Claude Code: - src/utils/tokenBudget.ts — 预算追踪与自动续行 - src/utils/tokenUsageTracker.ts — 累计用量追踪 - UI StatusLine 的 token 用量条 核心概念: - context_window: 模型上下文窗口大小(如 128K) - output_reserve: 留给模型输出的空间(默认 8K),只有 (window - reserve) 可被输入使用 - warning/critical/exhausted 三级预警 - 用户可设置 target budget(如 +500k),达到后自动继续 """ from __future__ import annotations import logging from dataclasses import dataclass, field from typing import Any, Dict, Optional from app.core.token_counter import TokenCounter, get_model_context_window logger = logging.getLogger(__name__) # ────────────── 配置 ────────────── @dataclass class TokenBudgetConfig: """Token 预算配置。""" # ── 总开关 ── enabled: bool = True # ── 窗口配置 ── context_window: int = 128_000 # 模型上下文窗口(token),0=自动检测 output_reserve: int = 8_192 # 留给模型输出的空间 # ── 预警阈值(占有效窗口的百分比) ── warning_threshold_pct: float = 0.75 # 75% → 开始预警 compact_threshold_pct: float = 0.85 # 85% → 触发自动压缩 hard_limit_pct: float = 0.95 # 95% → 下次调用前必须压缩 # ── 用户预算目标 ── user_budget: Optional[int] = None # 用户累计 token 目标(如 500_000) auto_continue: bool = False # 达到用户预算后是否自动继续 # ── 压缩协调 ── compaction_after_warning: bool = True # 预警后是否自动触发压缩 max_compaction_attempts: int = 3 # 单轮最多压缩尝试次数 @property def effective_window(self) -> int: """有效输入窗口 = 上下文窗口 - 输出预留。""" return max(0, self.context_window - self.output_reserve) @property def warning_at(self) -> int: """预警 token 数。""" return int(self.effective_window * self.warning_threshold_pct) @property def compact_at(self) -> int: """自动压缩触发 token 数。""" return int(self.effective_window * self.compact_threshold_pct) @property def hard_limit_at(self) -> int: """硬限制 token 数(超过则拒绝调用 LLM)。""" return int(self.effective_window * self.hard_limit_pct) # ────────────── 快照 ────────────── @dataclass class TokenSnapshot: """单次 LLM 调用的 token 快照。""" prompt_tokens: int = 0 completion_tokens: int = 0 total_tokens: int = 0 iteration: int = 0 step_type: str = "" # think / final model: str = "" # ────────────── 预算追踪器 ────────────── class TokenBudget: """会话级 token 预算追踪器。 追踪: - 当前消息列表的 token 数(输入侧) - 累计 LLM 消耗(输入 + 输出) - 用户预算目标的进度 - 预警/压缩/限额状态 用法:: budget = TokenBudget(TokenBudgetConfig(context_window=128000)) budget.update_input_estimate(counter.count_messages(messages)) if budget.needs_compaction: # trigger compaction budget.record_llm_call(prompt_tokens=5000, completion_tokens=800) print(budget.status_line) # "12.5k/128k (10%) | ⚠ near limit" """ def __init__( self, config: Optional[TokenBudgetConfig] = None, model: str = "deepseek-v4-flash", token_counter: Optional[TokenCounter] = None, ): self.config = config or TokenBudgetConfig() self.model = model self.counter = token_counter or TokenCounter(model=model) # 自动检测上下文窗口 if self.config.context_window <= 0: self.config.context_window = get_model_context_window(model) # ── 计数器 ── self._input_tokens_estimate: int = 0 # 当前输入消息列表的 token 估计 self._cumulative_prompt_tokens: int = 0 # 累计 prompt token(含重试) self._cumulative_completion_tokens: int = 0 # 累计 completion token self._llm_call_count: int = 0 self._compaction_attempts_this_turn: int = 0 # ── 历史快照(最近 20 次调用) ── self._snapshots: list[TokenSnapshot] = [] # ──────── 属性 ──────── @property def input_tokens(self) -> int: """当前输入消息列表的预估 token 数。""" return self._input_tokens_estimate @property def cumulative_total(self) -> int: """累计消耗 token(prompt + completion)。""" return self._cumulative_prompt_tokens + self._cumulative_completion_tokens @property def cumulative_prompt(self) -> int: return self._cumulative_prompt_tokens @property def cumulative_completion(self) -> int: return self._cumulative_completion_tokens @property def llm_call_count(self) -> int: return self._llm_call_count @property def input_usage_pct(self) -> float: """输入占用窗口的百分比。""" ew = self.config.effective_window return self._input_tokens_estimate / ew if ew > 0 else 0.0 @property def input_remaining(self) -> int: """输入侧剩余 token 空间。""" return max(0, self.config.effective_window - self._input_tokens_estimate) @property def user_budget_used(self) -> int: """用户预算消耗量。""" return self.cumulative_total @property def user_budget_remaining(self) -> Optional[int]: """用户预算剩余量(未设置则 None)。""" if self.config.user_budget is None: return None return max(0, self.config.user_budget - self.cumulative_total) @property def user_budget_pct(self) -> Optional[float]: """用户预算消耗百分比。""" if self.config.user_budget is None or self.config.user_budget <= 0: return None return self.cumulative_total / self.config.user_budget # ──────── 状态判断 ──────── @property def is_warning(self) -> bool: """是否达到预警线。""" return self._input_tokens_estimate >= self.config.warning_at @property def is_critical(self) -> bool: """是否达到紧急线(需要立即压缩)。""" return self._input_tokens_estimate >= self.config.compact_at @property def is_exhausted(self) -> bool: """是否达到硬限制(调用 LLM 前必须处理)。""" return self._input_tokens_estimate >= self.config.hard_limit_at @property def needs_compaction(self) -> bool: """是否需要触发压缩。""" if not self.config.compaction_after_warning: return False if self._compaction_attempts_this_turn >= self.config.max_compaction_attempts: return False # 熔断 return self.is_critical @property def compaction_attempts(self) -> int: return self._compaction_attempts_this_turn @property def is_user_budget_exhausted(self) -> bool: """用户预算是否用尽。""" rem = self.user_budget_remaining return rem is not None and rem <= 0 # ──────── 更新方法 ──────── def update_input_estimate(self, tokens: int) -> None: """更新当前输入消息列表的 token 估计值(每次消息列表变更后调用)。""" self._input_tokens_estimate = tokens logger.debug( "TokenBudget: input=%d tokens (%.1f%% of %d, compact_at=%d)", tokens, self.input_usage_pct * 100, self.config.effective_window, self.config.compact_at, ) def update_from_counter(self, messages: list) -> int: """从消息列表计算并更新输入 token 估计。返回估计值。""" tokens = self.counter.count_messages(messages) self.update_input_estimate(tokens) return tokens def record_llm_call( self, prompt_tokens: int = 0, completion_tokens: int = 0, iteration: int = 0, step_type: str = "think", ) -> TokenSnapshot: """记录一次 LLM 调用。 注意:prompt_tokens 应优先使用 API 返回的实际值; 若不可用则传入 0,由 update_input_estimate 的估算值代替。 """ if prompt_tokens <= 0: prompt_tokens = self._input_tokens_estimate self._cumulative_prompt_tokens += prompt_tokens self._cumulative_completion_tokens += completion_tokens self._llm_call_count += 1 snap = TokenSnapshot( prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, total_tokens=prompt_tokens + completion_tokens, iteration=iteration, step_type=step_type, model=self.model, ) self._snapshots.append(snap) # 只保留最近 50 次快照 if len(self._snapshots) > 50: self._snapshots = self._snapshots[-50:] if logger.isEnabledFor(logging.DEBUG): logger.debug( "TokenBudget: call #%d prompt=%d comp=%d total=%d cumulative=%d (%.1f%%)", self._llm_call_count, prompt_tokens, completion_tokens, prompt_tokens + completion_tokens, self.cumulative_total, self.input_usage_pct * 100, ) return snap def record_compaction_attempt(self) -> None: """记录一次压缩尝试(用于熔断计数)。""" self._compaction_attempts_this_turn += 1 def reset_compaction_attempts(self) -> None: """重置压缩尝试计数(新轮次开始时调用)。""" self._compaction_attempts_this_turn = 0 # ──────── 摘要/展示 ──────── @property def status_line(self) -> str: """单行状态摘要(用于日志/UI)。""" pct = self.input_usage_pct * 100 parts = [f"{self._input_tokens_estimate/1000:.1f}k/{self.config.effective_window/1000:.0f}k ({pct:.0f}%)"] if self.is_exhausted: parts.append("[EXHAUSTED]") elif self.is_critical: parts.append("[CRITICAL]") elif self.is_warning: parts.append("[WARNING]") if self.config.user_budget: parts.append(f"| budget: {self.cumulative_total/1000:.1f}k/{self.config.user_budget/1000:.0f}k") return " ".join(parts) def summary(self) -> Dict[str, Any]: """返回可供 API 响应的 token 预算摘要。""" result: Dict[str, Any] = { "input_tokens": self._input_tokens_estimate, "input_remaining": self.input_remaining, "input_usage_pct": round(self.input_usage_pct, 4), "effective_window": self.config.effective_window, "context_window": self.config.context_window, "cumulative_total": self.cumulative_total, "cumulative_prompt": self._cumulative_prompt_tokens, "cumulative_completion": self._cumulative_completion_tokens, "llm_call_count": self._llm_call_count, "is_warning": self.is_warning, "is_critical": self.is_critical, "is_exhausted": self.is_exhausted, "compaction_attempts": self._compaction_attempts_this_turn, } if self.config.user_budget is not None: result["user_budget"] = self.config.user_budget result["user_budget_used"] = self.user_budget_used result["user_budget_remaining"] = self.user_budget_remaining result["user_budget_pct"] = round(self.user_budget_pct, 4) if self.user_budget_pct else None return result def needs_user_budget_continue(self) -> bool: """用户预算用尽且配置了自动继续。""" return self.is_user_budget_exhausted and self.config.auto_continue # ────────────── 便捷工厂 ────────────── def create_token_budget( model: str = "deepseek-v4-flash", context_window: int = 0, user_budget: Optional[int] = None, enabled: bool = True, ) -> TokenBudget: """创建预配置的 TokenBudget(适合大多数场景)。""" config = TokenBudgetConfig( enabled=enabled, context_window=context_window or get_model_context_window(model), user_budget=user_budget, ) return TokenBudget(config=config, model=model)