- Fix delete agent 500: clean up FK records (agent_llm_logs, permissions, schedules, executions, team_members) and unbind goals/tasks before delete - Remove hardcoded personality templates in Android, replace with dynamic system prompt generation from name + description - Set promptSectionsEnabled=false to bypass PromptComposer for personality - Add Tencent Cloud Linux deployment guide (Docker Compose) - Accumulated backend service updates, frontend UI fixes, Android app changes Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
359 lines
13 KiB
Python
359 lines
13 KiB
Python
"""
|
||
Token 预算管理器 — 追踪每次 LLM 调用的 token 消耗,提供预警和限额控制。
|
||
|
||
参考 Claude Code:
|
||
- src/utils/tokenBudget.ts — 预算追踪与自动续行
|
||
- src/utils/tokenUsageTracker.ts — 累计用量追踪
|
||
- UI StatusLine 的 token 用量条
|
||
|
||
核心概念:
|
||
- context_window: 模型上下文窗口大小(如 128K)
|
||
- output_reserve: 留给模型输出的空间(默认 8K),只有 (window - reserve) 可被输入使用
|
||
- warning/critical/exhausted 三级预警
|
||
- 用户可设置 target budget(如 +500k),达到后自动继续
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import logging
|
||
from dataclasses import dataclass, field
|
||
from typing import Any, Dict, Optional
|
||
|
||
from app.core.token_counter import TokenCounter, get_model_context_window
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
# ────────────── 配置 ──────────────
|
||
|
||
|
||
@dataclass
|
||
class TokenBudgetConfig:
|
||
"""Token 预算配置。"""
|
||
|
||
# ── 总开关 ──
|
||
enabled: bool = True
|
||
|
||
# ── 窗口配置 ──
|
||
context_window: int = 128_000 # 模型上下文窗口(token),0=自动检测
|
||
output_reserve: int = 8_192 # 留给模型输出的空间
|
||
|
||
# ── 预警阈值(占有效窗口的百分比) ──
|
||
warning_threshold_pct: float = 0.75 # 75% → 开始预警
|
||
compact_threshold_pct: float = 0.85 # 85% → 触发自动压缩
|
||
hard_limit_pct: float = 0.95 # 95% → 下次调用前必须压缩
|
||
|
||
# ── 用户预算目标 ──
|
||
user_budget: Optional[int] = None # 用户累计 token 目标(如 500_000)
|
||
auto_continue: bool = False # 达到用户预算后是否自动继续
|
||
|
||
# ── 压缩协调 ──
|
||
compaction_after_warning: bool = True # 预警后是否自动触发压缩
|
||
max_compaction_attempts: int = 3 # 单轮最多压缩尝试次数
|
||
|
||
@property
|
||
def effective_window(self) -> int:
|
||
"""有效输入窗口 = 上下文窗口 - 输出预留。"""
|
||
return max(0, self.context_window - self.output_reserve)
|
||
|
||
@property
|
||
def warning_at(self) -> int:
|
||
"""预警 token 数。"""
|
||
return int(self.effective_window * self.warning_threshold_pct)
|
||
|
||
@property
|
||
def compact_at(self) -> int:
|
||
"""自动压缩触发 token 数。"""
|
||
return int(self.effective_window * self.compact_threshold_pct)
|
||
|
||
@property
|
||
def hard_limit_at(self) -> int:
|
||
"""硬限制 token 数(超过则拒绝调用 LLM)。"""
|
||
return int(self.effective_window * self.hard_limit_pct)
|
||
|
||
|
||
# ────────────── 快照 ──────────────
|
||
|
||
|
||
@dataclass
|
||
class TokenSnapshot:
|
||
"""单次 LLM 调用的 token 快照。"""
|
||
prompt_tokens: int = 0
|
||
completion_tokens: int = 0
|
||
total_tokens: int = 0
|
||
iteration: int = 0
|
||
step_type: str = "" # think / final
|
||
model: str = ""
|
||
|
||
|
||
# ────────────── 预算追踪器 ──────────────
|
||
|
||
|
||
class TokenBudget:
|
||
"""会话级 token 预算追踪器。
|
||
|
||
追踪:
|
||
- 当前消息列表的 token 数(输入侧)
|
||
- 累计 LLM 消耗(输入 + 输出)
|
||
- 用户预算目标的进度
|
||
- 预警/压缩/限额状态
|
||
|
||
用法::
|
||
|
||
budget = TokenBudget(TokenBudgetConfig(context_window=128000))
|
||
budget.update_input_estimate(counter.count_messages(messages))
|
||
|
||
if budget.needs_compaction:
|
||
# trigger compaction
|
||
|
||
budget.record_llm_call(prompt_tokens=5000, completion_tokens=800)
|
||
print(budget.status_line) # "12.5k/128k (10%) | ⚠ near limit"
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
config: Optional[TokenBudgetConfig] = None,
|
||
model: str = "deepseek-v4-flash",
|
||
token_counter: Optional[TokenCounter] = None,
|
||
):
|
||
self.config = config or TokenBudgetConfig()
|
||
self.model = model
|
||
self.counter = token_counter or TokenCounter(model=model)
|
||
|
||
# 自动检测上下文窗口
|
||
if self.config.context_window <= 0:
|
||
self.config.context_window = get_model_context_window(model)
|
||
|
||
# ── 计数器 ──
|
||
self._input_tokens_estimate: int = 0 # 当前输入消息列表的 token 估计
|
||
self._cumulative_prompt_tokens: int = 0 # 累计 prompt token(含重试)
|
||
self._cumulative_completion_tokens: int = 0 # 累计 completion token
|
||
self._llm_call_count: int = 0
|
||
self._compaction_attempts_this_turn: int = 0
|
||
|
||
# ── 历史快照(最近 20 次调用) ──
|
||
self._snapshots: list[TokenSnapshot] = []
|
||
|
||
# ──────── 属性 ────────
|
||
|
||
@property
|
||
def input_tokens(self) -> int:
|
||
"""当前输入消息列表的预估 token 数。"""
|
||
return self._input_tokens_estimate
|
||
|
||
@property
|
||
def cumulative_total(self) -> int:
|
||
"""累计消耗 token(prompt + completion)。"""
|
||
return self._cumulative_prompt_tokens + self._cumulative_completion_tokens
|
||
|
||
@property
|
||
def cumulative_prompt(self) -> int:
|
||
return self._cumulative_prompt_tokens
|
||
|
||
@property
|
||
def cumulative_completion(self) -> int:
|
||
return self._cumulative_completion_tokens
|
||
|
||
@property
|
||
def llm_call_count(self) -> int:
|
||
return self._llm_call_count
|
||
|
||
@property
|
||
def input_usage_pct(self) -> float:
|
||
"""输入占用窗口的百分比。"""
|
||
ew = self.config.effective_window
|
||
return self._input_tokens_estimate / ew if ew > 0 else 0.0
|
||
|
||
@property
|
||
def input_remaining(self) -> int:
|
||
"""输入侧剩余 token 空间。"""
|
||
return max(0, self.config.effective_window - self._input_tokens_estimate)
|
||
|
||
@property
|
||
def user_budget_used(self) -> int:
|
||
"""用户预算消耗量。"""
|
||
return self.cumulative_total
|
||
|
||
@property
|
||
def user_budget_remaining(self) -> Optional[int]:
|
||
"""用户预算剩余量(未设置则 None)。"""
|
||
if self.config.user_budget is None:
|
||
return None
|
||
return max(0, self.config.user_budget - self.cumulative_total)
|
||
|
||
@property
|
||
def user_budget_pct(self) -> Optional[float]:
|
||
"""用户预算消耗百分比。"""
|
||
if self.config.user_budget is None or self.config.user_budget <= 0:
|
||
return None
|
||
return self.cumulative_total / self.config.user_budget
|
||
|
||
# ──────── 状态判断 ────────
|
||
|
||
@property
|
||
def is_warning(self) -> bool:
|
||
"""是否达到预警线。"""
|
||
return self._input_tokens_estimate >= self.config.warning_at
|
||
|
||
@property
|
||
def is_critical(self) -> bool:
|
||
"""是否达到紧急线(需要立即压缩)。"""
|
||
return self._input_tokens_estimate >= self.config.compact_at
|
||
|
||
@property
|
||
def is_exhausted(self) -> bool:
|
||
"""是否达到硬限制(调用 LLM 前必须处理)。"""
|
||
return self._input_tokens_estimate >= self.config.hard_limit_at
|
||
|
||
@property
|
||
def needs_compaction(self) -> bool:
|
||
"""是否需要触发压缩。"""
|
||
if not self.config.compaction_after_warning:
|
||
return False
|
||
if self._compaction_attempts_this_turn >= self.config.max_compaction_attempts:
|
||
return False # 熔断
|
||
return self.is_critical
|
||
|
||
@property
|
||
def compaction_attempts(self) -> int:
|
||
return self._compaction_attempts_this_turn
|
||
|
||
@property
|
||
def is_user_budget_exhausted(self) -> bool:
|
||
"""用户预算是否用尽。"""
|
||
rem = self.user_budget_remaining
|
||
return rem is not None and rem <= 0
|
||
|
||
# ──────── 更新方法 ────────
|
||
|
||
def update_input_estimate(self, tokens: int) -> None:
|
||
"""更新当前输入消息列表的 token 估计值(每次消息列表变更后调用)。"""
|
||
self._input_tokens_estimate = tokens
|
||
logger.debug(
|
||
"TokenBudget: input=%d tokens (%.1f%% of %d, compact_at=%d)",
|
||
tokens, self.input_usage_pct * 100,
|
||
self.config.effective_window, self.config.compact_at,
|
||
)
|
||
|
||
def update_from_counter(self, messages: list) -> int:
|
||
"""从消息列表计算并更新输入 token 估计。返回估计值。"""
|
||
tokens = self.counter.count_messages(messages)
|
||
self.update_input_estimate(tokens)
|
||
return tokens
|
||
|
||
def record_llm_call(
|
||
self,
|
||
prompt_tokens: int = 0,
|
||
completion_tokens: int = 0,
|
||
iteration: int = 0,
|
||
step_type: str = "think",
|
||
) -> TokenSnapshot:
|
||
"""记录一次 LLM 调用。
|
||
|
||
注意:prompt_tokens 应优先使用 API 返回的实际值;
|
||
若不可用则传入 0,由 update_input_estimate 的估算值代替。
|
||
"""
|
||
if prompt_tokens <= 0:
|
||
prompt_tokens = self._input_tokens_estimate
|
||
|
||
self._cumulative_prompt_tokens += prompt_tokens
|
||
self._cumulative_completion_tokens += completion_tokens
|
||
self._llm_call_count += 1
|
||
|
||
snap = TokenSnapshot(
|
||
prompt_tokens=prompt_tokens,
|
||
completion_tokens=completion_tokens,
|
||
total_tokens=prompt_tokens + completion_tokens,
|
||
iteration=iteration,
|
||
step_type=step_type,
|
||
model=self.model,
|
||
)
|
||
self._snapshots.append(snap)
|
||
# 只保留最近 50 次快照
|
||
if len(self._snapshots) > 50:
|
||
self._snapshots = self._snapshots[-50:]
|
||
|
||
if logger.isEnabledFor(logging.DEBUG):
|
||
logger.debug(
|
||
"TokenBudget: call #%d prompt=%d comp=%d total=%d cumulative=%d (%.1f%%)",
|
||
self._llm_call_count, prompt_tokens, completion_tokens,
|
||
prompt_tokens + completion_tokens,
|
||
self.cumulative_total,
|
||
self.input_usage_pct * 100,
|
||
)
|
||
|
||
return snap
|
||
|
||
def record_compaction_attempt(self) -> None:
|
||
"""记录一次压缩尝试(用于熔断计数)。"""
|
||
self._compaction_attempts_this_turn += 1
|
||
|
||
def reset_compaction_attempts(self) -> None:
|
||
"""重置压缩尝试计数(新轮次开始时调用)。"""
|
||
self._compaction_attempts_this_turn = 0
|
||
|
||
# ──────── 摘要/展示 ────────
|
||
|
||
@property
|
||
def status_line(self) -> str:
|
||
"""单行状态摘要(用于日志/UI)。"""
|
||
pct = self.input_usage_pct * 100
|
||
parts = [f"{self._input_tokens_estimate/1000:.1f}k/{self.config.effective_window/1000:.0f}k ({pct:.0f}%)"]
|
||
|
||
if self.is_exhausted:
|
||
parts.append("[EXHAUSTED]")
|
||
elif self.is_critical:
|
||
parts.append("[CRITICAL]")
|
||
elif self.is_warning:
|
||
parts.append("[WARNING]")
|
||
|
||
if self.config.user_budget:
|
||
parts.append(f"| budget: {self.cumulative_total/1000:.1f}k/{self.config.user_budget/1000:.0f}k")
|
||
|
||
return " ".join(parts)
|
||
|
||
def summary(self) -> Dict[str, Any]:
|
||
"""返回可供 API 响应的 token 预算摘要。"""
|
||
result: Dict[str, Any] = {
|
||
"input_tokens": self._input_tokens_estimate,
|
||
"input_remaining": self.input_remaining,
|
||
"input_usage_pct": round(self.input_usage_pct, 4),
|
||
"effective_window": self.config.effective_window,
|
||
"context_window": self.config.context_window,
|
||
"cumulative_total": self.cumulative_total,
|
||
"cumulative_prompt": self._cumulative_prompt_tokens,
|
||
"cumulative_completion": self._cumulative_completion_tokens,
|
||
"llm_call_count": self._llm_call_count,
|
||
"is_warning": self.is_warning,
|
||
"is_critical": self.is_critical,
|
||
"is_exhausted": self.is_exhausted,
|
||
"compaction_attempts": self._compaction_attempts_this_turn,
|
||
}
|
||
if self.config.user_budget is not None:
|
||
result["user_budget"] = self.config.user_budget
|
||
result["user_budget_used"] = self.user_budget_used
|
||
result["user_budget_remaining"] = self.user_budget_remaining
|
||
result["user_budget_pct"] = round(self.user_budget_pct, 4) if self.user_budget_pct else None
|
||
return result
|
||
|
||
def needs_user_budget_continue(self) -> bool:
|
||
"""用户预算用尽且配置了自动继续。"""
|
||
return self.is_user_budget_exhausted and self.config.auto_continue
|
||
|
||
|
||
# ────────────── 便捷工厂 ──────────────
|
||
|
||
|
||
def create_token_budget(
|
||
model: str = "deepseek-v4-flash",
|
||
context_window: int = 0,
|
||
user_budget: Optional[int] = None,
|
||
enabled: bool = True,
|
||
) -> TokenBudget:
|
||
"""创建预配置的 TokenBudget(适合大多数场景)。"""
|
||
config = TokenBudgetConfig(
|
||
enabled=enabled,
|
||
context_window=context_window or get_model_context_window(model),
|
||
user_budget=user_budget,
|
||
)
|
||
return TokenBudget(config=config, model=model)
|