Files
aiagent/backend/app/core/token_budget.py
renjianbo beff3fac8d fix: delete agent 500 error + dynamic personality + deployment guide
- Fix delete agent 500: clean up FK records (agent_llm_logs, permissions,
  schedules, executions, team_members) and unbind goals/tasks before delete
- Remove hardcoded personality templates in Android, replace with dynamic
  system prompt generation from name + description
- Set promptSectionsEnabled=false to bypass PromptComposer for personality
- Add Tencent Cloud Linux deployment guide (Docker Compose)
- Accumulated backend service updates, frontend UI fixes, Android app changes

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-06-29 01:17:21 +08:00

359 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Token 预算管理器 — 追踪每次 LLM 调用的 token 消耗,提供预警和限额控制。
参考 Claude Code:
- src/utils/tokenBudget.ts — 预算追踪与自动续行
- src/utils/tokenUsageTracker.ts — 累计用量追踪
- UI StatusLine 的 token 用量条
核心概念:
- context_window: 模型上下文窗口大小(如 128K
- output_reserve: 留给模型输出的空间(默认 8K只有 (window - reserve) 可被输入使用
- warning/critical/exhausted 三级预警
- 用户可设置 target budget如 +500k达到后自动继续
"""
from __future__ import annotations
import logging
from dataclasses import dataclass, field
from typing import Any, Dict, Optional
from app.core.token_counter import TokenCounter, get_model_context_window
logger = logging.getLogger(__name__)
# ────────────── 配置 ──────────────
@dataclass
class TokenBudgetConfig:
"""Token 预算配置。"""
# ── 总开关 ──
enabled: bool = True
# ── 窗口配置 ──
context_window: int = 128_000 # 模型上下文窗口token0=自动检测
output_reserve: int = 8_192 # 留给模型输出的空间
# ── 预警阈值(占有效窗口的百分比) ──
warning_threshold_pct: float = 0.75 # 75% → 开始预警
compact_threshold_pct: float = 0.85 # 85% → 触发自动压缩
hard_limit_pct: float = 0.95 # 95% → 下次调用前必须压缩
# ── 用户预算目标 ──
user_budget: Optional[int] = None # 用户累计 token 目标(如 500_000
auto_continue: bool = False # 达到用户预算后是否自动继续
# ── 压缩协调 ──
compaction_after_warning: bool = True # 预警后是否自动触发压缩
max_compaction_attempts: int = 3 # 单轮最多压缩尝试次数
@property
def effective_window(self) -> int:
"""有效输入窗口 = 上下文窗口 - 输出预留。"""
return max(0, self.context_window - self.output_reserve)
@property
def warning_at(self) -> int:
"""预警 token 数。"""
return int(self.effective_window * self.warning_threshold_pct)
@property
def compact_at(self) -> int:
"""自动压缩触发 token 数。"""
return int(self.effective_window * self.compact_threshold_pct)
@property
def hard_limit_at(self) -> int:
"""硬限制 token 数(超过则拒绝调用 LLM"""
return int(self.effective_window * self.hard_limit_pct)
# ────────────── 快照 ──────────────
@dataclass
class TokenSnapshot:
"""单次 LLM 调用的 token 快照。"""
prompt_tokens: int = 0
completion_tokens: int = 0
total_tokens: int = 0
iteration: int = 0
step_type: str = "" # think / final
model: str = ""
# ────────────── 预算追踪器 ──────────────
class TokenBudget:
"""会话级 token 预算追踪器。
追踪:
- 当前消息列表的 token 数(输入侧)
- 累计 LLM 消耗(输入 + 输出)
- 用户预算目标的进度
- 预警/压缩/限额状态
用法::
budget = TokenBudget(TokenBudgetConfig(context_window=128000))
budget.update_input_estimate(counter.count_messages(messages))
if budget.needs_compaction:
# trigger compaction
budget.record_llm_call(prompt_tokens=5000, completion_tokens=800)
print(budget.status_line) # "12.5k/128k (10%) | ⚠ near limit"
"""
def __init__(
self,
config: Optional[TokenBudgetConfig] = None,
model: str = "deepseek-v4-flash",
token_counter: Optional[TokenCounter] = None,
):
self.config = config or TokenBudgetConfig()
self.model = model
self.counter = token_counter or TokenCounter(model=model)
# 自动检测上下文窗口
if self.config.context_window <= 0:
self.config.context_window = get_model_context_window(model)
# ── 计数器 ──
self._input_tokens_estimate: int = 0 # 当前输入消息列表的 token 估计
self._cumulative_prompt_tokens: int = 0 # 累计 prompt token含重试
self._cumulative_completion_tokens: int = 0 # 累计 completion token
self._llm_call_count: int = 0
self._compaction_attempts_this_turn: int = 0
# ── 历史快照(最近 20 次调用) ──
self._snapshots: list[TokenSnapshot] = []
# ──────── 属性 ────────
@property
def input_tokens(self) -> int:
"""当前输入消息列表的预估 token 数。"""
return self._input_tokens_estimate
@property
def cumulative_total(self) -> int:
"""累计消耗 tokenprompt + completion"""
return self._cumulative_prompt_tokens + self._cumulative_completion_tokens
@property
def cumulative_prompt(self) -> int:
return self._cumulative_prompt_tokens
@property
def cumulative_completion(self) -> int:
return self._cumulative_completion_tokens
@property
def llm_call_count(self) -> int:
return self._llm_call_count
@property
def input_usage_pct(self) -> float:
"""输入占用窗口的百分比。"""
ew = self.config.effective_window
return self._input_tokens_estimate / ew if ew > 0 else 0.0
@property
def input_remaining(self) -> int:
"""输入侧剩余 token 空间。"""
return max(0, self.config.effective_window - self._input_tokens_estimate)
@property
def user_budget_used(self) -> int:
"""用户预算消耗量。"""
return self.cumulative_total
@property
def user_budget_remaining(self) -> Optional[int]:
"""用户预算剩余量(未设置则 None"""
if self.config.user_budget is None:
return None
return max(0, self.config.user_budget - self.cumulative_total)
@property
def user_budget_pct(self) -> Optional[float]:
"""用户预算消耗百分比。"""
if self.config.user_budget is None or self.config.user_budget <= 0:
return None
return self.cumulative_total / self.config.user_budget
# ──────── 状态判断 ────────
@property
def is_warning(self) -> bool:
"""是否达到预警线。"""
return self._input_tokens_estimate >= self.config.warning_at
@property
def is_critical(self) -> bool:
"""是否达到紧急线(需要立即压缩)。"""
return self._input_tokens_estimate >= self.config.compact_at
@property
def is_exhausted(self) -> bool:
"""是否达到硬限制(调用 LLM 前必须处理)。"""
return self._input_tokens_estimate >= self.config.hard_limit_at
@property
def needs_compaction(self) -> bool:
"""是否需要触发压缩。"""
if not self.config.compaction_after_warning:
return False
if self._compaction_attempts_this_turn >= self.config.max_compaction_attempts:
return False # 熔断
return self.is_critical
@property
def compaction_attempts(self) -> int:
return self._compaction_attempts_this_turn
@property
def is_user_budget_exhausted(self) -> bool:
"""用户预算是否用尽。"""
rem = self.user_budget_remaining
return rem is not None and rem <= 0
# ──────── 更新方法 ────────
def update_input_estimate(self, tokens: int) -> None:
"""更新当前输入消息列表的 token 估计值(每次消息列表变更后调用)。"""
self._input_tokens_estimate = tokens
logger.debug(
"TokenBudget: input=%d tokens (%.1f%% of %d, compact_at=%d)",
tokens, self.input_usage_pct * 100,
self.config.effective_window, self.config.compact_at,
)
def update_from_counter(self, messages: list) -> int:
"""从消息列表计算并更新输入 token 估计。返回估计值。"""
tokens = self.counter.count_messages(messages)
self.update_input_estimate(tokens)
return tokens
def record_llm_call(
self,
prompt_tokens: int = 0,
completion_tokens: int = 0,
iteration: int = 0,
step_type: str = "think",
) -> TokenSnapshot:
"""记录一次 LLM 调用。
注意prompt_tokens 应优先使用 API 返回的实际值;
若不可用则传入 0由 update_input_estimate 的估算值代替。
"""
if prompt_tokens <= 0:
prompt_tokens = self._input_tokens_estimate
self._cumulative_prompt_tokens += prompt_tokens
self._cumulative_completion_tokens += completion_tokens
self._llm_call_count += 1
snap = TokenSnapshot(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens + completion_tokens,
iteration=iteration,
step_type=step_type,
model=self.model,
)
self._snapshots.append(snap)
# 只保留最近 50 次快照
if len(self._snapshots) > 50:
self._snapshots = self._snapshots[-50:]
if logger.isEnabledFor(logging.DEBUG):
logger.debug(
"TokenBudget: call #%d prompt=%d comp=%d total=%d cumulative=%d (%.1f%%)",
self._llm_call_count, prompt_tokens, completion_tokens,
prompt_tokens + completion_tokens,
self.cumulative_total,
self.input_usage_pct * 100,
)
return snap
def record_compaction_attempt(self) -> None:
"""记录一次压缩尝试(用于熔断计数)。"""
self._compaction_attempts_this_turn += 1
def reset_compaction_attempts(self) -> None:
"""重置压缩尝试计数(新轮次开始时调用)。"""
self._compaction_attempts_this_turn = 0
# ──────── 摘要/展示 ────────
@property
def status_line(self) -> str:
"""单行状态摘要(用于日志/UI"""
pct = self.input_usage_pct * 100
parts = [f"{self._input_tokens_estimate/1000:.1f}k/{self.config.effective_window/1000:.0f}k ({pct:.0f}%)"]
if self.is_exhausted:
parts.append("[EXHAUSTED]")
elif self.is_critical:
parts.append("[CRITICAL]")
elif self.is_warning:
parts.append("[WARNING]")
if self.config.user_budget:
parts.append(f"| budget: {self.cumulative_total/1000:.1f}k/{self.config.user_budget/1000:.0f}k")
return " ".join(parts)
def summary(self) -> Dict[str, Any]:
"""返回可供 API 响应的 token 预算摘要。"""
result: Dict[str, Any] = {
"input_tokens": self._input_tokens_estimate,
"input_remaining": self.input_remaining,
"input_usage_pct": round(self.input_usage_pct, 4),
"effective_window": self.config.effective_window,
"context_window": self.config.context_window,
"cumulative_total": self.cumulative_total,
"cumulative_prompt": self._cumulative_prompt_tokens,
"cumulative_completion": self._cumulative_completion_tokens,
"llm_call_count": self._llm_call_count,
"is_warning": self.is_warning,
"is_critical": self.is_critical,
"is_exhausted": self.is_exhausted,
"compaction_attempts": self._compaction_attempts_this_turn,
}
if self.config.user_budget is not None:
result["user_budget"] = self.config.user_budget
result["user_budget_used"] = self.user_budget_used
result["user_budget_remaining"] = self.user_budget_remaining
result["user_budget_pct"] = round(self.user_budget_pct, 4) if self.user_budget_pct else None
return result
def needs_user_budget_continue(self) -> bool:
"""用户预算用尽且配置了自动继续。"""
return self.is_user_budget_exhausted and self.config.auto_continue
# ────────────── 便捷工厂 ──────────────
def create_token_budget(
model: str = "deepseek-v4-flash",
context_window: int = 0,
user_budget: Optional[int] = None,
enabled: bool = True,
) -> TokenBudget:
"""创建预配置的 TokenBudget适合大多数场景"""
config = TokenBudgetConfig(
enabled=enabled,
context_window=context_window or get_model_context_window(model),
user_budget=user_budget,
)
return TokenBudget(config=config, model=model)