aiagent/backend/app/core/token_budget.py

"""
Token 预算管理器 — 追踪每次 LLM 调用的 token 消耗，提供预警和限额控制。

参考 Claude Code:
- src/utils/tokenBudget.ts — 预算追踪与自动续行
- src/utils/tokenUsageTracker.ts — 累计用量追踪
- UI StatusLine 的 token 用量条

核心概念:
- context_window: 模型上下文窗口大小（如 128K）
- output_reserve: 留给模型输出的空间（默认 8K），只有 (window - reserve) 可被输入使用
- warning/critical/exhausted 三级预警
- 用户可设置 target budget（如 +500k），达到后自动继续
"""
from __future__ import annotations

import logging
from dataclasses import dataclass, field
from typing import Any, Dict, Optional

from app.core.token_counter import TokenCounter, get_model_context_window

logger = logging.getLogger(__name__)


# ────────────── 配置 ──────────────


@dataclass
class TokenBudgetConfig:
    """Token 预算配置。"""

    # ── 总开关 ──
    enabled: bool = True

    # ── 窗口配置 ──
    context_window: int = 128_000         # 模型上下文窗口（token），0=自动检测
    output_reserve: int = 8_192            # 留给模型输出的空间

    # ── 预警阈值（占有效窗口的百分比） ──
    warning_threshold_pct: float = 0.75   # 75% → 开始预警
    compact_threshold_pct: float = 0.85   # 85% → 触发自动压缩
    hard_limit_pct: float = 0.95          # 95% → 下次调用前必须压缩

    # ── 用户预算目标 ──
    user_budget: Optional[int] = None     # 用户累计 token 目标（如 500_000）
    auto_continue: bool = False           # 达到用户预算后是否自动继续

    # ── 压缩协调 ──
    compaction_after_warning: bool = True # 预警后是否自动触发压缩
    max_compaction_attempts: int = 3      # 单轮最多压缩尝试次数

    @property
    def effective_window(self) -> int:
        """有效输入窗口 = 上下文窗口 - 输出预留。"""
        return max(0, self.context_window - self.output_reserve)

    @property
    def warning_at(self) -> int:
        """预警 token 数。"""
        return int(self.effective_window * self.warning_threshold_pct)

    @property
    def compact_at(self) -> int:
        """自动压缩触发 token 数。"""
        return int(self.effective_window * self.compact_threshold_pct)

    @property
    def hard_limit_at(self) -> int:
        """硬限制 token 数（超过则拒绝调用 LLM）。"""
        return int(self.effective_window * self.hard_limit_pct)


# ────────────── 快照 ──────────────


@dataclass
class TokenSnapshot:
    """单次 LLM 调用的 token 快照。"""
    prompt_tokens: int = 0
    completion_tokens: int = 0
    total_tokens: int = 0
    iteration: int = 0
    step_type: str = ""          # think / final
    model: str = ""


# ────────────── 预算追踪器 ──────────────


class TokenBudget:
    """会话级 token 预算追踪器。

    追踪:
    - 当前消息列表的 token 数（输入侧）
    - 累计 LLM 消耗（输入 + 输出）
    - 用户预算目标的进度
    - 预警/压缩/限额状态

    用法::

        budget = TokenBudget(TokenBudgetConfig(context_window=128000))
        budget.update_input_estimate(counter.count_messages(messages))

        if budget.needs_compaction:
            # trigger compaction

        budget.record_llm_call(prompt_tokens=5000, completion_tokens=800)
        print(budget.status_line)  # "12.5k/128k (10%) | ⚠ near limit"
    """

    def __init__(
        self,
        config: Optional[TokenBudgetConfig] = None,
        model: str = "deepseek-v4-flash",
        token_counter: Optional[TokenCounter] = None,
    ):
        self.config = config or TokenBudgetConfig()
        self.model = model
        self.counter = token_counter or TokenCounter(model=model)

        # 自动检测上下文窗口
        if self.config.context_window <= 0:
            self.config.context_window = get_model_context_window(model)

        # ── 计数器 ──
        self._input_tokens_estimate: int = 0       # 当前输入消息列表的 token 估计
        self._cumulative_prompt_tokens: int = 0     # 累计 prompt token（含重试）
        self._cumulative_completion_tokens: int = 0 # 累计 completion token
        self._llm_call_count: int = 0
        self._compaction_attempts_this_turn: int = 0

        # ── 历史快照（最近 20 次调用） ──
        self._snapshots: list[TokenSnapshot] = []

    # ──────── 属性 ────────

    @property
    def input_tokens(self) -> int:
        """当前输入消息列表的预估 token 数。"""
        return self._input_tokens_estimate

    @property
    def cumulative_total(self) -> int:
        """累计消耗 token（prompt + completion）。"""
        return self._cumulative_prompt_tokens + self._cumulative_completion_tokens

    @property
    def cumulative_prompt(self) -> int:
        return self._cumulative_prompt_tokens

    @property
    def cumulative_completion(self) -> int:
        return self._cumulative_completion_tokens

    @property
    def llm_call_count(self) -> int:
        return self._llm_call_count

    @property
    def input_usage_pct(self) -> float:
        """输入占用窗口的百分比。"""
        ew = self.config.effective_window
        return self._input_tokens_estimate / ew if ew > 0 else 0.0

    @property
    def input_remaining(self) -> int:
        """输入侧剩余 token 空间。"""
        return max(0, self.config.effective_window - self._input_tokens_estimate)

    @property
    def user_budget_used(self) -> int:
        """用户预算消耗量。"""
        return self.cumulative_total

    @property
    def user_budget_remaining(self) -> Optional[int]:
        """用户预算剩余量（未设置则 None）。"""
        if self.config.user_budget is None:
            return None
        return max(0, self.config.user_budget - self.cumulative_total)

    @property
    def user_budget_pct(self) -> Optional[float]:
        """用户预算消耗百分比。"""
        if self.config.user_budget is None or self.config.user_budget <= 0:
            return None
        return self.cumulative_total / self.config.user_budget

    # ──────── 状态判断 ────────

    @property
    def is_warning(self) -> bool:
        """是否达到预警线。"""
        return self._input_tokens_estimate >= self.config.warning_at

    @property
    def is_critical(self) -> bool:
        """是否达到紧急线（需要立即压缩）。"""
        return self._input_tokens_estimate >= self.config.compact_at

    @property
    def is_exhausted(self) -> bool:
        """是否达到硬限制（调用 LLM 前必须处理）。"""
        return self._input_tokens_estimate >= self.config.hard_limit_at

    @property
    def needs_compaction(self) -> bool:
        """是否需要触发压缩。"""
        if not self.config.compaction_after_warning:
            return False
        if self._compaction_attempts_this_turn >= self.config.max_compaction_attempts:
            return False  # 熔断
        return self.is_critical

    @property
    def compaction_attempts(self) -> int:
        return self._compaction_attempts_this_turn

    @property
    def is_user_budget_exhausted(self) -> bool:
        """用户预算是否用尽。"""
        rem = self.user_budget_remaining
        return rem is not None and rem <= 0

    # ──────── 更新方法 ────────

    def update_input_estimate(self, tokens: int) -> None:
        """更新当前输入消息列表的 token 估计值（每次消息列表变更后调用）。"""
        self._input_tokens_estimate = tokens
        logger.debug(
            "TokenBudget: input=%d tokens (%.1f%% of %d, compact_at=%d)",
            tokens, self.input_usage_pct * 100,
            self.config.effective_window, self.config.compact_at,
        )

    def update_from_counter(self, messages: list) -> int:
        """从消息列表计算并更新输入 token 估计。返回估计值。"""
        tokens = self.counter.count_messages(messages)
        self.update_input_estimate(tokens)
        return tokens

    def record_llm_call(
        self,
        prompt_tokens: int = 0,
        completion_tokens: int = 0,
        iteration: int = 0,
        step_type: str = "think",
    ) -> TokenSnapshot:
        """记录一次 LLM 调用。

        注意：prompt_tokens 应优先使用 API 返回的实际值；
        若不可用则传入 0，由 update_input_estimate 的估算值代替。
        """
        if prompt_tokens <= 0:
            prompt_tokens = self._input_tokens_estimate

        self._cumulative_prompt_tokens += prompt_tokens
        self._cumulative_completion_tokens += completion_tokens
        self._llm_call_count += 1

        snap = TokenSnapshot(
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
            total_tokens=prompt_tokens + completion_tokens,
            iteration=iteration,
            step_type=step_type,
            model=self.model,
        )
        self._snapshots.append(snap)
        # 只保留最近 50 次快照
        if len(self._snapshots) > 50:
            self._snapshots = self._snapshots[-50:]

        if logger.isEnabledFor(logging.DEBUG):
            logger.debug(
                "TokenBudget: call #%d prompt=%d comp=%d total=%d cumulative=%d (%.1f%%)",
                self._llm_call_count, prompt_tokens, completion_tokens,
                prompt_tokens + completion_tokens,
                self.cumulative_total,
                self.input_usage_pct * 100,
            )

        return snap

    def record_compaction_attempt(self) -> None:
        """记录一次压缩尝试（用于熔断计数）。"""
        self._compaction_attempts_this_turn += 1

    def reset_compaction_attempts(self) -> None:
        """重置压缩尝试计数（新轮次开始时调用）。"""
        self._compaction_attempts_this_turn = 0

    # ──────── 摘要/展示 ────────

    @property
    def status_line(self) -> str:
        """单行状态摘要（用于日志/UI）。"""
        pct = self.input_usage_pct * 100
        parts = [f"{self._input_tokens_estimate/1000:.1f}k/{self.config.effective_window/1000:.0f}k ({pct:.0f}%)"]

        if self.is_exhausted:
            parts.append("[EXHAUSTED]")
        elif self.is_critical:
            parts.append("[CRITICAL]")
        elif self.is_warning:
            parts.append("[WARNING]")

        if self.config.user_budget:
            parts.append(f"| budget: {self.cumulative_total/1000:.1f}k/{self.config.user_budget/1000:.0f}k")

        return " ".join(parts)

    def summary(self) -> Dict[str, Any]:
        """返回可供 API 响应的 token 预算摘要。"""
        result: Dict[str, Any] = {
            "input_tokens": self._input_tokens_estimate,
            "input_remaining": self.input_remaining,
            "input_usage_pct": round(self.input_usage_pct, 4),
            "effective_window": self.config.effective_window,
            "context_window": self.config.context_window,
            "cumulative_total": self.cumulative_total,
            "cumulative_prompt": self._cumulative_prompt_tokens,
            "cumulative_completion": self._cumulative_completion_tokens,
            "llm_call_count": self._llm_call_count,
            "is_warning": self.is_warning,
            "is_critical": self.is_critical,
            "is_exhausted": self.is_exhausted,
            "compaction_attempts": self._compaction_attempts_this_turn,
        }
        if self.config.user_budget is not None:
            result["user_budget"] = self.config.user_budget
            result["user_budget_used"] = self.user_budget_used
            result["user_budget_remaining"] = self.user_budget_remaining
            result["user_budget_pct"] = round(self.user_budget_pct, 4) if self.user_budget_pct else None
        return result

    def needs_user_budget_continue(self) -> bool:
        """用户预算用尽且配置了自动继续。"""
        return self.is_user_budget_exhausted and self.config.auto_continue


# ────────────── 便捷工厂 ──────────────


def create_token_budget(
    model: str = "deepseek-v4-flash",
    context_window: int = 0,
    user_budget: Optional[int] = None,
    enabled: bool = True,
) -> TokenBudget:
    """创建预配置的 TokenBudget（适合大多数场景）。"""
    config = TokenBudgetConfig(
        enabled=enabled,
        context_window=context_window or get_model_context_window(model),
        user_budget=user_budget,
    )
    return TokenBudget(config=config, model=model)