aiagent/test/test_stress_long_conversation.py

# -*- coding: utf-8 -*-
"""端到端长对话压力测试 — 50+ 轮验证压缩不丢上下文

测试逻辑：
1. Round  1: 注入关键事实（我的名字是小明，最爱《三体》）
2. Round  2-9: 填充轮 — 正常对话积累 token
3. Round 10: 验证 — 确认压缩是否触发
4. Round 11-48: 填充轮 — 继续积累
5. Round 49: 回忆 — 检查早期上下文是否丢失
6. Round 50: 最终验证

观测指标：
- compaction_triggered: 压缩是否被触发
- token_usage.input_tokens: 输入 token 走势（应被压缩压低）
- fact_retained: 50 轮后是否还记得早期事实
- context_lost_errors: 是否有上下文超限错误
"""
import sys, os, io, json, time, requests

sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")

BASE = "http://localhost:8038/api/v1"
TOTAL_ROUNDS = 50
FACT_CHECK_ROUND = 10   # 第 10 轮做第一次找回检查
FINAL_CHECK_ROUND = 49  # 第 49 轮做最终找回检查

FILL_MESSAGES = [
    "请用Python写一个简单的冒泡排序函数，只输出代码不要解释",
    "请把下面这段话翻译成英语：今天天气真好，适合出去散步。",
    "写一个SQL查询，找出表中最近7天注册的用户，按注册时间降序排列",
    "解释一下什么是Docker的layer缓存机制，控制在100字以内",
    "用JavaScript写一个防抖函数debounce",
    "Python的GIL是什么？有什么影响？请简短回答",
    "写一个简单的Makefile来编译和运行一个C项目",
    "列出5个提高代码可读性的最佳实践",
    "解释HTTP状态码200、301、404、500分别代表什么",
    "写一个正则表达式来匹配中国手机号",
    "什么是RESTful API？请用三句话内解释",
    "写一个简单的git工作流命令序列（clone→branch→commit→push→PR）",
    "Python装饰器的原理是什么？请简短说明",
    "解释什么是数据库索引以及什么时候该用",
    "写一个shell脚本统计当前目录下每种文件扩展名的数量",
    "什么是CI/CD？请用100字以内解释",
    "写一个Python context manager 示例",
    "解释乐观锁和悲观锁的区别",
    "用TypeScript写一个简单的泛型函数",
    "什么是缓存穿透、缓存击穿、缓存雪崩？简短区分",
    "写一个docker-compose.yml 启动 nginx+mysql",
    "Python中__init__和__new__的区别是什么？",
    "解释TCP三次握手和四次挥手",
    "写一个简单的Redis Lua脚本示例",
    "什么是JWT？在分布式系统中如何使用？",
    "解释CSS盒模型，简短回答",
    "写一个awk命令来统计日志中每个IP的访问次数",
    "Python中asyncio的基本用法是什么？",
    "解释微服务架构的优缺点",
    "写一个简单的React Hook示例",
    "什么是Kubernetes的Pod？简短回答",
    "解释HTTPS的握手过程",
    "写一个Python生成器函数的例子",
    "什么是消息队列？列举几个常用的MQ",
    "解释数据库ACID四个特性",
    "写一个简单的Nginx反向代理配置",
    "Python中*args和**kwargs的含义",
    "什么是WebSocket？与HTTP轮询对比有什么优势？",
    "解释面向对象六大原则中的开闭原则",
    "写一个PostgreSQL的窗口函数查询示例",
]

MEMORY_FACT = "我的名字是小明，我最喜欢读的书是刘慈欣写的《三体》三部曲，我的职业是Python后端工程师。"


def login():
    r = requests.post(f"{BASE}/auth/login", data={"username": "admin", "password": "123456"})
    r.raise_for_status()
    return r.json()["access_token"]


def test_long_conversation():
    token = login()
    headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
    print(f"=== 长对话压力测试 — {TOTAL_ROUNDS} 轮 ===\n")
    print(f"注入事实: {MEMORY_FACT}")
    print(f"首次检查: 第 {FACT_CHECK_ROUND} 轮")
    print(f"最终检查: 第 {FINAL_CHECK_ROUND} 轮\n")

    metrics = {
        "token_history": [],
        "compaction_triggered": False,
        "context_lost_errors": 0,
        "fact_retained_at_10": False,
        "fact_retained_at_49": False,
        "total_time_ms": 0,
    }

    start_time = time.time()

    # ── Round 1: 注入记忆事实 ──
    print("─" * 50)
    print(f"Round 1/50: 注入记忆事实…")
    r = requests.post(f"{BASE}/agent-chat/bare", headers=headers,
                      json={"message": f"请记住以下关于我的信息：{MEMORY_FACT} 请回复'已记住'即可。",
                            "streamlined": True}, timeout=180)
    r.raise_for_status()
    d = r.json()
    sid = d["session_id"]
    tu = d.get("token_usage")
    record_metrics(metrics, 1, tu, d)
    print(f"  session={sid[:16]}…, tokens_in={tu.get('input_tokens', 'N/A') if tu else 'N/A'}")
    print(f"  reply: {d['content'][:100]}")

    # ── Rounds 2-N: 填充 + 定期检查 ──
    for round_num in range(2, TOTAL_ROUNDS + 1):
        if round_num == FACT_CHECK_ROUND:
            msg = "我之前告诉过你我叫什么名字？最喜欢什么书？我的职业是什么？请简要回答。"
            label = f"Round {round_num}/50: 首次记忆检查"
        elif round_num == FINAL_CHECK_ROUND:
            msg = "请回忆一下：我们第一次对话时我告诉你我叫什么名字？我最喜欢的书是什么？我的职业是什么？"
            label = f"Round {round_num}/50: 最终记忆检查"
        else:
            msg = FILL_MESSAGES[(round_num - 2) % len(FILL_MESSAGES)]
            label = f"Round {round_num}/50: 填充"

        print(f"{label}…", end=" ", flush=True)
        try:
            r = requests.post(f"{BASE}/agent-chat/bare", headers=headers,
                             json={"message": msg, "session_id": sid, "streamlined": True},
                             timeout=180)
            r.raise_for_status()
            d = r.json()
            tu = d.get("token_usage")
            record_metrics(metrics, round_num, tu, d)

            if round_num == FACT_CHECK_ROUND:
                content_lower = d["content"].lower()
                metrics["fact_retained_at_10"] = (
                    "小明" in d["content"] and ("三体" in d["content"] or "科幻" in d["content"])
                )
                status = "✓ 记住" if metrics["fact_retained_at_10"] else "✗ 遗忘"
                print(f"{status} | tokens_in={tu.get('input_tokens', '?') if tu else '?'}")
                print(f"  reply: {d['content'][:200]}")

            elif round_num == FINAL_CHECK_ROUND:
                content_lower = d["content"].lower()
                metrics["fact_retained_at_49"] = (
                    "小明" in d["content"] and ("三体" in d["content"] or "科幻" in d["content"])
                )
                status = "✓ 记住" if metrics["fact_retained_at_49"] else "✗ 遗忘"
                print(f"{status} | tokens_in={tu.get('input_tokens', '?') if tu else '?'}")
                print(f"  reply: {d['content'][:200]}")

            elif round_num % 10 == 0:
                print(f"✓ tokens_in={tu.get('input_tokens', '?') if tu else '?'}")
            else:
                print("✓")

        except requests.exceptions.HTTPError as e:
            err_body = e.response.text[:200] if hasattr(e, 'response') else str(e)
            print(f"✗ HTTP {e.response.status_code if hasattr(e, 'response') else '?'}: {err_body}")
            if "context" in str(err_body).lower() or "413" in str(e):
                metrics["context_lost_errors"] += 1
            if metrics["context_lost_errors"] >= 3:
                print("  ⚠ 连续上下文超限，中止测试")
                break
        except requests.exceptions.Timeout:
            print("✗ 超时")
        except Exception as e:
            print(f"✗ {e}")

    metrics["total_time_ms"] = int((time.time() - start_time) * 1000)

    # ── 报告 ──
    print("\n" + "=" * 50)
    print("测试报告")
    print("=" * 50)
    print(f"总轮数: {TOTAL_ROUNDS}")
    print(f"总耗时: {metrics['total_time_ms']/1000:.1f}s")
    print(f"第10轮记忆保留: {'✓ 通过' if metrics['fact_retained_at_10'] else '✗ 失败'}")
    print(f"第49轮记忆保留: {'✓ 通过' if metrics['fact_retained_at_49'] else '✗ 失败'}")
    print(f"上下文超限错误: {metrics['context_lost_errors']} 次")

    if metrics["token_history"]:
        token_entries = metrics["token_history"]
        print(f"\nToken 用量走势:")
        print(f"  Round 1:    {token_entries[0]['input_tokens']:,} tokens")
        # 找到压缩触发点
        prev = token_entries[0]["input_tokens"]
        for entry in token_entries[1:]:
            if entry["input_tokens"] < prev * 0.7:  # 压缩后减少 >30%
                print(f"  Round {entry['round']:>2}:   {entry['input_tokens']:,} tokens  ← 压缩触发")
            prev = entry["input_tokens"]
        print(f"  Round {token_entries[-1]['round']:>2}:   {token_entries[-1]['input_tokens']:,} tokens")

    passed = metrics["fact_retained_at_10"] and not metrics["context_lost_errors"]
    print(f"\n=== {'测试通过 ✓' if passed else '测试失败 ✗'} ===")
    return passed


def record_metrics(metrics, round_num, token_usage, response):
    if token_usage:
        metrics["token_history"].append({
            "round": round_num,
            "input_tokens": token_usage.get("input_tokens", 0),
            "cumulative_total": token_usage.get("cumulative_total", 0),
            "is_warning": token_usage.get("is_warning", False),
            "is_critical": token_usage.get("is_critical", False),
            "compaction_attempts": token_usage.get("compaction_attempts", 0),
        })
        if token_usage.get("is_critical"):
            metrics["compaction_triggered"] = True


if __name__ == "__main__":
    test_long_conversation()