# -*- coding: utf-8 -*- """端到端长对话压力测试 — 50+ 轮验证压缩不丢上下文 测试逻辑: 1. Round 1: 注入关键事实(我的名字是小明,最爱《三体》) 2. Round 2-9: 填充轮 — 正常对话积累 token 3. Round 10: 验证 — 确认压缩是否触发 4. Round 11-48: 填充轮 — 继续积累 5. Round 49: 回忆 — 检查早期上下文是否丢失 6. Round 50: 最终验证 观测指标: - compaction_triggered: 压缩是否被触发 - token_usage.input_tokens: 输入 token 走势(应被压缩压低) - fact_retained: 50 轮后是否还记得早期事实 - context_lost_errors: 是否有上下文超限错误 """ import sys, os, io, json, time, requests sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace") BASE = "http://localhost:8038/api/v1" TOTAL_ROUNDS = 50 FACT_CHECK_ROUND = 10 # 第 10 轮做第一次找回检查 FINAL_CHECK_ROUND = 49 # 第 49 轮做最终找回检查 FILL_MESSAGES = [ "请用Python写一个简单的冒泡排序函数,只输出代码不要解释", "请把下面这段话翻译成英语:今天天气真好,适合出去散步。", "写一个SQL查询,找出表中最近7天注册的用户,按注册时间降序排列", "解释一下什么是Docker的layer缓存机制,控制在100字以内", "用JavaScript写一个防抖函数debounce", "Python的GIL是什么?有什么影响?请简短回答", "写一个简单的Makefile来编译和运行一个C项目", "列出5个提高代码可读性的最佳实践", "解释HTTP状态码200、301、404、500分别代表什么", "写一个正则表达式来匹配中国手机号", "什么是RESTful API?请用三句话内解释", "写一个简单的git工作流命令序列(clone→branch→commit→push→PR)", "Python装饰器的原理是什么?请简短说明", "解释什么是数据库索引以及什么时候该用", "写一个shell脚本统计当前目录下每种文件扩展名的数量", "什么是CI/CD?请用100字以内解释", "写一个Python context manager 示例", "解释乐观锁和悲观锁的区别", "用TypeScript写一个简单的泛型函数", "什么是缓存穿透、缓存击穿、缓存雪崩?简短区分", "写一个docker-compose.yml 启动 nginx+mysql", "Python中__init__和__new__的区别是什么?", "解释TCP三次握手和四次挥手", "写一个简单的Redis Lua脚本示例", "什么是JWT?在分布式系统中如何使用?", "解释CSS盒模型,简短回答", "写一个awk命令来统计日志中每个IP的访问次数", "Python中asyncio的基本用法是什么?", "解释微服务架构的优缺点", "写一个简单的React Hook示例", "什么是Kubernetes的Pod?简短回答", "解释HTTPS的握手过程", "写一个Python生成器函数的例子", "什么是消息队列?列举几个常用的MQ", "解释数据库ACID四个特性", "写一个简单的Nginx反向代理配置", "Python中*args和**kwargs的含义", "什么是WebSocket?与HTTP轮询对比有什么优势?", "解释面向对象六大原则中的开闭原则", "写一个PostgreSQL的窗口函数查询示例", ] MEMORY_FACT = "我的名字是小明,我最喜欢读的书是刘慈欣写的《三体》三部曲,我的职业是Python后端工程师。" def login(): r = requests.post(f"{BASE}/auth/login", data={"username": "admin", "password": "123456"}) r.raise_for_status() return r.json()["access_token"] def test_long_conversation(): token = login() headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"} print(f"=== 长对话压力测试 — {TOTAL_ROUNDS} 轮 ===\n") print(f"注入事实: {MEMORY_FACT}") print(f"首次检查: 第 {FACT_CHECK_ROUND} 轮") print(f"最终检查: 第 {FINAL_CHECK_ROUND} 轮\n") metrics = { "token_history": [], "compaction_triggered": False, "context_lost_errors": 0, "fact_retained_at_10": False, "fact_retained_at_49": False, "total_time_ms": 0, } start_time = time.time() # ── Round 1: 注入记忆事实 ── print("─" * 50) print(f"Round 1/50: 注入记忆事实…") r = requests.post(f"{BASE}/agent-chat/bare", headers=headers, json={"message": f"请记住以下关于我的信息:{MEMORY_FACT} 请回复'已记住'即可。", "streamlined": True}, timeout=180) r.raise_for_status() d = r.json() sid = d["session_id"] tu = d.get("token_usage") record_metrics(metrics, 1, tu, d) print(f" session={sid[:16]}…, tokens_in={tu.get('input_tokens', 'N/A') if tu else 'N/A'}") print(f" reply: {d['content'][:100]}") # ── Rounds 2-N: 填充 + 定期检查 ── for round_num in range(2, TOTAL_ROUNDS + 1): if round_num == FACT_CHECK_ROUND: msg = "我之前告诉过你我叫什么名字?最喜欢什么书?我的职业是什么?请简要回答。" label = f"Round {round_num}/50: 首次记忆检查" elif round_num == FINAL_CHECK_ROUND: msg = "请回忆一下:我们第一次对话时我告诉你我叫什么名字?我最喜欢的书是什么?我的职业是什么?" label = f"Round {round_num}/50: 最终记忆检查" else: msg = FILL_MESSAGES[(round_num - 2) % len(FILL_MESSAGES)] label = f"Round {round_num}/50: 填充" print(f"{label}…", end=" ", flush=True) try: r = requests.post(f"{BASE}/agent-chat/bare", headers=headers, json={"message": msg, "session_id": sid, "streamlined": True}, timeout=180) r.raise_for_status() d = r.json() tu = d.get("token_usage") record_metrics(metrics, round_num, tu, d) if round_num == FACT_CHECK_ROUND: content_lower = d["content"].lower() metrics["fact_retained_at_10"] = ( "小明" in d["content"] and ("三体" in d["content"] or "科幻" in d["content"]) ) status = "✓ 记住" if metrics["fact_retained_at_10"] else "✗ 遗忘" print(f"{status} | tokens_in={tu.get('input_tokens', '?') if tu else '?'}") print(f" reply: {d['content'][:200]}") elif round_num == FINAL_CHECK_ROUND: content_lower = d["content"].lower() metrics["fact_retained_at_49"] = ( "小明" in d["content"] and ("三体" in d["content"] or "科幻" in d["content"]) ) status = "✓ 记住" if metrics["fact_retained_at_49"] else "✗ 遗忘" print(f"{status} | tokens_in={tu.get('input_tokens', '?') if tu else '?'}") print(f" reply: {d['content'][:200]}") elif round_num % 10 == 0: print(f"✓ tokens_in={tu.get('input_tokens', '?') if tu else '?'}") else: print("✓") except requests.exceptions.HTTPError as e: err_body = e.response.text[:200] if hasattr(e, 'response') else str(e) print(f"✗ HTTP {e.response.status_code if hasattr(e, 'response') else '?'}: {err_body}") if "context" in str(err_body).lower() or "413" in str(e): metrics["context_lost_errors"] += 1 if metrics["context_lost_errors"] >= 3: print(" ⚠ 连续上下文超限,中止测试") break except requests.exceptions.Timeout: print("✗ 超时") except Exception as e: print(f"✗ {e}") metrics["total_time_ms"] = int((time.time() - start_time) * 1000) # ── 报告 ── print("\n" + "=" * 50) print("测试报告") print("=" * 50) print(f"总轮数: {TOTAL_ROUNDS}") print(f"总耗时: {metrics['total_time_ms']/1000:.1f}s") print(f"第10轮记忆保留: {'✓ 通过' if metrics['fact_retained_at_10'] else '✗ 失败'}") print(f"第49轮记忆保留: {'✓ 通过' if metrics['fact_retained_at_49'] else '✗ 失败'}") print(f"上下文超限错误: {metrics['context_lost_errors']} 次") if metrics["token_history"]: token_entries = metrics["token_history"] print(f"\nToken 用量走势:") print(f" Round 1: {token_entries[0]['input_tokens']:,} tokens") # 找到压缩触发点 prev = token_entries[0]["input_tokens"] for entry in token_entries[1:]: if entry["input_tokens"] < prev * 0.7: # 压缩后减少 >30% print(f" Round {entry['round']:>2}: {entry['input_tokens']:,} tokens ← 压缩触发") prev = entry["input_tokens"] print(f" Round {token_entries[-1]['round']:>2}: {token_entries[-1]['input_tokens']:,} tokens") passed = metrics["fact_retained_at_10"] and not metrics["context_lost_errors"] print(f"\n=== {'测试通过 ✓' if passed else '测试失败 ✗'} ===") return passed def record_metrics(metrics, round_num, token_usage, response): if token_usage: metrics["token_history"].append({ "round": round_num, "input_tokens": token_usage.get("input_tokens", 0), "cumulative_total": token_usage.get("cumulative_total", 0), "is_warning": token_usage.get("is_warning", False), "is_critical": token_usage.get("is_critical", False), "compaction_attempts": token_usage.get("compaction_attempts", 0), }) if token_usage.get("is_critical"): metrics["compaction_triggered"] = True if __name__ == "__main__": test_long_conversation()