Files
aiagent/test/test_stress_long_conversation.py
renjianbo beff3fac8d fix: delete agent 500 error + dynamic personality + deployment guide
- Fix delete agent 500: clean up FK records (agent_llm_logs, permissions,
  schedules, executions, team_members) and unbind goals/tasks before delete
- Remove hardcoded personality templates in Android, replace with dynamic
  system prompt generation from name + description
- Set promptSectionsEnabled=false to bypass PromptComposer for personality
- Add Tencent Cloud Linux deployment guide (Docker Compose)
- Accumulated backend service updates, frontend UI fixes, Android app changes

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-06-29 01:17:21 +08:00

215 lines
9.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""端到端长对话压力测试 — 50+ 轮验证压缩不丢上下文
测试逻辑:
1. Round 1: 注入关键事实(我的名字是小明,最爱《三体》)
2. Round 2-9: 填充轮 — 正常对话积累 token
3. Round 10: 验证 — 确认压缩是否触发
4. Round 11-48: 填充轮 — 继续积累
5. Round 49: 回忆 — 检查早期上下文是否丢失
6. Round 50: 最终验证
观测指标:
- compaction_triggered: 压缩是否被触发
- token_usage.input_tokens: 输入 token 走势(应被压缩压低)
- fact_retained: 50 轮后是否还记得早期事实
- context_lost_errors: 是否有上下文超限错误
"""
import sys, os, io, json, time, requests
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
BASE = "http://localhost:8038/api/v1"
TOTAL_ROUNDS = 50
FACT_CHECK_ROUND = 10 # 第 10 轮做第一次找回检查
FINAL_CHECK_ROUND = 49 # 第 49 轮做最终找回检查
FILL_MESSAGES = [
"请用Python写一个简单的冒泡排序函数只输出代码不要解释",
"请把下面这段话翻译成英语:今天天气真好,适合出去散步。",
"写一个SQL查询找出表中最近7天注册的用户按注册时间降序排列",
"解释一下什么是Docker的layer缓存机制控制在100字以内",
"用JavaScript写一个防抖函数debounce",
"Python的GIL是什么有什么影响请简短回答",
"写一个简单的Makefile来编译和运行一个C项目",
"列出5个提高代码可读性的最佳实践",
"解释HTTP状态码200、301、404、500分别代表什么",
"写一个正则表达式来匹配中国手机号",
"什么是RESTful API请用三句话内解释",
"写一个简单的git工作流命令序列clone→branch→commit→push→PR",
"Python装饰器的原理是什么请简短说明",
"解释什么是数据库索引以及什么时候该用",
"写一个shell脚本统计当前目录下每种文件扩展名的数量",
"什么是CI/CD请用100字以内解释",
"写一个Python context manager 示例",
"解释乐观锁和悲观锁的区别",
"用TypeScript写一个简单的泛型函数",
"什么是缓存穿透、缓存击穿、缓存雪崩?简短区分",
"写一个docker-compose.yml 启动 nginx+mysql",
"Python中__init__和__new__的区别是什么",
"解释TCP三次握手和四次挥手",
"写一个简单的Redis Lua脚本示例",
"什么是JWT在分布式系统中如何使用",
"解释CSS盒模型简短回答",
"写一个awk命令来统计日志中每个IP的访问次数",
"Python中asyncio的基本用法是什么",
"解释微服务架构的优缺点",
"写一个简单的React Hook示例",
"什么是Kubernetes的Pod简短回答",
"解释HTTPS的握手过程",
"写一个Python生成器函数的例子",
"什么是消息队列列举几个常用的MQ",
"解释数据库ACID四个特性",
"写一个简单的Nginx反向代理配置",
"Python中*args和**kwargs的含义",
"什么是WebSocket与HTTP轮询对比有什么优势",
"解释面向对象六大原则中的开闭原则",
"写一个PostgreSQL的窗口函数查询示例",
]
MEMORY_FACT = "我的名字是小明我最喜欢读的书是刘慈欣写的《三体》三部曲我的职业是Python后端工程师。"
def login():
r = requests.post(f"{BASE}/auth/login", data={"username": "admin", "password": "123456"})
r.raise_for_status()
return r.json()["access_token"]
def test_long_conversation():
token = login()
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
print(f"=== 长对话压力测试 — {TOTAL_ROUNDS} 轮 ===\n")
print(f"注入事实: {MEMORY_FACT}")
print(f"首次检查: 第 {FACT_CHECK_ROUND}")
print(f"最终检查: 第 {FINAL_CHECK_ROUND}\n")
metrics = {
"token_history": [],
"compaction_triggered": False,
"context_lost_errors": 0,
"fact_retained_at_10": False,
"fact_retained_at_49": False,
"total_time_ms": 0,
}
start_time = time.time()
# ── Round 1: 注入记忆事实 ──
print("" * 50)
print(f"Round 1/50: 注入记忆事实…")
r = requests.post(f"{BASE}/agent-chat/bare", headers=headers,
json={"message": f"请记住以下关于我的信息:{MEMORY_FACT} 请回复'已记住'即可。",
"streamlined": True}, timeout=180)
r.raise_for_status()
d = r.json()
sid = d["session_id"]
tu = d.get("token_usage")
record_metrics(metrics, 1, tu, d)
print(f" session={sid[:16]}…, tokens_in={tu.get('input_tokens', 'N/A') if tu else 'N/A'}")
print(f" reply: {d['content'][:100]}")
# ── Rounds 2-N: 填充 + 定期检查 ──
for round_num in range(2, TOTAL_ROUNDS + 1):
if round_num == FACT_CHECK_ROUND:
msg = "我之前告诉过你我叫什么名字?最喜欢什么书?我的职业是什么?请简要回答。"
label = f"Round {round_num}/50: 首次记忆检查"
elif round_num == FINAL_CHECK_ROUND:
msg = "请回忆一下:我们第一次对话时我告诉你我叫什么名字?我最喜欢的书是什么?我的职业是什么?"
label = f"Round {round_num}/50: 最终记忆检查"
else:
msg = FILL_MESSAGES[(round_num - 2) % len(FILL_MESSAGES)]
label = f"Round {round_num}/50: 填充"
print(f"{label}", end=" ", flush=True)
try:
r = requests.post(f"{BASE}/agent-chat/bare", headers=headers,
json={"message": msg, "session_id": sid, "streamlined": True},
timeout=180)
r.raise_for_status()
d = r.json()
tu = d.get("token_usage")
record_metrics(metrics, round_num, tu, d)
if round_num == FACT_CHECK_ROUND:
content_lower = d["content"].lower()
metrics["fact_retained_at_10"] = (
"小明" in d["content"] and ("三体" in d["content"] or "科幻" in d["content"])
)
status = "✓ 记住" if metrics["fact_retained_at_10"] else "✗ 遗忘"
print(f"{status} | tokens_in={tu.get('input_tokens', '?') if tu else '?'}")
print(f" reply: {d['content'][:200]}")
elif round_num == FINAL_CHECK_ROUND:
content_lower = d["content"].lower()
metrics["fact_retained_at_49"] = (
"小明" in d["content"] and ("三体" in d["content"] or "科幻" in d["content"])
)
status = "✓ 记住" if metrics["fact_retained_at_49"] else "✗ 遗忘"
print(f"{status} | tokens_in={tu.get('input_tokens', '?') if tu else '?'}")
print(f" reply: {d['content'][:200]}")
elif round_num % 10 == 0:
print(f"✓ tokens_in={tu.get('input_tokens', '?') if tu else '?'}")
else:
print("")
except requests.exceptions.HTTPError as e:
err_body = e.response.text[:200] if hasattr(e, 'response') else str(e)
print(f"✗ HTTP {e.response.status_code if hasattr(e, 'response') else '?'}: {err_body}")
if "context" in str(err_body).lower() or "413" in str(e):
metrics["context_lost_errors"] += 1
if metrics["context_lost_errors"] >= 3:
print(" ⚠ 连续上下文超限,中止测试")
break
except requests.exceptions.Timeout:
print("✗ 超时")
except Exception as e:
print(f"{e}")
metrics["total_time_ms"] = int((time.time() - start_time) * 1000)
# ── 报告 ──
print("\n" + "=" * 50)
print("测试报告")
print("=" * 50)
print(f"总轮数: {TOTAL_ROUNDS}")
print(f"总耗时: {metrics['total_time_ms']/1000:.1f}s")
print(f"第10轮记忆保留: {'✓ 通过' if metrics['fact_retained_at_10'] else '✗ 失败'}")
print(f"第49轮记忆保留: {'✓ 通过' if metrics['fact_retained_at_49'] else '✗ 失败'}")
print(f"上下文超限错误: {metrics['context_lost_errors']}")
if metrics["token_history"]:
token_entries = metrics["token_history"]
print(f"\nToken 用量走势:")
print(f" Round 1: {token_entries[0]['input_tokens']:,} tokens")
# 找到压缩触发点
prev = token_entries[0]["input_tokens"]
for entry in token_entries[1:]:
if entry["input_tokens"] < prev * 0.7: # 压缩后减少 >30%
print(f" Round {entry['round']:>2}: {entry['input_tokens']:,} tokens ← 压缩触发")
prev = entry["input_tokens"]
print(f" Round {token_entries[-1]['round']:>2}: {token_entries[-1]['input_tokens']:,} tokens")
passed = metrics["fact_retained_at_10"] and not metrics["context_lost_errors"]
print(f"\n=== {'测试通过 ✓' if passed else '测试失败 ✗'} ===")
return passed
def record_metrics(metrics, round_num, token_usage, response):
if token_usage:
metrics["token_history"].append({
"round": round_num,
"input_tokens": token_usage.get("input_tokens", 0),
"cumulative_total": token_usage.get("cumulative_total", 0),
"is_warning": token_usage.get("is_warning", False),
"is_critical": token_usage.get("is_critical", False),
"compaction_attempts": token_usage.get("compaction_attempts", 0),
})
if token_usage.get("is_critical"):
metrics["compaction_triggered"] = True
if __name__ == "__main__":
test_long_conversation()