Phase 2: eval harness, 182 examples, live bake-off, playtest infrastructure

- Expanded dataset from 31 to 182 examples (45 manual + 106 extracted from server logs)
- Built eval/harness.py with per-category breakdowns and baseline tracking
- Built eval/live_bakeoff.py for RCON-verified model comparison on live server
- Extracted training data from prayer logs, sudo logs, and bug reports on CT 644
- Added Reddit post draft and modmail for playtester recruitment
- Updated server context: all servers now online-mode=false + whitelist
- Updated PLAN.md with Phase 2 progress

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-18 13:38:12 -04:00
parent eaa9e0c26b
commit 38b9a02e45
10 changed files with 1522 additions and 31 deletions
+10 -4
View File
@@ -10,9 +10,10 @@
"difficulty": "easy",
"gamemode": "survival",
"max_players": 20,
"online_mode": true,
"online_mode": false,
"whitelist": true,
"pvp": true,
"notes": "Main vanilla survival server. MCSManager-managed."
"notes": "Main vanilla survival server. MCSManager-managed. autoStart + autoRestart enabled."
},
{
"name": "shrink-world",
@@ -24,7 +25,8 @@
"difficulty": "hard",
"gamemode": "survival",
"max_players": 20,
"online_mode": true,
"online_mode": false,
"whitelist": true,
"pvp": true,
"datapacks": ["shrinkborder (world border shrinks on death)", "morespawns (5x creeper spawns)"],
"notes": "Hardcore-style challenge server. World border starts at 500x500."
@@ -39,10 +41,12 @@
"difficulty": "hard",
"gamemode": "survival",
"max_players": 20,
"online_mode": true,
"online_mode": false,
"whitelist": true,
"pvp": true,
"plugins": ["FastAsyncWorldEdit", "LuckPerms"],
"ai_services": ["mc-aigod-paper.service (God/sudo AI)", "mc-langgraph-gateway.service (session gateway)"],
"training_role": "Primary training/eval server for playtesters. Has full AI God + sudo + LangGraph gateway.",
"notes": "Paper fork with AI God, sudo translator, and world observation tools."
},
{
@@ -56,8 +60,10 @@
"gamemode": "creative",
"max_players": 50,
"online_mode": false,
"whitelist": false,
"pvp": false,
"world_type": "flat",
"training_role": "Secondary training server for bot-driven data collection and destructive testing. No auth required.",
"notes": "Offline dev server for AI training bots. Superflat, no mobs."
}
],