0.5.0 bake-off results, knowledge lookup tools, training progress chart
Bake-off (0.5.0 vs 0.4.0): - Overall: 46.8% vs 45.2% (+1.6%), 0 errors vs 2 - Enchantments: +47% (20% → 67%) - EssentialsX: +60% (0% → 60%) - Effects: +25% (0% → 25%) - Regressions: fill_build -67%, world -20% Knowledge Lookup Tools (4 new): - plugin.docs_lookup: WorldGuard, WorldEdit, CoreProtect, EssentialsX, LuckPerms docs - minecraft.changelog_lookup: version history from Minecraft Wiki - paper.docs_lookup: Paper server-specific documentation - Wired into gateway model-driven tool loop and exploration self-play Exploration Self-Play: - General (vanilla MC) and plugins focus modes - Wiki-grounded: model researches before acting, validates through RCON - 2,243 exploration examples generated, 150 kept after quality filtering Training Progress Chart: - SVG chart showing training examples and inverse loss across versions - Added to MODEL_CARD.md for Gitea display Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,297 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Bake-off — compare model versions on standard test prompts with RCON validation.
|
||||
|
||||
Runs the same prompts through multiple models, executes via RCON, and scores
|
||||
success rate, response quality, and speed.
|
||||
|
||||
Usage:
|
||||
python3 bakeoff.py --models mortdecai:0.4.0,mortdecai:0.5.0 \
|
||||
--ollama-url http://localhost:11434 --rcon-host 192.168.0.244
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
|
||||
sys.path.insert(0, str(PROJECT_ROOT))
|
||||
|
||||
import requests
|
||||
from agent.tools.persistent_rcon import get_rcon
|
||||
|
||||
OUTPUT_DIR = PROJECT_ROOT / "training" / "bakeoff_results"
|
||||
|
||||
# Standard test prompts across categories
|
||||
TEST_PROMPTS = {
|
||||
"basic_give": [
|
||||
"sudo give me a diamond sword",
|
||||
"sudo give me 64 golden apples",
|
||||
"sudo give me full netherite armor",
|
||||
"sudo give me a stack of oak logs",
|
||||
],
|
||||
"enchantments": [
|
||||
"sudo give me a sword with sharpness 5 and mending",
|
||||
"sudo give me a bow with power 5 and infinity",
|
||||
"sudo give me boots with feather falling 4 and depth strider 3",
|
||||
"sudo give me a trident with loyalty 3 and channeling",
|
||||
],
|
||||
"effects": [
|
||||
"sudo give me speed 2 for 5 minutes",
|
||||
"sudo make me invisible for 60 seconds",
|
||||
"sudo give me night vision forever",
|
||||
"sudo give everyone resistance 3",
|
||||
],
|
||||
"world": [
|
||||
"sudo set time to day",
|
||||
"sudo clear the weather",
|
||||
"sudo kill all zombies",
|
||||
"sudo summon 3 cows near me",
|
||||
],
|
||||
"teleport": [
|
||||
"sudo tp me to 0 100 0",
|
||||
"sudo tp me 50 blocks up",
|
||||
],
|
||||
"fill_build": [
|
||||
"sudo fill a 5x5 gold platform under me",
|
||||
"sudo place a beacon at 0 64 0",
|
||||
],
|
||||
"complex": [
|
||||
"sudo give me a mace with density 5 and wind burst 3",
|
||||
"sudo give me a decorated pot",
|
||||
"sudo spawn a warden 10 blocks away",
|
||||
"sudo create a team called red with red color",
|
||||
],
|
||||
"plugins_worldguard": [
|
||||
"sudo create a region called test-region",
|
||||
"sudo set pvp deny in the test-region",
|
||||
"sudo list all regions",
|
||||
],
|
||||
"plugins_coreprotect": [
|
||||
"sudo check coreprotect status",
|
||||
"sudo lookup block changes in the last hour",
|
||||
],
|
||||
"plugins_essentials": [
|
||||
"sudo set spawn here",
|
||||
"sudo create a warp called bakeoff-test",
|
||||
"sudo heal me",
|
||||
],
|
||||
"plugins_luckperms": [
|
||||
"sudo create a group called testers",
|
||||
"sudo list all permission groups",
|
||||
],
|
||||
"error_prone": [
|
||||
"sudo give me a bed",
|
||||
"sudo give me cooked beef",
|
||||
"sudo effect give me speed",
|
||||
"sudo fill with stone 10",
|
||||
],
|
||||
}
|
||||
|
||||
PLAYER = "slingshooter08"
|
||||
|
||||
|
||||
def query_model(prompt, model, ollama_url, timeout=60):
|
||||
"""Query a model and return parsed response + timing."""
|
||||
system = (
|
||||
"/no_think\n"
|
||||
"You are a Minecraft 1.21 command translator for a Paper server with plugins: "
|
||||
"FastAsyncWorldEdit, WorldGuard, CoreProtect, EssentialsX, Vault, LuckPerms.\n"
|
||||
"PERMISSION LEVEL: 4 (generous).\n"
|
||||
"Return JSON: {\"commands\": [...], \"reasoning\": \"...\"}"
|
||||
)
|
||||
|
||||
start = time.time()
|
||||
try:
|
||||
r = requests.post(f"{ollama_url}/api/chat", json={
|
||||
"model": model,
|
||||
"messages": [
|
||||
{"role": "system", "content": system},
|
||||
{"role": "user", "content": f"Player {PLAYER}: {prompt}"},
|
||||
],
|
||||
"stream": False, "format": "json",
|
||||
"options": {"temperature": 0.2, "num_predict": 500},
|
||||
}, timeout=timeout)
|
||||
elapsed = time.time() - start
|
||||
content = r.json()["message"]["content"]
|
||||
content = re.sub(r'<think>[\s\S]*?</think>\s*', '', content)
|
||||
parsed = json.loads(content)
|
||||
return {
|
||||
"commands": parsed.get("commands", []),
|
||||
"reasoning": parsed.get("reasoning", ""),
|
||||
"elapsed": round(elapsed, 2),
|
||||
"error": None,
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"commands": [],
|
||||
"reasoning": "",
|
||||
"elapsed": round(time.time() - start, 2),
|
||||
"error": str(e)[:200],
|
||||
}
|
||||
|
||||
|
||||
def validate_commands(commands, rcon):
|
||||
"""Execute commands and return results."""
|
||||
results = []
|
||||
for cmd in commands[:8]:
|
||||
if not isinstance(cmd, str) or not cmd.strip():
|
||||
continue
|
||||
cmd = cmd.strip().lstrip("/")
|
||||
try:
|
||||
result = rcon.command(cmd)
|
||||
is_err = any(e in result for e in ("<--[HERE]", "Unknown", "Incorrect", "Expected", "Invalid"))
|
||||
results.append({"cmd": cmd, "result": result[:200], "ok": not is_err})
|
||||
except Exception as e:
|
||||
results.append({"cmd": cmd, "result": str(e), "ok": False})
|
||||
return results
|
||||
|
||||
|
||||
def run_bakeoff(models, ollama_url, rcon):
|
||||
"""Run all test prompts through all models."""
|
||||
results = {m: {"total": 0, "cmd_success": 0, "cmd_fail": 0, "cmd_total": 0,
|
||||
"no_commands": 0, "errors": 0, "total_time": 0, "details": []}
|
||||
for m in models}
|
||||
|
||||
total_prompts = sum(len(v) for v in TEST_PROMPTS.values())
|
||||
print(f"Running {total_prompts} prompts x {len(models)} models = {total_prompts * len(models)} tests\n")
|
||||
|
||||
for category, prompts in TEST_PROMPTS.items():
|
||||
print(f"── {category} ──")
|
||||
for prompt in prompts:
|
||||
print(f" {prompt[:65]}")
|
||||
for model in models:
|
||||
resp = query_model(prompt, model, ollama_url)
|
||||
r = results[model]
|
||||
r["total"] += 1
|
||||
r["total_time"] += resp["elapsed"]
|
||||
|
||||
if resp["error"]:
|
||||
r["errors"] += 1
|
||||
status = "ERR"
|
||||
rcon_results = []
|
||||
elif not resp["commands"]:
|
||||
r["no_commands"] += 1
|
||||
status = "EMPTY"
|
||||
rcon_results = []
|
||||
else:
|
||||
rcon_results = validate_commands(resp["commands"], rcon)
|
||||
ok = sum(1 for rr in rcon_results if rr["ok"])
|
||||
fail = sum(1 for rr in rcon_results if not rr["ok"])
|
||||
r["cmd_success"] += ok
|
||||
r["cmd_fail"] += fail
|
||||
r["cmd_total"] += ok + fail
|
||||
status = f"{ok}/{ok+fail}" if fail else f"{ok}✓"
|
||||
|
||||
model_short = model.split(":")[-1]
|
||||
print(f" {model_short:8s} {status:8s} {resp['elapsed']:.1f}s {len(resp['commands'])} cmds")
|
||||
|
||||
r["details"].append({
|
||||
"category": category,
|
||||
"prompt": prompt,
|
||||
"commands": resp["commands"],
|
||||
"rcon_results": rcon_results,
|
||||
"elapsed": resp["elapsed"],
|
||||
"error": resp["error"],
|
||||
})
|
||||
print()
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def print_summary(results, models):
|
||||
"""Print comparison table."""
|
||||
print("=" * 70)
|
||||
print("BAKE-OFF RESULTS")
|
||||
print("=" * 70)
|
||||
|
||||
header = f"{'Metric':<30s}"
|
||||
for m in models:
|
||||
header += f" {m.split(':')[-1]:>12s}"
|
||||
print(header)
|
||||
print("-" * 70)
|
||||
|
||||
metrics = [
|
||||
("Prompts tested", lambda r: r["total"]),
|
||||
("Commands generated", lambda r: r["cmd_total"]),
|
||||
("Commands succeeded", lambda r: r["cmd_success"]),
|
||||
("Commands failed", lambda r: r["cmd_fail"]),
|
||||
("Success rate", lambda r: f"{100*r['cmd_success']/max(r['cmd_total'],1):.1f}%"),
|
||||
("Empty responses", lambda r: r["no_commands"]),
|
||||
("Errors", lambda r: r["errors"]),
|
||||
("Avg response time", lambda r: f"{r['total_time']/max(r['total'],1):.2f}s"),
|
||||
("Total time", lambda r: f"{r['total_time']:.1f}s"),
|
||||
]
|
||||
|
||||
for label, fn in metrics:
|
||||
row = f"{label:<30s}"
|
||||
for m in models:
|
||||
val = fn(results[m])
|
||||
row += f" {str(val):>12s}"
|
||||
print(row)
|
||||
|
||||
print("=" * 70)
|
||||
|
||||
# Category breakdown
|
||||
print("\nCATEGORY BREAKDOWN (success rate):")
|
||||
print("-" * 70)
|
||||
categories = list(TEST_PROMPTS.keys())
|
||||
header = f"{'Category':<25s}"
|
||||
for m in models:
|
||||
header += f" {m.split(':')[-1]:>12s}"
|
||||
print(header)
|
||||
|
||||
for cat in categories:
|
||||
row = f"{cat:<25s}"
|
||||
for m in models:
|
||||
cat_details = [d for d in results[m]["details"] if d["category"] == cat]
|
||||
cat_ok = sum(sum(1 for rr in d["rcon_results"] if rr["ok"]) for d in cat_details)
|
||||
cat_total = sum(len(d["rcon_results"]) for d in cat_details)
|
||||
if cat_total > 0:
|
||||
row += f" {100*cat_ok/cat_total:>10.0f}%"
|
||||
else:
|
||||
row += f" {'N/A':>12s}"
|
||||
print(row)
|
||||
|
||||
print()
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Model bake-off")
|
||||
parser.add_argument("--models", default="mortdecai:0.4.0,mortdecai:0.5.0")
|
||||
parser.add_argument("--ollama-url", default="http://localhost:11434")
|
||||
parser.add_argument("--rcon-host", default="192.168.0.244")
|
||||
parser.add_argument("--rcon-port", type=int, default=25578)
|
||||
parser.add_argument("--rcon-pass", default="REDACTED_RCON")
|
||||
args = parser.parse_args()
|
||||
|
||||
models = [m.strip() for m in args.models.split(",")]
|
||||
rcon = get_rcon(args.rcon_host, args.rcon_port, args.rcon_pass)
|
||||
|
||||
print(f"Bake-off: {' vs '.join(models)}")
|
||||
print(f"Ollama: {args.ollama_url}")
|
||||
print(f"RCON: {args.rcon_host}:{args.rcon_port}")
|
||||
print()
|
||||
|
||||
results = run_bakeoff(models, args.ollama_url, rcon)
|
||||
print_summary(results, models)
|
||||
|
||||
# Save results
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
out_path = OUTPUT_DIR / f"bakeoff_{'-vs-'.join(m.replace(':','_') for m in models)}_{int(time.time())}.json"
|
||||
with open(out_path, "w") as f:
|
||||
json.dump({
|
||||
"models": models,
|
||||
"timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
"results": {m: {k: v for k, v in r.items() if k != "details"} for m, r in results.items()},
|
||||
"details": {m: r["details"] for m, r in results.items()},
|
||||
}, f, indent=2, default=str)
|
||||
print(f"Results saved to {out_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user