0.5.0 bake-off results, knowledge lookup tools, training progress chart

Bake-off (0.5.0 vs 0.4.0): - Overall: 46.8% vs 45.2% (+1.6%), 0 errors vs 2 - Enchantments: +47% (20% → 67%) - EssentialsX: +60% (0% → 60%) - Effects: +25% (0% → 25%) - Regressions: fill_build -67%, world -20% Knowledge Lookup Tools (4 new): - plugin.docs_lookup: WorldGuard, WorldEdit, CoreProtect, EssentialsX, LuckPerms docs - minecraft.changelog_lookup: version history from Minecraft Wiki - paper.docs_lookup: Paper server-specific documentation - Wired into gateway model-driven tool loop and exploration self-play Exploration Self-Play: - General (vanilla MC) and plugins focus modes - Wiki-grounded: model researches before acting, validates through RCON - 2,243 exploration examples generated, 150 kept after quality filtering Training Progress Chart: - SVG chart showing training examples and inverse loss across versions - Added to MODEL_CARD.md for Gitea display Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-21 15:28:09 -04:00
parent da8f557219
commit f5118505b1
10 changed files with 3215 additions and 20 deletions
@@ -0,0 +1,297 @@
+#!/usr/bin/env python3
+"""
+Bake-off — compare model versions on standard test prompts with RCON validation.
+
+Runs the same prompts through multiple models, executes via RCON, and scores
+success rate, response quality, and speed.
+
+Usage:
+    python3 bakeoff.py --models mortdecai:0.4.0,mortdecai:0.5.0 \
+        --ollama-url http://localhost:11434 --rcon-host 192.168.0.244
+"""
+
+import argparse
+import json
+import random
+import re
+import sys
+import time
+from pathlib import Path
+
+PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
+sys.path.insert(0, str(PROJECT_ROOT))
+
+import requests
+from agent.tools.persistent_rcon import get_rcon
+
+OUTPUT_DIR = PROJECT_ROOT / "training" / "bakeoff_results"
+
+# Standard test prompts across categories
+TEST_PROMPTS = {
+    "basic_give": [
+        "sudo give me a diamond sword",
+        "sudo give me 64 golden apples",
+        "sudo give me full netherite armor",
+        "sudo give me a stack of oak logs",
+    ],
+    "enchantments": [
+        "sudo give me a sword with sharpness 5 and mending",
+        "sudo give me a bow with power 5 and infinity",
+        "sudo give me boots with feather falling 4 and depth strider 3",
+        "sudo give me a trident with loyalty 3 and channeling",
+    ],
+    "effects": [
+        "sudo give me speed 2 for 5 minutes",
+        "sudo make me invisible for 60 seconds",
+        "sudo give me night vision forever",
+        "sudo give everyone resistance 3",
+    ],
+    "world": [
+        "sudo set time to day",
+        "sudo clear the weather",
+        "sudo kill all zombies",
+        "sudo summon 3 cows near me",
+    ],
+    "teleport": [
+        "sudo tp me to 0 100 0",
+        "sudo tp me 50 blocks up",
+    ],
+    "fill_build": [
+        "sudo fill a 5x5 gold platform under me",
+        "sudo place a beacon at 0 64 0",
+    ],
+    "complex": [
+        "sudo give me a mace with density 5 and wind burst 3",
+        "sudo give me a decorated pot",
+        "sudo spawn a warden 10 blocks away",
+        "sudo create a team called red with red color",
+    ],
+    "plugins_worldguard": [
+        "sudo create a region called test-region",
+        "sudo set pvp deny in the test-region",
+        "sudo list all regions",
+    ],
+    "plugins_coreprotect": [
+        "sudo check coreprotect status",
+        "sudo lookup block changes in the last hour",
+    ],
+    "plugins_essentials": [
+        "sudo set spawn here",
+        "sudo create a warp called bakeoff-test",
+        "sudo heal me",
+    ],
+    "plugins_luckperms": [
+        "sudo create a group called testers",
+        "sudo list all permission groups",
+    ],
+    "error_prone": [
+        "sudo give me a bed",
+        "sudo give me cooked beef",
+        "sudo effect give me speed",
+        "sudo fill with stone 10",
+    ],
+}
+
+PLAYER = "slingshooter08"
+
+
+def query_model(prompt, model, ollama_url, timeout=60):
+    """Query a model and return parsed response + timing."""
+    system = (
+        "/no_think\n"
+        "You are a Minecraft 1.21 command translator for a Paper server with plugins: "
+        "FastAsyncWorldEdit, WorldGuard, CoreProtect, EssentialsX, Vault, LuckPerms.\n"
+        "PERMISSION LEVEL: 4 (generous).\n"
+        "Return JSON: {\"commands\": [...], \"reasoning\": \"...\"}"
+    )
+
+    start = time.time()
+    try:
+        r = requests.post(f"{ollama_url}/api/chat", json={
+            "model": model,
+            "messages": [
+                {"role": "system", "content": system},
+                {"role": "user", "content": f"Player {PLAYER}: {prompt}"},
+            ],
+            "stream": False, "format": "json",
+            "options": {"temperature": 0.2, "num_predict": 500},
+        }, timeout=timeout)
+        elapsed = time.time() - start
+        content = r.json()["message"]["content"]
+        content = re.sub(r'<think>[\s\S]*?</think>\s*', '', content)
+        parsed = json.loads(content)
+        return {
+            "commands": parsed.get("commands", []),
+            "reasoning": parsed.get("reasoning", ""),
+            "elapsed": round(elapsed, 2),
+            "error": None,
+        }
+    except Exception as e:
+        return {
+            "commands": [],
+            "reasoning": "",
+            "elapsed": round(time.time() - start, 2),
+            "error": str(e)[:200],
+        }
+
+
+def validate_commands(commands, rcon):
+    """Execute commands and return results."""
+    results = []
+    for cmd in commands[:8]:
+        if not isinstance(cmd, str) or not cmd.strip():
+            continue
+        cmd = cmd.strip().lstrip("/")
+        try:
+            result = rcon.command(cmd)
+            is_err = any(e in result for e in ("<--[HERE]", "Unknown", "Incorrect", "Expected", "Invalid"))
+            results.append({"cmd": cmd, "result": result[:200], "ok": not is_err})
+        except Exception as e:
+            results.append({"cmd": cmd, "result": str(e), "ok": False})
+    return results
+
+
+def run_bakeoff(models, ollama_url, rcon):
+    """Run all test prompts through all models."""
+    results = {m: {"total": 0, "cmd_success": 0, "cmd_fail": 0, "cmd_total": 0,
+                    "no_commands": 0, "errors": 0, "total_time": 0, "details": []}
+               for m in models}
+
+    total_prompts = sum(len(v) for v in TEST_PROMPTS.values())
+    print(f"Running {total_prompts} prompts x {len(models)} models = {total_prompts * len(models)} tests\n")
+
+    for category, prompts in TEST_PROMPTS.items():
+        print(f"── {category} ──")
+        for prompt in prompts:
+            print(f"  {prompt[:65]}")
+            for model in models:
+                resp = query_model(prompt, model, ollama_url)
+                r = results[model]
+                r["total"] += 1
+                r["total_time"] += resp["elapsed"]
+
+                if resp["error"]:
+                    r["errors"] += 1
+                    status = "ERR"
+                    rcon_results = []
+                elif not resp["commands"]:
+                    r["no_commands"] += 1
+                    status = "EMPTY"
+                    rcon_results = []
+                else:
+                    rcon_results = validate_commands(resp["commands"], rcon)
+                    ok = sum(1 for rr in rcon_results if rr["ok"])
+                    fail = sum(1 for rr in rcon_results if not rr["ok"])
+                    r["cmd_success"] += ok
+                    r["cmd_fail"] += fail
+                    r["cmd_total"] += ok + fail
+                    status = f"{ok}/{ok+fail}" if fail else f"{ok}✓"
+
+                model_short = model.split(":")[-1]
+                print(f"    {model_short:8s} {status:8s} {resp['elapsed']:.1f}s  {len(resp['commands'])} cmds")
+
+                r["details"].append({
+                    "category": category,
+                    "prompt": prompt,
+                    "commands": resp["commands"],
+                    "rcon_results": rcon_results,
+                    "elapsed": resp["elapsed"],
+                    "error": resp["error"],
+                })
+        print()
+
+    return results
+
+
+def print_summary(results, models):
+    """Print comparison table."""
+    print("=" * 70)
+    print("BAKE-OFF RESULTS")
+    print("=" * 70)
+
+    header = f"{'Metric':<30s}"
+    for m in models:
+        header += f" {m.split(':')[-1]:>12s}"
+    print(header)
+    print("-" * 70)
+
+    metrics = [
+        ("Prompts tested", lambda r: r["total"]),
+        ("Commands generated", lambda r: r["cmd_total"]),
+        ("Commands succeeded", lambda r: r["cmd_success"]),
+        ("Commands failed", lambda r: r["cmd_fail"]),
+        ("Success rate", lambda r: f"{100*r['cmd_success']/max(r['cmd_total'],1):.1f}%"),
+        ("Empty responses", lambda r: r["no_commands"]),
+        ("Errors", lambda r: r["errors"]),
+        ("Avg response time", lambda r: f"{r['total_time']/max(r['total'],1):.2f}s"),
+        ("Total time", lambda r: f"{r['total_time']:.1f}s"),
+    ]
+
+    for label, fn in metrics:
+        row = f"{label:<30s}"
+        for m in models:
+            val = fn(results[m])
+            row += f" {str(val):>12s}"
+        print(row)
+
+    print("=" * 70)
+
+    # Category breakdown
+    print("\nCATEGORY BREAKDOWN (success rate):")
+    print("-" * 70)
+    categories = list(TEST_PROMPTS.keys())
+    header = f"{'Category':<25s}"
+    for m in models:
+        header += f" {m.split(':')[-1]:>12s}"
+    print(header)
+
+    for cat in categories:
+        row = f"{cat:<25s}"
+        for m in models:
+            cat_details = [d for d in results[m]["details"] if d["category"] == cat]
+            cat_ok = sum(sum(1 for rr in d["rcon_results"] if rr["ok"]) for d in cat_details)
+            cat_total = sum(len(d["rcon_results"]) for d in cat_details)
+            if cat_total > 0:
+                row += f" {100*cat_ok/cat_total:>10.0f}%"
+            else:
+                row += f" {'N/A':>12s}"
+        print(row)
+
+    print()
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Model bake-off")
+    parser.add_argument("--models", default="mortdecai:0.4.0,mortdecai:0.5.0")
+    parser.add_argument("--ollama-url", default="http://localhost:11434")
+    parser.add_argument("--rcon-host", default="192.168.0.244")
+    parser.add_argument("--rcon-port", type=int, default=25578)
+    parser.add_argument("--rcon-pass", default="REDACTED_RCON")
+    args = parser.parse_args()
+
+    models = [m.strip() for m in args.models.split(",")]
+    rcon = get_rcon(args.rcon_host, args.rcon_port, args.rcon_pass)
+
+    print(f"Bake-off: {' vs '.join(models)}")
+    print(f"Ollama: {args.ollama_url}")
+    print(f"RCON: {args.rcon_host}:{args.rcon_port}")
+    print()
+
+    results = run_bakeoff(models, args.ollama_url, rcon)
+    print_summary(results, models)
+
+    # Save results
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+    out_path = OUTPUT_DIR / f"bakeoff_{'-vs-'.join(m.replace(':','_') for m in models)}_{int(time.time())}.json"
+    with open(out_path, "w") as f:
+        json.dump({
+            "models": models,
+            "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
+            "results": {m: {k: v for k, v in r.items() if k != "details"} for m, r in results.items()},
+            "details": {m: r["details"] for m, r in results.items()},
+        }, f, indent=2, default=str)
+    print(f"Results saved to {out_path}")
+
+
+if __name__ == "__main__":
+    main()