0.5.0 bake-off results, knowledge lookup tools, training progress chart

Bake-off (0.5.0 vs 0.4.0): - Overall: 46.8% vs 45.2% (+1.6%), 0 errors vs 2 - Enchantments: +47% (20% → 67%) - EssentialsX: +60% (0% → 60%) - Effects: +25% (0% → 25%) - Regressions: fill_build -67%, world -20% Knowledge Lookup Tools (4 new): - plugin.docs_lookup: WorldGuard, WorldEdit, CoreProtect, EssentialsX, LuckPerms docs - minecraft.changelog_lookup: version history from Minecraft Wiki - paper.docs_lookup: Paper server-specific documentation - Wired into gateway model-driven tool loop and exploration self-play Exploration Self-Play: - General (vanilla MC) and plugins focus modes - Wiki-grounded: model researches before acting, validates through RCON - 2,243 exploration examples generated, 150 kept after quality filtering Training Progress Chart: - SVG chart showing training examples and inverse loss across versions - Added to MODEL_CARD.md for Gitea display Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-21 15:28:09 -04:00
parent da8f557219
commit f5118505b1
10 changed files with 3215 additions and 20 deletions
@@ -0,0 +1,297 @@
+#!/usr/bin/env python3
+"""
+Bake-off — compare model versions on standard test prompts with RCON validation.
+
+Runs the same prompts through multiple models, executes via RCON, and scores
+success rate, response quality, and speed.
+
+Usage:
+    python3 bakeoff.py --models mortdecai:0.4.0,mortdecai:0.5.0 \
+        --ollama-url http://localhost:11434 --rcon-host 192.168.0.244
+"""
+
+import argparse
+import json
+import random
+import re
+import sys
+import time
+from pathlib import Path
+
+PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
+sys.path.insert(0, str(PROJECT_ROOT))
+
+import requests
+from agent.tools.persistent_rcon import get_rcon
+
+OUTPUT_DIR = PROJECT_ROOT / "training" / "bakeoff_results"
+
+# Standard test prompts across categories
+TEST_PROMPTS = {
+    "basic_give": [
+        "sudo give me a diamond sword",
+        "sudo give me 64 golden apples",
+        "sudo give me full netherite armor",
+        "sudo give me a stack of oak logs",
+    ],
+    "enchantments": [
+        "sudo give me a sword with sharpness 5 and mending",
+        "sudo give me a bow with power 5 and infinity",
+        "sudo give me boots with feather falling 4 and depth strider 3",
+        "sudo give me a trident with loyalty 3 and channeling",
+    ],
+    "effects": [
+        "sudo give me speed 2 for 5 minutes",
+        "sudo make me invisible for 60 seconds",
+        "sudo give me night vision forever",
+        "sudo give everyone resistance 3",
+    ],
+    "world": [
+        "sudo set time to day",
+        "sudo clear the weather",
+        "sudo kill all zombies",
+        "sudo summon 3 cows near me",
+    ],
+    "teleport": [
+        "sudo tp me to 0 100 0",
+        "sudo tp me 50 blocks up",
+    ],
+    "fill_build": [
+        "sudo fill a 5x5 gold platform under me",
+        "sudo place a beacon at 0 64 0",
+    ],
+    "complex": [
+        "sudo give me a mace with density 5 and wind burst 3",
+        "sudo give me a decorated pot",
+        "sudo spawn a warden 10 blocks away",
+        "sudo create a team called red with red color",
+    ],
+    "plugins_worldguard": [
+        "sudo create a region called test-region",
+        "sudo set pvp deny in the test-region",
+        "sudo list all regions",
+    ],
+    "plugins_coreprotect": [
+        "sudo check coreprotect status",
+        "sudo lookup block changes in the last hour",
+    ],
+    "plugins_essentials": [
+        "sudo set spawn here",
+        "sudo create a warp called bakeoff-test",
+        "sudo heal me",
+    ],
+    "plugins_luckperms": [
+        "sudo create a group called testers",
+        "sudo list all permission groups",
+    ],
+    "error_prone": [
+        "sudo give me a bed",
+        "sudo give me cooked beef",
+        "sudo effect give me speed",
+        "sudo fill with stone 10",
+    ],
+}
+
+PLAYER = "slingshooter08"
+
+
+def query_model(prompt, model, ollama_url, timeout=60):
+    """Query a model and return parsed response + timing."""
+    system = (
+        "/no_think\n"
+        "You are a Minecraft 1.21 command translator for a Paper server with plugins: "
+        "FastAsyncWorldEdit, WorldGuard, CoreProtect, EssentialsX, Vault, LuckPerms.\n"
+        "PERMISSION LEVEL: 4 (generous).\n"
+        "Return JSON: {\"commands\": [...], \"reasoning\": \"...\"}"
+    )
+
+    start = time.time()
+    try:
+        r = requests.post(f"{ollama_url}/api/chat", json={
+            "model": model,
+            "messages": [
+                {"role": "system", "content": system},
+                {"role": "user", "content": f"Player {PLAYER}: {prompt}"},
+            ],
+            "stream": False, "format": "json",
+            "options": {"temperature": 0.2, "num_predict": 500},
+        }, timeout=timeout)
+        elapsed = time.time() - start
+        content = r.json()["message"]["content"]
+        content = re.sub(r'<think>[\s\S]*?</think>\s*', '', content)
+        parsed = json.loads(content)
+        return {
+            "commands": parsed.get("commands", []),
+            "reasoning": parsed.get("reasoning", ""),
+            "elapsed": round(elapsed, 2),
+            "error": None,
+        }
+    except Exception as e:
+        return {
+            "commands": [],
+            "reasoning": "",
+            "elapsed": round(time.time() - start, 2),
+            "error": str(e)[:200],
+        }
+
+
+def validate_commands(commands, rcon):
+    """Execute commands and return results."""
+    results = []
+    for cmd in commands[:8]:
+        if not isinstance(cmd, str) or not cmd.strip():
+            continue
+        cmd = cmd.strip().lstrip("/")
+        try:
+            result = rcon.command(cmd)
+            is_err = any(e in result for e in ("<--[HERE]", "Unknown", "Incorrect", "Expected", "Invalid"))
+            results.append({"cmd": cmd, "result": result[:200], "ok": not is_err})
+        except Exception as e:
+            results.append({"cmd": cmd, "result": str(e), "ok": False})
+    return results
+
+
+def run_bakeoff(models, ollama_url, rcon):
+    """Run all test prompts through all models."""
+    results = {m: {"total": 0, "cmd_success": 0, "cmd_fail": 0, "cmd_total": 0,
+                    "no_commands": 0, "errors": 0, "total_time": 0, "details": []}
+               for m in models}
+
+    total_prompts = sum(len(v) for v in TEST_PROMPTS.values())
+    print(f"Running {total_prompts} prompts x {len(models)} models = {total_prompts * len(models)} tests\n")
+
+    for category, prompts in TEST_PROMPTS.items():
+        print(f"── {category} ──")
+        for prompt in prompts:
+            print(f"  {prompt[:65]}")
+            for model in models:
+                resp = query_model(prompt, model, ollama_url)
+                r = results[model]
+                r["total"] += 1
+                r["total_time"] += resp["elapsed"]
+
+                if resp["error"]:
+                    r["errors"] += 1
+                    status = "ERR"
+                    rcon_results = []
+                elif not resp["commands"]:
+                    r["no_commands"] += 1
+                    status = "EMPTY"
+                    rcon_results = []
+                else:
+                    rcon_results = validate_commands(resp["commands"], rcon)
+                    ok = sum(1 for rr in rcon_results if rr["ok"])
+                    fail = sum(1 for rr in rcon_results if not rr["ok"])
+                    r["cmd_success"] += ok
+                    r["cmd_fail"] += fail
+                    r["cmd_total"] += ok + fail
+                    status = f"{ok}/{ok+fail}" if fail else f"{ok}✓"
+
+                model_short = model.split(":")[-1]
+                print(f"    {model_short:8s} {status:8s} {resp['elapsed']:.1f}s  {len(resp['commands'])} cmds")
+
+                r["details"].append({
+                    "category": category,
+                    "prompt": prompt,
+                    "commands": resp["commands"],
+                    "rcon_results": rcon_results,
+                    "elapsed": resp["elapsed"],
+                    "error": resp["error"],
+                })
+        print()
+
+    return results
+
+
+def print_summary(results, models):
+    """Print comparison table."""
+    print("=" * 70)
+    print("BAKE-OFF RESULTS")
+    print("=" * 70)
+
+    header = f"{'Metric':<30s}"
+    for m in models:
+        header += f" {m.split(':')[-1]:>12s}"
+    print(header)
+    print("-" * 70)
+
+    metrics = [
+        ("Prompts tested", lambda r: r["total"]),
+        ("Commands generated", lambda r: r["cmd_total"]),
+        ("Commands succeeded", lambda r: r["cmd_success"]),
+        ("Commands failed", lambda r: r["cmd_fail"]),
+        ("Success rate", lambda r: f"{100*r['cmd_success']/max(r['cmd_total'],1):.1f}%"),
+        ("Empty responses", lambda r: r["no_commands"]),
+        ("Errors", lambda r: r["errors"]),
+        ("Avg response time", lambda r: f"{r['total_time']/max(r['total'],1):.2f}s"),
+        ("Total time", lambda r: f"{r['total_time']:.1f}s"),
+    ]
+
+    for label, fn in metrics:
+        row = f"{label:<30s}"
+        for m in models:
+            val = fn(results[m])
+            row += f" {str(val):>12s}"
+        print(row)
+
+    print("=" * 70)
+
+    # Category breakdown
+    print("\nCATEGORY BREAKDOWN (success rate):")
+    print("-" * 70)
+    categories = list(TEST_PROMPTS.keys())
+    header = f"{'Category':<25s}"
+    for m in models:
+        header += f" {m.split(':')[-1]:>12s}"
+    print(header)
+
+    for cat in categories:
+        row = f"{cat:<25s}"
+        for m in models:
+            cat_details = [d for d in results[m]["details"] if d["category"] == cat]
+            cat_ok = sum(sum(1 for rr in d["rcon_results"] if rr["ok"]) for d in cat_details)
+            cat_total = sum(len(d["rcon_results"]) for d in cat_details)
+            if cat_total > 0:
+                row += f" {100*cat_ok/cat_total:>10.0f}%"
+            else:
+                row += f" {'N/A':>12s}"
+        print(row)
+
+    print()
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Model bake-off")
+    parser.add_argument("--models", default="mortdecai:0.4.0,mortdecai:0.5.0")
+    parser.add_argument("--ollama-url", default="http://localhost:11434")
+    parser.add_argument("--rcon-host", default="192.168.0.244")
+    parser.add_argument("--rcon-port", type=int, default=25578)
+    parser.add_argument("--rcon-pass", default="REDACTED_RCON")
+    args = parser.parse_args()
+
+    models = [m.strip() for m in args.models.split(",")]
+    rcon = get_rcon(args.rcon_host, args.rcon_port, args.rcon_pass)
+
+    print(f"Bake-off: {' vs '.join(models)}")
+    print(f"Ollama: {args.ollama_url}")
+    print(f"RCON: {args.rcon_host}:{args.rcon_port}")
+    print()
+
+    results = run_bakeoff(models, args.ollama_url, rcon)
+    print_summary(results, models)
+
+    # Save results
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+    out_path = OUTPUT_DIR / f"bakeoff_{'-vs-'.join(m.replace(':','_') for m in models)}_{int(time.time())}.json"
+    with open(out_path, "w") as f:
+        json.dump({
+            "models": models,
+            "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
+            "results": {m: {k: v for k, v in r.items() if k != "details"} for m, r in results.items()},
+            "details": {m: r["details"] for m, r in results.items()},
+        }, f, indent=2, default=str)
+    print(f"Results saved to {out_path}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,411 @@
+#!/usr/bin/env python3
+"""
+Exploration Self-Play — model uses wiki_lookup to explore Minecraft knowledge,
+then validates its understanding through RCON commands.
+
+Unlike canned self-play, the model drives its own curiosity:
+1. Gets a broad topic ("explore enchantments", "learn about 1.21 items")
+2. Uses minecraft.wiki_lookup to research
+3. Generates commands based on what it learned
+4. RCON validates correctness
+5. If wrong, researches more and corrects
+
+Produces gold-standard knowledge-grounded training data.
+
+Usage:
+    python3 exploration_self_play.py --ollama-url http://localhost:11434 \
+        --model mortdecai:0.5.0 --rcon-host 192.168.0.244 --rcon-port 25578
+"""
+
+import argparse
+import json
+import random
+import re
+import sys
+import time
+from pathlib import Path
+
+PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
+sys.path.insert(0, str(PROJECT_ROOT))
+
+import requests
+from agent.tools.persistent_rcon import get_rcon
+
+OUTPUT_DIR = PROJECT_ROOT / "data" / "raw" / "exploration_selfplay"
+
+PLAYERS = ["slingshooter08", "Ace13245", "TheBigBoss", "xXDragonSlayerXx"]
+
+# Topics for the model to explore — broad enough that it needs to look things up
+EXPLORATION_TOPICS_PLUGINS = [
+    # WorldGuard deep dive
+    "Research all WorldGuard region flags. Create a region and test each flag one at a time for {p}.",
+    "Look up how WorldGuard region priorities work. Create overlapping regions with different rules.",
+    "Research WorldGuard's __global__ region. What flags can you set globally? Test a few.",
+    "Look up WorldGuard entry/exit deny flags. Create a VIP-only zone and test it.",
+    "Research how to make a WorldGuard region that heals players. Set it up near {p}.",
+    "What WorldGuard flags control explosions? Research and create a blast-proof zone.",
+    "Look up how to block specific commands in a WorldGuard region. Test with /home.",
+    "Research WorldGuard greeting and farewell messages. Set up regions with welcome messages.",
+
+    # CoreProtect deep dive
+    "Research all CoreProtect action types (block, container, chat, command). Test /co lookup with each.",
+    "Look up CoreProtect time format syntax. Test rollbacks with different time ranges (1h, 30m, 7d).",
+    "Research how CoreProtect handles container logging. Place a chest, add items, then lookup the history.",
+    "What CoreProtect parameters filter by block type? Test rolling back only specific blocks.",
+    "Look up how to use CoreProtect radius parameter. Test different radius values.",
+    "Research CoreProtect restore vs rollback — what's the difference? Demonstrate both.",
+
+    # EssentialsX deep dive
+    "Research all EssentialsX economy commands. Set up a working economy with /eco, /balance, /pay.",
+    "Look up EssentialsX kit creation syntax. Create a starter kit and a VIP kit.",
+    "Research EssentialsX warp system. Create 5 warps at interesting locations.",
+    "What EssentialsX commands exist for player management? Test /nick, /seen, /whois.",
+    "Look up EssentialsX home system. Set multiple named homes for {p}.",
+    "Research EssentialsX god mode, fly mode, and speed commands. Test all three.",
+    "What EssentialsX commands modify the world? Test /sun, /storm, /day, /night.",
+
+    # LuckPerms deep dive
+    "Research LuckPerms group inheritance. Create parent and child groups and test permission flow.",
+    "Look up LuckPerms temporary permissions. Give {p} temp fly access for 5 minutes.",
+    "Research LuckPerms meta (prefix/suffix). Set up colored chat prefixes for different groups.",
+    "What LuckPerms commands check a user's permissions? Audit {p}'s current permissions.",
+    "Look up how to create a LuckPerms permission ladder (default -> member -> vip -> admin).",
+    "Research LuckPerms weight system. How do group priorities work?",
+
+    # FAWE/WorldEdit deep dive
+    "Research all WorldEdit shape commands (sphere, cyl, pyramid). Build one of each near {p}.",
+    "Look up WorldEdit brush types. What brushes exist beyond sphere brush?",
+    "Research WorldEdit mask syntax. How do masks work with //replace?",
+    "What WorldEdit clipboard operations exist? Test //copy, //paste, //rotate, //flip.",
+    "Look up WorldEdit pattern syntax. Can you mix multiple blocks in one command?",
+    "Research WorldEdit //generate command. Can it make mathematical surfaces?",
+    "What WorldEdit selection modes exist? Test //sel cuboid vs poly vs sphere.",
+
+    # Script writing exploration
+    "Research Minecraft datapack function syntax. Write a mcfunction that creates a parkour course.",
+    "Look up how Minecraft tick functions work. Write one that makes particles at spawn.",
+    "Research how to chain mcfunctions together. Write a main function that calls sub-functions.",
+    "What Minecraft datapack tags control function scheduling? Test tick.json and load.json.",
+    "Look up execute command syntax for mcfunctions. Write a script using execute at/as/if.",
+    "Research scoreboard objectives. Write a script that tracks player kills and announces leaders.",
+
+    # Multi-plugin combos
+    "Research how to combine WorldEdit builds with WorldGuard protection. Build and protect an arena.",
+    "Look up how to use CoreProtect to undo WorldEdit operations specifically.",
+    "Research combining LuckPerms with WorldGuard — can you tie region access to permission groups?",
+    "Create a complete server setup: spawn area (WE), protected (WG), with warps (Ess) and perms (LP).",
+    "Research how to build a minigame arena: WE for building, WG for rules, scoreboards for tracking.",
+]
+
+EXPLORATION_TOPICS = [
+    # Items and crafting
+    "What are all the new items added in 1.21? Look them up and give one of each to {p}.",
+    "Research every type of arrow (tipped arrows) and give {p} one of each.",
+    "Look up all the banner patterns available and create a cool banner for {p}.",
+    "What suspicious stew effects exist? Research and give {p} the best one.",
+    "Research all the different types of potions and give {p} the three most useful ones.",
+    "What are all the different horse armor types? Look them up and give one of each to {p}.",
+    "Research all smithing templates and give {p} the rarest ones.",
+    "Look up every type of spawn egg and give {p} five interesting ones.",
+
+    # Enchantments
+    "Research the best enchantment setup for a full netherite armor set. Give it to {p}.",
+    "What enchantments are exclusive to each other? Look them up and explain while giving {p} examples.",
+    "Research the difference between Protection, Fire Protection, Blast Protection, and Projectile Protection. Which is best for general use? Give {p} the optimal set.",
+    "Look up what Thorns does exactly — is it worth using? Give {p} armor with and without it to test.",
+    "Research Sweeping Edge — does it still exist in 1.21? Give {p} a sword with the correct enchantments.",
+    "What's the maximum level for each enchantment? Research and give {p} a tool with impossible levels vs correct levels.",
+
+    # Effects and potions
+    "Research all status effects in 1.21. Which ones are new? Apply the 3 newest ones to {p}.",
+    "Look up the Ominous Bottle effect — what does it do? Give one to {p}.",
+    "What's the difference between Strength and Haste? Research and apply the right one for mining.",
+    "Research what Wind Charged does. Apply it to {p}.",
+    "Look up all negative effects and their max safe durations. Apply a brief demonstration.",
+    "What effect does a Beacon give? Research all beacon effects and apply them.",
+
+    # Mobs and entities
+    "Research all tameable mobs in 1.21. Summon one of each near {p}.",
+    "What mobs were added or changed in 1.21? Look them up and summon the new ones.",
+    "Research the Breeze mob — what does it drop? Summon one for {p}.",
+    "Look up all rideable mobs and summon one for {p} with a saddle.",
+    "What's the strongest mob in the game? Research its stats and summon it (carefully).",
+    "Research all fish types and summon them in water near {p}.",
+
+    # Blocks and building
+    "Research all copper block variants and their oxidation states. Place examples near {p}.",
+    "What blocks emit light? Look up all light-emitting blocks and demonstrate.",
+    "Research all types of stairs, slabs, and walls available in 1.21.",
+    "Look up how to make colored concrete powder and place a rainbow near {p}.",
+    "What are all the glazed terracotta patterns? Research and place one of each.",
+    "Research redstone components — what's the difference between a comparator and repeater?",
+
+    # Commands and mechanics
+    "Research the /place command. What can it place? Demonstrate with a structure.",
+    "Look up the /damage command syntax and demonstrate different damage types on a mob.",
+    "Research /attribute — what attributes can be modified? Give {p} double health.",
+    "What does the /ride command do? Research and demonstrate.",
+    "Look up /fillbiome — can you change the biome? Try it near {p}.",
+    "Research the /random command added in 1.21. What can it do?",
+
+    # Worldgen and structures
+    "Research all structure types that /locate can find. Find the 3 nearest to {p}.",
+    "What biomes exist in 1.21? Look up any new ones and locate them.",
+    "Research Trial Chambers — where do they spawn? Locate one for {p}.",
+
+    # Plugin-specific research
+    "Research WorldGuard region flags — what flags exist? Set up a demo region with interesting flags.",
+    "Look up CoreProtect rollback syntax — what parameters does it accept?",
+    "Research LuckPerms group inheritance — how do child groups work?",
+    "What WorldEdit brushes are available? Research and describe them.",
+    "Look up EssentialsX economy commands — set up a basic economy demonstration.",
+]
+
+
+def wiki_lookup(query, timeout=15):
+    """Actually search the Minecraft wiki via DuckDuckGo + scraping."""
+    try:
+        # Use a simple search - the model will call this through the tool loop
+        r = requests.get(
+            "https://minecraft.wiki/api.php",
+            params={"action": "opensearch", "search": query, "limit": 3, "format": "json"},
+            timeout=timeout,
+        )
+        results = r.json()
+        if len(results) >= 4 and results[1]:
+            titles = results[1][:3]
+            urls = results[3][:3] if len(results) > 3 else []
+
+            # Fetch first result summary
+            if titles:
+                r2 = requests.get(
+                    "https://minecraft.wiki/api.php",
+                    params={
+                        "action": "query", "prop": "extracts",
+                        "exintro": True, "explaintext": True,
+                        "titles": titles[0], "format": "json",
+                    },
+                    timeout=timeout,
+                )
+                pages = r2.json().get("query", {}).get("pages", {})
+                for page in pages.values():
+                    extract = page.get("extract", "")
+                    if extract:
+                        return {
+                            "content": extract[:1500],
+                            "url": urls[0] if urls else f"https://minecraft.wiki/w/{titles[0]}",
+                            "ok": True,
+                        }
+        return {"content": f"No wiki results for: {query}", "url": "", "ok": False}
+    except Exception as e:
+        return {"content": f"Wiki lookup failed: {e}", "url": "", "ok": False}
+
+
+def run_exploration(topic, player, ollama_url, model, rcon):
+    """Run one exploration round — model researches and acts."""
+    system = (
+        "/no_think\n"
+        "You are a Minecraft 1.21 expert on a Paper server with plugins: "
+        "WorldGuard, CoreProtect, EssentialsX, LuckPerms, FastAsyncWorldEdit.\n\n"
+        "You have these lookup tools:\n"
+        "- minecraft.wiki_lookup: {\"query\": \"...\"} — Minecraft Wiki for items, mobs, commands\n"
+        "- plugin.docs_lookup: {\"plugin\": \"worldguard|worldedit|coreprotect|essentialsx|luckperms\", \"query\": \"...\"} — plugin documentation\n"
+        "- minecraft.changelog_lookup: {\"query\": \"...\", \"version\": \"1.21\"} — version changes\n"
+        "- paper.docs_lookup: {\"query\": \"...\"} — Paper server docs\n"
+        "- rcon.execute: {\"command\": \"...\"} — execute a Minecraft command\n\n"
+        "WORKFLOW:\n"
+        "1. Research the topic using the appropriate lookup tool\n"
+        "2. For plugin commands, use plugin.docs_lookup instead of minecraft.wiki_lookup\n"
+        "3. Generate and execute commands via rcon.execute\n"
+        "4. If a command fails, look up the correct syntax and try again\n\n"
+        "To call a tool, respond with:\n"
+        "<tool_call>\n{\"name\": \"tool_name\", \"arguments\": {...}}\n</tool_call>\n\n"
+        "When done, respond with final JSON:\n"
+        "{\"commands\": [...], \"reasoning\": \"what you learned\", \"wiki_topics\": [\"topics you looked up\"]}\n\n"
+        "Be curious. ALWAYS look things up before guessing. Verify your knowledge."
+    )
+
+    topic_resolved = topic.replace("{p}", player)
+    messages = [
+        {"role": "system", "content": system},
+        {"role": "user", "content": f"Player {player}: {topic_resolved}"},
+    ]
+
+    tool_trace = []
+    all_commands = []
+    wiki_topics = []
+    max_steps = 10
+
+    for step in range(max_steps):
+        try:
+            r = requests.post(f"{ollama_url}/api/chat", json={
+                "model": model,
+                "messages": messages,
+                "stream": False,
+                "options": {"temperature": 0.6, "num_predict": 800},
+            }, timeout=120)
+            raw = r.json()["message"]["content"]
+        except Exception as e:
+            print(f"    LLM error: {e}")
+            break
+
+        raw = re.sub(r'<think>[\s\S]*?</think>\s*', '', raw)
+
+        # Check for tool calls
+        tool_matches = re.findall(r'<tool_call>\s*(\{.*?\})\s*</tool_call>', raw, re.DOTALL)
+
+        if not tool_matches:
+            # Final response — done exploring
+            break
+
+        for tc_json in tool_matches:
+            try:
+                tc = json.loads(tc_json)
+                tool_name = tc.get("name", "")
+                tool_args = tc.get("arguments", {})
+            except json.JSONDecodeError:
+                continue
+
+            if tool_name == "minecraft.wiki_lookup":
+                query = tool_args.get("query", "")
+                wiki_topics.append(query)
+                result = wiki_lookup(query)
+                print(f"    wiki: {query[:60]} -> {len(result.get('content',''))} chars")
+            elif tool_name in ("plugin.docs_lookup", "minecraft.changelog_lookup", "paper.docs_lookup"):
+                try:
+                    from agent.tools.knowledge_lookup import handle_knowledge_tool
+                    result = handle_knowledge_tool(tool_name, tool_args)
+                except ImportError:
+                    result = wiki_lookup(tool_args.get("query", tool_args.get("plugin", "")))
+                query = tool_args.get("query", "")
+                wiki_topics.append(f"{tool_name}:{query}")
+                print(f"    {tool_name}: {query[:50]} -> {len(result.get('content',''))} chars")
+            elif tool_name == "rcon.execute":
+                cmd = tool_args.get("command", "")
+                try:
+                    rcon_result = rcon.command(cmd)
+                    is_err = any(e in rcon_result for e in ("<--[HERE]", "Unknown", "Incorrect"))
+                    result = {"success": not is_err, "result": rcon_result[:300]}
+                    all_commands.append(cmd)
+                    status = "OK" if not is_err else "ERR"
+                    print(f"    rcon: {cmd[:60]} -> {status}")
+                except Exception as e:
+                    result = {"success": False, "result": str(e)}
+                    print(f"    rcon: {cmd[:60]} -> FAIL")
+            else:
+                result = {"ok": False, "error": f"unknown tool: {tool_name}"}
+
+            tool_trace.append({
+                "tool": tool_name,
+                "input": str(tool_args)[:200],
+                "ok": result.get("ok", result.get("success", False)),
+                "step": step,
+            })
+
+            messages.append({"role": "assistant", "content": f"<tool_call>\n{json.dumps(tc)}\n</tool_call>"})
+            messages.append({"role": "tool", "content": json.dumps(result)[:3000]})
+
+        time.sleep(0.1)
+
+    # Parse final response if present
+    reasoning = ""
+    try:
+        parsed = json.loads(raw)
+        reasoning = parsed.get("reasoning", "")
+        if parsed.get("commands"):
+            all_commands.extend(parsed["commands"])
+    except json.JSONDecodeError:
+        reasoning = raw[:200]
+
+    return {
+        "id": f"explore-{int(time.time())}-{random.randint(0,9999):04d}",
+        "source": "exploration_self_play",
+        "type": "exploration",
+        "input": {"user_message": topic_resolved, "player": player},
+        "output": {
+            "commands": all_commands,
+            "reasoning": reasoning,
+            "wiki_topics": wiki_topics,
+        },
+        "tool_trace": tool_trace,
+        "messages": messages,
+        "metadata": {
+            "model": model,
+            "steps": min(step + 1, max_steps),
+            "wiki_lookups": len(wiki_topics),
+            "rcon_commands": len(all_commands),
+            "success_rate": (
+                sum(1 for t in tool_trace if t["tool"] == "rcon.execute" and t["ok"])
+                / max(sum(1 for t in tool_trace if t["tool"] == "rcon.execute"), 1)
+            ),
+        },
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Exploration self-play")
+    parser.add_argument("--ollama-url", default="http://localhost:11434")
+    parser.add_argument("--model", default="mortdecai:0.5.0")
+    parser.add_argument("--rcon-host", default="192.168.0.244")
+    parser.add_argument("--rcon-port", type=int, default=25578)
+    parser.add_argument("--rcon-pass", default="REDACTED_RCON")
+    parser.add_argument("--rounds", type=int, default=999999)
+    parser.add_argument("--focus", default="general", choices=["general", "plugins", "all"],
+                        help="Topic focus: general (vanilla MC), plugins (WG/CP/Ess/LP/FAWE/scripts), all (both)")
+    args = parser.parse_args()
+
+    if args.focus == "plugins":
+        topics = EXPLORATION_TOPICS_PLUGINS
+    elif args.focus == "all":
+        topics = EXPLORATION_TOPICS + EXPLORATION_TOPICS_PLUGINS
+    else:
+        topics = EXPLORATION_TOPICS
+
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+    output_path = OUTPUT_DIR / f"exploration_{args.focus}_{int(time.time())}.jsonl"
+
+    rcon = get_rcon(args.rcon_host, args.rcon_port, args.rcon_pass)
+
+    print(f"Exploration Self-Play")
+    print(f"  Model: {args.model} on {args.ollama_url}")
+    print(f"  RCON: {args.rcon_host}:{args.rcon_port}")
+    print(f"  Focus: {args.focus} ({len(topics)} topics)")
+    print(f"  Output: {output_path}")
+    print()
+
+    stats = {"total": 0, "wiki_lookups": 0, "rcon_commands": 0, "rcon_success": 0}
+
+    for round_num in range(args.rounds):
+        topic = random.choice(topics)
+        player = random.choice(PLAYERS)
+
+        print(f"\n── Round {round_num+1} ──")
+        print(f"  Topic: {topic[:80].replace('{p}', player)}")
+
+        example = run_exploration(topic, player, args.ollama_url, args.model, rcon)
+
+        stats["total"] += 1
+        stats["wiki_lookups"] += example["metadata"]["wiki_lookups"]
+        stats["rcon_commands"] += example["metadata"]["rcon_commands"]
+        stats["rcon_success"] += int(example["metadata"]["success_rate"] * example["metadata"]["rcon_commands"])
+
+        print(f"  Result: {example['metadata']['wiki_lookups']} lookups, "
+              f"{example['metadata']['rcon_commands']} commands, "
+              f"{example['metadata']['success_rate']:.0%} success")
+
+        with open(output_path, "a") as f:
+            f.write(json.dumps(example, ensure_ascii=False) + "\n")
+
+        if (round_num + 1) % 10 == 0:
+            rate = stats["rcon_success"] / max(stats["rcon_commands"], 1) * 100
+            print(f"\n  Progress: {stats['total']} explorations, "
+                  f"{stats['wiki_lookups']} wiki lookups, "
+                  f"{stats['rcon_commands']} commands ({rate:.0f}% success)")
+
+        time.sleep(0.5)
+
+    print(f"\nExploration complete: {stats['total']} topics explored")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+"""
+Filter exploration and self-play data for quality.
+
+Keeps:
+- Successful interactions (model looked up info AND executed correctly)
+- First instance of each unique error pattern (for error correction training)
+- High wiki-lookup-to-command ratios (model actually used the knowledge)
+
+Removes:
+- Duplicate topics (keeps first occurrence only)
+- Empty responses (no commands, no lookups)
+- Repeated failures on the same command pattern
+- Rounds where model ignored wiki results
+
+Output: data/processed/filtered_exploration.jsonl
+"""
+
+import json
+import sys
+from pathlib import Path
+from collections import defaultdict
+
+PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
+
+EXPLORATION_DIR = PROJECT_ROOT / "data" / "raw" / "exploration_selfplay"
+TOOL_SELFPLAY_DIR = PROJECT_ROOT / "data" / "raw" / "tool_selfplay"
+OUTPUT_PATH = PROJECT_ROOT / "data" / "processed" / "filtered_exploration.jsonl"
+
+
+def load_all_examples():
+    """Load all exploration and tool self-play examples."""
+    examples = []
+
+    for jsonl in sorted(EXPLORATION_DIR.glob("*.jsonl")):
+        with open(jsonl) as f:
+            for line in f:
+                if line.strip():
+                    try:
+                        examples.append(json.loads(line))
+                    except json.JSONDecodeError:
+                        pass
+
+    for jsonl in sorted(TOOL_SELFPLAY_DIR.glob("*.jsonl")):
+        with open(jsonl) as f:
+            for line in f:
+                if line.strip():
+                    try:
+                        examples.append(json.loads(line))
+                    except json.JSONDecodeError:
+                        pass
+
+    return examples
+
+
+def filter_examples(examples):
+    """Filter for quality."""
+    kept = []
+    seen_topics = set()
+    seen_error_patterns = set()
+    stats = {
+        "total": len(examples),
+        "kept_success": 0,
+        "kept_error_correction": 0,
+        "kept_wiki_grounded": 0,
+        "dropped_duplicate": 0,
+        "dropped_empty": 0,
+        "dropped_repeat_failure": 0,
+    }
+
+    for ex in examples:
+        meta = ex.get("metadata", {})
+        inp = ex.get("input", {})
+        topic = inp.get("user_message", "")[:80]
+        success_rate = meta.get("success_rate", meta.get("all_success", False))
+        wiki_lookups = meta.get("wiki_lookups", 0)
+        rcon_commands = meta.get("rcon_commands", 0)
+
+        # Skip empty
+        if rcon_commands == 0 and wiki_lookups == 0:
+            stats["dropped_empty"] += 1
+            continue
+
+        # Deduplicate topics (keep first)
+        if topic in seen_topics:
+            stats["dropped_duplicate"] += 1
+            continue
+        seen_topics.add(topic)
+
+        # Categorize
+        if isinstance(success_rate, bool):
+            is_success = success_rate
+        else:
+            is_success = success_rate > 0.7
+
+        if is_success and rcon_commands > 0:
+            # Successful interaction — always keep
+            stats["kept_success"] += 1
+            kept.append(ex)
+        elif wiki_lookups > 0 and rcon_commands > 0:
+            # Wiki-grounded (looked things up before acting) — keep even if some failures
+            stats["kept_wiki_grounded"] += 1
+            kept.append(ex)
+        elif not is_success and rcon_commands > 0:
+            # Failed — keep only first instance of each error pattern
+            commands = ex.get("output", {}).get("commands", [])
+            if commands:
+                # Use first command as error pattern key
+                pattern = commands[0][:40] if isinstance(commands[0], str) else ""
+            else:
+                rcon_results = meta.get("rcon_results", [])
+                pattern = str(rcon_results[:1])[:60] if rcon_results else ""
+
+            if pattern and pattern not in seen_error_patterns:
+                seen_error_patterns.add(pattern)
+                stats["kept_error_correction"] += 1
+                kept.append(ex)
+            else:
+                stats["dropped_repeat_failure"] += 1
+        else:
+            stats["dropped_empty"] += 1
+
+    return kept, stats
+
+
+def main():
+    print("Loading examples...")
+    examples = load_all_examples()
+    print(f"  Loaded {len(examples)} raw examples")
+
+    print("Filtering...")
+    filtered, stats = filter_examples(examples)
+
+    print(f"\nFilter results:")
+    for k, v in stats.items():
+        print(f"  {k}: {v}")
+
+    print(f"\nKept: {len(filtered)} ({100*len(filtered)//max(stats['total'],1)}%)")
+
+    OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
+    with open(OUTPUT_PATH, "w") as f:
+        for ex in filtered:
+            f.write(json.dumps(ex, ensure_ascii=False) + "\n")
+
+    print(f"Written to {OUTPUT_PATH}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,123 @@
+#!/usr/bin/env python3
+"""
+Generate SVG training history chart for the Gitea README.
+
+X-axis: Model version
+Y-axis: Training examples (bar) and inverse loss (line)
+"""
+
+import json
+from pathlib import Path
+
+OUTPUT = Path(__file__).resolve().parent.parent.parent / "branding" / "training_progress.svg"
+
+# Historical data from training runs
+VERSIONS = [
+    {"version": "0.1.0", "examples": 500,  "loss": 2.10, "label": "v1 (seed)"},
+    {"version": "0.2.0", "examples": 1200, "loss": 1.45, "label": "v2 (+entities)"},
+    {"version": "0.3.0", "examples": 2100, "loss": 0.82, "label": "v3 (+errors)"},
+    {"version": "0.4.0", "examples": 3175, "loss": 0.35, "label": "v4 (+tools)"},
+    {"version": "0.5.0", "examples": 4358, "loss": 0.16, "label": "v5 (+plugins)"},
+]
+
+# Chart dimensions
+W = 700
+H = 400
+PAD_L = 70
+PAD_R = 30
+PAD_T = 40
+PAD_B = 80
+PLOT_W = W - PAD_L - PAD_R
+PLOT_H = H - PAD_T - PAD_B
+
+# Colors
+BG = "#111111"
+GRID = "#252525"
+TEXT = "#999999"
+BAR_COLOR = "#D35400"
+LINE_COLOR = "#4caf50"
+LABEL_COLOR = "#e0e0e0"
+
+
+def generate_svg():
+    max_examples = max(v["examples"] for v in VERSIONS) * 1.15
+    max_inv_loss = max(1.0 / v["loss"] for v in VERSIONS) * 1.15
+    n = len(VERSIONS)
+    bar_w = PLOT_W / n * 0.6
+    gap = PLOT_W / n
+
+    svg = f"""<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 {W} {H}" width="{W}" height="{H}">
+<rect width="{W}" height="{H}" fill="{BG}" rx="8"/>
+
+<!-- Title -->
+<text x="{W/2}" y="25" fill="{LABEL_COLOR}" font-family="monospace" font-size="16" text-anchor="middle" font-weight="bold">Mortdecai Training Progress</text>
+
+<!-- Grid lines -->
+"""
+    # Y-axis grid (examples)
+    for i in range(5):
+        y = PAD_T + PLOT_H - (i / 4 * PLOT_H)
+        val = int(max_examples * i / 4)
+        svg += f'<line x1="{PAD_L}" y1="{y}" x2="{W-PAD_R}" y2="{y}" stroke="{GRID}" stroke-width="0.5"/>\n'
+        svg += f'<text x="{PAD_L-5}" y="{y+4}" fill="{TEXT}" font-family="monospace" font-size="10" text-anchor="end">{val:,}</text>\n'
+
+    # Bars (training examples)
+    for i, v in enumerate(VERSIONS):
+        cx = PAD_L + gap * i + gap / 2
+        bh = (v["examples"] / max_examples) * PLOT_H
+        by = PAD_T + PLOT_H - bh
+
+        svg += f'<rect x="{cx - bar_w/2}" y="{by}" width="{bar_w}" height="{bh}" fill="{BAR_COLOR}" rx="3" opacity="0.85"/>\n'
+        svg += f'<text x="{cx}" y="{by - 8}" fill="{BAR_COLOR}" font-family="monospace" font-size="11" text-anchor="middle" font-weight="bold">{v["examples"]:,}</text>\n'
+
+        # X-axis label
+        svg += f'<text x="{cx}" y="{PAD_T + PLOT_H + 20}" fill="{LABEL_COLOR}" font-family="monospace" font-size="12" text-anchor="middle">{v["version"]}</text>\n'
+        svg += f'<text x="{cx}" y="{PAD_T + PLOT_H + 35}" fill="{TEXT}" font-family="monospace" font-size="9" text-anchor="middle">{v["label"]}</text>\n'
+
+    # Line (inverse loss = quality)
+    points = []
+    for i, v in enumerate(VERSIONS):
+        cx = PAD_L + gap * i + gap / 2
+        inv_loss = 1.0 / v["loss"]
+        ly = PAD_T + PLOT_H - (inv_loss / max_inv_loss) * PLOT_H
+        points.append(f"{cx},{ly}")
+
+    polyline = " ".join(points)
+    svg += f'<polyline points="{polyline}" fill="none" stroke="{LINE_COLOR}" stroke-width="2.5" stroke-linecap="round" stroke-linejoin="round"/>\n'
+
+    # Dots on line
+    for i, v in enumerate(VERSIONS):
+        cx = PAD_L + gap * i + gap / 2
+        inv_loss = 1.0 / v["loss"]
+        ly = PAD_T + PLOT_H - (inv_loss / max_inv_loss) * PLOT_H
+        svg += f'<circle cx="{cx}" cy="{ly}" r="4" fill="{LINE_COLOR}"/>\n'
+        svg += f'<text x="{cx}" y="{ly - 10}" fill="{LINE_COLOR}" font-family="monospace" font-size="10" text-anchor="middle">loss={v["loss"]}</text>\n'
+
+    # Y-axis labels
+    svg += f'<text x="{PAD_L - 45}" y="{PAD_T + PLOT_H/2}" fill="{BAR_COLOR}" font-family="monospace" font-size="11" text-anchor="middle" transform="rotate(-90,{PAD_L-45},{PAD_T+PLOT_H/2})">Training Examples</text>\n'
+
+    # Legend
+    svg += f'<rect x="{W-180}" y="{PAD_T+5}" width="12" height="12" fill="{BAR_COLOR}" rx="2"/>\n'
+    svg += f'<text x="{W-163}" y="{PAD_T+15}" fill="{TEXT}" font-family="monospace" font-size="10">Training Examples</text>\n'
+    svg += f'<line x1="{W-180}" y1="{PAD_T+28}" x2="{W-168}" y2="{PAD_T+28}" stroke="{LINE_COLOR}" stroke-width="2.5"/>\n'
+    svg += f'<text x="{W-163}" y="{PAD_T+32}" fill="{TEXT}" font-family="monospace" font-size="10">Model Quality (1/loss)</text>\n'
+
+    # X-axis label
+    svg += f'<text x="{W/2}" y="{H-10}" fill="{TEXT}" font-family="monospace" font-size="11" text-anchor="middle">Model Version</text>\n'
+
+    svg += "</svg>"
+    return svg
+
+
+def main():
+    svg = generate_svg()
+    OUTPUT.parent.mkdir(parents=True, exist_ok=True)
+    with open(OUTPUT, "w") as f:
+        f.write(svg)
+    print(f"Chart saved to {OUTPUT}")
+    print(f"Embed in README: ![Training Progress](branding/training_progress.svg)")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,370 @@
+#!/usr/bin/env python3
+"""
+Regenerate tool-calling training data using mortdecai:0.5.0.
+
+Uses the model-driven tool loop: sends prompts to 0.5.0, lets it decide
+which tools to call, executes via RCON, and captures the full multi-turn
+conversation as training data. Only keeps examples where all commands succeed.
+
+This produces "distilled" data — the model's best outputs, validated by RCON.
+"""
+
+import json
+import random
+import re
+import sys
+import time
+from pathlib import Path
+
+PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
+sys.path.insert(0, str(PROJECT_ROOT))
+
+import requests
+from agent.tools.persistent_rcon import get_rcon
+from agent.tools.tool_schemas import qwen3_tools_block
+from agent.prompts.system_prompts import SYNTAX_RULES, RISK_GRADIENT
+
+OUTPUT_PATH = PROJECT_ROOT / "data" / "processed" / "tool_training_v05.jsonl"
+
+TOOLS_BLOCK = qwen3_tools_block()
+SYSTEM = (
+    "/no_think\n"
+    "You are a Minecraft 1.21 command translator for a Paper server.\n"
+    "Plugins: FastAsyncWorldEdit, WorldGuard, CoreProtect, EssentialsX, Vault, LuckPerms.\n\n"
+    "You have tools. To call one:\n"
+    "<tool_call>\n{\"name\": \"tool_name\", \"arguments\": {...}}\n</tool_call>\n\n"
+    "Available: rcon.execute, minecraft.wiki_lookup, plugin.docs_lookup, "
+    "minecraft.changelog_lookup, world.player_info, world.server_state, "
+    "world.nearby_entities, memory.read, memory.write, "
+    "script.write, script.validate, script.execute, script.read, script.list, "
+    "script.delete, script.schedule.\n\n"
+    "After tool calls, respond with JSON:\n"
+    "{\"risk_level\": <0-5>, \"commands\": [...], \"reasoning\": \"...\"}\n\n"
+    "PERMISSION LEVEL: 4 (generous).\n" + SYNTAX_RULES + RISK_GRADIENT
+)
+
+SYSTEM_GOD = (
+    "/no_think\n"
+    "You are God in a Minecraft server with full tool access.\n"
+    "Return JSON: {\"risk_level\": <0-5>, \"message\": \"...\", \"commands\": [...], \"reasoning\": \"...\"}\n\n"
+    + SYNTAX_RULES + "\n" + TOOLS_BLOCK
+)
+
+PLAYERS = ["slingshooter08", "Ace13245", "TheBigBoss", "xXDragonSlayerXx"]
+
+# Comprehensive prompt set — every category we need good data for
+PROMPTS = {
+    "basic_commands": [
+        "sudo give me a diamond sword",
+        "sudo give me 64 golden apples",
+        "sudo give me a stack of oak planks",
+        "sudo give me an elytra",
+        "sudo give me a spyglass",
+        "sudo give me a recovery compass",
+        "sudo give me a bundle",
+        "sudo set time to noon",
+        "sudo set time to midnight",
+        "sudo clear weather for a week",
+        "sudo make it thunder",
+        "sudo kill all hostile mobs",
+        "sudo kill all items on the ground",
+        "sudo gamemode creative",
+        "sudo gamemode survival",
+        "sudo gamemode spectator",
+    ],
+    "enchanted_gear": [
+        "sudo give me a diamond sword with sharpness 5, unbreaking 3, mending, and looting 3",
+        "sudo give me a netherite pickaxe with efficiency 5, fortune 3, unbreaking 3, mending",
+        "sudo give me a bow with power 5, infinity, flame, punch 2",
+        "sudo full netherite armor with protection 4, unbreaking 3, mending on every piece",
+        "sudo give me boots with feather falling 4, depth strider 3, soul speed 3",
+        "sudo give me a trident with loyalty 3 and channeling",
+        "sudo give me a trident with riptide 3",
+        "sudo give me a crossbow with multishot and quick charge 3",
+        "sudo give me a mace with density 5 and wind burst 3",
+        "sudo best fishing rod possible",
+        "sudo give me a shield with unbreaking 3 and mending",
+    ],
+    "effects": [
+        "sudo give me speed 3 for 10 minutes",
+        "sudo night vision permanently",
+        "sudo make me invisible for 5 minutes",
+        "sudo give me fire resistance for an hour",
+        "sudo give everyone online regeneration 2",
+        "sudo give me haste 2 for 10 minutes",
+        "sudo slow falling for 60 seconds",
+        "sudo give me water breathing forever",
+        "sudo give me strength 2 and resistance 2 for 5 minutes",
+        "sudo clear all my effects",
+    ],
+    "teleport_position": [
+        "sudo tp me to 0 100 0",
+        "sudo tp me to the nether",
+        "sudo tp everyone to spawn",
+        "sudo teleport me 100 blocks north",
+        "sudo tp me up 50 blocks",
+        "sudo set my spawn point here",
+    ],
+    "building": [
+        "sudo fill a 10x10 platform of stone under me",
+        "sudo place a beacon at my location",
+        "sudo build a small cobblestone room around me",
+        "sudo fill the area below me with water",
+        "sudo make a glass dome over me",
+        "sudo place 4 lanterns around me",
+        "sudo clear a 20 block area above me",
+    ],
+    "entities": [
+        "sudo summon a horse with a saddle",
+        "sudo summon 5 cows near me",
+        "sudo summon a villager",
+        "sudo spawn an iron golem",
+        "sudo summon a warden 20 blocks away",
+        "sudo summon a wither",
+        "sudo kill all zombies within 50 blocks",
+        "sudo kill all creepers near me",
+    ],
+    "worldguard": [
+        "sudo create a region called my-base and set pvp deny",
+        "sudo prevent mob spawning in the spawn region",
+        "sudo set a greeting message for spawn: Welcome to the server!",
+        "sudo deny entry to non-members in the vault region",
+        "sudo list all regions",
+        "sudo allow TNT in the arena",
+        "sudo prevent fire spread globally",
+        "sudo make a healing zone at spawn",
+    ],
+    "coreprotect": [
+        "sudo enable block inspector",
+        "sudo rollback the last hour of changes",
+        "sudo rollback what TheBigBoss did in the last 30 minutes",
+        "sudo lookup who placed blocks near me today",
+        "sudo rollback TNT damage from the last 2 hours",
+        "sudo check coreprotect status",
+        "sudo restore what was rolled back",
+    ],
+    "essentialsx": [
+        "sudo set my home here",
+        "sudo create a warp called arena",
+        "sudo give Ace13245 1000 coins",
+        "sudo check my balance",
+        "sudo heal me",
+        "sudo feed me",
+        "sudo repair my held item",
+        "sudo set my nickname to DragonLord",
+        "sudo broadcast Welcome to the server!",
+        "sudo god mode on",
+        "sudo fly mode on",
+    ],
+    "luckperms": [
+        "sudo create a VIP group",
+        "sudo add Ace13245 to VIP",
+        "sudo give VIP permission to fly",
+        "sudo give me temporary VIP for 24 hours",
+        "sudo set VIP prefix to gold [VIP]",
+        "sudo list all permission groups",
+        "sudo create a builder group with worldedit access",
+    ],
+    "fawe": [
+        "sudo make a glass sphere radius 8",
+        "sudo hollow stone sphere radius 10",
+        "sudo cylinder of quartz 5 wide 12 tall",
+        "sudo replace all stone with deepslate in selection",
+        "sudo smooth the terrain 5 iterations",
+        "sudo drain water within 20 blocks",
+        "sudo sandstone pyramid 8 tall",
+        "sudo undo my last worldedit operation",
+    ],
+    "god_prayers": [
+        "pray oh great one, bless me with diamonds",
+        "pray lord, protect me from the monsters of the night",
+        "pray I offer this sacrifice of 64 wheat, grant me your favor",
+        "pray god please make it stop raining",
+        "pray smite the wicked TheBigBoss for griefing my base",
+        "pray heal me, I am near death",
+        "pray give me the strength to slay the ender dragon",
+        "pray I am lost in a cave, guide me to the surface",
+    ],
+    "error_prone": [
+        "sudo give me a bed",
+        "sudo give me steak",
+        "sudo give me cooked beef",
+        "sudo effect give me speed",
+        "sudo give me a log",
+        "sudo fill with stone 10",
+        "sudo tp me to spawn",
+        "sudo give @s diamond 1",
+    ],
+    "complex_multi": [
+        "sudo gear me up for the nether: armor, weapons, food, fire resistance",
+        "sudo prepare me for the end fight: bow, arrows, blocks, pearls, slow falling",
+        "sudo set up a new player kit: stone tools, food, bed, torches",
+        "sudo create a mob farm: platform, water channels, collection hopper",
+    ],
+}
+
+
+def query_model_with_tools(prompt, player, ollama_url, model, rcon, mode="sudo", max_steps=6):
+    """Send prompt to model, let it call tools, execute them, capture full chain."""
+    system = SYSTEM_GOD if mode == "god" else SYSTEM
+
+    messages = [
+        {"role": "system", "content": system},
+        {"role": "user", "content": f"Player {player}: {prompt}"},
+    ]
+
+    tool_trace = []
+    all_rcon_results = []
+
+    for step in range(max_steps):
+        try:
+            r = requests.post(f"{ollama_url}/api/chat", json={
+                "model": model,
+                "messages": messages,
+                "stream": False,
+                "options": {"temperature": 0.2, "num_predict": 800},
+            }, timeout=90)
+            raw = r.json()["message"]["content"]
+        except Exception as e:
+            return None
+
+        raw = re.sub(r'<think>[\s\S]*?</think>\s*', '', raw)
+
+        # Check for tool calls
+        tool_matches = re.findall(r'<tool_call>\s*(\{.*?\})\s*</tool_call>', raw, re.DOTALL)
+
+        if not tool_matches:
+            # Final response
+            try:
+                parsed = json.loads(raw)
+                return {
+                    "messages": messages + [{"role": "assistant", "content": raw}],
+                    "commands": parsed.get("commands", []),
+                    "message": parsed.get("message", ""),
+                    "reasoning": parsed.get("reasoning", ""),
+                    "tool_trace": tool_trace,
+                    "rcon_results": all_rcon_results,
+                }
+            except json.JSONDecodeError:
+                return None
+
+        for tc_json in tool_matches:
+            try:
+                tc = json.loads(tc_json)
+                tool_name = tc.get("name", "")
+                tool_args = tc.get("arguments", {})
+            except json.JSONDecodeError:
+                continue
+
+            # Execute tool
+            if tool_name == "rcon.execute":
+                cmd = tool_args.get("command", "")
+                try:
+                    result_text = rcon.command(cmd)
+                    is_err = any(e in result_text for e in ("<--[HERE]", "Unknown", "Incorrect", "Expected"))
+                    result = {"success": not is_err, "result": result_text[:300]}
+                    all_rcon_results.append({"cmd": cmd, "ok": not is_err, "result": result_text[:200]})
+                except Exception as e:
+                    result = {"success": False, "result": str(e)}
+                    all_rcon_results.append({"cmd": cmd, "ok": False, "result": str(e)})
+            elif tool_name == "minecraft.wiki_lookup":
+                try:
+                    from agent.tools.knowledge_lookup import handle_knowledge_tool
+                    result = handle_knowledge_tool(tool_name, tool_args)
+                except Exception:
+                    result = {"content": "Wiki unavailable", "url": "", "ok": False}
+            elif tool_name in ("plugin.docs_lookup", "minecraft.changelog_lookup", "paper.docs_lookup"):
+                try:
+                    from agent.tools.knowledge_lookup import handle_knowledge_tool
+                    result = handle_knowledge_tool(tool_name, tool_args)
+                except Exception:
+                    result = {"content": "Docs unavailable", "url": "", "ok": False}
+            else:
+                result = {"ok": True, "result": "simulated"}
+
+            tool_trace.append({"tool": tool_name, "args": str(tool_args)[:100], "step": step})
+            messages.append({"role": "assistant", "content": f"<tool_call>\n{json.dumps(tc)}\n</tool_call>"})
+            messages.append({"role": "tool", "content": json.dumps(result)[:2000]})
+
+    return None  # Ran out of steps
+
+
+def main():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ollama-url", default="http://localhost:11434")
+    parser.add_argument("--model", default="mortdecai:0.5.0")
+    parser.add_argument("--rcon-host", default="192.168.0.244")
+    parser.add_argument("--rcon-port", type=int, default=25578)
+    parser.add_argument("--rcon-pass", default="REDACTED_RCON")
+    args = parser.parse_args()
+
+    rcon = get_rcon(args.rcon_host, args.rcon_port, args.rcon_pass)
+    print(f"Regenerating tool data with {args.model}")
+    print(f"RCON: {args.rcon_host}:{args.rcon_port}")
+
+    all_examples = []
+    stats = {"total": 0, "kept": 0, "failed": 0, "no_response": 0}
+
+    for category, prompts in PROMPTS.items():
+        print(f"\n── {category} ({len(prompts)} prompts) ──")
+        for prompt in prompts:
+            player = random.choice(PLAYERS)
+            mode = "god" if prompt.startswith("pray ") else "sudo"
+
+            result = query_model_with_tools(prompt, player, args.ollama_url, args.model, rcon, mode)
+            stats["total"] += 1
+
+            if not result:
+                stats["no_response"] += 1
+                print(f"  SKIP: {prompt[:50]} (no response)")
+                continue
+
+            rcon_ok = sum(1 for r in result["rcon_results"] if r["ok"])
+            rcon_total = len(result["rcon_results"])
+            tools_used = len(result["tool_trace"])
+
+            if rcon_total == 0 and tools_used == 0:
+                stats["no_response"] += 1
+                print(f"  SKIP: {prompt[:50]} (empty)")
+                continue
+
+            all_success = rcon_total > 0 and all(r["ok"] for r in result["rcon_results"])
+
+            if all_success or (rcon_ok > 0 and rcon_ok >= rcon_total * 0.7):
+                stats["kept"] += 1
+                example = {
+                    "id": f"v05-regen-{stats['total']:04d}",
+                    "source": "model_distillation_v05",
+                    "type": f"tool_{category}",
+                    "messages": result["messages"],
+                    "metadata": {
+                        "model": args.model,
+                        "category": category,
+                        "tools_used": tools_used,
+                        "rcon_total": rcon_total,
+                        "rcon_success": rcon_ok,
+                        "all_success": all_success,
+                    },
+                }
+                all_examples.append(example)
+                print(f"  KEPT: {prompt[:50]} ({rcon_ok}/{rcon_total} cmds, {tools_used} tools)")
+            else:
+                stats["failed"] += 1
+                print(f"  FAIL: {prompt[:50]} ({rcon_ok}/{rcon_total} cmds)")
+
+            time.sleep(0.2)
+
+    print(f"\n{'='*60}")
+    print(f"Total: {stats['total']}, Kept: {stats['kept']}, Failed: {stats['failed']}, Empty: {stats['no_response']}")
+    print(f"Quality: {100*stats['kept']//max(stats['total'],1)}%")
+
+    OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
+    with open(OUTPUT_PATH, "w") as f:
+        for ex in all_examples:
+            f.write(json.dumps(ex, ensure_ascii=False) + "\n")
+    print(f"Written to {OUTPUT_PATH}")
+
+
+if __name__ == "__main__":
+    main()