0.5.0 bake-off results, knowledge lookup tools, training progress chart

Bake-off (0.5.0 vs 0.4.0):
- Overall: 46.8% vs 45.2% (+1.6%), 0 errors vs 2
- Enchantments: +47% (20% → 67%)
- EssentialsX: +60% (0% → 60%)
- Effects: +25% (0% → 25%)
- Regressions: fill_build -67%, world -20%

Knowledge Lookup Tools (4 new):
- plugin.docs_lookup: WorldGuard, WorldEdit, CoreProtect, EssentialsX, LuckPerms docs
- minecraft.changelog_lookup: version history from Minecraft Wiki
- paper.docs_lookup: Paper server-specific documentation
- Wired into gateway model-driven tool loop and exploration self-play

Exploration Self-Play:
- General (vanilla MC) and plugins focus modes
- Wiki-grounded: model researches before acting, validates through RCON
- 2,243 exploration examples generated, 150 kept after quality filtering

Training Progress Chart:
- SVG chart showing training examples and inverse loss across versions
- Added to MODEL_CARD.md for Gitea display

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Mortdecai
2026-03-21 15:28:09 -04:00
parent da8f557219
commit f5118505b1
10 changed files with 3215 additions and 20 deletions
+297
View File
@@ -0,0 +1,297 @@
#!/usr/bin/env python3
"""
Bake-off — compare model versions on standard test prompts with RCON validation.
Runs the same prompts through multiple models, executes via RCON, and scores
success rate, response quality, and speed.
Usage:
python3 bakeoff.py --models mortdecai:0.4.0,mortdecai:0.5.0 \
--ollama-url http://localhost:11434 --rcon-host 192.168.0.244
"""
import argparse
import json
import random
import re
import sys
import time
from pathlib import Path
PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
import requests
from agent.tools.persistent_rcon import get_rcon
OUTPUT_DIR = PROJECT_ROOT / "training" / "bakeoff_results"
# Standard test prompts across categories
TEST_PROMPTS = {
"basic_give": [
"sudo give me a diamond sword",
"sudo give me 64 golden apples",
"sudo give me full netherite armor",
"sudo give me a stack of oak logs",
],
"enchantments": [
"sudo give me a sword with sharpness 5 and mending",
"sudo give me a bow with power 5 and infinity",
"sudo give me boots with feather falling 4 and depth strider 3",
"sudo give me a trident with loyalty 3 and channeling",
],
"effects": [
"sudo give me speed 2 for 5 minutes",
"sudo make me invisible for 60 seconds",
"sudo give me night vision forever",
"sudo give everyone resistance 3",
],
"world": [
"sudo set time to day",
"sudo clear the weather",
"sudo kill all zombies",
"sudo summon 3 cows near me",
],
"teleport": [
"sudo tp me to 0 100 0",
"sudo tp me 50 blocks up",
],
"fill_build": [
"sudo fill a 5x5 gold platform under me",
"sudo place a beacon at 0 64 0",
],
"complex": [
"sudo give me a mace with density 5 and wind burst 3",
"sudo give me a decorated pot",
"sudo spawn a warden 10 blocks away",
"sudo create a team called red with red color",
],
"plugins_worldguard": [
"sudo create a region called test-region",
"sudo set pvp deny in the test-region",
"sudo list all regions",
],
"plugins_coreprotect": [
"sudo check coreprotect status",
"sudo lookup block changes in the last hour",
],
"plugins_essentials": [
"sudo set spawn here",
"sudo create a warp called bakeoff-test",
"sudo heal me",
],
"plugins_luckperms": [
"sudo create a group called testers",
"sudo list all permission groups",
],
"error_prone": [
"sudo give me a bed",
"sudo give me cooked beef",
"sudo effect give me speed",
"sudo fill with stone 10",
],
}
PLAYER = "slingshooter08"
def query_model(prompt, model, ollama_url, timeout=60):
"""Query a model and return parsed response + timing."""
system = (
"/no_think\n"
"You are a Minecraft 1.21 command translator for a Paper server with plugins: "
"FastAsyncWorldEdit, WorldGuard, CoreProtect, EssentialsX, Vault, LuckPerms.\n"
"PERMISSION LEVEL: 4 (generous).\n"
"Return JSON: {\"commands\": [...], \"reasoning\": \"...\"}"
)
start = time.time()
try:
r = requests.post(f"{ollama_url}/api/chat", json={
"model": model,
"messages": [
{"role": "system", "content": system},
{"role": "user", "content": f"Player {PLAYER}: {prompt}"},
],
"stream": False, "format": "json",
"options": {"temperature": 0.2, "num_predict": 500},
}, timeout=timeout)
elapsed = time.time() - start
content = r.json()["message"]["content"]
content = re.sub(r'<think>[\s\S]*?</think>\s*', '', content)
parsed = json.loads(content)
return {
"commands": parsed.get("commands", []),
"reasoning": parsed.get("reasoning", ""),
"elapsed": round(elapsed, 2),
"error": None,
}
except Exception as e:
return {
"commands": [],
"reasoning": "",
"elapsed": round(time.time() - start, 2),
"error": str(e)[:200],
}
def validate_commands(commands, rcon):
"""Execute commands and return results."""
results = []
for cmd in commands[:8]:
if not isinstance(cmd, str) or not cmd.strip():
continue
cmd = cmd.strip().lstrip("/")
try:
result = rcon.command(cmd)
is_err = any(e in result for e in ("<--[HERE]", "Unknown", "Incorrect", "Expected", "Invalid"))
results.append({"cmd": cmd, "result": result[:200], "ok": not is_err})
except Exception as e:
results.append({"cmd": cmd, "result": str(e), "ok": False})
return results
def run_bakeoff(models, ollama_url, rcon):
"""Run all test prompts through all models."""
results = {m: {"total": 0, "cmd_success": 0, "cmd_fail": 0, "cmd_total": 0,
"no_commands": 0, "errors": 0, "total_time": 0, "details": []}
for m in models}
total_prompts = sum(len(v) for v in TEST_PROMPTS.values())
print(f"Running {total_prompts} prompts x {len(models)} models = {total_prompts * len(models)} tests\n")
for category, prompts in TEST_PROMPTS.items():
print(f"── {category} ──")
for prompt in prompts:
print(f" {prompt[:65]}")
for model in models:
resp = query_model(prompt, model, ollama_url)
r = results[model]
r["total"] += 1
r["total_time"] += resp["elapsed"]
if resp["error"]:
r["errors"] += 1
status = "ERR"
rcon_results = []
elif not resp["commands"]:
r["no_commands"] += 1
status = "EMPTY"
rcon_results = []
else:
rcon_results = validate_commands(resp["commands"], rcon)
ok = sum(1 for rr in rcon_results if rr["ok"])
fail = sum(1 for rr in rcon_results if not rr["ok"])
r["cmd_success"] += ok
r["cmd_fail"] += fail
r["cmd_total"] += ok + fail
status = f"{ok}/{ok+fail}" if fail else f"{ok}"
model_short = model.split(":")[-1]
print(f" {model_short:8s} {status:8s} {resp['elapsed']:.1f}s {len(resp['commands'])} cmds")
r["details"].append({
"category": category,
"prompt": prompt,
"commands": resp["commands"],
"rcon_results": rcon_results,
"elapsed": resp["elapsed"],
"error": resp["error"],
})
print()
return results
def print_summary(results, models):
"""Print comparison table."""
print("=" * 70)
print("BAKE-OFF RESULTS")
print("=" * 70)
header = f"{'Metric':<30s}"
for m in models:
header += f" {m.split(':')[-1]:>12s}"
print(header)
print("-" * 70)
metrics = [
("Prompts tested", lambda r: r["total"]),
("Commands generated", lambda r: r["cmd_total"]),
("Commands succeeded", lambda r: r["cmd_success"]),
("Commands failed", lambda r: r["cmd_fail"]),
("Success rate", lambda r: f"{100*r['cmd_success']/max(r['cmd_total'],1):.1f}%"),
("Empty responses", lambda r: r["no_commands"]),
("Errors", lambda r: r["errors"]),
("Avg response time", lambda r: f"{r['total_time']/max(r['total'],1):.2f}s"),
("Total time", lambda r: f"{r['total_time']:.1f}s"),
]
for label, fn in metrics:
row = f"{label:<30s}"
for m in models:
val = fn(results[m])
row += f" {str(val):>12s}"
print(row)
print("=" * 70)
# Category breakdown
print("\nCATEGORY BREAKDOWN (success rate):")
print("-" * 70)
categories = list(TEST_PROMPTS.keys())
header = f"{'Category':<25s}"
for m in models:
header += f" {m.split(':')[-1]:>12s}"
print(header)
for cat in categories:
row = f"{cat:<25s}"
for m in models:
cat_details = [d for d in results[m]["details"] if d["category"] == cat]
cat_ok = sum(sum(1 for rr in d["rcon_results"] if rr["ok"]) for d in cat_details)
cat_total = sum(len(d["rcon_results"]) for d in cat_details)
if cat_total > 0:
row += f" {100*cat_ok/cat_total:>10.0f}%"
else:
row += f" {'N/A':>12s}"
print(row)
print()
def main():
parser = argparse.ArgumentParser(description="Model bake-off")
parser.add_argument("--models", default="mortdecai:0.4.0,mortdecai:0.5.0")
parser.add_argument("--ollama-url", default="http://localhost:11434")
parser.add_argument("--rcon-host", default="192.168.0.244")
parser.add_argument("--rcon-port", type=int, default=25578)
parser.add_argument("--rcon-pass", default="REDACTED_RCON")
args = parser.parse_args()
models = [m.strip() for m in args.models.split(",")]
rcon = get_rcon(args.rcon_host, args.rcon_port, args.rcon_pass)
print(f"Bake-off: {' vs '.join(models)}")
print(f"Ollama: {args.ollama_url}")
print(f"RCON: {args.rcon_host}:{args.rcon_port}")
print()
results = run_bakeoff(models, args.ollama_url, rcon)
print_summary(results, models)
# Save results
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
out_path = OUTPUT_DIR / f"bakeoff_{'-vs-'.join(m.replace(':','_') for m in models)}_{int(time.time())}.json"
with open(out_path, "w") as f:
json.dump({
"models": models,
"timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
"results": {m: {k: v for k, v in r.items() if k != "details"} for m, r in results.items()},
"details": {m: r["details"] for m, r in results.items()},
}, f, indent=2, default=str)
print(f"Results saved to {out_path}")
if __name__ == "__main__":
main()
+411
View File
@@ -0,0 +1,411 @@
#!/usr/bin/env python3
"""
Exploration Self-Play — model uses wiki_lookup to explore Minecraft knowledge,
then validates its understanding through RCON commands.
Unlike canned self-play, the model drives its own curiosity:
1. Gets a broad topic ("explore enchantments", "learn about 1.21 items")
2. Uses minecraft.wiki_lookup to research
3. Generates commands based on what it learned
4. RCON validates correctness
5. If wrong, researches more and corrects
Produces gold-standard knowledge-grounded training data.
Usage:
python3 exploration_self_play.py --ollama-url http://localhost:11434 \
--model mortdecai:0.5.0 --rcon-host 192.168.0.244 --rcon-port 25578
"""
import argparse
import json
import random
import re
import sys
import time
from pathlib import Path
PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
import requests
from agent.tools.persistent_rcon import get_rcon
OUTPUT_DIR = PROJECT_ROOT / "data" / "raw" / "exploration_selfplay"
PLAYERS = ["slingshooter08", "Ace13245", "TheBigBoss", "xXDragonSlayerXx"]
# Topics for the model to explore — broad enough that it needs to look things up
EXPLORATION_TOPICS_PLUGINS = [
# WorldGuard deep dive
"Research all WorldGuard region flags. Create a region and test each flag one at a time for {p}.",
"Look up how WorldGuard region priorities work. Create overlapping regions with different rules.",
"Research WorldGuard's __global__ region. What flags can you set globally? Test a few.",
"Look up WorldGuard entry/exit deny flags. Create a VIP-only zone and test it.",
"Research how to make a WorldGuard region that heals players. Set it up near {p}.",
"What WorldGuard flags control explosions? Research and create a blast-proof zone.",
"Look up how to block specific commands in a WorldGuard region. Test with /home.",
"Research WorldGuard greeting and farewell messages. Set up regions with welcome messages.",
# CoreProtect deep dive
"Research all CoreProtect action types (block, container, chat, command). Test /co lookup with each.",
"Look up CoreProtect time format syntax. Test rollbacks with different time ranges (1h, 30m, 7d).",
"Research how CoreProtect handles container logging. Place a chest, add items, then lookup the history.",
"What CoreProtect parameters filter by block type? Test rolling back only specific blocks.",
"Look up how to use CoreProtect radius parameter. Test different radius values.",
"Research CoreProtect restore vs rollback — what's the difference? Demonstrate both.",
# EssentialsX deep dive
"Research all EssentialsX economy commands. Set up a working economy with /eco, /balance, /pay.",
"Look up EssentialsX kit creation syntax. Create a starter kit and a VIP kit.",
"Research EssentialsX warp system. Create 5 warps at interesting locations.",
"What EssentialsX commands exist for player management? Test /nick, /seen, /whois.",
"Look up EssentialsX home system. Set multiple named homes for {p}.",
"Research EssentialsX god mode, fly mode, and speed commands. Test all three.",
"What EssentialsX commands modify the world? Test /sun, /storm, /day, /night.",
# LuckPerms deep dive
"Research LuckPerms group inheritance. Create parent and child groups and test permission flow.",
"Look up LuckPerms temporary permissions. Give {p} temp fly access for 5 minutes.",
"Research LuckPerms meta (prefix/suffix). Set up colored chat prefixes for different groups.",
"What LuckPerms commands check a user's permissions? Audit {p}'s current permissions.",
"Look up how to create a LuckPerms permission ladder (default -> member -> vip -> admin).",
"Research LuckPerms weight system. How do group priorities work?",
# FAWE/WorldEdit deep dive
"Research all WorldEdit shape commands (sphere, cyl, pyramid). Build one of each near {p}.",
"Look up WorldEdit brush types. What brushes exist beyond sphere brush?",
"Research WorldEdit mask syntax. How do masks work with //replace?",
"What WorldEdit clipboard operations exist? Test //copy, //paste, //rotate, //flip.",
"Look up WorldEdit pattern syntax. Can you mix multiple blocks in one command?",
"Research WorldEdit //generate command. Can it make mathematical surfaces?",
"What WorldEdit selection modes exist? Test //sel cuboid vs poly vs sphere.",
# Script writing exploration
"Research Minecraft datapack function syntax. Write a mcfunction that creates a parkour course.",
"Look up how Minecraft tick functions work. Write one that makes particles at spawn.",
"Research how to chain mcfunctions together. Write a main function that calls sub-functions.",
"What Minecraft datapack tags control function scheduling? Test tick.json and load.json.",
"Look up execute command syntax for mcfunctions. Write a script using execute at/as/if.",
"Research scoreboard objectives. Write a script that tracks player kills and announces leaders.",
# Multi-plugin combos
"Research how to combine WorldEdit builds with WorldGuard protection. Build and protect an arena.",
"Look up how to use CoreProtect to undo WorldEdit operations specifically.",
"Research combining LuckPerms with WorldGuard — can you tie region access to permission groups?",
"Create a complete server setup: spawn area (WE), protected (WG), with warps (Ess) and perms (LP).",
"Research how to build a minigame arena: WE for building, WG for rules, scoreboards for tracking.",
]
EXPLORATION_TOPICS = [
# Items and crafting
"What are all the new items added in 1.21? Look them up and give one of each to {p}.",
"Research every type of arrow (tipped arrows) and give {p} one of each.",
"Look up all the banner patterns available and create a cool banner for {p}.",
"What suspicious stew effects exist? Research and give {p} the best one.",
"Research all the different types of potions and give {p} the three most useful ones.",
"What are all the different horse armor types? Look them up and give one of each to {p}.",
"Research all smithing templates and give {p} the rarest ones.",
"Look up every type of spawn egg and give {p} five interesting ones.",
# Enchantments
"Research the best enchantment setup for a full netherite armor set. Give it to {p}.",
"What enchantments are exclusive to each other? Look them up and explain while giving {p} examples.",
"Research the difference between Protection, Fire Protection, Blast Protection, and Projectile Protection. Which is best for general use? Give {p} the optimal set.",
"Look up what Thorns does exactly — is it worth using? Give {p} armor with and without it to test.",
"Research Sweeping Edge — does it still exist in 1.21? Give {p} a sword with the correct enchantments.",
"What's the maximum level for each enchantment? Research and give {p} a tool with impossible levels vs correct levels.",
# Effects and potions
"Research all status effects in 1.21. Which ones are new? Apply the 3 newest ones to {p}.",
"Look up the Ominous Bottle effect — what does it do? Give one to {p}.",
"What's the difference between Strength and Haste? Research and apply the right one for mining.",
"Research what Wind Charged does. Apply it to {p}.",
"Look up all negative effects and their max safe durations. Apply a brief demonstration.",
"What effect does a Beacon give? Research all beacon effects and apply them.",
# Mobs and entities
"Research all tameable mobs in 1.21. Summon one of each near {p}.",
"What mobs were added or changed in 1.21? Look them up and summon the new ones.",
"Research the Breeze mob — what does it drop? Summon one for {p}.",
"Look up all rideable mobs and summon one for {p} with a saddle.",
"What's the strongest mob in the game? Research its stats and summon it (carefully).",
"Research all fish types and summon them in water near {p}.",
# Blocks and building
"Research all copper block variants and their oxidation states. Place examples near {p}.",
"What blocks emit light? Look up all light-emitting blocks and demonstrate.",
"Research all types of stairs, slabs, and walls available in 1.21.",
"Look up how to make colored concrete powder and place a rainbow near {p}.",
"What are all the glazed terracotta patterns? Research and place one of each.",
"Research redstone components — what's the difference between a comparator and repeater?",
# Commands and mechanics
"Research the /place command. What can it place? Demonstrate with a structure.",
"Look up the /damage command syntax and demonstrate different damage types on a mob.",
"Research /attribute — what attributes can be modified? Give {p} double health.",
"What does the /ride command do? Research and demonstrate.",
"Look up /fillbiome — can you change the biome? Try it near {p}.",
"Research the /random command added in 1.21. What can it do?",
# Worldgen and structures
"Research all structure types that /locate can find. Find the 3 nearest to {p}.",
"What biomes exist in 1.21? Look up any new ones and locate them.",
"Research Trial Chambers — where do they spawn? Locate one for {p}.",
# Plugin-specific research
"Research WorldGuard region flags — what flags exist? Set up a demo region with interesting flags.",
"Look up CoreProtect rollback syntax — what parameters does it accept?",
"Research LuckPerms group inheritance — how do child groups work?",
"What WorldEdit brushes are available? Research and describe them.",
"Look up EssentialsX economy commands — set up a basic economy demonstration.",
]
def wiki_lookup(query, timeout=15):
"""Actually search the Minecraft wiki via DuckDuckGo + scraping."""
try:
# Use a simple search - the model will call this through the tool loop
r = requests.get(
"https://minecraft.wiki/api.php",
params={"action": "opensearch", "search": query, "limit": 3, "format": "json"},
timeout=timeout,
)
results = r.json()
if len(results) >= 4 and results[1]:
titles = results[1][:3]
urls = results[3][:3] if len(results) > 3 else []
# Fetch first result summary
if titles:
r2 = requests.get(
"https://minecraft.wiki/api.php",
params={
"action": "query", "prop": "extracts",
"exintro": True, "explaintext": True,
"titles": titles[0], "format": "json",
},
timeout=timeout,
)
pages = r2.json().get("query", {}).get("pages", {})
for page in pages.values():
extract = page.get("extract", "")
if extract:
return {
"content": extract[:1500],
"url": urls[0] if urls else f"https://minecraft.wiki/w/{titles[0]}",
"ok": True,
}
return {"content": f"No wiki results for: {query}", "url": "", "ok": False}
except Exception as e:
return {"content": f"Wiki lookup failed: {e}", "url": "", "ok": False}
def run_exploration(topic, player, ollama_url, model, rcon):
"""Run one exploration round — model researches and acts."""
system = (
"/no_think\n"
"You are a Minecraft 1.21 expert on a Paper server with plugins: "
"WorldGuard, CoreProtect, EssentialsX, LuckPerms, FastAsyncWorldEdit.\n\n"
"You have these lookup tools:\n"
"- minecraft.wiki_lookup: {\"query\": \"...\"} — Minecraft Wiki for items, mobs, commands\n"
"- plugin.docs_lookup: {\"plugin\": \"worldguard|worldedit|coreprotect|essentialsx|luckperms\", \"query\": \"...\"} — plugin documentation\n"
"- minecraft.changelog_lookup: {\"query\": \"...\", \"version\": \"1.21\"} — version changes\n"
"- paper.docs_lookup: {\"query\": \"...\"} — Paper server docs\n"
"- rcon.execute: {\"command\": \"...\"} — execute a Minecraft command\n\n"
"WORKFLOW:\n"
"1. Research the topic using the appropriate lookup tool\n"
"2. For plugin commands, use plugin.docs_lookup instead of minecraft.wiki_lookup\n"
"3. Generate and execute commands via rcon.execute\n"
"4. If a command fails, look up the correct syntax and try again\n\n"
"To call a tool, respond with:\n"
"<tool_call>\n{\"name\": \"tool_name\", \"arguments\": {...}}\n</tool_call>\n\n"
"When done, respond with final JSON:\n"
"{\"commands\": [...], \"reasoning\": \"what you learned\", \"wiki_topics\": [\"topics you looked up\"]}\n\n"
"Be curious. ALWAYS look things up before guessing. Verify your knowledge."
)
topic_resolved = topic.replace("{p}", player)
messages = [
{"role": "system", "content": system},
{"role": "user", "content": f"Player {player}: {topic_resolved}"},
]
tool_trace = []
all_commands = []
wiki_topics = []
max_steps = 10
for step in range(max_steps):
try:
r = requests.post(f"{ollama_url}/api/chat", json={
"model": model,
"messages": messages,
"stream": False,
"options": {"temperature": 0.6, "num_predict": 800},
}, timeout=120)
raw = r.json()["message"]["content"]
except Exception as e:
print(f" LLM error: {e}")
break
raw = re.sub(r'<think>[\s\S]*?</think>\s*', '', raw)
# Check for tool calls
tool_matches = re.findall(r'<tool_call>\s*(\{.*?\})\s*</tool_call>', raw, re.DOTALL)
if not tool_matches:
# Final response — done exploring
break
for tc_json in tool_matches:
try:
tc = json.loads(tc_json)
tool_name = tc.get("name", "")
tool_args = tc.get("arguments", {})
except json.JSONDecodeError:
continue
if tool_name == "minecraft.wiki_lookup":
query = tool_args.get("query", "")
wiki_topics.append(query)
result = wiki_lookup(query)
print(f" wiki: {query[:60]} -> {len(result.get('content',''))} chars")
elif tool_name in ("plugin.docs_lookup", "minecraft.changelog_lookup", "paper.docs_lookup"):
try:
from agent.tools.knowledge_lookup import handle_knowledge_tool
result = handle_knowledge_tool(tool_name, tool_args)
except ImportError:
result = wiki_lookup(tool_args.get("query", tool_args.get("plugin", "")))
query = tool_args.get("query", "")
wiki_topics.append(f"{tool_name}:{query}")
print(f" {tool_name}: {query[:50]} -> {len(result.get('content',''))} chars")
elif tool_name == "rcon.execute":
cmd = tool_args.get("command", "")
try:
rcon_result = rcon.command(cmd)
is_err = any(e in rcon_result for e in ("<--[HERE]", "Unknown", "Incorrect"))
result = {"success": not is_err, "result": rcon_result[:300]}
all_commands.append(cmd)
status = "OK" if not is_err else "ERR"
print(f" rcon: {cmd[:60]} -> {status}")
except Exception as e:
result = {"success": False, "result": str(e)}
print(f" rcon: {cmd[:60]} -> FAIL")
else:
result = {"ok": False, "error": f"unknown tool: {tool_name}"}
tool_trace.append({
"tool": tool_name,
"input": str(tool_args)[:200],
"ok": result.get("ok", result.get("success", False)),
"step": step,
})
messages.append({"role": "assistant", "content": f"<tool_call>\n{json.dumps(tc)}\n</tool_call>"})
messages.append({"role": "tool", "content": json.dumps(result)[:3000]})
time.sleep(0.1)
# Parse final response if present
reasoning = ""
try:
parsed = json.loads(raw)
reasoning = parsed.get("reasoning", "")
if parsed.get("commands"):
all_commands.extend(parsed["commands"])
except json.JSONDecodeError:
reasoning = raw[:200]
return {
"id": f"explore-{int(time.time())}-{random.randint(0,9999):04d}",
"source": "exploration_self_play",
"type": "exploration",
"input": {"user_message": topic_resolved, "player": player},
"output": {
"commands": all_commands,
"reasoning": reasoning,
"wiki_topics": wiki_topics,
},
"tool_trace": tool_trace,
"messages": messages,
"metadata": {
"model": model,
"steps": min(step + 1, max_steps),
"wiki_lookups": len(wiki_topics),
"rcon_commands": len(all_commands),
"success_rate": (
sum(1 for t in tool_trace if t["tool"] == "rcon.execute" and t["ok"])
/ max(sum(1 for t in tool_trace if t["tool"] == "rcon.execute"), 1)
),
},
}
def main():
parser = argparse.ArgumentParser(description="Exploration self-play")
parser.add_argument("--ollama-url", default="http://localhost:11434")
parser.add_argument("--model", default="mortdecai:0.5.0")
parser.add_argument("--rcon-host", default="192.168.0.244")
parser.add_argument("--rcon-port", type=int, default=25578)
parser.add_argument("--rcon-pass", default="REDACTED_RCON")
parser.add_argument("--rounds", type=int, default=999999)
parser.add_argument("--focus", default="general", choices=["general", "plugins", "all"],
help="Topic focus: general (vanilla MC), plugins (WG/CP/Ess/LP/FAWE/scripts), all (both)")
args = parser.parse_args()
if args.focus == "plugins":
topics = EXPLORATION_TOPICS_PLUGINS
elif args.focus == "all":
topics = EXPLORATION_TOPICS + EXPLORATION_TOPICS_PLUGINS
else:
topics = EXPLORATION_TOPICS
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
output_path = OUTPUT_DIR / f"exploration_{args.focus}_{int(time.time())}.jsonl"
rcon = get_rcon(args.rcon_host, args.rcon_port, args.rcon_pass)
print(f"Exploration Self-Play")
print(f" Model: {args.model} on {args.ollama_url}")
print(f" RCON: {args.rcon_host}:{args.rcon_port}")
print(f" Focus: {args.focus} ({len(topics)} topics)")
print(f" Output: {output_path}")
print()
stats = {"total": 0, "wiki_lookups": 0, "rcon_commands": 0, "rcon_success": 0}
for round_num in range(args.rounds):
topic = random.choice(topics)
player = random.choice(PLAYERS)
print(f"\n── Round {round_num+1} ──")
print(f" Topic: {topic[:80].replace('{p}', player)}")
example = run_exploration(topic, player, args.ollama_url, args.model, rcon)
stats["total"] += 1
stats["wiki_lookups"] += example["metadata"]["wiki_lookups"]
stats["rcon_commands"] += example["metadata"]["rcon_commands"]
stats["rcon_success"] += int(example["metadata"]["success_rate"] * example["metadata"]["rcon_commands"])
print(f" Result: {example['metadata']['wiki_lookups']} lookups, "
f"{example['metadata']['rcon_commands']} commands, "
f"{example['metadata']['success_rate']:.0%} success")
with open(output_path, "a") as f:
f.write(json.dumps(example, ensure_ascii=False) + "\n")
if (round_num + 1) % 10 == 0:
rate = stats["rcon_success"] / max(stats["rcon_commands"], 1) * 100
print(f"\n Progress: {stats['total']} explorations, "
f"{stats['wiki_lookups']} wiki lookups, "
f"{stats['rcon_commands']} commands ({rate:.0f}% success)")
time.sleep(0.5)
print(f"\nExploration complete: {stats['total']} topics explored")
if __name__ == "__main__":
main()
+149
View File
@@ -0,0 +1,149 @@
#!/usr/bin/env python3
"""
Filter exploration and self-play data for quality.
Keeps:
- Successful interactions (model looked up info AND executed correctly)
- First instance of each unique error pattern (for error correction training)
- High wiki-lookup-to-command ratios (model actually used the knowledge)
Removes:
- Duplicate topics (keeps first occurrence only)
- Empty responses (no commands, no lookups)
- Repeated failures on the same command pattern
- Rounds where model ignored wiki results
Output: data/processed/filtered_exploration.jsonl
"""
import json
import sys
from pathlib import Path
from collections import defaultdict
PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
EXPLORATION_DIR = PROJECT_ROOT / "data" / "raw" / "exploration_selfplay"
TOOL_SELFPLAY_DIR = PROJECT_ROOT / "data" / "raw" / "tool_selfplay"
OUTPUT_PATH = PROJECT_ROOT / "data" / "processed" / "filtered_exploration.jsonl"
def load_all_examples():
"""Load all exploration and tool self-play examples."""
examples = []
for jsonl in sorted(EXPLORATION_DIR.glob("*.jsonl")):
with open(jsonl) as f:
for line in f:
if line.strip():
try:
examples.append(json.loads(line))
except json.JSONDecodeError:
pass
for jsonl in sorted(TOOL_SELFPLAY_DIR.glob("*.jsonl")):
with open(jsonl) as f:
for line in f:
if line.strip():
try:
examples.append(json.loads(line))
except json.JSONDecodeError:
pass
return examples
def filter_examples(examples):
"""Filter for quality."""
kept = []
seen_topics = set()
seen_error_patterns = set()
stats = {
"total": len(examples),
"kept_success": 0,
"kept_error_correction": 0,
"kept_wiki_grounded": 0,
"dropped_duplicate": 0,
"dropped_empty": 0,
"dropped_repeat_failure": 0,
}
for ex in examples:
meta = ex.get("metadata", {})
inp = ex.get("input", {})
topic = inp.get("user_message", "")[:80]
success_rate = meta.get("success_rate", meta.get("all_success", False))
wiki_lookups = meta.get("wiki_lookups", 0)
rcon_commands = meta.get("rcon_commands", 0)
# Skip empty
if rcon_commands == 0 and wiki_lookups == 0:
stats["dropped_empty"] += 1
continue
# Deduplicate topics (keep first)
if topic in seen_topics:
stats["dropped_duplicate"] += 1
continue
seen_topics.add(topic)
# Categorize
if isinstance(success_rate, bool):
is_success = success_rate
else:
is_success = success_rate > 0.7
if is_success and rcon_commands > 0:
# Successful interaction — always keep
stats["kept_success"] += 1
kept.append(ex)
elif wiki_lookups > 0 and rcon_commands > 0:
# Wiki-grounded (looked things up before acting) — keep even if some failures
stats["kept_wiki_grounded"] += 1
kept.append(ex)
elif not is_success and rcon_commands > 0:
# Failed — keep only first instance of each error pattern
commands = ex.get("output", {}).get("commands", [])
if commands:
# Use first command as error pattern key
pattern = commands[0][:40] if isinstance(commands[0], str) else ""
else:
rcon_results = meta.get("rcon_results", [])
pattern = str(rcon_results[:1])[:60] if rcon_results else ""
if pattern and pattern not in seen_error_patterns:
seen_error_patterns.add(pattern)
stats["kept_error_correction"] += 1
kept.append(ex)
else:
stats["dropped_repeat_failure"] += 1
else:
stats["dropped_empty"] += 1
return kept, stats
def main():
print("Loading examples...")
examples = load_all_examples()
print(f" Loaded {len(examples)} raw examples")
print("Filtering...")
filtered, stats = filter_examples(examples)
print(f"\nFilter results:")
for k, v in stats.items():
print(f" {k}: {v}")
print(f"\nKept: {len(filtered)} ({100*len(filtered)//max(stats['total'],1)}%)")
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_PATH, "w") as f:
for ex in filtered:
f.write(json.dumps(ex, ensure_ascii=False) + "\n")
print(f"Written to {OUTPUT_PATH}")
if __name__ == "__main__":
main()
+123
View File
@@ -0,0 +1,123 @@
#!/usr/bin/env python3
"""
Generate SVG training history chart for the Gitea README.
X-axis: Model version
Y-axis: Training examples (bar) and inverse loss (line)
"""
import json
from pathlib import Path
OUTPUT = Path(__file__).resolve().parent.parent.parent / "branding" / "training_progress.svg"
# Historical data from training runs
VERSIONS = [
{"version": "0.1.0", "examples": 500, "loss": 2.10, "label": "v1 (seed)"},
{"version": "0.2.0", "examples": 1200, "loss": 1.45, "label": "v2 (+entities)"},
{"version": "0.3.0", "examples": 2100, "loss": 0.82, "label": "v3 (+errors)"},
{"version": "0.4.0", "examples": 3175, "loss": 0.35, "label": "v4 (+tools)"},
{"version": "0.5.0", "examples": 4358, "loss": 0.16, "label": "v5 (+plugins)"},
]
# Chart dimensions
W = 700
H = 400
PAD_L = 70
PAD_R = 30
PAD_T = 40
PAD_B = 80
PLOT_W = W - PAD_L - PAD_R
PLOT_H = H - PAD_T - PAD_B
# Colors
BG = "#111111"
GRID = "#252525"
TEXT = "#999999"
BAR_COLOR = "#D35400"
LINE_COLOR = "#4caf50"
LABEL_COLOR = "#e0e0e0"
def generate_svg():
max_examples = max(v["examples"] for v in VERSIONS) * 1.15
max_inv_loss = max(1.0 / v["loss"] for v in VERSIONS) * 1.15
n = len(VERSIONS)
bar_w = PLOT_W / n * 0.6
gap = PLOT_W / n
svg = f"""<?xml version="1.0" encoding="UTF-8"?>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 {W} {H}" width="{W}" height="{H}">
<rect width="{W}" height="{H}" fill="{BG}" rx="8"/>
<!-- Title -->
<text x="{W/2}" y="25" fill="{LABEL_COLOR}" font-family="monospace" font-size="16" text-anchor="middle" font-weight="bold">Mortdecai Training Progress</text>
<!-- Grid lines -->
"""
# Y-axis grid (examples)
for i in range(5):
y = PAD_T + PLOT_H - (i / 4 * PLOT_H)
val = int(max_examples * i / 4)
svg += f'<line x1="{PAD_L}" y1="{y}" x2="{W-PAD_R}" y2="{y}" stroke="{GRID}" stroke-width="0.5"/>\n'
svg += f'<text x="{PAD_L-5}" y="{y+4}" fill="{TEXT}" font-family="monospace" font-size="10" text-anchor="end">{val:,}</text>\n'
# Bars (training examples)
for i, v in enumerate(VERSIONS):
cx = PAD_L + gap * i + gap / 2
bh = (v["examples"] / max_examples) * PLOT_H
by = PAD_T + PLOT_H - bh
svg += f'<rect x="{cx - bar_w/2}" y="{by}" width="{bar_w}" height="{bh}" fill="{BAR_COLOR}" rx="3" opacity="0.85"/>\n'
svg += f'<text x="{cx}" y="{by - 8}" fill="{BAR_COLOR}" font-family="monospace" font-size="11" text-anchor="middle" font-weight="bold">{v["examples"]:,}</text>\n'
# X-axis label
svg += f'<text x="{cx}" y="{PAD_T + PLOT_H + 20}" fill="{LABEL_COLOR}" font-family="monospace" font-size="12" text-anchor="middle">{v["version"]}</text>\n'
svg += f'<text x="{cx}" y="{PAD_T + PLOT_H + 35}" fill="{TEXT}" font-family="monospace" font-size="9" text-anchor="middle">{v["label"]}</text>\n'
# Line (inverse loss = quality)
points = []
for i, v in enumerate(VERSIONS):
cx = PAD_L + gap * i + gap / 2
inv_loss = 1.0 / v["loss"]
ly = PAD_T + PLOT_H - (inv_loss / max_inv_loss) * PLOT_H
points.append(f"{cx},{ly}")
polyline = " ".join(points)
svg += f'<polyline points="{polyline}" fill="none" stroke="{LINE_COLOR}" stroke-width="2.5" stroke-linecap="round" stroke-linejoin="round"/>\n'
# Dots on line
for i, v in enumerate(VERSIONS):
cx = PAD_L + gap * i + gap / 2
inv_loss = 1.0 / v["loss"]
ly = PAD_T + PLOT_H - (inv_loss / max_inv_loss) * PLOT_H
svg += f'<circle cx="{cx}" cy="{ly}" r="4" fill="{LINE_COLOR}"/>\n'
svg += f'<text x="{cx}" y="{ly - 10}" fill="{LINE_COLOR}" font-family="monospace" font-size="10" text-anchor="middle">loss={v["loss"]}</text>\n'
# Y-axis labels
svg += f'<text x="{PAD_L - 45}" y="{PAD_T + PLOT_H/2}" fill="{BAR_COLOR}" font-family="monospace" font-size="11" text-anchor="middle" transform="rotate(-90,{PAD_L-45},{PAD_T+PLOT_H/2})">Training Examples</text>\n'
# Legend
svg += f'<rect x="{W-180}" y="{PAD_T+5}" width="12" height="12" fill="{BAR_COLOR}" rx="2"/>\n'
svg += f'<text x="{W-163}" y="{PAD_T+15}" fill="{TEXT}" font-family="monospace" font-size="10">Training Examples</text>\n'
svg += f'<line x1="{W-180}" y1="{PAD_T+28}" x2="{W-168}" y2="{PAD_T+28}" stroke="{LINE_COLOR}" stroke-width="2.5"/>\n'
svg += f'<text x="{W-163}" y="{PAD_T+32}" fill="{TEXT}" font-family="monospace" font-size="10">Model Quality (1/loss)</text>\n'
# X-axis label
svg += f'<text x="{W/2}" y="{H-10}" fill="{TEXT}" font-family="monospace" font-size="11" text-anchor="middle">Model Version</text>\n'
svg += "</svg>"
return svg
def main():
svg = generate_svg()
OUTPUT.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT, "w") as f:
f.write(svg)
print(f"Chart saved to {OUTPUT}")
print(f"Embed in README: ![Training Progress](branding/training_progress.svg)")
if __name__ == "__main__":
main()
@@ -0,0 +1,370 @@
#!/usr/bin/env python3
"""
Regenerate tool-calling training data using mortdecai:0.5.0.
Uses the model-driven tool loop: sends prompts to 0.5.0, lets it decide
which tools to call, executes via RCON, and captures the full multi-turn
conversation as training data. Only keeps examples where all commands succeed.
This produces "distilled" data the model's best outputs, validated by RCON.
"""
import json
import random
import re
import sys
import time
from pathlib import Path
PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
import requests
from agent.tools.persistent_rcon import get_rcon
from agent.tools.tool_schemas import qwen3_tools_block
from agent.prompts.system_prompts import SYNTAX_RULES, RISK_GRADIENT
OUTPUT_PATH = PROJECT_ROOT / "data" / "processed" / "tool_training_v05.jsonl"
TOOLS_BLOCK = qwen3_tools_block()
SYSTEM = (
"/no_think\n"
"You are a Minecraft 1.21 command translator for a Paper server.\n"
"Plugins: FastAsyncWorldEdit, WorldGuard, CoreProtect, EssentialsX, Vault, LuckPerms.\n\n"
"You have tools. To call one:\n"
"<tool_call>\n{\"name\": \"tool_name\", \"arguments\": {...}}\n</tool_call>\n\n"
"Available: rcon.execute, minecraft.wiki_lookup, plugin.docs_lookup, "
"minecraft.changelog_lookup, world.player_info, world.server_state, "
"world.nearby_entities, memory.read, memory.write, "
"script.write, script.validate, script.execute, script.read, script.list, "
"script.delete, script.schedule.\n\n"
"After tool calls, respond with JSON:\n"
"{\"risk_level\": <0-5>, \"commands\": [...], \"reasoning\": \"...\"}\n\n"
"PERMISSION LEVEL: 4 (generous).\n" + SYNTAX_RULES + RISK_GRADIENT
)
SYSTEM_GOD = (
"/no_think\n"
"You are God in a Minecraft server with full tool access.\n"
"Return JSON: {\"risk_level\": <0-5>, \"message\": \"...\", \"commands\": [...], \"reasoning\": \"...\"}\n\n"
+ SYNTAX_RULES + "\n" + TOOLS_BLOCK
)
PLAYERS = ["slingshooter08", "Ace13245", "TheBigBoss", "xXDragonSlayerXx"]
# Comprehensive prompt set — every category we need good data for
PROMPTS = {
"basic_commands": [
"sudo give me a diamond sword",
"sudo give me 64 golden apples",
"sudo give me a stack of oak planks",
"sudo give me an elytra",
"sudo give me a spyglass",
"sudo give me a recovery compass",
"sudo give me a bundle",
"sudo set time to noon",
"sudo set time to midnight",
"sudo clear weather for a week",
"sudo make it thunder",
"sudo kill all hostile mobs",
"sudo kill all items on the ground",
"sudo gamemode creative",
"sudo gamemode survival",
"sudo gamemode spectator",
],
"enchanted_gear": [
"sudo give me a diamond sword with sharpness 5, unbreaking 3, mending, and looting 3",
"sudo give me a netherite pickaxe with efficiency 5, fortune 3, unbreaking 3, mending",
"sudo give me a bow with power 5, infinity, flame, punch 2",
"sudo full netherite armor with protection 4, unbreaking 3, mending on every piece",
"sudo give me boots with feather falling 4, depth strider 3, soul speed 3",
"sudo give me a trident with loyalty 3 and channeling",
"sudo give me a trident with riptide 3",
"sudo give me a crossbow with multishot and quick charge 3",
"sudo give me a mace with density 5 and wind burst 3",
"sudo best fishing rod possible",
"sudo give me a shield with unbreaking 3 and mending",
],
"effects": [
"sudo give me speed 3 for 10 minutes",
"sudo night vision permanently",
"sudo make me invisible for 5 minutes",
"sudo give me fire resistance for an hour",
"sudo give everyone online regeneration 2",
"sudo give me haste 2 for 10 minutes",
"sudo slow falling for 60 seconds",
"sudo give me water breathing forever",
"sudo give me strength 2 and resistance 2 for 5 minutes",
"sudo clear all my effects",
],
"teleport_position": [
"sudo tp me to 0 100 0",
"sudo tp me to the nether",
"sudo tp everyone to spawn",
"sudo teleport me 100 blocks north",
"sudo tp me up 50 blocks",
"sudo set my spawn point here",
],
"building": [
"sudo fill a 10x10 platform of stone under me",
"sudo place a beacon at my location",
"sudo build a small cobblestone room around me",
"sudo fill the area below me with water",
"sudo make a glass dome over me",
"sudo place 4 lanterns around me",
"sudo clear a 20 block area above me",
],
"entities": [
"sudo summon a horse with a saddle",
"sudo summon 5 cows near me",
"sudo summon a villager",
"sudo spawn an iron golem",
"sudo summon a warden 20 blocks away",
"sudo summon a wither",
"sudo kill all zombies within 50 blocks",
"sudo kill all creepers near me",
],
"worldguard": [
"sudo create a region called my-base and set pvp deny",
"sudo prevent mob spawning in the spawn region",
"sudo set a greeting message for spawn: Welcome to the server!",
"sudo deny entry to non-members in the vault region",
"sudo list all regions",
"sudo allow TNT in the arena",
"sudo prevent fire spread globally",
"sudo make a healing zone at spawn",
],
"coreprotect": [
"sudo enable block inspector",
"sudo rollback the last hour of changes",
"sudo rollback what TheBigBoss did in the last 30 minutes",
"sudo lookup who placed blocks near me today",
"sudo rollback TNT damage from the last 2 hours",
"sudo check coreprotect status",
"sudo restore what was rolled back",
],
"essentialsx": [
"sudo set my home here",
"sudo create a warp called arena",
"sudo give Ace13245 1000 coins",
"sudo check my balance",
"sudo heal me",
"sudo feed me",
"sudo repair my held item",
"sudo set my nickname to DragonLord",
"sudo broadcast Welcome to the server!",
"sudo god mode on",
"sudo fly mode on",
],
"luckperms": [
"sudo create a VIP group",
"sudo add Ace13245 to VIP",
"sudo give VIP permission to fly",
"sudo give me temporary VIP for 24 hours",
"sudo set VIP prefix to gold [VIP]",
"sudo list all permission groups",
"sudo create a builder group with worldedit access",
],
"fawe": [
"sudo make a glass sphere radius 8",
"sudo hollow stone sphere radius 10",
"sudo cylinder of quartz 5 wide 12 tall",
"sudo replace all stone with deepslate in selection",
"sudo smooth the terrain 5 iterations",
"sudo drain water within 20 blocks",
"sudo sandstone pyramid 8 tall",
"sudo undo my last worldedit operation",
],
"god_prayers": [
"pray oh great one, bless me with diamonds",
"pray lord, protect me from the monsters of the night",
"pray I offer this sacrifice of 64 wheat, grant me your favor",
"pray god please make it stop raining",
"pray smite the wicked TheBigBoss for griefing my base",
"pray heal me, I am near death",
"pray give me the strength to slay the ender dragon",
"pray I am lost in a cave, guide me to the surface",
],
"error_prone": [
"sudo give me a bed",
"sudo give me steak",
"sudo give me cooked beef",
"sudo effect give me speed",
"sudo give me a log",
"sudo fill with stone 10",
"sudo tp me to spawn",
"sudo give @s diamond 1",
],
"complex_multi": [
"sudo gear me up for the nether: armor, weapons, food, fire resistance",
"sudo prepare me for the end fight: bow, arrows, blocks, pearls, slow falling",
"sudo set up a new player kit: stone tools, food, bed, torches",
"sudo create a mob farm: platform, water channels, collection hopper",
],
}
def query_model_with_tools(prompt, player, ollama_url, model, rcon, mode="sudo", max_steps=6):
"""Send prompt to model, let it call tools, execute them, capture full chain."""
system = SYSTEM_GOD if mode == "god" else SYSTEM
messages = [
{"role": "system", "content": system},
{"role": "user", "content": f"Player {player}: {prompt}"},
]
tool_trace = []
all_rcon_results = []
for step in range(max_steps):
try:
r = requests.post(f"{ollama_url}/api/chat", json={
"model": model,
"messages": messages,
"stream": False,
"options": {"temperature": 0.2, "num_predict": 800},
}, timeout=90)
raw = r.json()["message"]["content"]
except Exception as e:
return None
raw = re.sub(r'<think>[\s\S]*?</think>\s*', '', raw)
# Check for tool calls
tool_matches = re.findall(r'<tool_call>\s*(\{.*?\})\s*</tool_call>', raw, re.DOTALL)
if not tool_matches:
# Final response
try:
parsed = json.loads(raw)
return {
"messages": messages + [{"role": "assistant", "content": raw}],
"commands": parsed.get("commands", []),
"message": parsed.get("message", ""),
"reasoning": parsed.get("reasoning", ""),
"tool_trace": tool_trace,
"rcon_results": all_rcon_results,
}
except json.JSONDecodeError:
return None
for tc_json in tool_matches:
try:
tc = json.loads(tc_json)
tool_name = tc.get("name", "")
tool_args = tc.get("arguments", {})
except json.JSONDecodeError:
continue
# Execute tool
if tool_name == "rcon.execute":
cmd = tool_args.get("command", "")
try:
result_text = rcon.command(cmd)
is_err = any(e in result_text for e in ("<--[HERE]", "Unknown", "Incorrect", "Expected"))
result = {"success": not is_err, "result": result_text[:300]}
all_rcon_results.append({"cmd": cmd, "ok": not is_err, "result": result_text[:200]})
except Exception as e:
result = {"success": False, "result": str(e)}
all_rcon_results.append({"cmd": cmd, "ok": False, "result": str(e)})
elif tool_name == "minecraft.wiki_lookup":
try:
from agent.tools.knowledge_lookup import handle_knowledge_tool
result = handle_knowledge_tool(tool_name, tool_args)
except Exception:
result = {"content": "Wiki unavailable", "url": "", "ok": False}
elif tool_name in ("plugin.docs_lookup", "minecraft.changelog_lookup", "paper.docs_lookup"):
try:
from agent.tools.knowledge_lookup import handle_knowledge_tool
result = handle_knowledge_tool(tool_name, tool_args)
except Exception:
result = {"content": "Docs unavailable", "url": "", "ok": False}
else:
result = {"ok": True, "result": "simulated"}
tool_trace.append({"tool": tool_name, "args": str(tool_args)[:100], "step": step})
messages.append({"role": "assistant", "content": f"<tool_call>\n{json.dumps(tc)}\n</tool_call>"})
messages.append({"role": "tool", "content": json.dumps(result)[:2000]})
return None # Ran out of steps
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--ollama-url", default="http://localhost:11434")
parser.add_argument("--model", default="mortdecai:0.5.0")
parser.add_argument("--rcon-host", default="192.168.0.244")
parser.add_argument("--rcon-port", type=int, default=25578)
parser.add_argument("--rcon-pass", default="REDACTED_RCON")
args = parser.parse_args()
rcon = get_rcon(args.rcon_host, args.rcon_port, args.rcon_pass)
print(f"Regenerating tool data with {args.model}")
print(f"RCON: {args.rcon_host}:{args.rcon_port}")
all_examples = []
stats = {"total": 0, "kept": 0, "failed": 0, "no_response": 0}
for category, prompts in PROMPTS.items():
print(f"\n── {category} ({len(prompts)} prompts) ──")
for prompt in prompts:
player = random.choice(PLAYERS)
mode = "god" if prompt.startswith("pray ") else "sudo"
result = query_model_with_tools(prompt, player, args.ollama_url, args.model, rcon, mode)
stats["total"] += 1
if not result:
stats["no_response"] += 1
print(f" SKIP: {prompt[:50]} (no response)")
continue
rcon_ok = sum(1 for r in result["rcon_results"] if r["ok"])
rcon_total = len(result["rcon_results"])
tools_used = len(result["tool_trace"])
if rcon_total == 0 and tools_used == 0:
stats["no_response"] += 1
print(f" SKIP: {prompt[:50]} (empty)")
continue
all_success = rcon_total > 0 and all(r["ok"] for r in result["rcon_results"])
if all_success or (rcon_ok > 0 and rcon_ok >= rcon_total * 0.7):
stats["kept"] += 1
example = {
"id": f"v05-regen-{stats['total']:04d}",
"source": "model_distillation_v05",
"type": f"tool_{category}",
"messages": result["messages"],
"metadata": {
"model": args.model,
"category": category,
"tools_used": tools_used,
"rcon_total": rcon_total,
"rcon_success": rcon_ok,
"all_success": all_success,
},
}
all_examples.append(example)
print(f" KEPT: {prompt[:50]} ({rcon_ok}/{rcon_total} cmds, {tools_used} tools)")
else:
stats["failed"] += 1
print(f" FAIL: {prompt[:50]} ({rcon_ok}/{rcon_total} cmds)")
time.sleep(0.2)
print(f"\n{'='*60}")
print(f"Total: {stats['total']}, Kept: {stats['kept']}, Failed: {stats['failed']}, Empty: {stats['no_response']}")
print(f"Quality: {100*stats['kept']//max(stats['total'],1)}%")
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_PATH, "w") as f:
for ex in all_examples:
f.write(json.dumps(ex, ensure_ascii=False) + "\n")
print(f"Written to {OUTPUT_PATH}")
if __name__ == "__main__":
main()