2189579490
Tested gemma3n:e4b, qwen3-coder:30b, phi4-mini, qwen3:8b, qwen3.5:9b, qwen3.5:4b, and qwen3:4b on structured command generation from a single Quadro RTX 4000 (8GB). The 6.9B model beat the 30B model on every metric. Includes the test harness, evaluation dataset, raw results from all rounds, and a writeup covering the token budget discovery that doubled one model's score overnight. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
362 lines
14 KiB
Python
362 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Small LLM Bake-Off: Structured command generation test harness.
|
|
|
|
Tests multiple Ollama models on a fixed set of domain-specific tasks that
|
|
require strict JSON output, correct syntax, and safety compliance.
|
|
|
|
Usage:
|
|
python bakeoff.py
|
|
python bakeoff.py --ollama-url http://localhost:11434
|
|
python bakeoff.py --models gemma3n:e4b qwen3:8b phi4-mini
|
|
python bakeoff.py --no-think # prepend /no_think for Qwen models
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
import time
|
|
from pathlib import Path
|
|
|
|
import requests
|
|
|
|
DATASET = Path(__file__).resolve().parent / "dataset.jsonl"
|
|
RESULTS_DIR = Path(__file__).resolve().parent / "results"
|
|
|
|
# --- System Prompts ---
|
|
# Two modes: "sudo" (pure command translation) and "god" (persona + commands)
|
|
|
|
SUDO_PROMPT = """You are a Minecraft 1.21 command translator. You receive natural language requests and return ONLY valid RCON commands.
|
|
|
|
CRITICAL RULES:
|
|
1. Return ONLY JSON: {"commands": ["cmd1", "cmd2"], "reasoning": "why"}
|
|
2. No prose, no markdown, no labels, no leading slash on commands.
|
|
3. Use 1.21 Java Edition syntax ONLY.
|
|
|
|
SYNTAX RULES (1.21+):
|
|
- Enchantments: give @s diamond_sword[enchantments={sharpness:5,unbreaking:3}] 1
|
|
NEVER use old NBT: {Enchantments:[{id:...,lvl:...}]}
|
|
- Effects: effect give <target> minecraft:<effect> <seconds> <amplifier> [hideParticles]
|
|
NEVER use bare "effect <target> <effect>" without "give"
|
|
- Weather: weather clear | weather rain | weather thunder
|
|
NEVER use "storm", "rainstorm", "thunderstorm"
|
|
- Gamemode: gamemode survival|creative|adventure|spectator <target>
|
|
NEVER use abbreviations (s/c/a/sp) or numbers (0/1/2/3)
|
|
- Summon: summon minecraft:<entity> <x> <y> <z> [nbt]
|
|
NEVER append count to summon -- use multiple commands
|
|
- Fill: fill <x1> <y1> <z1> <x2> <y2> <z2> minecraft:<block> [mode]
|
|
NEVER use metadata numbers (e.g. "fire 0")
|
|
- Execute: "execute as" changes executor but NOT position. "execute at" changes position.
|
|
Use "execute at <player> run ..." for relative coordinates.
|
|
- Items always need minecraft: prefix: minecraft:diamond_sword, not diamond_sword
|
|
|
|
WORLD STATE:
|
|
If player position data is provided, use absolute coordinates for fill/setblock/tp commands instead of relative ~ ~ ~ when the position is known. This is more reliable.
|
|
|
|
SCOPE:
|
|
- If request says "me" or "my", target only the requesting player, not @a
|
|
- If request involves building, prefer fill/setblock with exact coordinates over template workflows
|
|
- If request is impossible or unsafe, return empty commands list
|
|
|
|
AVAILABLE TOOLS (call via tool_calls if supported):
|
|
- rcon_execute: Run an RCON command and see the result
|
|
- search_knowledge: Search command syntax reference
|
|
- get_player_info: Get player position, health, gamemode
|
|
- get_server_status: Get online players, time, difficulty
|
|
"""
|
|
|
|
GOD_PROMPT = """You are God in a Minecraft server. Players pray to you and you respond with divine judgment.
|
|
|
|
Return JSON with two fields:
|
|
{"message": "Your dramatic response as God", "commands": ["cmd1", "cmd2"], "reasoning": "why"}
|
|
|
|
PERSONA RULES:
|
|
- Speak dramatically but clearly in the "message" field
|
|
- Balance benevolence and judgment based on the prayer
|
|
- Blasphemous/offensive prayers get mild punishment (mining_fatigue, slowness) + a warning message
|
|
- Sincere prayers get helpful effects/items
|
|
- DO NOT teleport players unless they explicitly ask to move
|
|
- DO NOT add unnecessary effects the player didn't ask for
|
|
- DO NOT use tp ~ ~10 ~ as a "blessing" -- it causes fall damage
|
|
|
|
COMMAND RULES:
|
|
- Same 1.21 syntax rules as the sudo prompt
|
|
- effect give <player> minecraft:<effect> <duration> <amplifier>
|
|
- give <player> minecraft:<item>[enchantments={...}] <count>
|
|
- Keep commands focused on what the player asked for
|
|
- Maximum 8 commands per response
|
|
"""
|
|
|
|
GOD_INTERVENTION_PROMPT = """You are God in a Minecraft server, performing an unprompted divine intervention.
|
|
|
|
Return JSON: {"message": "Your dramatic announcement", "commands": ["cmd1", "cmd2"]}
|
|
|
|
RULES:
|
|
- Interventions should be thematic and benign (fireworks, glowing, brief effects)
|
|
- DO NOT use teleport, levitation, or harmful effects
|
|
- DO NOT kill players or destroy blocks
|
|
- Keep it brief and atmospheric
|
|
- Maximum 4 commands
|
|
"""
|
|
|
|
|
|
def get_prompt(mode: str) -> str:
|
|
return {"sudo": SUDO_PROMPT, "god": GOD_PROMPT, "god_system": GOD_INTERVENTION_PROMPT}.get(mode, SUDO_PROMPT)
|
|
|
|
|
|
# --- Ollama API ---
|
|
|
|
def ollama_chat(model: str, messages: list, ollama_url: str,
|
|
temperature: float = 0.2, max_tokens: int = 1500,
|
|
no_think: bool = False) -> dict:
|
|
"""Call Ollama and return response + timing."""
|
|
payload = {
|
|
"model": model,
|
|
"messages": messages,
|
|
"stream": False,
|
|
"format": "json",
|
|
"options": {
|
|
"temperature": temperature,
|
|
"num_predict": max_tokens,
|
|
},
|
|
}
|
|
if no_think:
|
|
for msg in reversed(payload["messages"]):
|
|
if msg["role"] == "user":
|
|
msg["content"] = "/no_think\n" + msg["content"]
|
|
break
|
|
start = time.time()
|
|
r = requests.post(f"{ollama_url}/api/chat", json=payload, timeout=180)
|
|
r.raise_for_status()
|
|
duration_ms = int((time.time() - start) * 1000)
|
|
data = r.json()
|
|
return {
|
|
"content": data["message"]["content"],
|
|
"duration_ms": duration_ms,
|
|
"eval_count": data.get("eval_count", 0),
|
|
"prompt_eval_count": data.get("prompt_eval_count", 0),
|
|
}
|
|
|
|
|
|
def parse_response(content: str) -> dict:
|
|
try:
|
|
return json.loads(content)
|
|
except json.JSONDecodeError:
|
|
cmds = re.findall(r'"(/?\w[^"]*)"', content)
|
|
return {"commands": cmds, "message": "", "reasoning": "parse_fallback"}
|
|
|
|
|
|
# --- Dataset / Scoring ---
|
|
|
|
def build_user_message(example: dict) -> str:
|
|
inp = example["input"]
|
|
query = inp["user_message"]
|
|
ctx = inp.get("server_context", {})
|
|
parts = [f"Request from slingshooter08: {query}"]
|
|
parts.append(f"\nContext:\nServer: {ctx.get('server_type', 'paper')} {ctx.get('version', '1.21.x')}")
|
|
if ctx.get("online_players"):
|
|
parts.append(f"Online: {', '.join(ctx['online_players'])}")
|
|
pos = ctx.get("player_position")
|
|
if pos:
|
|
parts.append(f"Player position: ({pos['x']}, {pos['y']}, {pos['z']})")
|
|
return "\n".join(parts)
|
|
|
|
|
|
def score_result(example: dict, actual_cmds: list, parsed: dict) -> dict:
|
|
expected = example["output"]
|
|
expected_cmds = expected.get("commands", [])
|
|
expected_safety = expected.get("safety_flags", [])
|
|
category = example.get("category", "?")
|
|
|
|
# Command type match (loose)
|
|
cmd_match = False
|
|
if not expected_cmds and not actual_cmds:
|
|
cmd_match = True
|
|
elif expected_cmds and actual_cmds:
|
|
expected_types = set(c.split()[0].lstrip("/") for c in expected_cmds if c)
|
|
actual_types = set(c.split()[0].lstrip("/") for c in actual_cmds if c)
|
|
cmd_match = len(expected_types & actual_types) > 0
|
|
|
|
exact_match = set(c.strip() for c in expected_cmds) == set(c.strip() for c in actual_cmds)
|
|
|
|
# Syntax checks
|
|
syntax_issues = []
|
|
for cmd in actual_cmds:
|
|
if "{Enchantments:[" in cmd or "{enchantments:[" in cmd:
|
|
syntax_issues.append("old_nbt_enchant")
|
|
if re.search(r"(give|effect give) \w+ (?!minecraft:)\w+", cmd):
|
|
syntax_issues.append("missing_namespace")
|
|
if re.match(r"^effect \w+ (?!give)", cmd):
|
|
syntax_issues.append("bare_effect")
|
|
if "weather storm" in cmd:
|
|
syntax_issues.append("weather_storm")
|
|
if re.search(r"gamemode [csa0-3](\s|$)", cmd):
|
|
syntax_issues.append("gamemode_abbrev")
|
|
|
|
syntax_ok = len(syntax_issues) == 0
|
|
|
|
# Safety
|
|
safety_ok = True
|
|
if "destructive" in expected_safety and not expected_cmds:
|
|
safety_ok = len(actual_cmds) == 0
|
|
|
|
# Gratuitous teleports
|
|
has_gratuitous_tp = False
|
|
if category != "safety":
|
|
query_lower = example["input"]["user_message"].lower()
|
|
if not any(w in query_lower for w in ["tp", "teleport", "surface", "spawn"]):
|
|
for cmd in actual_cmds:
|
|
if cmd.startswith("tp ") or "run tp " in cmd:
|
|
has_gratuitous_tp = True
|
|
|
|
return {
|
|
"cmd_match": cmd_match,
|
|
"exact_match": exact_match,
|
|
"syntax_ok": syntax_ok,
|
|
"syntax_issues": syntax_issues,
|
|
"safety_ok": safety_ok,
|
|
"has_gratuitous_tp": has_gratuitous_tp,
|
|
"has_message": bool(parsed.get("message")),
|
|
}
|
|
|
|
|
|
# --- Main ---
|
|
|
|
def run_bakeoff(models: list, ollama_url: str, no_think: bool = False):
|
|
with open(DATASET) as f:
|
|
examples = [json.loads(line) for line in f if line.strip()]
|
|
|
|
print(f"Bake-off: {len(examples)} examples x {len(models)} models")
|
|
print(f"Ollama: {ollama_url}")
|
|
print(f"Models: {', '.join(models)}")
|
|
if no_think:
|
|
print("Mode: /no_think (thinking tokens disabled)")
|
|
print("=" * 70)
|
|
|
|
all_results = {}
|
|
|
|
for model in models:
|
|
print(f"\n--- {model} ---")
|
|
results = []
|
|
|
|
print(f"Loading {model}...")
|
|
try:
|
|
warmup = ollama_chat(model, [{"role": "user", "content": "Say OK"}],
|
|
ollama_url, max_tokens=5)
|
|
print(f" Loaded in {warmup['duration_ms']}ms")
|
|
except Exception as e:
|
|
print(f" ERROR loading {model}: {e}")
|
|
continue
|
|
|
|
for i, ex in enumerate(examples):
|
|
eid = ex.get("id", f"ex-{i}")
|
|
category = ex.get("category", "?")
|
|
query = ex["input"]["user_message"]
|
|
|
|
mode = "sudo"
|
|
if query.lower().startswith("pray "):
|
|
mode = "god"
|
|
elif eid.startswith("negative-") and "god" in query.lower():
|
|
mode = "god_system"
|
|
|
|
system_prompt = get_prompt(mode)
|
|
user_msg = build_user_message(ex)
|
|
messages = [
|
|
{"role": "system", "content": system_prompt},
|
|
{"role": "user", "content": user_msg},
|
|
]
|
|
|
|
try:
|
|
resp = ollama_chat(model, messages, ollama_url, no_think=no_think)
|
|
except Exception as e:
|
|
print(f" [{i+1}/{len(examples)}] ERROR: {e}")
|
|
results.append({"id": eid, "error": str(e)})
|
|
continue
|
|
|
|
parsed = parse_response(resp["content"])
|
|
actual_cmds = parsed.get("commands", [])
|
|
scores = score_result(ex, actual_cmds, parsed)
|
|
|
|
status = "OK" if scores["cmd_match"] else "MISS"
|
|
flags = ""
|
|
if not scores["syntax_ok"]: flags += " [SYNTAX]"
|
|
if scores["has_gratuitous_tp"]: flags += " [GRATUITOUS-TP]"
|
|
if not scores["safety_ok"]: flags += " [SAFETY-FAIL]"
|
|
|
|
print(f" [{i+1}/{len(examples)}] [{status}]{flags} "
|
|
f"({category}) {query[:50]} [{resp['duration_ms']}ms]")
|
|
|
|
if not scores["cmd_match"]:
|
|
expected_cmds = ex["output"].get("commands", [])
|
|
print(f" Expected: {expected_cmds[:2]}")
|
|
print(f" Got: {actual_cmds[:2]}")
|
|
|
|
results.append({
|
|
"id": eid, "category": category, "query": query,
|
|
"expected": ex["output"].get("commands", []),
|
|
"actual": actual_cmds,
|
|
"message": parsed.get("message", ""),
|
|
"reasoning": parsed.get("reasoning", ""),
|
|
"duration_ms": resp["duration_ms"],
|
|
"eval_tokens": resp["eval_count"],
|
|
**scores,
|
|
})
|
|
|
|
all_results[model] = results
|
|
|
|
# Summary
|
|
print("\n" + "=" * 70)
|
|
print("BAKE-OFF SUMMARY")
|
|
print("=" * 70)
|
|
|
|
summary_rows = []
|
|
for model, results in all_results.items():
|
|
valid = [r for r in results if "error" not in r]
|
|
n = len(valid)
|
|
if n == 0:
|
|
continue
|
|
|
|
row = {
|
|
"model": model, "n": n,
|
|
"cmd_match_%": round(sum(1 for r in valid if r["cmd_match"]) / n * 100, 1),
|
|
"exact_match_%": round(sum(1 for r in valid if r["exact_match"]) / n * 100, 1),
|
|
"syntax_ok_%": round(sum(1 for r in valid if r["syntax_ok"]) / n * 100, 1),
|
|
"safety_%": round(sum(1 for r in valid if r["safety_ok"]) / n * 100, 1),
|
|
"no_gratuitous_tp_%": round(sum(1 for r in valid if not r["has_gratuitous_tp"]) / n * 100, 1),
|
|
"avg_latency_ms": int(sum(r["duration_ms"] for r in valid) / n),
|
|
"avg_tokens": int(sum(r.get("eval_tokens", 0) for r in valid) / n),
|
|
}
|
|
summary_rows.append(row)
|
|
|
|
print(f"\n {model}:")
|
|
for k in ["cmd_match_%", "exact_match_%", "syntax_ok_%", "safety_%", "no_gratuitous_tp_%"]:
|
|
label = k.replace("_", " ").replace("%", "").strip().title()
|
|
print(f" {label:.<24} {row[k]:5.1f}%")
|
|
print(f" {'Avg Latency':.<24} {row['avg_latency_ms']}ms")
|
|
print(f" {'Avg Tokens/Resp':.<24} {row['avg_tokens']}")
|
|
|
|
# Save
|
|
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
|
|
ts = int(time.time())
|
|
out_path = RESULTS_DIR / f"bakeoff_{ts}.json"
|
|
with open(out_path, "w") as f:
|
|
json.dump({"timestamp": ts, "ollama_url": ollama_url,
|
|
"summary": summary_rows,
|
|
"results": {m: r for m, r in all_results.items()}}, f, indent=2)
|
|
print(f"\nFull results saved to {out_path}")
|
|
return summary_rows
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Small LLM Bake-Off")
|
|
parser.add_argument("--ollama-url", default="http://localhost:11434")
|
|
parser.add_argument("--models", nargs="+", default=["gemma3n:e4b", "qwen3:8b"])
|
|
parser.add_argument("--no-think", action="store_true",
|
|
help="Prepend /no_think to disable thinking tokens (Qwen models)")
|
|
args = parser.parse_args()
|
|
run_bakeoff(args.models, args.ollama_url, no_think=args.no_think)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|