Files
Mortdecai/eval/live_bakeoff.py
Seth 9d789d2524 Three-tier constraint model, mode-aware eval, boundary examples, playtest tooling
Eval harness:
- Mode-aware scoring: sudo=strict (exact match), pray/god=soft (category match,
  in-character, appropriate intensity)
- New metrics: cmd_category_match, appropriate_intensity, scoring_mode breakdown
- Eval defaults to steel141 (192.168.0.141) — prod GPU reserved for serving

Dataset (213 examples):
- Added 31 boundary/adversarial examples (safety edges, abstention, near-boundary)
- Updated pray example reasoning: character-driven logic, not prescriptive outputs
- Tagged pray examples with scoring_mode=soft

Playtest tooling:
- whitelist.sh: add/remove/list across all 3 servers
- FRIENDS_INVITE.md + Discord version: playtester recruitment docs
- Server addresses and implementation details for both training servers

PLAN.md:
- Three-tier constraint model documented (sudo/pray/god_system)
- Success criteria split by scoring mode
- All session decisions logged

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-18 15:57:01 -04:00

582 lines
21 KiB
Python

#!/usr/bin/env python3
"""
Live Bake-off: Compare two Ollama models on a real Minecraft Paper server via RCON.
Sends each test example to both models, executes the returned commands on the
live server via RCON, and scores results including a new "rcon_success" metric.
Usage:
python3 eval/live_bakeoff.py --models gemma3n:e4b qwen3:8b
python3 eval/live_bakeoff.py --models gemma3n:e4b qwen3:8b --max-examples 5
python3 eval/live_bakeoff.py --models gemma3n:e4b qwen3:8b --categories command_gen
"""
import argparse
import json
import re
import sys
import time
from collections import defaultdict
from pathlib import Path
import requests
from mcrcon import MCRcon
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))
from agent.prompts.system_prompts import get_prompt
from eval.harness import score_result, build_user_message, parse_response, determine_mode, ollama_chat
DATASET = ROOT / "data" / "processed" / "seed_dataset.jsonl"
RESULTS_DIR = ROOT / "eval" / "results"
# RCON error patterns that indicate command failure
RCON_ERROR_PATTERNS = [
r"Unknown or incomplete command",
r"No entity was found",
r"Incorrect argument",
r"Expected whitespace",
r"Invalid or unknown",
r"An unexpected error occurred",
r"That position is not loaded",
r"Could not set the block",
r"Nothing changed",
r"No player was found",
r"Expected block",
r"Expected.*but got",
r"Unknown item",
r"Unknown effect",
r"Unexpected.*at position",
]
RCON_ERROR_RE = re.compile("|".join(RCON_ERROR_PATTERNS), re.IGNORECASE)
def rcon_execute(cmd: str, host: str, port: int, password: str) -> dict:
"""Execute a single command via RCON. Returns response text and success flag."""
try:
with MCRcon(host, password, port=port) as mcr:
response = mcr.command(cmd)
is_error = bool(RCON_ERROR_RE.search(response))
return {
"command": cmd,
"response": response.strip(),
"success": not is_error,
"error": None,
}
except Exception as e:
return {
"command": cmd,
"response": "",
"success": False,
"error": str(e),
}
def rcon_execute_batch(commands: list, host: str, port: int, password: str) -> list:
"""Execute a list of commands via RCON sequentially. Returns list of results."""
results = []
if not commands:
return results
try:
with MCRcon(host, password, port=port) as mcr:
for cmd in commands:
try:
response = mcr.command(cmd)
is_error = bool(RCON_ERROR_RE.search(response))
results.append({
"command": cmd,
"response": response.strip(),
"success": not is_error,
"error": None,
})
except Exception as e:
results.append({
"command": cmd,
"response": "",
"success": False,
"error": str(e),
})
except Exception as e:
# Connection-level failure: mark all commands as failed
for cmd in commands:
results.append({
"command": cmd,
"response": "",
"success": False,
"error": f"RCON connection failed: {e}",
})
return results
def rcon_reset(host: str, port: int, password: str):
"""Clear all effects from all players (test reset between models)."""
try:
with MCRcon(host, password, port=port) as mcr:
mcr.command("effect clear @a")
except Exception:
pass # Best-effort reset
def should_skip_example(example: dict) -> tuple:
"""Determine if an example should be skipped for live testing.
Returns (should_skip: bool, reason: str)."""
category = example.get("category", "")
expected_cmds = example.get("output", {}).get("commands", [])
safety_flags = example.get("output", {}).get("safety_flags", [])
# Skip safety examples where expected output is empty commands
# (we don't want to test destructive refusals on a live server)
if category == "safety" and not expected_cmds:
return True, "safety refusal (empty commands)"
# Skip safety examples with destructive flags
if "destructive" in safety_flags and not expected_cmds:
return True, "destructive refusal"
return False, ""
def compute_rcon_score(rcon_results: list) -> dict:
"""Compute RCON success metrics from execution results."""
if not rcon_results:
return {
"rcon_success": True, # No commands = vacuously true
"rcon_total": 0,
"rcon_succeeded": 0,
"rcon_failed": 0,
"rcon_errors": [],
}
succeeded = sum(1 for r in rcon_results if r["success"])
failed = len(rcon_results) - succeeded
errors = [
{"command": r["command"], "response": r["response"], "error": r.get("error")}
for r in rcon_results if not r["success"]
]
return {
"rcon_success": failed == 0,
"rcon_total": len(rcon_results),
"rcon_succeeded": succeeded,
"rcon_failed": failed,
"rcon_errors": errors,
}
def run_model_on_example(model: str, example: dict, ollama_url: str,
rcon_host: str, rcon_port: int, rcon_password: str,
max_tokens: int = 1500) -> dict:
"""Run one model on one example: generate commands, execute via RCON, score."""
mode = determine_mode(example)
messages = [
{"role": "system", "content": get_prompt(mode)},
{"role": "user", "content": build_user_message(example)},
]
# Get model response
try:
resp = ollama_chat(model, messages, ollama_url, max_tokens=max_tokens)
except Exception as e:
return {"model": model, "error": str(e)}
parsed = parse_response(resp["content"])
actual_cmds = parsed.get("commands", [])
# Score against expected (same as harness.py)
scores = score_result(example, actual_cmds, parsed)
# Execute commands on live server via RCON
rcon_results = rcon_execute_batch(actual_cmds, rcon_host, rcon_port, rcon_password)
rcon_scores = compute_rcon_score(rcon_results)
return {
"model": model,
"mode": mode,
"actual_cmds": actual_cmds,
"message": parsed.get("message", ""),
"reasoning": parsed.get("reasoning", ""),
"raw_content": resp["content"],
"duration_ms": resp["duration_ms"],
"eval_tokens": resp.get("eval_count", 0),
"done_reason": resp.get("done_reason", ""),
"rcon_results": rcon_results,
**scores,
**rcon_scores,
}
def run_live_bakeoff(models: list, ollama_url: str,
rcon_host: str, rcon_port: int, rcon_password: str,
max_examples: int = 0, categories: list = None,
max_tokens: int = 1500) -> dict:
"""Run the full live bake-off comparing two models."""
# Load dataset
with open(DATASET) as f:
examples = [json.loads(line) for line in f if line.strip()]
# Filter by categories
if categories:
examples = [ex for ex in examples if ex.get("category") in categories]
# Filter out skippable examples
filtered = []
skipped = []
for ex in examples:
skip, reason = should_skip_example(ex)
if skip:
skipped.append({"id": ex.get("id", "?"), "reason": reason})
else:
filtered.append(ex)
examples = filtered
# Limit examples
if max_examples > 0:
examples = examples[:max_examples]
total = len(examples)
model_a, model_b = models[0], models[1]
print(f"Live Bake-off: {model_a} vs {model_b}")
print(f" Dataset: {total} examples ({len(skipped)} skipped)")
print(f" Ollama: {ollama_url}")
print(f" RCON: {rcon_host}:{rcon_port}")
print("=" * 80)
# Test RCON connectivity first
print("Testing RCON connection...")
test_result = rcon_execute("list", rcon_host, rcon_port, rcon_password)
if test_result["error"]:
print(f" RCON connection FAILED: {test_result['error']}")
print(" Aborting live bake-off.")
return {"error": f"RCON connection failed: {test_result['error']}"}
print(f" RCON OK: {test_result['response']}")
# Warm up both models
for model in [model_a, model_b]:
print(f"Loading {model}...")
try:
warmup = ollama_chat(model, [{"role": "user", "content": "Say OK"}],
ollama_url, max_tokens=5)
print(f" Loaded in {warmup['duration_ms']}ms")
except Exception as e:
print(f" ERROR loading {model}: {e}")
return {"error": f"Failed to load {model}: {e}"}
print("\n" + "=" * 80)
all_results = []
for i, ex in enumerate(examples):
eid = ex.get("id", f"ex-{i}")
category = ex.get("category", "?")
query = ex["input"]["user_message"]
print(f"\n[{i+1}/{total}] ({category}) {query[:60]}")
print("-" * 70)
# --- Model A ---
print(f" {model_a}:")
result_a = run_model_on_example(
model_a, ex, ollama_url, rcon_host, rcon_port, rcon_password, max_tokens
)
if "error" in result_a:
print(f" ERROR: {result_a['error']}")
else:
status_a = "OK" if result_a["cmd_match"] else "MISS"
rcon_a = f"{result_a['rcon_succeeded']}/{result_a['rcon_total']} RCON ok"
flags_a = ""
if not result_a["syntax_ok"]:
flags_a += " [SYNTAX]"
if not result_a["rcon_success"]:
flags_a += " [RCON-FAIL]"
if result_a.get("hallucinated"):
flags_a += " [HALLUC]"
print(f" [{status_a}] {rcon_a}{flags_a} [{result_a['duration_ms']}ms]")
print(f" Cmds: {result_a['actual_cmds'][:3]}")
if result_a["rcon_errors"]:
for err in result_a["rcon_errors"][:2]:
print(f" RCON err: {err['command'][:50]} -> {err['response'][:60]}")
# Wait and reset
time.sleep(2)
rcon_reset(rcon_host, rcon_port, rcon_password)
# --- Model B ---
print(f" {model_b}:")
result_b = run_model_on_example(
model_b, ex, ollama_url, rcon_host, rcon_port, rcon_password, max_tokens
)
if "error" in result_b:
print(f" ERROR: {result_b['error']}")
else:
status_b = "OK" if result_b["cmd_match"] else "MISS"
rcon_b = f"{result_b['rcon_succeeded']}/{result_b['rcon_total']} RCON ok"
flags_b = ""
if not result_b["syntax_ok"]:
flags_b += " [SYNTAX]"
if not result_b["rcon_success"]:
flags_b += " [RCON-FAIL]"
if result_b.get("hallucinated"):
flags_b += " [HALLUC]"
print(f" [{status_b}] {rcon_b}{flags_b} [{result_b['duration_ms']}ms]")
print(f" Cmds: {result_b['actual_cmds'][:3]}")
if result_b["rcon_errors"]:
for err in result_b["rcon_errors"][:2]:
print(f" RCON err: {err['command'][:50]} -> {err['response'][:60]}")
# Wait and reset
time.sleep(2)
rcon_reset(rcon_host, rcon_port, rcon_password)
all_results.append({
"id": eid,
"category": category,
"query": query,
"expected": ex["output"].get("commands", []),
model_a: result_a,
model_b: result_b,
})
return {
"models": [model_a, model_b],
"ollama_url": ollama_url,
"rcon_host": rcon_host,
"rcon_port": rcon_port,
"timestamp": int(time.time()),
"dataset_size": total,
"skipped": skipped,
"results": all_results,
}
def compute_model_summary(results: list, model: str) -> dict:
"""Compute aggregate metrics for a single model across all results."""
valid = [r for r in results if model in r and "error" not in r[model]]
n = len(valid)
if n == 0:
return {"n": 0}
def pct(key):
return round(sum(1 for r in valid if r[model].get(key, False)) / n * 100, 1)
# Per-category
cats = defaultdict(list)
for r in valid:
cats[r["category"]].append(r)
cat_scores = {}
for cat, cat_results in sorted(cats.items()):
cn = len(cat_results)
cat_valid = [r for r in cat_results if "error" not in r[model]]
if not cat_valid:
continue
cvn = len(cat_valid)
cat_scores[cat] = {
"n": cvn,
"cmd_match_%": round(sum(1 for r in cat_valid if r[model]["cmd_match"]) / cvn * 100, 1),
"exact_match_%": round(sum(1 for r in cat_valid if r[model]["exact_match"]) / cvn * 100, 1),
"syntax_ok_%": round(sum(1 for r in cat_valid if r[model]["syntax_ok"]) / cvn * 100, 1),
"safety_%": round(sum(1 for r in cat_valid if r[model]["safety_ok"]) / cvn * 100, 1),
"rcon_success_%": round(sum(1 for r in cat_valid if r[model]["rcon_success"]) / cvn * 100, 1),
}
avg_latency = int(sum(r[model]["duration_ms"] for r in valid) / n)
avg_tokens = int(sum(r[model].get("eval_tokens", 0) for r in valid) / n)
total_rcon_cmds = sum(r[model].get("rcon_total", 0) for r in valid)
total_rcon_ok = sum(r[model].get("rcon_succeeded", 0) for r in valid)
return {
"model": model,
"n": n,
"overall": {
"cmd_match_%": pct("cmd_match"),
"exact_match_%": pct("exact_match"),
"syntax_ok_%": pct("syntax_ok"),
"safety_%": pct("safety_ok"),
"rcon_success_%": pct("rcon_success"),
"no_gratuitous_tp_%": round(sum(1 for r in valid if not r[model].get("has_gratuitous_tp", False)) / n * 100, 1),
"no_hallucination_%": round(sum(1 for r in valid if not r[model].get("hallucinated", False)) / n * 100, 1),
"empty_%": round(sum(1 for r in valid if r[model].get("is_empty", False)) / n * 100, 1),
"rcon_cmd_success_%": round(total_rcon_ok / total_rcon_cmds * 100, 1) if total_rcon_cmds > 0 else 100.0,
"avg_latency_ms": avg_latency,
"avg_tokens": avg_tokens,
},
"by_category": cat_scores,
}
def print_comparison(bakeoff_data: dict):
"""Print a side-by-side comparison table."""
models = bakeoff_data["models"]
results = bakeoff_data["results"]
model_a, model_b = models
summary_a = compute_model_summary(results, model_a)
summary_b = compute_model_summary(results, model_b)
print("\n" + "=" * 80)
print("LIVE BAKE-OFF RESULTS")
print(f" {model_a} vs {model_b}")
print(f" {summary_a['n']} examples evaluated on live server")
ts = bakeoff_data.get("timestamp", 0)
print(f" {time.strftime('%Y-%m-%d %H:%M', time.localtime(ts))}")
print("=" * 80)
if summary_a["n"] == 0 or summary_b["n"] == 0:
print(" Insufficient results for comparison.")
return summary_a, summary_b
ov_a = summary_a["overall"]
ov_b = summary_b["overall"]
# Side-by-side overall metrics
metrics = [
("Command match", "cmd_match_%", True),
("Exact match", "exact_match_%", True),
("Syntax correct", "syntax_ok_%", True),
("Safety compliance", "safety_%", True),
("RCON success", "rcon_success_%", True),
("RCON cmd success", "rcon_cmd_success_%", True),
("No gratuitous tp", "no_gratuitous_tp_%", True),
("No hallucination", "no_hallucination_%", True),
("Empty responses", "empty_%", False),
("Avg latency (ms)", "avg_latency_ms", False),
("Avg tokens", "avg_tokens", False),
]
hdr_a = model_a[:20]
hdr_b = model_b[:20]
print(f"\n {'Metric':<22} {hdr_a:>14} {hdr_b:>14} Winner")
print(f" {'-'*22} {'-'*14} {'-'*14} {'-'*10}")
wins = {model_a: 0, model_b: 0}
for label, key, higher_is_better in metrics:
val_a = ov_a.get(key, 0)
val_b = ov_b.get(key, 0)
# Format values
if "%" in key:
s_a = f"{val_a:>6.1f}%"
s_b = f"{val_b:>6.1f}%"
else:
s_a = f"{val_a:>7}"
s_b = f"{val_b:>7}"
# Determine winner
diff = val_a - val_b
if abs(diff) < 0.5:
winner = "TIE"
elif (diff > 0) == higher_is_better:
winner = "<-"
wins[model_a] += 1
else:
winner = "->"
wins[model_b] += 1
print(f" {label:<22} {s_a:>14} {s_b:>14} {winner}")
print(f"\n Score: {model_a} {wins[model_a]} wins, {model_b} {wins[model_b]} wins")
# Per-category comparison
all_cats = sorted(set(list(summary_a.get("by_category", {}).keys()) +
list(summary_b.get("by_category", {}).keys())))
if all_cats:
print(f"\n Per-Category RCON Success Rate:")
print(f" {'Category':<16} {hdr_a:>14} {hdr_b:>14}")
print(f" {'-'*16} {'-'*14} {'-'*14}")
for cat in all_cats:
ca = summary_a.get("by_category", {}).get(cat, {})
cb = summary_b.get("by_category", {}).get(cat, {})
rcon_a = f"{ca.get('rcon_success_%', '-'):>6.1f}%" if ca else " N/A"
rcon_b = f"{cb.get('rcon_success_%', '-'):>6.1f}%" if cb else " N/A"
print(f" {cat:<16} {rcon_a:>14} {rcon_b:>14}")
# Per-example comparison for disagreements
disagreements = [
r for r in results
if model_a in r and model_b in r
and "error" not in r[model_a] and "error" not in r[model_b]
and r[model_a]["rcon_success"] != r[model_b]["rcon_success"]
]
if disagreements:
print(f"\n RCON Disagreements ({len(disagreements)} examples):")
print(f" {'-'*70}")
for r in disagreements[:10]:
rcon_a_ok = "OK" if r[model_a]["rcon_success"] else "FAIL"
rcon_b_ok = "OK" if r[model_b]["rcon_success"] else "FAIL"
print(f" [{r['id']}] {r['query'][:50]}")
print(f" {model_a}: RCON {rcon_a_ok} | {model_b}: RCON {rcon_b_ok}")
return summary_a, summary_b
def main():
parser = argparse.ArgumentParser(
description="Live bake-off: compare two models on a real Minecraft server via RCON"
)
parser.add_argument("--models", nargs=2, default=["gemma3n:e4b", "qwen3:8b"],
metavar=("MODEL_A", "MODEL_B"),
help="Two models to compare (default: gemma3n:e4b qwen3:8b)")
parser.add_argument("--ollama-url", default="http://192.168.0.141:11434",
help="Ollama API URL")
parser.add_argument("--rcon-host", default="192.168.0.244",
help="RCON host (default: 192.168.0.244)")
parser.add_argument("--rcon-port", type=int, default=25577,
help="RCON port (default: 25577)")
parser.add_argument("--rcon-password", default="REDACTED_RCON",
help="RCON password")
parser.add_argument("--max-examples", type=int, default=0,
help="Limit number of examples (0 = all)")
parser.add_argument("--max-tokens", type=int, default=1500,
help="Max tokens per model response")
parser.add_argument("--categories", nargs="+", default=None,
help="Filter to specific categories (e.g. command_gen safety)")
args = parser.parse_args()
# Run bake-off
bakeoff_data = run_live_bakeoff(
models=args.models,
ollama_url=args.ollama_url,
rcon_host=args.rcon_host,
rcon_port=args.rcon_port,
rcon_password=args.rcon_password,
max_examples=args.max_examples,
categories=args.categories,
max_tokens=args.max_tokens,
)
if "error" in bakeoff_data:
print(f"\nBake-off failed: {bakeoff_data['error']}")
sys.exit(1)
# Print comparison
summary_a, summary_b = print_comparison(bakeoff_data)
# Save results
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
ts = int(time.time())
model_a_slug = args.models[0].replace(":", "_")
model_b_slug = args.models[1].replace(":", "_")
out_path = RESULTS_DIR / f"live_bakeoff_{model_a_slug}_vs_{model_b_slug}_{ts}.json"
save_data = {
"summary": {
args.models[0]: summary_a,
args.models[1]: summary_b,
},
"bakeoff_data": bakeoff_data,
}
with open(out_path, "w") as f:
json.dump(save_data, f, indent=2, default=str)
print(f"\nResults saved to {out_path}")
if __name__ == "__main__":
main()