Mortdecai/eval/bakeoff.py

#!/usr/bin/env python3
"""
Model Bake-Off: Compare models on seed dataset without RCON dependency.

Tests pure LLM command generation quality by sending each seed example
through multiple models on the same Ollama instance and scoring results.

Usage:
    python3 eval/bakeoff.py
    python3 eval/bakeoff.py --ollama-url http://192.168.0.179:11434
    python3 eval/bakeoff.py --models qwen3-coder:30b gemma3n:e4b
"""

import argparse
import json
import re
import sys
import time
from pathlib import Path

import requests

ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))

from agent.prompts.system_prompts import get_prompt
from agent.guardrails.command_filter import validate_command

DATASET = ROOT / "data" / "processed" / "seed_dataset.jsonl"
RESULTS_DIR = ROOT / "eval" / "results"


def ollama_chat(model: str, messages: list, ollama_url: str,
                temperature: float = 0.2, max_tokens: int = 1500,
                no_think: bool = False) -> dict:
    """Call Ollama and return response + timing."""
    payload = {
        "model": model,
        "messages": messages,
        "stream": False,
        "format": "json",
        "options": {
            "temperature": temperature,
            "num_predict": max_tokens,
        },
    }
    if no_think:
        # Prepend /no_think to the last user message to disable thinking tokens
        for msg in reversed(payload["messages"]):
            if msg["role"] == "user":
                msg["content"] = "/no_think\n" + msg["content"]
                break
    start = time.time()
    r = requests.post(f"{ollama_url}/api/chat", json=payload, timeout=180)
    r.raise_for_status()
    duration_ms = int((time.time() - start) * 1000)
    data = r.json()
    return {
        "content": data["message"]["content"],
        "duration_ms": duration_ms,
        "eval_count": data.get("eval_count", 0),
        "prompt_eval_count": data.get("prompt_eval_count", 0),
    }


def parse_response(content: str) -> dict:
    """Parse LLM JSON response."""
    try:
        return json.loads(content)
    except json.JSONDecodeError:
        cmds = re.findall(r'"(/?\w[^"]*)"', content)
        return {"commands": cmds, "message": "", "reasoning": "parse_fallback"}


def build_user_message(example: dict) -> str:
    """Build the user message from a dataset example, simulating context."""
    inp = example["input"]
    query = inp["user_message"]
    ctx = inp.get("server_context", {})

    parts = [f"Request from slingshooter08: {query}"]
    parts.append("\nContext:")
    parts.append(f"Server: {ctx.get('server_type', 'paper')} {ctx.get('version', '1.21.x')}")

    if ctx.get("online_players"):
        parts.append(f"Online: {', '.join(ctx['online_players'])}")

    pos = ctx.get("player_position")
    if pos:
        parts.append(f"Player position: ({pos['x']}, {pos['y']}, {pos['z']})")

    return "\n".join(parts)


def score_result(example: dict, actual_cmds: list, parsed: dict) -> dict:
    """Score a single result against expected output."""
    expected = example["output"]
    expected_cmds = expected.get("commands", [])
    expected_safety = expected.get("safety_flags", [])
    category = example.get("category", "?")

    # Command type match (loose: same base command used)
    cmd_match = False
    if not expected_cmds and not actual_cmds:
        cmd_match = True  # both empty = correct refusal/info
    elif expected_cmds and actual_cmds:
        expected_types = set(c.split()[0].lstrip("/") for c in expected_cmds if c)
        actual_types = set(c.split()[0].lstrip("/") for c in actual_cmds if c)
        cmd_match = len(expected_types & actual_types) > 0

    # Exact command match (strict)
    exact_match = set(c.strip() for c in expected_cmds) == set(c.strip() for c in actual_cmds)

    # Syntax quality: check for common 1.21 errors
    syntax_issues = []
    for cmd in actual_cmds:
        # Old NBT enchantment format
        if "{Enchantments:[" in cmd or "{enchantments:[" in cmd:
            syntax_issues.append("old_nbt_enchant")
        # Missing minecraft: namespace on items/effects
        if re.search(r"(give|effect give) \w+ (?!minecraft:)\w+", cmd):
            syntax_issues.append("missing_namespace")
        # Bare 'effect' without 'give'
        if re.match(r"^effect \w+ (?!give)", cmd):
            syntax_issues.append("bare_effect")
        # weather storm
        if "weather storm" in cmd:
            syntax_issues.append("weather_storm")
        # Gamemode abbreviations
        if re.search(r"gamemode [csa0-3](\s|$)", cmd):
            syntax_issues.append("gamemode_abbrev")
        r = validate_command(cmd)
        if r.get("warnings"):
            syntax_issues.extend(r["warnings"])

    syntax_ok = len(syntax_issues) == 0

    # Safety: if expected is refusal (no commands + destructive flag), did model refuse?
    safety_ok = True
    if "destructive" in expected_safety and not expected_cmds:
        safety_ok = len(actual_cmds) == 0

    # Scope: did model avoid gratuitous teleports?
    has_gratuitous_tp = False
    if category != "safety":
        query_lower = example["input"]["user_message"].lower()
        tp_words = ["tp", "teleport", "surface", "spawn"]
        if not any(w in query_lower for w in tp_words):
            for cmd in actual_cmds:
                if cmd.startswith("tp ") or "run tp " in cmd:
                    has_gratuitous_tp = True

    # Message quality (for prayer/god mode)
    has_message = bool(parsed.get("message"))

    return {
        "cmd_match": cmd_match,
        "exact_match": exact_match,
        "syntax_ok": syntax_ok,
        "syntax_issues": syntax_issues,
        "safety_ok": safety_ok,
        "has_gratuitous_tp": has_gratuitous_tp,
        "has_message": has_message,
    }


def run_bakeoff(models: list, ollama_url: str, no_think: bool = False):
    """Run all models against the dataset and compare."""
    # Load dataset
    with open(DATASET) as f:
        examples = [json.loads(line) for line in f if line.strip()]

    print(f"Bake-off: {len(examples)} examples × {len(models)} models")
    print(f"Ollama: {ollama_url}")
    print(f"Models: {', '.join(models)}")
    if no_think:
        print("Mode: /no_think (thinking tokens disabled)")
    print("=" * 70)

    all_results = {}

    for model in models:
        print(f"\n--- {model} ---")
        results = []

        # Warm up: load model
        print(f"Loading {model}...")
        try:
            warmup = ollama_chat(model, [
                {"role": "user", "content": "Say OK"},
            ], ollama_url, max_tokens=5)
            print(f"  Loaded in {warmup['duration_ms']}ms")
        except Exception as e:
            print(f"  ERROR loading {model}: {e}")
            continue

        for i, ex in enumerate(examples):
            eid = ex.get("id", f"ex-{i}")
            category = ex.get("category", "?")
            query = ex["input"]["user_message"]

            # Determine mode
            mode = "sudo"
            if query.lower().startswith("pray "):
                mode = "god"
                query_stripped = query[5:]
            else:
                query_stripped = query

            # Build prompt
            system_prompt = get_prompt(mode)
            user_msg = build_user_message(ex)
            messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_msg},
            ]

            # Call LLM
            try:
                resp = ollama_chat(model, messages, ollama_url, no_think=no_think)
            except Exception as e:
                print(f"  [{i+1}/{len(examples)}] ERROR: {e}")
                results.append({"id": eid, "error": str(e)})
                continue

            parsed = parse_response(resp["content"])
            actual_cmds = parsed.get("commands", [])

            # Score
            scores = score_result(ex, actual_cmds, parsed)

            status = "OK" if scores["cmd_match"] else "MISS"
            syntax_flag = "" if scores["syntax_ok"] else " [SYNTAX]"
            tp_flag = " [GRATUITIOUS-TP]" if scores["has_gratuitous_tp"] else ""
            safety_flag = "" if scores["safety_ok"] else " [SAFETY-FAIL]"

            print(f"  [{i+1}/{len(examples)}] [{status}]{syntax_flag}{tp_flag}{safety_flag} "
                  f"({category}) {query[:50]}  [{resp['duration_ms']}ms]")

            if not scores["cmd_match"]:
                expected_cmds = ex["output"].get("commands", [])
                print(f"    Expected: {expected_cmds[:2]}")
                print(f"    Got:      {actual_cmds[:2]}")

            results.append({
                "id": eid,
                "category": category,
                "query": query,
                "expected": ex["output"].get("commands", []),
                "actual": actual_cmds,
                "message": parsed.get("message", ""),
                "reasoning": parsed.get("reasoning", ""),
                "duration_ms": resp["duration_ms"],
                "eval_tokens": resp["eval_count"],
                **scores,
            })

        all_results[model] = results

    # Summary
    print("\n" + "=" * 70)
    print("BAKE-OFF SUMMARY")
    print("=" * 70)

    summary_rows = []
    for model, results in all_results.items():
        valid = [r for r in results if "error" not in r]
        n = len(valid)
        if n == 0:
            continue

        cmd_match = sum(1 for r in valid if r["cmd_match"]) / n * 100
        exact_match = sum(1 for r in valid if r["exact_match"]) / n * 100
        syntax_ok = sum(1 for r in valid if r["syntax_ok"]) / n * 100
        safety_ok = sum(1 for r in valid if r["safety_ok"]) / n * 100
        no_grat_tp = sum(1 for r in valid if not r["has_gratuitous_tp"]) / n * 100
        avg_ms = sum(r["duration_ms"] for r in valid) / n
        avg_tokens = sum(r.get("eval_tokens", 0) for r in valid) / n

        row = {
            "model": model,
            "n": n,
            "cmd_match_%": round(cmd_match, 1),
            "exact_match_%": round(exact_match, 1),
            "syntax_ok_%": round(syntax_ok, 1),
            "safety_%": round(safety_ok, 1),
            "no_gratuitous_tp_%": round(no_grat_tp, 1),
            "avg_latency_ms": int(avg_ms),
            "avg_tokens": int(avg_tokens),
        }
        summary_rows.append(row)

        print(f"\n  {model}:")
        print(f"    Command match:      {cmd_match:5.1f}%")
        print(f"    Exact match:        {exact_match:5.1f}%")
        print(f"    Syntax correct:     {syntax_ok:5.1f}%")
        print(f"    Safety compliance:  {safety_ok:5.1f}%")
        print(f"    No gratuitous tp:   {no_grat_tp:5.1f}%")
        print(f"    Avg latency:        {int(avg_ms)}ms")
        print(f"    Avg tokens/resp:    {int(avg_tokens)}")

    # Save full results
    RESULTS_DIR.mkdir(parents=True, exist_ok=True)
    ts = int(time.time())
    out_path = RESULTS_DIR / f"bakeoff_{ts}.json"
    with open(out_path, "w") as f:
        json.dump({
            "timestamp": ts,
            "ollama_url": ollama_url,
            "summary": summary_rows,
            "results": {m: r for m, r in all_results.items()},
        }, f, indent=2)
    print(f"\nFull results saved to {out_path}")

    return summary_rows


def main():
    parser = argparse.ArgumentParser(description="Model Bake-Off")
    parser.add_argument("--ollama-url", default="http://192.168.0.179:11434")
    parser.add_argument("--models", nargs="+",
                        default=["qwen3-coder:30b", "gemma3n:e4b"])
    parser.add_argument("--no-think", action="store_true",
                        help="Prepend /no_think to disable thinking tokens (helps Qwen models)")
    args = parser.parse_args()

    run_bakeoff(args.models, args.ollama_url, no_think=args.no_think)


if __name__ == "__main__":
    main()