Mortdecai/eval/bakeoff.py

#!/usr/bin/env python3
"""
Model Bake-Off: Compare models on seed dataset without RCON dependency.

Tests pure LLM command generation quality by sending each seed example
through multiple models on the same Ollama instance and scoring results.

Usage:
    python3 eval/bakeoff.py
    python3 eval/bakeoff.py --ollama-url http://192.168.0.179:11434
    python3 eval/bakeoff.py --models qwen3-coder:30b gemma3n:e4b
"""

import argparse
import json
import re
import sys
import time
from pathlib import Path

import requests

ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))

from agent.prompts.system_prompts import get_prompt
from agent.guardrails.command_filter import validate_command

DATASET = ROOT / "data" / "processed" / "seed_dataset.jsonl"
RESULTS_DIR = ROOT / "eval" / "results"


def ollama_chat(model: str, messages: list, ollama_url: str,
                temperature: float = 0.2, max_tokens: int = 1500,
                no_think: bool = False) -> dict:
    """Call Ollama and return response + timing."""
    payload = {
        "model": model,
        "messages": messages,
        "stream": False,
        "format": "json",
        "options": {
            "temperature": temperature,
            "num_predict": max_tokens,
        },
    }
    if no_think:
        # Prepend /no_think to the last user message to disable thinking tokens
        for msg in reversed(payload["messages"]):
            if msg["role"] == "user":
                msg["content"] = "/no_think\n" + msg["content"]
                break
    start = time.time()
    r = requests.post(f"{ollama_url}/api/chat", json=payload, timeout=180)
    r.raise_for_status()
    duration_ms = int((time.time() - start) * 1000)
    data = r.json()
    return {
        "content": data["message"]["content"],
        "duration_ms": duration_ms,
        "eval_count": data.get("eval_count", 0),
        "prompt_eval_count": data.get("prompt_eval_count", 0),
    }


def parse_response(content: str) -> dict:
    """Parse LLM JSON response, stripping think blocks."""
    # Strip think blocks
    content = re.sub(r'<think>[\s\S]*?</think>\s*', '', content).strip()
    try:
        parsed = json.loads(content)
        # Ensure commands is a list of strings
        cmds = parsed.get("commands", [])
        if isinstance(cmds, list):
            parsed["commands"] = [c for c in cmds if isinstance(c, str)]
        return parsed
    except json.JSONDecodeError:
        # Try to extract JSON from markdown wrapper
        match = re.search(r'\{[\s\S]*\}', content)
        if match:
            try:
                return json.loads(match.group())
            except json.JSONDecodeError:
                pass
        cmds = re.findall(r'"(/?\w[^"]*)"', content)
        return {"commands": cmds, "message": "", "reasoning": "parse_fallback"}


def build_user_message(example: dict) -> str:
    """Build the user message from a dataset example, simulating context."""
    inp = example["input"]
    query = inp["user_message"]
    ctx = inp.get("server_context", {})

    parts = [f"Request from slingshooter08: {query}"]
    parts.append("\nContext:")
    parts.append(f"Server: {ctx.get('server_type', 'paper')} {ctx.get('version', '1.21.x')}")

    if ctx.get("online_players"):
        parts.append(f"Online: {', '.join(ctx['online_players'])}")

    pos = ctx.get("player_position")
    if pos:
        parts.append(f"Player position: ({pos['x']}, {pos['y']}, {pos['z']})")

    return "\n".join(parts)


def score_result(example: dict, actual_cmds: list, parsed: dict) -> dict:
    """Score a single result against expected output."""
    expected = example["output"]
    expected_cmds = expected.get("commands", [])
    expected_safety = expected.get("safety_flags", [])
    category = example.get("category", "?")

    # Command type match (loose: same base command used)
    cmd_match = False
    if not expected_cmds and not actual_cmds:
        cmd_match = True  # both empty = correct refusal/info
    elif expected_cmds and actual_cmds:
        expected_types = set(c.split()[0].lstrip("/") for c in expected_cmds if c)
        actual_types = set(c.split()[0].lstrip("/") for c in actual_cmds if c)
        cmd_match = len(expected_types & actual_types) > 0

    # Exact command match (strict)
    exact_match = set(c.strip() for c in expected_cmds) == set(c.strip() for c in actual_cmds)

    # Syntax quality: check for common 1.21 errors
    syntax_issues = []
    for cmd in actual_cmds:
        # Old NBT enchantment format
        if "{Enchantments:[" in cmd or "{enchantments:[" in cmd:
            syntax_issues.append("old_nbt_enchant")
        # Missing minecraft: namespace on items/effects
        if re.search(r"(give|effect give) \w+ (?!minecraft:)\w+", cmd):
            syntax_issues.append("missing_namespace")
        # Bare 'effect' without 'give'
        if re.match(r"^effect \w+ (?!give)", cmd):
            syntax_issues.append("bare_effect")
        # weather storm
        if "weather storm" in cmd:
            syntax_issues.append("weather_storm")
        # Gamemode abbreviations
        if re.search(r"gamemode [csa0-3](\s|$)", cmd):
            syntax_issues.append("gamemode_abbrev")
        r = validate_command(cmd)
        if r.get("warnings"):
            syntax_issues.extend(r["warnings"])

    syntax_ok = len(syntax_issues) == 0

    # Safety: if expected is refusal (no commands + destructive flag), did model refuse?
    safety_ok = True
    if "destructive" in expected_safety and not expected_cmds:
        safety_ok = len(actual_cmds) == 0

    # Scope: did model avoid gratuitous teleports?
    has_gratuitous_tp = False
    if category != "safety":
        query_lower = example["input"]["user_message"].lower()
        tp_words = ["tp", "teleport", "surface", "spawn"]
        if not any(w in query_lower for w in tp_words):
            for cmd in actual_cmds:
                if cmd.startswith("tp ") or "run tp " in cmd:
                    has_gratuitous_tp = True

    # Message quality (for prayer/god mode)
    has_message = bool(parsed.get("message"))

    return {
        "cmd_match": cmd_match,
        "exact_match": exact_match,
        "syntax_ok": syntax_ok,
        "syntax_issues": syntax_issues,
        "safety_ok": safety_ok,
        "has_gratuitous_tp": has_gratuitous_tp,
        "has_message": has_message,
    }


def run_bakeoff(models: list, ollama_url: str, no_think: bool = False, limit: int = 0):
    """Run all models against the dataset and compare."""
    import random
    # Load dataset
    with open(DATASET) as f:
        examples = [json.loads(line) for line in f if line.strip()]

    if limit > 0 and limit < len(examples):
        random.seed(42)
        examples = random.sample(examples, limit)

    print(f"Bake-off: {len(examples)} examples × {len(models)} models")
    print(f"Ollama: {ollama_url}")
    print(f"Models: {', '.join(models)}")
    if no_think:
        print("Mode: /no_think (thinking tokens disabled)")
    print("=" * 70)

    all_results = {}

    for model in models:
        print(f"\n--- {model} ---")
        results = []

        # Warm up: load model
        print(f"Loading {model}...")
        try:
            warmup = ollama_chat(model, [
                {"role": "user", "content": "Say OK"},
            ], ollama_url, max_tokens=5)
            print(f"  Loaded in {warmup['duration_ms']}ms")
        except Exception as e:
            print(f"  ERROR loading {model}: {e}")
            continue

        for i, ex in enumerate(examples):
            eid = ex.get("id", f"ex-{i}")
            category = ex.get("category", "?")

            # Handle both old dict format and new messages[] format
            if "messages" in ex and isinstance(ex["messages"], list):
                # Messages format: extract user message and system prompt
                msgs = ex["messages"]
                sys_content = ""
                user_content = ""
                for msg in msgs:
                    if msg.get("role") == "system":
                        sys_content = msg.get("content", "")
                    elif msg.get("role") == "user":
                        user_content = msg.get("content", "")
                query = user_content
                mode = "god" if "You are God" in sys_content else "sudo"
                messages = [
                    {"role": "system", "content": sys_content},
                    {"role": "user", "content": user_content},
                ]
            else:
                query = ex["input"]["user_message"]
                # Determine mode
                mode = "sudo"
                if query.lower().startswith("pray "):
                    mode = "god"
                # Build prompt
                system_prompt = get_prompt(mode)
                user_msg = build_user_message(ex)
                messages = [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_msg},
                ]

            # Call LLM
            try:
                resp = ollama_chat(model, messages, ollama_url, no_think=no_think)
            except Exception as e:
                print(f"  [{i+1}/{len(examples)}] ERROR: {e}")
                results.append({"id": eid, "error": str(e)})
                continue

            parsed = parse_response(resp["content"])
            actual_cmds = parsed.get("commands", [])

            # Score — adapt example to old format for scoring if needed
            score_ex = ex
            if "messages" in ex and "output" not in ex:
                # Extract expected output from assistant message
                expected_content = ""
                for msg in ex["messages"]:
                    if msg.get("role") == "assistant":
                        expected_content = msg.get("content", "")
                        break
                try:
                    expected_parsed = json.loads(expected_content)
                except (json.JSONDecodeError, TypeError):
                    expected_parsed = {"commands": [], "message": ""}
                score_ex = {
                    "input": {"user_message": query},
                    "output": {
                        "commands": expected_parsed.get("commands", []),
                        "message": expected_parsed.get("message", ""),
                        "safety_flags": [],
                    },
                    "category": category,
                }
            scores = score_result(score_ex, actual_cmds, parsed)

            status = "OK" if scores["cmd_match"] else "MISS"
            syntax_flag = "" if scores["syntax_ok"] else " [SYNTAX]"
            tp_flag = " [GRATUITIOUS-TP]" if scores["has_gratuitous_tp"] else ""
            safety_flag = "" if scores["safety_ok"] else " [SAFETY-FAIL]"

            print(f"  [{i+1}/{len(examples)}] [{status}]{syntax_flag}{tp_flag}{safety_flag} "
                  f"({category}) {query[:50]}  [{resp['duration_ms']}ms]")

            if not scores["cmd_match"]:
                expected_cmds = score_ex.get("output", {}).get("commands", [])
                print(f"    Expected: {expected_cmds[:2] if isinstance(expected_cmds, list) else expected_cmds}")
                print(f"    Got:      {actual_cmds[:2] if isinstance(actual_cmds, list) else actual_cmds}")

            results.append({
                "id": eid,
                "category": category,
                "query": query,
                "expected": score_ex.get("output", {}).get("commands", []),
                "actual": actual_cmds,
                "message": parsed.get("message", ""),
                "reasoning": parsed.get("reasoning", ""),
                "duration_ms": resp["duration_ms"],
                "eval_tokens": resp["eval_count"],
                **scores,
            })

        all_results[model] = results

    # Summary
    print("\n" + "=" * 70)
    print("BAKE-OFF SUMMARY")
    print("=" * 70)

    summary_rows = []
    for model, results in all_results.items():
        valid = [r for r in results if "error" not in r]
        n = len(valid)
        if n == 0:
            continue

        cmd_match = sum(1 for r in valid if r["cmd_match"]) / n * 100
        exact_match = sum(1 for r in valid if r["exact_match"]) / n * 100
        syntax_ok = sum(1 for r in valid if r["syntax_ok"]) / n * 100
        safety_ok = sum(1 for r in valid if r["safety_ok"]) / n * 100
        no_grat_tp = sum(1 for r in valid if not r["has_gratuitous_tp"]) / n * 100
        avg_ms = sum(r["duration_ms"] for r in valid) / n
        avg_tokens = sum(r.get("eval_tokens", 0) for r in valid) / n

        row = {
            "model": model,
            "n": n,
            "cmd_match_%": round(cmd_match, 1),
            "exact_match_%": round(exact_match, 1),
            "syntax_ok_%": round(syntax_ok, 1),
            "safety_%": round(safety_ok, 1),
            "no_gratuitous_tp_%": round(no_grat_tp, 1),
            "avg_latency_ms": int(avg_ms),
            "avg_tokens": int(avg_tokens),
        }
        summary_rows.append(row)

        print(f"\n  {model}:")
        print(f"    Command match:      {cmd_match:5.1f}%")
        print(f"    Exact match:        {exact_match:5.1f}%")
        print(f"    Syntax correct:     {syntax_ok:5.1f}%")
        print(f"    Safety compliance:  {safety_ok:5.1f}%")
        print(f"    No gratuitous tp:   {no_grat_tp:5.1f}%")
        print(f"    Avg latency:        {int(avg_ms)}ms")
        print(f"    Avg tokens/resp:    {int(avg_tokens)}")

    # Save full results
    RESULTS_DIR.mkdir(parents=True, exist_ok=True)
    ts = int(time.time())
    out_path = RESULTS_DIR / f"bakeoff_{ts}.json"
    with open(out_path, "w") as f:
        json.dump({
            "timestamp": ts,
            "ollama_url": ollama_url,
            "summary": summary_rows,
            "results": {m: r for m, r in all_results.items()},
        }, f, indent=2)
    print(f"\nFull results saved to {out_path}")

    return summary_rows


def main():
    parser = argparse.ArgumentParser(description="Model Bake-Off")
    parser.add_argument("--ollama-url", default="http://192.168.0.141:11434")
    parser.add_argument("--models", nargs="+",
                        default=["qwen3-coder:30b", "gemma3n:e4b"])
    parser.add_argument("--no-think", action="store_true",
                        help="Prepend /no_think to disable thinking tokens (helps Qwen models)")
    parser.add_argument("--limit", type=int, default=0,
                        help="Max examples per model (0 = all)")
    args = parser.parse_args()

    run_bakeoff(args.models, args.ollama_url, no_think=args.no_think, limit=args.limit)


if __name__ == "__main__":
    main()