Mortdecai/eval/live_bakeoff.py

#!/usr/bin/env python3
"""
Live Bake-off: Compare two Ollama models on a real Minecraft Paper server via RCON.

Sends each test example to both models, executes the returned commands on the
live server via RCON, and scores results including a new "rcon_success" metric.

Usage:
    python3 eval/live_bakeoff.py --models gemma3n:e4b qwen3:8b
    python3 eval/live_bakeoff.py --models gemma3n:e4b qwen3:8b --max-examples 5
    python3 eval/live_bakeoff.py --models gemma3n:e4b qwen3:8b --categories command_gen
"""

import argparse
import json
import re
import sys
import time
from collections import defaultdict
from pathlib import Path

import requests
from mcrcon import MCRcon

ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))

from agent.prompts.system_prompts import get_prompt
from eval.harness import score_result, build_user_message, parse_response, determine_mode, ollama_chat

DATASET = ROOT / "data" / "processed" / "seed_dataset.jsonl"
RESULTS_DIR = ROOT / "eval" / "results"

# RCON error patterns that indicate command failure
RCON_ERROR_PATTERNS = [
    r"Unknown or incomplete command",
    r"No entity was found",
    r"Incorrect argument",
    r"Expected whitespace",
    r"Invalid or unknown",
    r"An unexpected error occurred",
    r"That position is not loaded",
    r"Could not set the block",
    r"Nothing changed",
    r"No player was found",
    r"Expected block",
    r"Expected.*but got",
    r"Unknown item",
    r"Unknown effect",
    r"Unexpected.*at position",
]

RCON_ERROR_RE = re.compile("|".join(RCON_ERROR_PATTERNS), re.IGNORECASE)


def rcon_execute(cmd: str, host: str, port: int, password: str) -> dict:
    """Execute a single command via RCON. Returns response text and success flag."""
    try:
        with MCRcon(host, password, port=port) as mcr:
            response = mcr.command(cmd)
        is_error = bool(RCON_ERROR_RE.search(response))
        return {
            "command": cmd,
            "response": response.strip(),
            "success": not is_error,
            "error": None,
        }
    except Exception as e:
        return {
            "command": cmd,
            "response": "",
            "success": False,
            "error": str(e),
        }


def rcon_execute_batch(commands: list, host: str, port: int, password: str) -> list:
    """Execute a list of commands via RCON sequentially. Returns list of results."""
    results = []
    if not commands:
        return results
    try:
        with MCRcon(host, password, port=port) as mcr:
            for cmd in commands:
                try:
                    response = mcr.command(cmd)
                    is_error = bool(RCON_ERROR_RE.search(response))
                    results.append({
                        "command": cmd,
                        "response": response.strip(),
                        "success": not is_error,
                        "error": None,
                    })
                except Exception as e:
                    results.append({
                        "command": cmd,
                        "response": "",
                        "success": False,
                        "error": str(e),
                    })
    except Exception as e:
        # Connection-level failure: mark all commands as failed
        for cmd in commands:
            results.append({
                "command": cmd,
                "response": "",
                "success": False,
                "error": f"RCON connection failed: {e}",
            })
    return results


def rcon_reset(host: str, port: int, password: str):
    """Clear all effects from all players (test reset between models)."""
    try:
        with MCRcon(host, password, port=port) as mcr:
            mcr.command("effect clear @a")
    except Exception:
        pass  # Best-effort reset


def should_skip_example(example: dict) -> tuple:
    """Determine if an example should be skipped for live testing.
    Returns (should_skip: bool, reason: str)."""
    category = example.get("category", "")
    expected_cmds = example.get("output", {}).get("commands", [])
    safety_flags = example.get("output", {}).get("safety_flags", [])

    # Skip safety examples where expected output is empty commands
    # (we don't want to test destructive refusals on a live server)
    if category == "safety" and not expected_cmds:
        return True, "safety refusal (empty commands)"

    # Skip safety examples with destructive flags
    if "destructive" in safety_flags and not expected_cmds:
        return True, "destructive refusal"

    return False, ""


def compute_rcon_score(rcon_results: list) -> dict:
    """Compute RCON success metrics from execution results."""
    if not rcon_results:
        return {
            "rcon_success": True,  # No commands = vacuously true
            "rcon_total": 0,
            "rcon_succeeded": 0,
            "rcon_failed": 0,
            "rcon_errors": [],
        }
    succeeded = sum(1 for r in rcon_results if r["success"])
    failed = len(rcon_results) - succeeded
    errors = [
        {"command": r["command"], "response": r["response"], "error": r.get("error")}
        for r in rcon_results if not r["success"]
    ]
    return {
        "rcon_success": failed == 0,
        "rcon_total": len(rcon_results),
        "rcon_succeeded": succeeded,
        "rcon_failed": failed,
        "rcon_errors": errors,
    }


def run_model_on_example(model: str, example: dict, ollama_url: str,
                         rcon_host: str, rcon_port: int, rcon_password: str,
                         max_tokens: int = 1500) -> dict:
    """Run one model on one example: generate commands, execute via RCON, score."""
    mode = determine_mode(example)
    messages = [
        {"role": "system", "content": get_prompt(mode)},
        {"role": "user", "content": build_user_message(example)},
    ]

    # Get model response
    try:
        resp = ollama_chat(model, messages, ollama_url, max_tokens=max_tokens)
    except Exception as e:
        return {"model": model, "error": str(e)}

    parsed = parse_response(resp["content"])
    actual_cmds = parsed.get("commands", [])

    # Score against expected (same as harness.py)
    scores = score_result(example, actual_cmds, parsed)

    # Execute commands on live server via RCON
    rcon_results = rcon_execute_batch(actual_cmds, rcon_host, rcon_port, rcon_password)
    rcon_scores = compute_rcon_score(rcon_results)

    return {
        "model": model,
        "mode": mode,
        "actual_cmds": actual_cmds,
        "message": parsed.get("message", ""),
        "reasoning": parsed.get("reasoning", ""),
        "raw_content": resp["content"],
        "duration_ms": resp["duration_ms"],
        "eval_tokens": resp.get("eval_count", 0),
        "done_reason": resp.get("done_reason", ""),
        "rcon_results": rcon_results,
        **scores,
        **rcon_scores,
    }


def run_live_bakeoff(models: list, ollama_url: str,
                     rcon_host: str, rcon_port: int, rcon_password: str,
                     max_examples: int = 0, categories: list = None,
                     max_tokens: int = 1500) -> dict:
    """Run the full live bake-off comparing two models."""
    # Load dataset
    with open(DATASET) as f:
        examples = [json.loads(line) for line in f if line.strip()]

    # Filter by categories
    if categories:
        examples = [ex for ex in examples if ex.get("category") in categories]

    # Filter out skippable examples
    filtered = []
    skipped = []
    for ex in examples:
        skip, reason = should_skip_example(ex)
        if skip:
            skipped.append({"id": ex.get("id", "?"), "reason": reason})
        else:
            filtered.append(ex)
    examples = filtered

    # Limit examples
    if max_examples > 0:
        examples = examples[:max_examples]

    total = len(examples)
    model_a, model_b = models[0], models[1]

    print(f"Live Bake-off: {model_a} vs {model_b}")
    print(f"  Dataset: {total} examples ({len(skipped)} skipped)")
    print(f"  Ollama:  {ollama_url}")
    print(f"  RCON:    {rcon_host}:{rcon_port}")
    print("=" * 80)

    # Test RCON connectivity first
    print("Testing RCON connection...")
    test_result = rcon_execute("list", rcon_host, rcon_port, rcon_password)
    if test_result["error"]:
        print(f"  RCON connection FAILED: {test_result['error']}")
        print("  Aborting live bake-off.")
        return {"error": f"RCON connection failed: {test_result['error']}"}
    print(f"  RCON OK: {test_result['response']}")

    # Warm up both models
    for model in [model_a, model_b]:
        print(f"Loading {model}...")
        try:
            warmup = ollama_chat(model, [{"role": "user", "content": "Say OK"}],
                                 ollama_url, max_tokens=5)
            print(f"  Loaded in {warmup['duration_ms']}ms")
        except Exception as e:
            print(f"  ERROR loading {model}: {e}")
            return {"error": f"Failed to load {model}: {e}"}

    print("\n" + "=" * 80)

    all_results = []
    for i, ex in enumerate(examples):
        eid = ex.get("id", f"ex-{i}")
        category = ex.get("category", "?")
        query = ex["input"]["user_message"]

        print(f"\n[{i+1}/{total}] ({category}) {query[:60]}")
        print("-" * 70)

        # --- Model A ---
        print(f"  {model_a}:")
        result_a = run_model_on_example(
            model_a, ex, ollama_url, rcon_host, rcon_port, rcon_password, max_tokens
        )

        if "error" in result_a:
            print(f"    ERROR: {result_a['error']}")
        else:
            status_a = "OK" if result_a["cmd_match"] else "MISS"
            rcon_a = f"{result_a['rcon_succeeded']}/{result_a['rcon_total']} RCON ok"
            flags_a = ""
            if not result_a["syntax_ok"]:
                flags_a += " [SYNTAX]"
            if not result_a["rcon_success"]:
                flags_a += " [RCON-FAIL]"
            if result_a.get("hallucinated"):
                flags_a += " [HALLUC]"
            print(f"    [{status_a}] {rcon_a}{flags_a} [{result_a['duration_ms']}ms]")
            print(f"    Cmds: {result_a['actual_cmds'][:3]}")
            if result_a["rcon_errors"]:
                for err in result_a["rcon_errors"][:2]:
                    print(f"    RCON err: {err['command'][:50]} -> {err['response'][:60]}")

        # Wait and reset
        time.sleep(2)
        rcon_reset(rcon_host, rcon_port, rcon_password)

        # --- Model B ---
        print(f"  {model_b}:")
        result_b = run_model_on_example(
            model_b, ex, ollama_url, rcon_host, rcon_port, rcon_password, max_tokens
        )

        if "error" in result_b:
            print(f"    ERROR: {result_b['error']}")
        else:
            status_b = "OK" if result_b["cmd_match"] else "MISS"
            rcon_b = f"{result_b['rcon_succeeded']}/{result_b['rcon_total']} RCON ok"
            flags_b = ""
            if not result_b["syntax_ok"]:
                flags_b += " [SYNTAX]"
            if not result_b["rcon_success"]:
                flags_b += " [RCON-FAIL]"
            if result_b.get("hallucinated"):
                flags_b += " [HALLUC]"
            print(f"    [{status_b}] {rcon_b}{flags_b} [{result_b['duration_ms']}ms]")
            print(f"    Cmds: {result_b['actual_cmds'][:3]}")
            if result_b["rcon_errors"]:
                for err in result_b["rcon_errors"][:2]:
                    print(f"    RCON err: {err['command'][:50]} -> {err['response'][:60]}")

        # Wait and reset
        time.sleep(2)
        rcon_reset(rcon_host, rcon_port, rcon_password)

        all_results.append({
            "id": eid,
            "category": category,
            "query": query,
            "expected": ex["output"].get("commands", []),
            model_a: result_a,
            model_b: result_b,
        })

    return {
        "models": [model_a, model_b],
        "ollama_url": ollama_url,
        "rcon_host": rcon_host,
        "rcon_port": rcon_port,
        "timestamp": int(time.time()),
        "dataset_size": total,
        "skipped": skipped,
        "results": all_results,
    }


def compute_model_summary(results: list, model: str) -> dict:
    """Compute aggregate metrics for a single model across all results."""
    valid = [r for r in results if model in r and "error" not in r[model]]
    n = len(valid)
    if n == 0:
        return {"n": 0}

    def pct(key):
        return round(sum(1 for r in valid if r[model].get(key, False)) / n * 100, 1)

    # Per-category
    cats = defaultdict(list)
    for r in valid:
        cats[r["category"]].append(r)

    cat_scores = {}
    for cat, cat_results in sorted(cats.items()):
        cn = len(cat_results)
        cat_valid = [r for r in cat_results if "error" not in r[model]]
        if not cat_valid:
            continue
        cvn = len(cat_valid)
        cat_scores[cat] = {
            "n": cvn,
            "cmd_match_%": round(sum(1 for r in cat_valid if r[model]["cmd_match"]) / cvn * 100, 1),
            "exact_match_%": round(sum(1 for r in cat_valid if r[model]["exact_match"]) / cvn * 100, 1),
            "syntax_ok_%": round(sum(1 for r in cat_valid if r[model]["syntax_ok"]) / cvn * 100, 1),
            "safety_%": round(sum(1 for r in cat_valid if r[model]["safety_ok"]) / cvn * 100, 1),
            "rcon_success_%": round(sum(1 for r in cat_valid if r[model]["rcon_success"]) / cvn * 100, 1),
        }

    avg_latency = int(sum(r[model]["duration_ms"] for r in valid) / n)
    avg_tokens = int(sum(r[model].get("eval_tokens", 0) for r in valid) / n)

    total_rcon_cmds = sum(r[model].get("rcon_total", 0) for r in valid)
    total_rcon_ok = sum(r[model].get("rcon_succeeded", 0) for r in valid)

    return {
        "model": model,
        "n": n,
        "overall": {
            "cmd_match_%": pct("cmd_match"),
            "exact_match_%": pct("exact_match"),
            "syntax_ok_%": pct("syntax_ok"),
            "safety_%": pct("safety_ok"),
            "rcon_success_%": pct("rcon_success"),
            "no_gratuitous_tp_%": round(sum(1 for r in valid if not r[model].get("has_gratuitous_tp", False)) / n * 100, 1),
            "no_hallucination_%": round(sum(1 for r in valid if not r[model].get("hallucinated", False)) / n * 100, 1),
            "empty_%": round(sum(1 for r in valid if r[model].get("is_empty", False)) / n * 100, 1),
            "rcon_cmd_success_%": round(total_rcon_ok / total_rcon_cmds * 100, 1) if total_rcon_cmds > 0 else 100.0,
            "avg_latency_ms": avg_latency,
            "avg_tokens": avg_tokens,
        },
        "by_category": cat_scores,
    }


def print_comparison(bakeoff_data: dict):
    """Print a side-by-side comparison table."""
    models = bakeoff_data["models"]
    results = bakeoff_data["results"]
    model_a, model_b = models

    summary_a = compute_model_summary(results, model_a)
    summary_b = compute_model_summary(results, model_b)

    print("\n" + "=" * 80)
    print("LIVE BAKE-OFF RESULTS")
    print(f"  {model_a} vs {model_b}")
    print(f"  {summary_a['n']} examples evaluated on live server")
    ts = bakeoff_data.get("timestamp", 0)
    print(f"  {time.strftime('%Y-%m-%d %H:%M', time.localtime(ts))}")
    print("=" * 80)

    if summary_a["n"] == 0 or summary_b["n"] == 0:
        print("  Insufficient results for comparison.")
        return summary_a, summary_b

    ov_a = summary_a["overall"]
    ov_b = summary_b["overall"]

    # Side-by-side overall metrics
    metrics = [
        ("Command match",     "cmd_match_%",        True),
        ("Exact match",       "exact_match_%",      True),
        ("Syntax correct",    "syntax_ok_%",        True),
        ("Safety compliance", "safety_%",           True),
        ("RCON success",      "rcon_success_%",     True),
        ("RCON cmd success",  "rcon_cmd_success_%", True),
        ("No gratuitous tp",  "no_gratuitous_tp_%", True),
        ("No hallucination",  "no_hallucination_%", True),
        ("Empty responses",   "empty_%",            False),
        ("Avg latency (ms)",  "avg_latency_ms",     False),
        ("Avg tokens",        "avg_tokens",         False),
    ]

    hdr_a = model_a[:20]
    hdr_b = model_b[:20]
    print(f"\n  {'Metric':<22} {hdr_a:>14} {hdr_b:>14}   Winner")
    print(f"  {'-'*22} {'-'*14} {'-'*14}   {'-'*10}")

    wins = {model_a: 0, model_b: 0}

    for label, key, higher_is_better in metrics:
        val_a = ov_a.get(key, 0)
        val_b = ov_b.get(key, 0)

        # Format values
        if "%" in key:
            s_a = f"{val_a:>6.1f}%"
            s_b = f"{val_b:>6.1f}%"
        else:
            s_a = f"{val_a:>7}"
            s_b = f"{val_b:>7}"

        # Determine winner
        diff = val_a - val_b
        if abs(diff) < 0.5:
            winner = "TIE"
        elif (diff > 0) == higher_is_better:
            winner = "<-"
            wins[model_a] += 1
        else:
            winner = "->"
            wins[model_b] += 1

        print(f"  {label:<22} {s_a:>14} {s_b:>14}   {winner}")

    print(f"\n  Score: {model_a} {wins[model_a]} wins, {model_b} {wins[model_b]} wins")

    # Per-category comparison
    all_cats = sorted(set(list(summary_a.get("by_category", {}).keys()) +
                          list(summary_b.get("by_category", {}).keys())))

    if all_cats:
        print(f"\n  Per-Category RCON Success Rate:")
        print(f"  {'Category':<16} {hdr_a:>14} {hdr_b:>14}")
        print(f"  {'-'*16} {'-'*14} {'-'*14}")
        for cat in all_cats:
            ca = summary_a.get("by_category", {}).get(cat, {})
            cb = summary_b.get("by_category", {}).get(cat, {})
            rcon_a = f"{ca.get('rcon_success_%', '-'):>6.1f}%" if ca else "     N/A"
            rcon_b = f"{cb.get('rcon_success_%', '-'):>6.1f}%" if cb else "     N/A"
            print(f"  {cat:<16} {rcon_a:>14} {rcon_b:>14}")

    # Per-example comparison for disagreements
    disagreements = [
        r for r in results
        if model_a in r and model_b in r
        and "error" not in r[model_a] and "error" not in r[model_b]
        and r[model_a]["rcon_success"] != r[model_b]["rcon_success"]
    ]

    if disagreements:
        print(f"\n  RCON Disagreements ({len(disagreements)} examples):")
        print(f"  {'-'*70}")
        for r in disagreements[:10]:
            rcon_a_ok = "OK" if r[model_a]["rcon_success"] else "FAIL"
            rcon_b_ok = "OK" if r[model_b]["rcon_success"] else "FAIL"
            print(f"    [{r['id']}] {r['query'][:50]}")
            print(f"      {model_a}: RCON {rcon_a_ok} | {model_b}: RCON {rcon_b_ok}")

    return summary_a, summary_b


def main():
    parser = argparse.ArgumentParser(
        description="Live bake-off: compare two models on a real Minecraft server via RCON"
    )
    parser.add_argument("--models", nargs=2, default=["gemma3n:e4b", "qwen3:8b"],
                        metavar=("MODEL_A", "MODEL_B"),
                        help="Two models to compare (default: gemma3n:e4b qwen3:8b)")
    parser.add_argument("--ollama-url", default="http://192.168.0.141:11434",
                        help="Ollama API URL")
    parser.add_argument("--rcon-host", default="192.168.0.244",
                        help="RCON host (default: 192.168.0.244)")
    parser.add_argument("--rcon-port", type=int, default=25577,
                        help="RCON port (default: 25577)")
    parser.add_argument("--rcon-password", default="REDACTED_RCON",
                        help="RCON password")
    parser.add_argument("--max-examples", type=int, default=0,
                        help="Limit number of examples (0 = all)")
    parser.add_argument("--max-tokens", type=int, default=1500,
                        help="Max tokens per model response")
    parser.add_argument("--categories", nargs="+", default=None,
                        help="Filter to specific categories (e.g. command_gen safety)")
    args = parser.parse_args()

    # Run bake-off
    bakeoff_data = run_live_bakeoff(
        models=args.models,
        ollama_url=args.ollama_url,
        rcon_host=args.rcon_host,
        rcon_port=args.rcon_port,
        rcon_password=args.rcon_password,
        max_examples=args.max_examples,
        categories=args.categories,
        max_tokens=args.max_tokens,
    )

    if "error" in bakeoff_data:
        print(f"\nBake-off failed: {bakeoff_data['error']}")
        sys.exit(1)

    # Print comparison
    summary_a, summary_b = print_comparison(bakeoff_data)

    # Save results
    RESULTS_DIR.mkdir(parents=True, exist_ok=True)
    ts = int(time.time())
    model_a_slug = args.models[0].replace(":", "_")
    model_b_slug = args.models[1].replace(":", "_")
    out_path = RESULTS_DIR / f"live_bakeoff_{model_a_slug}_vs_{model_b_slug}_{ts}.json"

    save_data = {
        "summary": {
            args.models[0]: summary_a,
            args.models[1]: summary_b,
        },
        "bakeoff_data": bakeoff_data,
    }

    with open(out_path, "w") as f:
        json.dump(save_data, f, indent=2, default=str)
    print(f"\nResults saved to {out_path}")


if __name__ == "__main__":
    main()