Swarm bots, RCON validation, Haiku distillation complete

Swarm bots (ingame/swarm_bots.js): - 10 survival bots with generated names (SwiftWolf, DarkWolf, etc.) - All bots wander, take damage, auto-respawn, pray when hurt - Gemini + Dolphin(5%) + Multilingual(3%) prompt generation - 20-60s interaction interval per bot Distillation results: - 222 sudo examples via Haiku ($0.28) - 122 god examples via Haiku ($0.37) — with God Soul personality - Total: 344 distilled, $0.65 spent of $5 budget - RCON validation: 74.7% fully valid, 30 real errors out of ~1000 commands validate_distilled.py: - Executes distilled commands on live server via RCON - Distinguishes real errors from benign (no player online) - Tags each example with validation status Dev server switched to Claude Haiku via Anthropic API: - llm_provider: anthropic with $5 budget cap - Auto-fallback to Ollama when budget exhausted - Cost tracking with logging Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-18 19:18:19 -04:00
parent 961f53ea7d
commit 65ee146043
5 changed files with 1224 additions and 2 deletions
@@ -0,0 +1,235 @@
+#!/usr/bin/env python3
+"""
+validate_distilled.py — Execute distilled Claude responses on a live server via RCON.
+
+Takes distilled.jsonl, executes each example's commands on the dev server,
+captures RCON results, and writes validated training pairs to output.
+
+This creates the strongest training signal: input → Claude's commands → actual server result.
+
+Usage:
+    python3 training/scripts/validate_distilled.py                    # run all
+    python3 training/scripts/validate_distilled.py --dry-run          # preview
+    python3 training/scripts/validate_distilled.py --max 10           # first 10 only
+    python3 training/scripts/validate_distilled.py --rcon-host 192.168.0.244 --rcon-port 25578
+"""
+
+import argparse
+import json
+import re
+import sys
+import time
+from pathlib import Path
+
+from mcrcon import MCRcon
+
+ROOT = Path(__file__).resolve().parent.parent.parent
+DISTILLED = ROOT / "data" / "processed" / "distilled.jsonl"
+OUTPUT = ROOT / "data" / "processed" / "validated_distilled.jsonl"
+
+# RCON error patterns
+RCON_ERRORS = [
+    re.compile(r'Unknown or incomplete command', re.I),
+    re.compile(r'Incorrect argument', re.I),
+    re.compile(r'Expected .+ at position', re.I),
+    re.compile(r'Unknown item', re.I),
+    re.compile(r'Unknown item component', re.I),
+    re.compile(r'Invalid or unknown', re.I),
+    re.compile(r"Can't find element", re.I),
+    re.compile(r'Expected whitespace', re.I),
+]
+
+# Expected "failures" that are actually fine (no player online, no entity, unloaded chunks)
+BENIGN_ERRORS = [
+    re.compile(r'No player was found', re.I),
+    re.compile(r'No entity was found', re.I),
+    re.compile(r'That position is not loaded', re.I),
+]
+
+
+def is_real_error(result: str) -> bool:
+    """Check if RCON result is a real syntax/command error (not just missing player)."""
+    for pat in RCON_ERRORS:
+        if pat.search(result):
+            # Check it's not just a benign error
+            for bp in BENIGN_ERRORS:
+                if bp.search(result):
+                    return False
+            return True
+    return False
+
+
+def is_benign_error(result: str) -> bool:
+    """Check if error is benign (would work with a player online)."""
+    for bp in BENIGN_ERRORS:
+        if bp.search(result):
+            return True
+    return False
+
+
+def execute_commands(commands: list, rcon_host: str, rcon_port: int, rcon_pass: str) -> list:
+    """Execute commands via RCON, return list of (cmd, result, success) tuples."""
+    results = []
+    try:
+        with MCRcon(rcon_host, rcon_pass, port=rcon_port) as rcon:
+            for cmd in commands:
+                try:
+                    result = rcon.command(cmd)
+                    real_err = is_real_error(result)
+                    benign = is_benign_error(result)
+                    success = not real_err
+                    results.append({
+                        "command": cmd,
+                        "result": result[:200],
+                        "success": success,
+                        "benign_error": benign,
+                        "real_error": real_err,
+                    })
+                    time.sleep(0.2)
+                except Exception as e:
+                    results.append({
+                        "command": cmd,
+                        "result": str(e)[:200],
+                        "success": False,
+                        "benign_error": False,
+                        "real_error": True,
+                    })
+    except Exception as e:
+        results.append({
+            "command": "(connection failed)",
+            "result": str(e)[:200],
+            "success": False,
+            "benign_error": False,
+            "real_error": True,
+        })
+    return results
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Validate distilled responses via RCON")
+    parser.add_argument("--input", default=str(DISTILLED))
+    parser.add_argument("--output", default=str(OUTPUT))
+    parser.add_argument("--rcon-host", default="192.168.0.244")
+    parser.add_argument("--rcon-port", type=int, default=25578)
+    parser.add_argument("--rcon-pass", default="REDACTED_RCON")
+    parser.add_argument("--max", type=int, default=0, help="Max examples to process (0=all)")
+    parser.add_argument("--dry-run", action="store_true")
+    parser.add_argument("--reset-between", action="store_true", default=True,
+                        help="Clear effects between examples")
+    args = parser.parse_args()
+
+    with open(args.input) as f:
+        examples = [json.loads(l) for l in f if l.strip()]
+
+    if args.max > 0:
+        examples = examples[:args.max]
+
+    print(f"Validating {len(examples)} distilled examples")
+    print(f"RCON: {args.rcon_host}:{args.rcon_port}")
+    print(f"Output: {args.output}")
+
+    if args.dry_run:
+        total_cmds = sum(len(ex.get("output", {}).get("commands", [])) for ex in examples)
+        print(f"\n[DRY RUN] Would execute {total_cmds} commands across {len(examples)} examples")
+        return
+
+    # Test RCON
+    try:
+        with MCRcon(args.rcon_host, args.rcon_pass, port=args.rcon_port) as rcon:
+            print(f"RCON OK: {rcon.command('list')}")
+    except Exception as e:
+        print(f"RCON FAILED: {e}")
+        sys.exit(1)
+
+    validated = []
+    stats = {"total": 0, "all_success": 0, "partial": 0, "all_fail": 0, "no_cmds": 0,
+             "real_errors": 0, "benign_errors": 0}
+
+    for i, ex in enumerate(examples):
+        commands = ex.get("output", {}).get("commands", [])
+        msg = ex.get("input", {}).get("user_message", "")[:50]
+        mode = "god" if "pray" in msg.lower() or ex.get("source") == "prayer_log" else "sudo"
+
+        stats["total"] += 1
+
+        if not commands:
+            stats["no_cmds"] += 1
+            # Still valid — refusal or info-only response
+            ex["rcon_validation"] = {"status": "no_commands", "results": []}
+            validated.append(ex)
+            print(f"  [{i+1}/{len(examples)}] ({mode}) {msg:50} [no cmds — kept]")
+            continue
+
+        # Execute
+        results = execute_commands(commands, args.rcon_host, args.rcon_port, args.rcon_pass)
+
+        real_errors = sum(1 for r in results if r["real_error"])
+        benign = sum(1 for r in results if r["benign_error"])
+        successes = sum(1 for r in results if r["success"])
+
+        stats["real_errors"] += real_errors
+        stats["benign_errors"] += benign
+
+        if real_errors == 0:
+            stats["all_success"] += 1
+            status = "valid"
+        elif real_errors < len(results):
+            stats["partial"] += 1
+            status = "partial"
+        else:
+            stats["all_fail"] += 1
+            status = "invalid"
+
+        # Tag the example with validation results
+        ex["rcon_validation"] = {
+            "status": status,
+            "results": results,
+            "real_errors": real_errors,
+            "benign_errors": benign,
+            "successes": successes,
+        }
+        validated.append(ex)
+
+        flag = ""
+        if real_errors > 0:
+            flag = f" [FAIL:{real_errors}]"
+            # Show first real error
+            for r in results:
+                if r["real_error"]:
+                    flag += f" {r['command'][:30]}→{r['result'][:40]}"
+                    break
+        elif benign > 0:
+            flag = f" [benign:{benign}]"
+
+        print(f"  [{i+1}/{len(examples)}] ({mode}) {msg:50} [{successes}/{len(results)} ok]{flag}")
+
+        # Reset effects between examples
+        if args.reset_between and mode == "god":
+            try:
+                with MCRcon(args.rcon_host, args.rcon_pass, port=args.rcon_port) as rcon:
+                    rcon.command("effect clear @a")
+            except:
+                pass
+            time.sleep(0.5)
+
+        time.sleep(0.3)
+
+    # Write output
+    with open(args.output, "w") as f:
+        for ex in validated:
+            f.write(json.dumps(ex, ensure_ascii=False) + "\n")
+
+    print(f"\n{'='*60}")
+    print(f"Validation complete")
+    print(f"  Total:          {stats['total']}")
+    print(f"  All valid:      {stats['all_success']} ({stats['all_success']/max(stats['total'],1)*100:.1f}%)")
+    print(f"  Partial:        {stats['partial']}")
+    print(f"  All failed:     {stats['all_fail']}")
+    print(f"  No commands:    {stats['no_cmds']}")
+    print(f"  Real errors:    {stats['real_errors']}")
+    print(f"  Benign errors:  {stats['benign_errors']} (would work with player online)")
+    print(f"  Output:         {args.output}")
+
+
+if __name__ == "__main__":
+    main()