Swarm bots, RCON validation, Haiku distillation complete
Swarm bots (ingame/swarm_bots.js): - 10 survival bots with generated names (SwiftWolf, DarkWolf, etc.) - All bots wander, take damage, auto-respawn, pray when hurt - Gemini + Dolphin(5%) + Multilingual(3%) prompt generation - 20-60s interaction interval per bot Distillation results: - 222 sudo examples via Haiku ($0.28) - 122 god examples via Haiku ($0.37) — with God Soul personality - Total: 344 distilled, $0.65 spent of $5 budget - RCON validation: 74.7% fully valid, 30 real errors out of ~1000 commands validate_distilled.py: - Executes distilled commands on live server via RCON - Distinguishes real errors from benign (no player online) - Tags each example with validation status Dev server switched to Claude Haiku via Anthropic API: - llm_provider: anthropic with $5 budget cap - Auto-fallback to Ollama when budget exhausted - Cost tracking with logging Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,235 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
validate_distilled.py — Execute distilled Claude responses on a live server via RCON.
|
||||
|
||||
Takes distilled.jsonl, executes each example's commands on the dev server,
|
||||
captures RCON results, and writes validated training pairs to output.
|
||||
|
||||
This creates the strongest training signal: input → Claude's commands → actual server result.
|
||||
|
||||
Usage:
|
||||
python3 training/scripts/validate_distilled.py # run all
|
||||
python3 training/scripts/validate_distilled.py --dry-run # preview
|
||||
python3 training/scripts/validate_distilled.py --max 10 # first 10 only
|
||||
python3 training/scripts/validate_distilled.py --rcon-host 192.168.0.244 --rcon-port 25578
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from mcrcon import MCRcon
|
||||
|
||||
ROOT = Path(__file__).resolve().parent.parent.parent
|
||||
DISTILLED = ROOT / "data" / "processed" / "distilled.jsonl"
|
||||
OUTPUT = ROOT / "data" / "processed" / "validated_distilled.jsonl"
|
||||
|
||||
# RCON error patterns
|
||||
RCON_ERRORS = [
|
||||
re.compile(r'Unknown or incomplete command', re.I),
|
||||
re.compile(r'Incorrect argument', re.I),
|
||||
re.compile(r'Expected .+ at position', re.I),
|
||||
re.compile(r'Unknown item', re.I),
|
||||
re.compile(r'Unknown item component', re.I),
|
||||
re.compile(r'Invalid or unknown', re.I),
|
||||
re.compile(r"Can't find element", re.I),
|
||||
re.compile(r'Expected whitespace', re.I),
|
||||
]
|
||||
|
||||
# Expected "failures" that are actually fine (no player online, no entity, unloaded chunks)
|
||||
BENIGN_ERRORS = [
|
||||
re.compile(r'No player was found', re.I),
|
||||
re.compile(r'No entity was found', re.I),
|
||||
re.compile(r'That position is not loaded', re.I),
|
||||
]
|
||||
|
||||
|
||||
def is_real_error(result: str) -> bool:
|
||||
"""Check if RCON result is a real syntax/command error (not just missing player)."""
|
||||
for pat in RCON_ERRORS:
|
||||
if pat.search(result):
|
||||
# Check it's not just a benign error
|
||||
for bp in BENIGN_ERRORS:
|
||||
if bp.search(result):
|
||||
return False
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def is_benign_error(result: str) -> bool:
|
||||
"""Check if error is benign (would work with a player online)."""
|
||||
for bp in BENIGN_ERRORS:
|
||||
if bp.search(result):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def execute_commands(commands: list, rcon_host: str, rcon_port: int, rcon_pass: str) -> list:
|
||||
"""Execute commands via RCON, return list of (cmd, result, success) tuples."""
|
||||
results = []
|
||||
try:
|
||||
with MCRcon(rcon_host, rcon_pass, port=rcon_port) as rcon:
|
||||
for cmd in commands:
|
||||
try:
|
||||
result = rcon.command(cmd)
|
||||
real_err = is_real_error(result)
|
||||
benign = is_benign_error(result)
|
||||
success = not real_err
|
||||
results.append({
|
||||
"command": cmd,
|
||||
"result": result[:200],
|
||||
"success": success,
|
||||
"benign_error": benign,
|
||||
"real_error": real_err,
|
||||
})
|
||||
time.sleep(0.2)
|
||||
except Exception as e:
|
||||
results.append({
|
||||
"command": cmd,
|
||||
"result": str(e)[:200],
|
||||
"success": False,
|
||||
"benign_error": False,
|
||||
"real_error": True,
|
||||
})
|
||||
except Exception as e:
|
||||
results.append({
|
||||
"command": "(connection failed)",
|
||||
"result": str(e)[:200],
|
||||
"success": False,
|
||||
"benign_error": False,
|
||||
"real_error": True,
|
||||
})
|
||||
return results
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Validate distilled responses via RCON")
|
||||
parser.add_argument("--input", default=str(DISTILLED))
|
||||
parser.add_argument("--output", default=str(OUTPUT))
|
||||
parser.add_argument("--rcon-host", default="192.168.0.244")
|
||||
parser.add_argument("--rcon-port", type=int, default=25578)
|
||||
parser.add_argument("--rcon-pass", default="REDACTED_RCON")
|
||||
parser.add_argument("--max", type=int, default=0, help="Max examples to process (0=all)")
|
||||
parser.add_argument("--dry-run", action="store_true")
|
||||
parser.add_argument("--reset-between", action="store_true", default=True,
|
||||
help="Clear effects between examples")
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.input) as f:
|
||||
examples = [json.loads(l) for l in f if l.strip()]
|
||||
|
||||
if args.max > 0:
|
||||
examples = examples[:args.max]
|
||||
|
||||
print(f"Validating {len(examples)} distilled examples")
|
||||
print(f"RCON: {args.rcon_host}:{args.rcon_port}")
|
||||
print(f"Output: {args.output}")
|
||||
|
||||
if args.dry_run:
|
||||
total_cmds = sum(len(ex.get("output", {}).get("commands", [])) for ex in examples)
|
||||
print(f"\n[DRY RUN] Would execute {total_cmds} commands across {len(examples)} examples")
|
||||
return
|
||||
|
||||
# Test RCON
|
||||
try:
|
||||
with MCRcon(args.rcon_host, args.rcon_pass, port=args.rcon_port) as rcon:
|
||||
print(f"RCON OK: {rcon.command('list')}")
|
||||
except Exception as e:
|
||||
print(f"RCON FAILED: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
validated = []
|
||||
stats = {"total": 0, "all_success": 0, "partial": 0, "all_fail": 0, "no_cmds": 0,
|
||||
"real_errors": 0, "benign_errors": 0}
|
||||
|
||||
for i, ex in enumerate(examples):
|
||||
commands = ex.get("output", {}).get("commands", [])
|
||||
msg = ex.get("input", {}).get("user_message", "")[:50]
|
||||
mode = "god" if "pray" in msg.lower() or ex.get("source") == "prayer_log" else "sudo"
|
||||
|
||||
stats["total"] += 1
|
||||
|
||||
if not commands:
|
||||
stats["no_cmds"] += 1
|
||||
# Still valid — refusal or info-only response
|
||||
ex["rcon_validation"] = {"status": "no_commands", "results": []}
|
||||
validated.append(ex)
|
||||
print(f" [{i+1}/{len(examples)}] ({mode}) {msg:50} [no cmds — kept]")
|
||||
continue
|
||||
|
||||
# Execute
|
||||
results = execute_commands(commands, args.rcon_host, args.rcon_port, args.rcon_pass)
|
||||
|
||||
real_errors = sum(1 for r in results if r["real_error"])
|
||||
benign = sum(1 for r in results if r["benign_error"])
|
||||
successes = sum(1 for r in results if r["success"])
|
||||
|
||||
stats["real_errors"] += real_errors
|
||||
stats["benign_errors"] += benign
|
||||
|
||||
if real_errors == 0:
|
||||
stats["all_success"] += 1
|
||||
status = "valid"
|
||||
elif real_errors < len(results):
|
||||
stats["partial"] += 1
|
||||
status = "partial"
|
||||
else:
|
||||
stats["all_fail"] += 1
|
||||
status = "invalid"
|
||||
|
||||
# Tag the example with validation results
|
||||
ex["rcon_validation"] = {
|
||||
"status": status,
|
||||
"results": results,
|
||||
"real_errors": real_errors,
|
||||
"benign_errors": benign,
|
||||
"successes": successes,
|
||||
}
|
||||
validated.append(ex)
|
||||
|
||||
flag = ""
|
||||
if real_errors > 0:
|
||||
flag = f" [FAIL:{real_errors}]"
|
||||
# Show first real error
|
||||
for r in results:
|
||||
if r["real_error"]:
|
||||
flag += f" {r['command'][:30]}→{r['result'][:40]}"
|
||||
break
|
||||
elif benign > 0:
|
||||
flag = f" [benign:{benign}]"
|
||||
|
||||
print(f" [{i+1}/{len(examples)}] ({mode}) {msg:50} [{successes}/{len(results)} ok]{flag}")
|
||||
|
||||
# Reset effects between examples
|
||||
if args.reset_between and mode == "god":
|
||||
try:
|
||||
with MCRcon(args.rcon_host, args.rcon_pass, port=args.rcon_port) as rcon:
|
||||
rcon.command("effect clear @a")
|
||||
except:
|
||||
pass
|
||||
time.sleep(0.5)
|
||||
|
||||
time.sleep(0.3)
|
||||
|
||||
# Write output
|
||||
with open(args.output, "w") as f:
|
||||
for ex in validated:
|
||||
f.write(json.dumps(ex, ensure_ascii=False) + "\n")
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Validation complete")
|
||||
print(f" Total: {stats['total']}")
|
||||
print(f" All valid: {stats['all_success']} ({stats['all_success']/max(stats['total'],1)*100:.1f}%)")
|
||||
print(f" Partial: {stats['partial']}")
|
||||
print(f" All failed: {stats['all_fail']}")
|
||||
print(f" No commands: {stats['no_cmds']}")
|
||||
print(f" Real errors: {stats['real_errors']}")
|
||||
print(f" Benign errors: {stats['benign_errors']} (would work with player online)")
|
||||
print(f" Output: {args.output}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user