#!/usr/bin/env python3 """ self_play.py — Self-play training data generator. The fine-tuned model generates its own training data by: 1. Generating diverse edge-case prompts it's uncertain about 2. Attempting commands via RCON 3. Self-correcting on errors 4. Saving successful sequences as training examples This creates a closed-loop learning system with RCON as ground truth. No API cost — runs entirely on the local model. Usage: python3 training/scripts/self_play.py --rounds 100 --model qwen3.5-9b-mc-v4 python3 training/scripts/self_play.py --rounds 50 --dry-run python3 training/scripts/self_play.py --rounds 200 --focus enchantments """ import argparse import json import os import re import random import sys import time from pathlib import Path import requests ROOT = Path(__file__).resolve().parent.parent.parent sys.path.insert(0, str(ROOT)) OUTPUT = ROOT / "data" / "processed" / "self_play.jsonl" # --- RCON --- def rcon_command(cmd, host, port, password): """Execute via RCON, return (success, result_text).""" import socket, struct try: s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.settimeout(5) s.connect((host, port)) def send(rid, ptype, payload): data = struct.pack("[\s\S]*?\s*', '', content) return content.strip() # --- Prompt generation categories --- EXPLORATION_CATEGORIES = { "enchantment_combos": { "instruction": """Generate 5 Minecraft chat messages that test unusual or edge-case enchantment combinations. Include: mutually exclusive enchants, max level exceeded, enchants on wrong items, multi-enchant syntax. Every message must start with "sudo " or "pray ". Return a JSON array of strings.""", "temperature": 1.0, }, "entity_nbt": { "instruction": """Generate 5 Minecraft chat messages that test entity spawning with unusual NBT data. Include: custom names, baby variants, colored sheep, armored mobs, riding/passengers, powered creepers. Every message must start with "sudo " or "pray ". Return a JSON array of strings.""", "temperature": 1.0, }, "execute_chains": { "instruction": """Generate 5 Minecraft chat messages that require complex execute command chains. Include: nested execute, conditional execution, store results, dimension switching, targeting by gamemode/team. Every message must start with "sudo ". Return a JSON array of strings.""", "temperature": 1.0, }, "edge_items": { "instruction": """Generate 5 Minecraft chat messages requesting obscure or easily-confused items. Include: items with color variants, items that changed names between versions, items with underscores, items people misspell (like "wooden_sword" vs "wood_sword", "cooked_beef" vs "steak"). Every message must start with "sudo ". Return a JSON array of strings.""", "temperature": 1.0, }, "worldedit": { "instruction": """Generate 5 Minecraft chat messages requesting WorldEdit operations. Include: shapes, selections, replacements, brushes, stacking, clipboard operations. Every message must start with "sudo ". Return a JSON array of strings.""", "temperature": 1.0, }, "multiplayer": { "instruction": """Generate 5 Minecraft chat messages involving multiple players or complex selectors. Include: @a with exclusions, team commands, scoreboard operations, targeting by distance/gamemode. Use player names like: slingshooter08, SwiftWolf, DarkWolf, BraveWolf. Every message must start with "sudo ". Return a JSON array of strings.""", "temperature": 1.0, }, "boundary_testing": { "instruction": """Generate 5 Minecraft chat messages that test safety boundaries. Include: requests for forbidden items, mass destruction, OP commands, but also requests that SEEM dangerous but are actually fine (like giving TNT to yourself, or killing your own mobs). Every message must start with "sudo " or "pray ". Return a JSON array of strings.""", "temperature": 1.0, }, "natural_language": { "instruction": """Generate 5 Minecraft chat messages phrased in unusual or creative natural language. Include: typos, slang, roleplay, indirect requests, questions, sarcasm, mixed languages. The AI should still be able to figure out what the player wants. Every message must start with "sudo " or "pray ". Return a JSON array of strings.""", "temperature": 1.2, }, "cosmetic_effects": { "instruction": """Generate 5 Minecraft chat messages requesting cosmetic or dramatic effects. Include: particles, sounds, titles, tellraw formatting, fireworks, combination effects. Every message must start with "sudo " or "pray ". Return a JSON array of strings.""", "temperature": 1.0, }, } # System prompts SUDO_SYSTEM = """You are a Minecraft 1.21 command translator. Return JSON: {"commands": ["cmd1", ...], "reasoning": "why"} Commands use minecraft: prefix. Enchantments: item[enchantments={name:level}]. Effects: effect give minecraft: . Do NOT start commands with /. Player name: slingshooter08.""" GOD_SYSTEM = """You are God in a Minecraft server. Return JSON: {"message": "dramatic response", "commands": ["cmd1", ...], "reasoning": "why"} Commands use minecraft: prefix. Be dramatic but use valid 1.21 syntax. Player: slingshooter08.""" RETRY_SYSTEM = """You are a Minecraft 1.21 command translator. Your previous command failed with an error. Analyze the error and return a corrected command. Return JSON: {"commands": ["corrected_cmd"], "reasoning": "what was wrong and how you fixed it"}""" def generate_prompts(model, ollama_url, category=None): """Use the model to generate edge-case prompts for itself.""" if category: cats = {category: EXPLORATION_CATEGORIES[category]} else: # Pick 2-3 random categories per round keys = random.sample(list(EXPLORATION_CATEGORIES.keys()), min(3, len(EXPLORATION_CATEGORIES))) cats = {k: EXPLORATION_CATEGORIES[k] for k in keys} prompts = [] for cat_name, cat_config in cats.items(): try: raw = llm_call( model=model, system="You are a Minecraft test case generator. Generate diverse edge cases for an AI training pipeline.", user=cat_config["instruction"], ollama_url=ollama_url, temperature=cat_config["temperature"], max_tokens=400, ) # Parse JSON array cleaned = raw.replace("```json", "").replace("```", "").strip() match = re.search(r'\[[\s\S]*\]', cleaned) if match: items = json.loads(match.group()) for item in items: if isinstance(item, str) and item.strip(): prompts.append({"prompt": item.strip(), "category": cat_name}) except Exception as e: print(f" [!] Prompt generation failed for {cat_name}: {e}") return prompts def attempt_command(model, ollama_url, prompt, rcon_host, rcon_port, rcon_pass, max_retries=2): """ Model generates a command for the prompt, executes via RCON. On error, model self-corrects up to max_retries times. Returns the full interaction trace. """ mode = "god" if prompt.lower().startswith("pray ") else "sudo" system = GOD_SYSTEM if mode == "god" else SUDO_SYSTEM trace = { "prompt": prompt, "mode": mode, "attempts": [], "final_success": False, "self_corrected": False, } # First attempt try: raw = llm_call(model, system, prompt, ollama_url, temperature=0.3, max_tokens=300, fmt="json") result = json.loads(raw) except (json.JSONDecodeError, Exception) as e: # Try extracting JSON match = re.search(r'\{[\s\S]*\}', raw if 'raw' in dir() else '') if match: try: result = json.loads(match.group()) except: trace["attempts"].append({"commands": [], "error": f"JSON parse failed: {e}"}) return trace else: trace["attempts"].append({"commands": [], "error": f"LLM failed: {e}"}) return trace commands = result.get("commands") or [] message = result.get("message") or "" reasoning = result.get("reasoning") or "" if not commands: trace["attempts"].append({ "commands": [], "reasoning": reasoning, "message": message, "rcon_results": [], "all_success": True, }) trace["final_success"] = True # Refusal/info is valid return trace # Execute commands via RCON rcon_results = [] all_success = True for cmd in commands: success, rcon_result = rcon_command(cmd, rcon_host, rcon_port, rcon_pass) rcon_results.append({"command": cmd, "success": success, "result": rcon_result}) if not success: all_success = False trace["attempts"].append({ "commands": commands, "reasoning": reasoning, "message": message, "rcon_results": rcon_results, "all_success": all_success, }) if all_success: trace["final_success"] = True return trace # Self-correction loop for retry in range(max_retries): # Build error context for the model failed_cmds = [r for r in rcon_results if not r["success"]] error_context = "\n".join( f"Command: {r['command']}\nError: {r['result']}" for r in failed_cmds ) retry_prompt = f"Original request: {prompt}\n\nFailed commands:\n{error_context}\n\nPlease fix the commands." try: raw = llm_call(model, RETRY_SYSTEM, retry_prompt, ollama_url, temperature=0.2, max_tokens=300, fmt="json") result = json.loads(raw) except: match = re.search(r'\{[\s\S]*\}', raw if 'raw' in dir() else '') if match: try: result = json.loads(match.group()) except: break else: break commands = result.get("commands") or [] reasoning = result.get("reasoning") or "" if not commands: break # Execute corrected commands rcon_results = [] all_success = True for cmd in commands: success, rcon_result = rcon_command(cmd, rcon_host, rcon_port, rcon_pass) rcon_results.append({"command": cmd, "success": success, "result": rcon_result}) if not success: all_success = False trace["attempts"].append({ "commands": commands, "reasoning": reasoning, "rcon_results": rcon_results, "all_success": all_success, "retry": retry + 1, }) if all_success: trace["final_success"] = True trace["self_corrected"] = True break return trace def trace_to_training(trace): """Convert a self-play trace to training examples.""" examples = [] prompt = trace["prompt"] mode = trace["mode"] if not trace["attempts"]: return examples # Single successful attempt → standard training pair if trace["final_success"] and len(trace["attempts"]) == 1: att = trace["attempts"][0] ex = { "id": f"selfplay-{int(time.time())}-{random.randint(0,999):03d}", "source": "self_play", "category": "command_gen", "input": { "user_message": prompt, "server_context": {"server_type": "paper", "version": "1.21.x"}, }, "output": { "reasoning": att.get("reasoning", ""), "commands": att.get("commands", []), "message": att.get("message", "") if mode == "god" else "", "safety_flags": [], }, "metadata": { "difficulty": "medium", "validated": True, "risk_level": 3, "rcon_verified": True, "self_play": True, }, } examples.append(ex) # Self-corrected → multi-turn tool-calling training example elif trace["self_corrected"] and len(trace["attempts"]) >= 2: messages = [] # System system = GOD_SYSTEM if mode == "god" else SUDO_SYSTEM messages.append({"role": "system", "content": system}) # User messages.append({"role": "user", "content": prompt}) # First attempt (failed) first = trace["attempts"][0] for r in first.get("rcon_results", []): messages.append({ "role": "assistant", "content": f'\n{{"name": "rcon.execute", "arguments": {{"command": "{r["command"]}"}}}}\n' }) messages.append({ "role": "tool", "content": json.dumps({"success": r["success"], "result": r["result"]}) }) # Successful retry last = trace["attempts"][-1] for r in last.get("rcon_results", []): messages.append({ "role": "assistant", "content": f'\n{{"name": "rcon.execute", "arguments": {{"command": "{r["command"]}"}}}}\n' }) messages.append({ "role": "tool", "content": json.dumps({"success": r["success"], "result": r["result"]}) }) # Final response final_cmds = last.get("commands", []) final_response = { "commands": final_cmds, "reasoning": f"Self-corrected: {first.get('reasoning', '')} → {last.get('reasoning', '')}", } if mode == "god": final_response["message"] = first.get("message", "") messages.append({"role": "assistant", "content": json.dumps(final_response)}) ex = { "id": f"selfplay-correction-{int(time.time())}-{random.randint(0,999):03d}", "source": "self_play", "type": "error_correction", "messages": messages, "metadata": { "self_play": True, "rcon_verified": True, "attempts": len(trace["attempts"]), }, } examples.append(ex) return examples def main(): parser = argparse.ArgumentParser(description="Self-play training data generator") parser.add_argument("--model", default="qwen3-8b-mc-lora-v3") parser.add_argument("--ollama-url", default="http://192.168.0.141:11434") parser.add_argument("--rcon-host", default="192.168.0.244") parser.add_argument("--rcon-port", type=int, default=25578) parser.add_argument("--rcon-pass", default="REDACTED_RCON") parser.add_argument("--rounds", type=int, default=50) parser.add_argument("--focus", default=None, choices=list(EXPLORATION_CATEGORIES.keys())) parser.add_argument("--output", default=str(OUTPUT)) parser.add_argument("--dry-run", action="store_true") parser.add_argument("--max-retries", type=int, default=2) args = parser.parse_args() print(f"Self-play training data generator") print(f" Model: {args.model}") print(f" RCON: {args.rcon_host}:{args.rcon_port}") print(f" Rounds: {args.rounds}") print(f" Focus: {args.focus or 'all categories'}") print(f" Max retries: {args.max_retries}") print(f" Output: {args.output}") all_examples = [] stats = { "rounds": 0, "prompts_generated": 0, "attempts": 0, "success_first_try": 0, "self_corrected": 0, "failed": 0, "training_examples": 0, "by_category": {}, } for round_num in range(args.rounds): print(f"\n--- Round {round_num + 1}/{args.rounds} ---") # Generate prompts prompts = generate_prompts(args.model, args.ollama_url, args.focus) if not prompts: print(" No prompts generated, skipping round") continue stats["prompts_generated"] += len(prompts) print(f" Generated {len(prompts)} prompts") for p in prompts: prompt = p["prompt"] cat = p["category"] stats["by_category"].setdefault(cat, {"total": 0, "success": 0, "corrected": 0}) stats["by_category"][cat]["total"] += 1 stats["attempts"] += 1 print(f" [{cat}] {prompt[:60]:60}", end="") if args.dry_run: print(" [DRY RUN]") continue trace = attempt_command( args.model, args.ollama_url, prompt, args.rcon_host, args.rcon_port, args.rcon_pass, max_retries=args.max_retries, ) if trace["final_success"] and not trace["self_corrected"]: stats["success_first_try"] += 1 stats["by_category"][cat]["success"] += 1 n_cmds = len(trace["attempts"][0].get("commands", [])) print(f" OK ({n_cmds} cmds)") elif trace["self_corrected"]: stats["self_corrected"] += 1 stats["by_category"][cat]["corrected"] += 1 print(f" CORRECTED ({len(trace['attempts'])} attempts)") else: stats["failed"] += 1 print(f" FAILED") # Convert to training examples examples = trace_to_training(trace) all_examples.extend(examples) stats["training_examples"] += len(examples) # Brief pause between attempts time.sleep(1) stats["rounds"] += 1 # Save if not args.dry_run and all_examples: output_path = Path(args.output) output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, "a") as f: for ex in all_examples: f.write(json.dumps(ex, ensure_ascii=False) + "\n") # Summary print(f"\n{'='*60}") print(f"Self-play complete") print(f" Rounds: {stats['rounds']}") print(f" Prompts generated:{stats['prompts_generated']}") print(f" Attempts: {stats['attempts']}") print(f" Success (1st try):{stats['success_first_try']}") print(f" Self-corrected: {stats['self_corrected']}") print(f" Failed: {stats['failed']}") print(f" Training examples:{stats['training_examples']}") if stats["by_category"]: print(f"\n By category:") for cat, s in sorted(stats["by_category"].items()): total = s["total"] ok = s["success"] corr = s["corrected"] fail = total - ok - corr print(f" {cat:25} total={total} ok={ok} corrected={corr} failed={fail}") print(f"\n Output: {args.output}") if __name__ == "__main__": main()