#!/usr/bin/env python3 """ Clean training data — fix known bad patterns before 0.6.0 training. Fixes: - @s selector → @p (RCON has no executor entity) - Leading slash on commands - Template commands (remove entire example) - Old NBT enchant syntax - fill with trailing count - Generic bed/log → specific variants - steak → cooked_beef Usage: python3 training/scripts/clean_training_data.py """ import json import re import sys from pathlib import Path PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent FILES = [ PROJECT_ROOT / "data" / "processed" / "seed_dataset.jsonl", PROJECT_ROOT / "data" / "processed" / "tool_training.jsonl", PROJECT_ROOT / "data" / "processed" / "tool_training_v05.jsonl", PROJECT_ROOT / "data" / "processed" / "filtered_exploration.jsonl", ] stats = { "files_processed": 0, "examples_in": 0, "examples_out": 0, "removed_template": 0, "fixed_at_s": 0, "fixed_leading_slash": 0, "fixed_old_nbt": 0, "fixed_fill_count": 0, "fixed_generic_items": 0, } def fix_command(cmd: str, player: str = "slingshooter08") -> str: """Fix a single command string.""" if not isinstance(cmd, str): return cmd # Leading slash if cmd.startswith("/"): cmd = cmd[1:] stats["fixed_leading_slash"] += 1 # @s → @p (RCON has no executor) if "@s" in cmd: cmd = cmd.replace("@s", "@p") stats["fixed_at_s"] += 1 # Generic items if "minecraft:bed " in cmd or "minecraft:bed]" in cmd: cmd = cmd.replace("minecraft:bed", "minecraft:white_bed") stats["fixed_generic_items"] += 1 if "minecraft:log " in cmd or "minecraft:log]" in cmd: cmd = cmd.replace("minecraft:log", "minecraft:oak_log") stats["fixed_generic_items"] += 1 if "minecraft:steak" in cmd: cmd = cmd.replace("minecraft:steak", "minecraft:cooked_beef") stats["fixed_generic_items"] += 1 # Fill with trailing count (e.g. "fill ... minecraft:stone 1") m = re.match(r'^(fill .+ minecraft:\w+(?:\[.*?\])?)\s+\d+$', cmd) if m: cmd = m.group(1) stats["fixed_fill_count"] += 1 return cmd def fix_commands_in_obj(obj): """Recursively fix commands in any dict/list structure.""" if isinstance(obj, str): # Fix @s in any string content (including tool call JSON) if "@s" in obj: obj = obj.replace("@s", "@p") return obj elif isinstance(obj, list): return [fix_commands_in_obj(item) for item in obj] elif isinstance(obj, dict): result = {} for k, v in obj.items(): if k in ("commands", "commands_generated", "commands_executed"): result[k] = [fix_command(c) for c in v] if isinstance(v, list) else v elif k == "command" and isinstance(v, str): result[k] = fix_command(v) elif k == "content" and isinstance(v, str): # Fix @s in message content (tool calls, system prompts) fixed = v if "@s" in fixed and "rcon" in fixed.lower(): fixed = fixed.replace("@s", "@p") result[k] = fixed else: result[k] = fix_commands_in_obj(v) return result return obj def has_template_commands(obj) -> bool: """Check if this example contains template commands.""" text = json.dumps(obj).lower() return any(t in text for t in ["template search", "template pick", "template build"]) def process_file(path: Path): """Clean one JSONL file in place.""" if not path.exists(): print(f" SKIP: {path.name} (not found)") return examples = [] with open(path) as f: for line in f: if line.strip(): try: examples.append(json.loads(line)) except json.JSONDecodeError: pass stats["examples_in"] += len(examples) stats["files_processed"] += 1 cleaned = [] for ex in examples: # Remove template command examples entirely if has_template_commands(ex): stats["removed_template"] += 1 continue # Fix all commands recursively fixed = fix_commands_in_obj(ex) cleaned.append(fixed) stats["examples_out"] += len(cleaned) # Write back with open(path, "w") as f: for ex in cleaned: f.write(json.dumps(ex, ensure_ascii=False) + "\n") removed = len(examples) - len(cleaned) print(f" {path.name}: {len(examples)} → {len(cleaned)} ({removed} removed)") def main(): print("Cleaning training data...\n") for path in FILES: process_file(path) print(f"\n{'='*50}") print(f"Files processed: {stats['files_processed']}") print(f"Examples: {stats['examples_in']} → {stats['examples_out']} ({stats['examples_in'] - stats['examples_out']} removed)") print(f"\nFixes applied:") print(f" @s → @p: {stats['fixed_at_s']}") print(f" Leading slash: {stats['fixed_leading_slash']}") print(f" Template removed: {stats['removed_template']}") print(f" Fill trailing count: {stats['fixed_fill_count']}") print(f" Generic items: {stats['fixed_generic_items']}") if __name__ == "__main__": main()