Mortdecai/training/scripts/clean_training_data.py

#!/usr/bin/env python3
"""
Clean training data — fix known bad patterns before 0.6.0 training.

Fixes:
- @s selector → @p (RCON has no executor entity)
- Leading slash on commands
- Template commands (remove entire example)
- Old NBT enchant syntax
- fill with trailing count
- Generic bed/log → specific variants
- steak → cooked_beef

Usage:
    python3 training/scripts/clean_training_data.py
"""

import json
import re
import sys
from pathlib import Path

PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent

FILES = [
    PROJECT_ROOT / "data" / "processed" / "seed_dataset.jsonl",
    PROJECT_ROOT / "data" / "processed" / "tool_training.jsonl",
    PROJECT_ROOT / "data" / "processed" / "tool_training_v05.jsonl",
    PROJECT_ROOT / "data" / "processed" / "filtered_exploration.jsonl",
]

stats = {
    "files_processed": 0,
    "examples_in": 0,
    "examples_out": 0,
    "removed_template": 0,
    "fixed_at_s": 0,
    "fixed_leading_slash": 0,
    "fixed_old_nbt": 0,
    "fixed_fill_count": 0,
    "fixed_generic_items": 0,
}


def fix_command(cmd: str, player: str = "slingshooter08") -> str:
    """Fix a single command string."""
    if not isinstance(cmd, str):
        return cmd

    # Leading slash
    if cmd.startswith("/"):
        cmd = cmd[1:]
        stats["fixed_leading_slash"] += 1

    # @s → @p (RCON has no executor)
    if "@s" in cmd:
        cmd = cmd.replace("@s", "@p")
        stats["fixed_at_s"] += 1

    # Generic items
    if "minecraft:bed " in cmd or "minecraft:bed]" in cmd:
        cmd = cmd.replace("minecraft:bed", "minecraft:white_bed")
        stats["fixed_generic_items"] += 1
    if "minecraft:log " in cmd or "minecraft:log]" in cmd:
        cmd = cmd.replace("minecraft:log", "minecraft:oak_log")
        stats["fixed_generic_items"] += 1
    if "minecraft:steak" in cmd:
        cmd = cmd.replace("minecraft:steak", "minecraft:cooked_beef")
        stats["fixed_generic_items"] += 1

    # Fill with trailing count (e.g. "fill ... minecraft:stone 1")
    m = re.match(r'^(fill .+ minecraft:\w+(?:\[.*?\])?)\s+\d+$', cmd)
    if m:
        cmd = m.group(1)
        stats["fixed_fill_count"] += 1

    return cmd


def fix_commands_in_obj(obj):
    """Recursively fix commands in any dict/list structure."""
    if isinstance(obj, str):
        # Fix @s in any string content (including tool call JSON)
        if "@s" in obj:
            obj = obj.replace("@s", "@p")
        return obj
    elif isinstance(obj, list):
        return [fix_commands_in_obj(item) for item in obj]
    elif isinstance(obj, dict):
        result = {}
        for k, v in obj.items():
            if k in ("commands", "commands_generated", "commands_executed"):
                result[k] = [fix_command(c) for c in v] if isinstance(v, list) else v
            elif k == "command" and isinstance(v, str):
                result[k] = fix_command(v)
            elif k == "content" and isinstance(v, str):
                # Fix @s in message content (tool calls, system prompts)
                fixed = v
                if "@s" in fixed and "rcon" in fixed.lower():
                    fixed = fixed.replace("@s", "@p")
                result[k] = fixed
            else:
                result[k] = fix_commands_in_obj(v)
        return result
    return obj


def has_template_commands(obj) -> bool:
    """Check if this example contains template commands."""
    text = json.dumps(obj).lower()
    return any(t in text for t in ["template search", "template pick", "template build"])


def process_file(path: Path):
    """Clean one JSONL file in place."""
    if not path.exists():
        print(f"  SKIP: {path.name} (not found)")
        return

    examples = []
    with open(path) as f:
        for line in f:
            if line.strip():
                try:
                    examples.append(json.loads(line))
                except json.JSONDecodeError:
                    pass

    stats["examples_in"] += len(examples)
    stats["files_processed"] += 1

    cleaned = []
    for ex in examples:
        # Remove template command examples entirely
        if has_template_commands(ex):
            stats["removed_template"] += 1
            continue

        # Fix all commands recursively
        fixed = fix_commands_in_obj(ex)
        cleaned.append(fixed)

    stats["examples_out"] += len(cleaned)

    # Write back
    with open(path, "w") as f:
        for ex in cleaned:
            f.write(json.dumps(ex, ensure_ascii=False) + "\n")

    removed = len(examples) - len(cleaned)
    print(f"  {path.name}: {len(examples)} → {len(cleaned)} ({removed} removed)")


def main():
    print("Cleaning training data...\n")

    for path in FILES:
        process_file(path)

    print(f"\n{'='*50}")
    print(f"Files processed: {stats['files_processed']}")
    print(f"Examples: {stats['examples_in']} → {stats['examples_out']} ({stats['examples_in'] - stats['examples_out']} removed)")
    print(f"\nFixes applied:")
    print(f"  @s → @p:           {stats['fixed_at_s']}")
    print(f"  Leading slash:      {stats['fixed_leading_slash']}")
    print(f"  Template removed:   {stats['removed_template']}")
    print(f"  Fill trailing count: {stats['fixed_fill_count']}")
    print(f"  Generic items:      {stats['fixed_generic_items']}")


if __name__ == "__main__":
    main()