Mortdecai/training/scripts/validate_all_training.py

#!/usr/bin/env python3
"""
Unified training data validator for Mortdecai 0.6.0.

Validates ALL training data files:
- RCON command syntax (live server when available, pattern matching always)
- System prompt correctness (24-tool list)
- Format consistency (messages[] chat format)
- Known bad patterns (@s, enchantment syntax, leading slashes, generic items)
- Tool call schema compliance
- Duplicate detection

Modes:
    --check     Dry run, report issues only (default)
    --fix       Auto-fix known issues in place
    --rcon      Enable live RCON validation (requires dev server)

Usage:
    python3 training/scripts/validate_all_training.py --check
    python3 training/scripts/validate_all_training.py --fix --rcon
"""

import argparse
import glob
import json
import logging
import os
import re
import socket
import struct
import sys
import time
from collections import Counter, defaultdict
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

log = logging.getLogger(__name__)

PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
sys.path.insert(0, str(PROJECT_ROOT))

# --- Current 24-tool list ---
CURRENT_TOOLS = [
    "rcon.execute", "minecraft.lookup", "plugin.docs_lookup",
    "world.player_info", "world.server_state", "world.nearby_entities",
    "world.scan_area", "world.redstone_trace", "world.render",
    "server.config",
    "memory.read", "memory.write",
    "journal.read", "journal.write",
    "log.query", "user.ask",
    "script.write", "script.validate", "script.execute",
    "script.read", "script.list", "script.delete", "script.schedule",
    "training.save",
]

# Legacy tool names that should be updated
LEGACY_TOOLS = {
    "minecraft.wiki_lookup": "minecraft.lookup",
    "minecraft.changelog_lookup": "minecraft.lookup",
    "paper.docs_lookup": "minecraft.lookup",
}

# Known bad patterns in commands
BAD_PATTERNS = [
    (re.compile(r'^/'), "leading_slash", "Commands should not start with /"),
    (re.compile(r'@s\b'), "at_s", "@s invalid via RCON (no executor). Auto-fix: @p (imprecise — selects nearest player, not necessarily the requester)"),
    (re.compile(r'\[enchantments=\{'), "enchantment_syntax", "Paper RCON rejects [enchantments={...}] component syntax"),
    (re.compile(r'\[potion_contents='), "potion_syntax", "Paper RCON rejects [potion_contents={...}] syntax"),
    (re.compile(r'give \S+ minecraft:bed\b'), "generic_bed", "minecraft:bed doesn't exist, use minecraft:white_bed"),
    (re.compile(r'give \S+ minecraft:log\b'), "generic_log", "minecraft:log doesn't exist, use minecraft:oak_log"),
    (re.compile(r'give \S+ minecraft:wood\b'), "generic_wood", "minecraft:wood doesn't exist, use minecraft:oak_planks"),
    (re.compile(r'give \S+ minecraft:boat\b'), "generic_boat", "minecraft:boat doesn't exist, use minecraft:oak_boat"),
    (re.compile(r'give \S+ minecraft:steak\b'), "generic_steak", "minecraft:steak doesn't exist, use minecraft:cooked_beef"),
    (re.compile(r'"weather clear"'), "weather_no_world", "Paper needs world name: weather devworld clear"),
    (re.compile(r'template (search|pick|build)'), "template_cmd", "Template commands removed in 0.5.0+"),
]

# Fix map for auto-repair
FIXES = {
    "leading_slash": lambda cmd: cmd.lstrip("/"),
    "at_s": lambda cmd: cmd.replace("@s", "@p"),
    "generic_bed": lambda cmd: cmd.replace("minecraft:bed", "minecraft:white_bed"),
    "generic_log": lambda cmd: cmd.replace("minecraft:log", "minecraft:oak_log"),
    "generic_wood": lambda cmd: cmd.replace("minecraft:wood", "minecraft:oak_planks"),
    "generic_boat": lambda cmd: cmd.replace("minecraft:boat", "minecraft:oak_boat"),
    "generic_steak": lambda cmd: cmd.replace("minecraft:steak", "minecraft:cooked_beef"),
}

# Enchantment fix: replace enchanted give with plain give
def fix_enchantment(cmd: str) -> List[str]:
    m = re.match(r'(give \S+ minecraft:\S+)\[enchantments=\{[^}]+\}\]\s*(\d*)', cmd)
    if m:
        base = m.group(1)
        count = m.group(2) or "1"
        return [f"{base} {count}"]
    m = re.match(r'(give \S+ minecraft:\S+)\[potion_contents=\{[^}]+\}\]\s*(\d*)', cmd)
    if m:
        base = m.group(1).split("[")[0]
        count = m.group(2) or "1"
        return [f"{base.rsplit(' ', 1)[0]} minecraft:potion {count}"]
    return [cmd]


# --- RCON helper ---
def rcon_connect(host: str, port: int, password: str) -> Optional[socket.socket]:
    try:
        s = socket.socket()
        s.settimeout(10)
        s.connect((host, port))
        data = password.encode() + b'\x00\x00'
        s.sendall(struct.pack('<iii', len(data)+8, 1, 3) + data)
        time.sleep(0.1)
        s.recv(4096)
        return s
    except Exception:
        return None


def rcon_cmd(sock: socket.socket, cmd: str) -> str:
    try:
        data = cmd.encode() + b'\x00\x00'
        sock.sendall(struct.pack('<iii', len(data)+8, 2, 2) + data)
        time.sleep(0.05)
        raw = sock.recv(4096)
        if len(raw) >= 14:
            return raw[12:-2].decode('utf-8', errors='replace')
        return ''
    except Exception as e:
        return f'ERROR: {e}'


def rcon_validate_cmd(sock: socket.socket, cmd: str) -> Tuple[bool, str]:
    """Validate a command via RCON. Returns (valid, result)."""
    result = rcon_cmd(sock, cmd)
    bad_words = ["unknown", "invalid", "incorrect", "expected whitespace", "error"]
    benign = ["no player", "no entity", "not loaded", "not online"]
    result_lower = result.lower()
    if any(b in result_lower for b in benign):
        return True, result  # command syntax is fine, just no target
    if any(b in result_lower for b in bad_words):
        return False, result
    return True, result


# --- Extract commands from training examples ---
def extract_commands(rec: dict) -> List[str]:
    """Extract all RCON commands from a training example (any format)."""
    commands = []

    # Format 1: messages[] chat format
    messages = rec.get("messages", [])
    for msg in messages:
        if not isinstance(msg, dict):
            continue
        content = msg.get("content", "")
        if not isinstance(content, str):
            continue

        # Tool calls with rcon.execute
        if msg.get("role") == "assistant" and "<tool_call>" in content:
            try:
                tc_json = content.split("<tool_call>")[1].split("</tool_call>")[0].strip()
                tc = json.loads(tc_json)
                if tc.get("name") == "rcon.execute":
                    cmd = tc["arguments"].get("command", "")
                    if cmd:
                        commands.append(cmd)
            except (json.JSONDecodeError, KeyError, IndexError):
                pass

        # Final JSON with commands array
        if msg.get("role") == "assistant" and "<tool_call>" not in content:
            try:
                parsed = json.loads(content)
                for cmd in parsed.get("commands", []):
                    if isinstance(cmd, str) and cmd:
                        commands.append(cmd)
            except (json.JSONDecodeError, TypeError):
                pass

    # Format 2: old dict format
    output = rec.get("output", {})
    if isinstance(output, dict):
        for cmd in output.get("commands", output.get("commands_generated", [])):
            if isinstance(cmd, str) and cmd:
                commands.append(cmd)

    return commands


def extract_tool_calls(rec: dict) -> List[str]:
    """Extract tool names called in a training example."""
    tools = []
    for msg in rec.get("messages", []):
        if not isinstance(msg, dict):
            continue
        content = msg.get("content", "")
        if msg.get("role") == "assistant" and "<tool_call>" in content:
            try:
                tc = json.loads(content.split("<tool_call>")[1].split("</tool_call>")[0].strip())
                tools.append(tc.get("name", "unknown"))
            except:
                pass
    return tools


def check_system_prompt(rec: dict) -> List[str]:
    """Check if system prompt has correct tool list."""
    issues = []
    messages = rec.get("messages", [])
    if not messages:
        return issues

    sys_content = ""
    for msg in messages:
        if isinstance(msg, dict) and msg.get("role") == "system":
            sys_content = msg.get("content", "")
            break

    if not sys_content:
        return ["no_system_prompt"]

    # Check for outdated tool names
    for old_name in LEGACY_TOOLS:
        if old_name in sys_content:
            issues.append(f"legacy_tool:{old_name}")

    # Check for missing new tools
    for tool in ["world.scan_area", "world.redstone_trace", "world.render",
                  "training.save", "server.config"]:
        # Only flag if the prompt lists tools at all
        if "rcon.execute" in sys_content and tool not in sys_content:
            issues.append(f"missing_tool:{tool}")

    return issues


# --- Main validation ---
def validate_file(filepath: str, fix_mode: bool, rcon_sock: Optional[socket.socket]) -> dict:
    """Validate a single JSONL file. Returns stats dict."""
    stats = {
        "file": os.path.basename(filepath),
        "total": 0,
        "valid": 0,
        "issues": Counter(),
        "cmd_issues": Counter(),
        "format_issues": Counter(),
        "fixed": 0,
        "examples_with_issues": 0,
        "rcon_tested": 0,
        "rcon_passed": 0,
        "rcon_failed": 0,
    }

    lines = []
    with open(filepath) as f:
        for line in f:
            try:
                rec = json.loads(line.strip())
                lines.append(rec)
            except json.JSONDecodeError:
                stats["format_issues"]["bad_json"] += 1

    output_lines = []
    seen_ids = set()

    for rec in lines:
        stats["total"] += 1
        example_issues = []

        # Duplicate check
        rec_id = rec.get("id", "")
        if rec_id and rec_id in seen_ids:
            example_issues.append("duplicate_id")
            stats["issues"]["duplicate_id"] += 1
        seen_ids.add(rec_id)

        # Format check
        has_messages = "messages" in rec and isinstance(rec.get("messages"), list)
        has_old_format = "input" in rec and "output" in rec
        if not has_messages and not has_old_format:
            example_issues.append("unknown_format")
            stats["format_issues"]["unknown_format"] += 1

        if has_old_format and not has_messages:
            stats["format_issues"]["old_format"] += 1

        # System prompt check
        prompt_issues = check_system_prompt(rec)
        for pi in prompt_issues:
            stats["issues"][pi] += 1
            example_issues.append(pi)

        # Tool call check
        tool_calls = extract_tool_calls(rec)
        for tool_name in tool_calls:
            if tool_name in LEGACY_TOOLS:
                stats["issues"][f"legacy_tool_call:{tool_name}"] += 1
                example_issues.append(f"legacy_tool_call:{tool_name}")

        # Command validation
        commands = extract_commands(rec)
        fixed_commands = {}  # index → fixed command(s)

        for i, cmd in enumerate(commands):
            matched_issues = []
            for pattern, issue_name, _desc in BAD_PATTERNS:
                if pattern.search(cmd):
                    stats["cmd_issues"][issue_name] += 1
                    example_issues.append(issue_name)
                    matched_issues.append(issue_name)

                    # Auto-fix if in fix mode
                    if fix_mode:
                        if issue_name in FIXES:
                            fixed_commands[i] = [FIXES[issue_name](cmd)]
                            stats["fixed"] += 1
                        elif issue_name in ("enchantment_syntax", "potion_syntax"):
                            fixed_commands[i] = fix_enchantment(cmd)
                            stats["fixed"] += 1

            # RCON validation — skip commands with unfixable syntax issues
            skip_rcon = any(mi in ("enchantment_syntax", "potion_syntax", "template_cmd")
                           for mi in matched_issues)
            if rcon_sock and not skip_rcon:
                test_cmd = cmd
                if i in fixed_commands:
                    test_cmd = fixed_commands[i][0]
                # Skip player-targeted commands (bots not guaranteed online)
                player_cmds = ["give ", "tp ", "effect ", "tellraw ", "execute at "]
                if not any(test_cmd.startswith(p) for p in player_cmds):
                    valid, result = rcon_validate_cmd(rcon_sock, test_cmd)
                    stats["rcon_tested"] += 1
                    if valid:
                        stats["rcon_passed"] += 1
                    else:
                        stats["rcon_failed"] += 1
                        example_issues.append(f"rcon_fail:{result[:60]}")

        if example_issues:
            stats["examples_with_issues"] += 1
        else:
            stats["valid"] += 1

        # Apply fixes to the record if in fix mode
        if fix_mode and fixed_commands:
            rec = _apply_fixes_to_record(rec, commands, fixed_commands)

        output_lines.append(rec)

    # Write back if fixing and changes were made
    if fix_mode and stats["fixed"] > 0:
        with open(filepath, 'w', encoding='utf-8') as f:
            for rec in output_lines:
                f.write(json.dumps(rec, ensure_ascii=True) + '\n')
        log.info("Wrote %d fixed examples to %s", stats["fixed"], filepath)

    return stats


def _apply_fixes_to_record(rec: dict, original_commands: List[str],
                           fixed_commands: Dict[int, List[str]]) -> dict:
    """Apply command fixes to all locations in a training record."""
    # Build old→new mapping
    fix_map = {}
    for i, new_cmds in fixed_commands.items():
        if i < len(original_commands):
            fix_map[original_commands[i]] = new_cmds

    if not fix_map:
        return rec

    # Fix messages[] format
    for msg in rec.get("messages", []):
        if not isinstance(msg, dict) or msg.get("role") != "assistant":
            continue
        content = msg.get("content", "")

        # Fix tool_call blocks
        if "<tool_call>" in content and "rcon.execute" in content:
            try:
                tc = json.loads(content.split("<tool_call>")[1].split("</tool_call>")[0].strip())
                if tc.get("name") == "rcon.execute":
                    cmd = tc["arguments"].get("command", "")
                    if cmd in fix_map:
                        tc["arguments"]["command"] = fix_map[cmd][0]
                        msg["content"] = f'<tool_call>\n{json.dumps(tc)}\n</tool_call>'
            except:
                pass

        # Fix final JSON commands array
        if "<tool_call>" not in content:
            try:
                parsed = json.loads(content)
                if "commands" in parsed:
                    new_cmds = []
                    for cmd in parsed["commands"]:
                        if cmd in fix_map:
                            new_cmds.extend(fix_map[cmd])
                        else:
                            new_cmds.append(cmd)
                    parsed["commands"] = new_cmds
                    msg["content"] = json.dumps(parsed, ensure_ascii=True)
            except:
                pass

    # Fix old dict format
    output = rec.get("output", {})
    if isinstance(output, dict):
        for key in ("commands", "commands_generated", "commands_executed"):
            if key in output and isinstance(output[key], list):
                new_cmds = []
                for cmd in output[key]:
                    if isinstance(cmd, str) and cmd in fix_map:
                        new_cmds.extend(fix_map[cmd])
                    else:
                        new_cmds.append(cmd)
                output[key] = new_cmds

    return rec


def main():
    parser = argparse.ArgumentParser(description="Validate all training data")
    parser.add_argument("--check", action="store_true", default=True, help="Dry run (default)")
    parser.add_argument("--fix", action="store_true", help="Auto-fix known issues")
    parser.add_argument("--rcon", action="store_true", help="Enable live RCON validation")
    parser.add_argument("--files", nargs="*", help="Specific files to validate (default: all)")
    args = parser.parse_args()

    # Find all training files
    if args.files:
        files = args.files
    else:
        files = sorted(
            glob.glob(str(PROJECT_ROOT / "data/raw/*.jsonl"))
            + glob.glob(str(PROJECT_ROOT / "data/processed/*.jsonl"))
        )
        # Exclude quarantine and queue files
        files = [f for f in files if "quarantine" not in f and "queue" not in f]

    print(f"Validating {len(files)} files...")

    # RCON connection
    rcon_sock = None
    if args.rcon:
        print("Connecting to dev RCON (192.168.0.244:25578)...")
        rcon_sock = rcon_connect("192.168.0.244", 25578, "REDACTED_RCON")
        if rcon_sock:
            print("  RCON connected")
        else:
            print("  RCON connection failed — running without live validation")

    # Validate each file
    all_stats = []
    total_issues = Counter()
    total_cmd_issues = Counter()

    for filepath in files:
        stats = validate_file(filepath, args.fix, rcon_sock)
        all_stats.append(stats)
        total_issues.update(stats["issues"])
        total_cmd_issues.update(stats["cmd_issues"])

    # Close RCON
    if rcon_sock:
        try:
            rcon_sock.close()
        except:
            pass

    # Report
    print(f"\n{'='*70}")
    print(f"VALIDATION REPORT")
    print(f"{'='*70}")

    total_examples = sum(s["total"] for s in all_stats)
    total_valid = sum(s["valid"] for s in all_stats)
    total_with_issues = sum(s["examples_with_issues"] for s in all_stats)
    total_fixed = sum(s["fixed"] for s in all_stats)

    print(f"\nTotal examples: {total_examples}")
    print(f"Valid: {total_valid} ({total_valid/total_examples*100:.1f}%)")
    print(f"With issues: {total_with_issues} ({total_with_issues/total_examples*100:.1f}%)")
    if args.fix:
        print(f"Fixed: {total_fixed}")

    # Per-file summary
    print(f"\n{'File':<45} {'Total':>6} {'Valid':>6} {'Issues':>6} {'Rate':>6}")
    print("-" * 75)
    for s in sorted(all_stats, key=lambda x: -x["examples_with_issues"]):
        if s["total"] == 0:
            continue
        rate = f"{s['valid']/s['total']*100:.0f}%"
        print(f"{s['file']:<45} {s['total']:>6} {s['valid']:>6} {s['examples_with_issues']:>6} {rate:>6}")

    # Issue breakdown
    if total_cmd_issues:
        print(f"\nCommand issues:")
        for issue, count in total_cmd_issues.most_common():
            print(f"  {count:>5} {issue}")

    if total_issues:
        print(f"\nOther issues:")
        for issue, count in total_issues.most_common(20):
            print(f"  {count:>5} {issue}")

    # RCON stats
    rcon_tested = sum(s["rcon_tested"] for s in all_stats)
    if rcon_tested:
        rcon_passed = sum(s["rcon_passed"] for s in all_stats)
        rcon_failed = sum(s["rcon_failed"] for s in all_stats)
        print(f"\nRCON validation: {rcon_tested} tested, {rcon_passed} passed, {rcon_failed} failed")

    # Format breakdown
    format_issues = Counter()
    for s in all_stats:
        format_issues.update(s["format_issues"])
    if format_issues:
        print(f"\nFormat issues:")
        for issue, count in format_issues.most_common():
            print(f"  {count:>5} {issue}")


if __name__ == "__main__":
    main()