9c2c9a2310
Distilled Training Data (1,203 examples): - 341 initial gold (plugins, enchantments, builds, effects, god, errors) - 165 buildings + pipeline (100 structures built on dev, 65 request→query→act) - 24 safety-aware (worldborder, safe tp, intentional harm, gamemode checks) - 17 advanced logic (decanonized items, redstone gates, iterative builds) - 12 redstone mastery (NOT/OR/AND/XOR/RS-latch/T-flip-flop/comparator/clock) - 7 circuit verification and diagnosis - 1 compact comparator gates - 10 redstone methodology (build→test→save→recall→learn from mistakes) - 8 player journal usage - 29 creative+uncommon+pipeline+god with full tool chains Player Journal System: - agent/tools/player_journal.py — per-player text files (1-10 lines) - journal.read + journal.write tool schemas added - Cross-contaminated: God and Sudo share same journal per player - Includes sentiment, relationship, builds, preferences, skill level Redstone Engineering: - agent/prompts/redstone_rules.md — baked-in wall torch, dedicated lead, repeater rules - Learned from 4 iterations of 8-switch circuit: wall_torch on back face, not top - T-junction bypass prevention: dedicated lead wire between merge and NOT block - RCON limitation: can build circuits but cannot test them (lever toggle doesn't propagate) Training Data Cleaning: - 466 @s→@p fixes, 10 template commands removed - 12 outdated refusals replaced with correct plugin commands - Data de-duped across all sources Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
173 lines
5.2 KiB
Python
173 lines
5.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Clean training data — fix known bad patterns before 0.6.0 training.
|
|
|
|
Fixes:
|
|
- @s selector → @p (RCON has no executor entity)
|
|
- Leading slash on commands
|
|
- Template commands (remove entire example)
|
|
- Old NBT enchant syntax
|
|
- fill with trailing count
|
|
- Generic bed/log → specific variants
|
|
- steak → cooked_beef
|
|
|
|
Usage:
|
|
python3 training/scripts/clean_training_data.py
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
|
|
|
|
FILES = [
|
|
PROJECT_ROOT / "data" / "processed" / "seed_dataset.jsonl",
|
|
PROJECT_ROOT / "data" / "processed" / "tool_training.jsonl",
|
|
PROJECT_ROOT / "data" / "processed" / "tool_training_v05.jsonl",
|
|
PROJECT_ROOT / "data" / "processed" / "filtered_exploration.jsonl",
|
|
]
|
|
|
|
stats = {
|
|
"files_processed": 0,
|
|
"examples_in": 0,
|
|
"examples_out": 0,
|
|
"removed_template": 0,
|
|
"fixed_at_s": 0,
|
|
"fixed_leading_slash": 0,
|
|
"fixed_old_nbt": 0,
|
|
"fixed_fill_count": 0,
|
|
"fixed_generic_items": 0,
|
|
}
|
|
|
|
|
|
def fix_command(cmd: str, player: str = "slingshooter08") -> str:
|
|
"""Fix a single command string."""
|
|
if not isinstance(cmd, str):
|
|
return cmd
|
|
|
|
# Leading slash
|
|
if cmd.startswith("/"):
|
|
cmd = cmd[1:]
|
|
stats["fixed_leading_slash"] += 1
|
|
|
|
# @s → @p (RCON has no executor)
|
|
if "@s" in cmd:
|
|
cmd = cmd.replace("@s", "@p")
|
|
stats["fixed_at_s"] += 1
|
|
|
|
# Generic items
|
|
if "minecraft:bed " in cmd or "minecraft:bed]" in cmd:
|
|
cmd = cmd.replace("minecraft:bed", "minecraft:white_bed")
|
|
stats["fixed_generic_items"] += 1
|
|
if "minecraft:log " in cmd or "minecraft:log]" in cmd:
|
|
cmd = cmd.replace("minecraft:log", "minecraft:oak_log")
|
|
stats["fixed_generic_items"] += 1
|
|
if "minecraft:steak" in cmd:
|
|
cmd = cmd.replace("minecraft:steak", "minecraft:cooked_beef")
|
|
stats["fixed_generic_items"] += 1
|
|
|
|
# Fill with trailing count (e.g. "fill ... minecraft:stone 1")
|
|
m = re.match(r'^(fill .+ minecraft:\w+(?:\[.*?\])?)\s+\d+$', cmd)
|
|
if m:
|
|
cmd = m.group(1)
|
|
stats["fixed_fill_count"] += 1
|
|
|
|
return cmd
|
|
|
|
|
|
def fix_commands_in_obj(obj):
|
|
"""Recursively fix commands in any dict/list structure."""
|
|
if isinstance(obj, str):
|
|
# Fix @s in any string content (including tool call JSON)
|
|
if "@s" in obj:
|
|
obj = obj.replace("@s", "@p")
|
|
return obj
|
|
elif isinstance(obj, list):
|
|
return [fix_commands_in_obj(item) for item in obj]
|
|
elif isinstance(obj, dict):
|
|
result = {}
|
|
for k, v in obj.items():
|
|
if k in ("commands", "commands_generated", "commands_executed"):
|
|
result[k] = [fix_command(c) for c in v] if isinstance(v, list) else v
|
|
elif k == "command" and isinstance(v, str):
|
|
result[k] = fix_command(v)
|
|
elif k == "content" and isinstance(v, str):
|
|
# Fix @s in message content (tool calls, system prompts)
|
|
fixed = v
|
|
if "@s" in fixed and "rcon" in fixed.lower():
|
|
fixed = fixed.replace("@s", "@p")
|
|
result[k] = fixed
|
|
else:
|
|
result[k] = fix_commands_in_obj(v)
|
|
return result
|
|
return obj
|
|
|
|
|
|
def has_template_commands(obj) -> bool:
|
|
"""Check if this example contains template commands."""
|
|
text = json.dumps(obj).lower()
|
|
return any(t in text for t in ["template search", "template pick", "template build"])
|
|
|
|
|
|
def process_file(path: Path):
|
|
"""Clean one JSONL file in place."""
|
|
if not path.exists():
|
|
print(f" SKIP: {path.name} (not found)")
|
|
return
|
|
|
|
examples = []
|
|
with open(path) as f:
|
|
for line in f:
|
|
if line.strip():
|
|
try:
|
|
examples.append(json.loads(line))
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
stats["examples_in"] += len(examples)
|
|
stats["files_processed"] += 1
|
|
|
|
cleaned = []
|
|
for ex in examples:
|
|
# Remove template command examples entirely
|
|
if has_template_commands(ex):
|
|
stats["removed_template"] += 1
|
|
continue
|
|
|
|
# Fix all commands recursively
|
|
fixed = fix_commands_in_obj(ex)
|
|
cleaned.append(fixed)
|
|
|
|
stats["examples_out"] += len(cleaned)
|
|
|
|
# Write back
|
|
with open(path, "w") as f:
|
|
for ex in cleaned:
|
|
f.write(json.dumps(ex, ensure_ascii=False) + "\n")
|
|
|
|
removed = len(examples) - len(cleaned)
|
|
print(f" {path.name}: {len(examples)} → {len(cleaned)} ({removed} removed)")
|
|
|
|
|
|
def main():
|
|
print("Cleaning training data...\n")
|
|
|
|
for path in FILES:
|
|
process_file(path)
|
|
|
|
print(f"\n{'='*50}")
|
|
print(f"Files processed: {stats['files_processed']}")
|
|
print(f"Examples: {stats['examples_in']} → {stats['examples_out']} ({stats['examples_in'] - stats['examples_out']} removed)")
|
|
print(f"\nFixes applied:")
|
|
print(f" @s → @p: {stats['fixed_at_s']}")
|
|
print(f" Leading slash: {stats['fixed_leading_slash']}")
|
|
print(f" Template removed: {stats['removed_template']}")
|
|
print(f" Fill trailing count: {stats['fixed_fill_count']}")
|
|
print(f" Generic items: {stats['fixed_generic_items']}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|