#!/usr/bin/env python3 """ Convert IGLU dataset to Mortdecai build training examples. IGLU provides natural language instructions paired with block placement coordinates. We convert these to: 1. Direct setblock/fill commands (for simple builds) 2. script.write + script.execute flows (for complex builds) Source: microsoft/iglu-datasets singleturn dataset Output: data/raw/iglu_build_training.jsonl Usage: python3 training/scripts/convert_iglu_to_training.py """ import csv import json import os import random import sys from collections import defaultdict from pathlib import Path PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent sys.path.insert(0, str(PROJECT_ROOT)) from agent.tools.tool_schemas import qwen3_tools_block from agent.prompts.system_prompts import SYNTAX_RULES, RISK_GRADIENT IGLU_DIR = PROJECT_ROOT / "data" / "external" / "iglu-repo" / "datasets" / "singleturn" OUTPUT_PATH = PROJECT_ROOT / "data" / "raw" / "iglu_build_training.jsonl" TOOLS_BLOCK = qwen3_tools_block() SYSTEM = ( "You are a Minecraft 1.21 command translator with script writing abilities.\n" "For complex builds (4+ blocks), write a mcfunction script. Validate first.\n" "For simple builds (1-3 blocks), use rcon.execute directly.\n" "PERMISSION LEVEL: 4 (generous).\n\n" "Return JSON: {\"risk_level\": <0-5>, \"commands\": [...], \"reasoning\": \"...\"}\n\n" + SYNTAX_RULES + RISK_GRADIENT + "\n" + TOOLS_BLOCK ) # IGLU uses color IDs for blocks. Map to Minecraft wool colors. IGLU_BLOCK_MAP = { 57: "minecraft:blue_wool", 58: "minecraft:light_blue_wool", 59: "minecraft:green_wool", 60: "minecraft:red_wool", 61: "minecraft:orange_wool", 62: "minecraft:purple_wool", 63: "minecraft:yellow_wool", } # For variety, also map to concrete and terracotta BLOCK_VARIANTS = { "wool": { 57: "minecraft:blue_wool", 58: "minecraft:light_blue_wool", 59: "minecraft:green_wool", 60: "minecraft:red_wool", 61: "minecraft:orange_wool", 62: "minecraft:purple_wool", 63: "minecraft:yellow_wool", }, "concrete": { 57: "minecraft:blue_concrete", 58: "minecraft:light_blue_concrete", 59: "minecraft:green_concrete", 60: "minecraft:red_concrete", 61: "minecraft:orange_concrete", 62: "minecraft:purple_concrete", 63: "minecraft:yellow_concrete", }, "terracotta": { 57: "minecraft:blue_terracotta", 58: "minecraft:light_blue_terracotta", 59: "minecraft:green_terracotta", 60: "minecraft:red_terracotta", 61: "minecraft:orange_terracotta", 62: "minecraft:purple_terracotta", 63: "minecraft:yellow_terracotta", }, } PLAYERS = ["slingshooter08", "Ace13245", "TheBigBoss", "xXDragonSlayerXx", "CreeperKing99"] def sys_msg(): return {"role": "system", "content": SYSTEM} def user_msg(text): return {"role": "user", "content": text} def tool_call(name, args): return {"role": "assistant", "content": f"\n{json.dumps({'name': name, 'arguments': args})}\n"} def tool_result(data): return {"role": "tool", "content": json.dumps(data)} def final_response(resp): return {"role": "assistant", "content": json.dumps(resp)} def blocks_to_commands(blocks_to_place, blocks_to_remove, block_map, use_relative=True, offset=(0, 64, 0)): """Convert block coordinate lists to setblock/fill commands.""" commands = [] # Group placed blocks by color for potential fill optimization by_color = defaultdict(list) for x, y, z, color_id in blocks_to_place: block = block_map.get(color_id, "minecraft:white_wool") by_color[block].append((x, y, z)) for block, coords in by_color.items(): if len(coords) == 1: x, y, z = coords[0] if use_relative: commands.append(f"setblock ~{x} ~{y-offset[1]} ~{z} {block}") else: commands.append(f"setblock {x} {y} {z} {block}") elif len(coords) <= 3: for x, y, z in coords: if use_relative: commands.append(f"setblock ~{x} ~{y-offset[1]} ~{z} {block}") else: commands.append(f"setblock {x} {y} {z} {block}") else: # Try to find a bounding box for fill xs = [c[0] for c in coords] ys = [c[1] for c in coords] zs = [c[2] for c in coords] min_x, max_x = min(xs), max(xs) min_y, max_y = min(ys), max(ys) min_z, max_z = min(zs), max(zs) # Check if it's a solid fill (all positions in the box are filled) box_volume = (max_x - min_x + 1) * (max_y - min_y + 1) * (max_z - min_z + 1) if box_volume == len(coords) and box_volume > 2: if use_relative: commands.append( f"fill ~{min_x} ~{min_y-offset[1]} ~{min_z} " f"~{max_x} ~{max_y-offset[1]} ~{max_z} {block}" ) else: commands.append( f"fill {min_x} {min_y} {min_z} {max_x} {max_y} {max_z} {block}" ) else: # Not a clean box — individual setblocks for x, y, z in coords: if use_relative: commands.append(f"setblock ~{x} ~{y-offset[1]} ~{z} {block}") else: commands.append(f"setblock {x} {y} {z} {block}") # Remove blocks for x, y, z, _ in blocks_to_remove: if use_relative: commands.append(f"setblock ~{x} ~{y-offset[1]} ~{z} minecraft:air") else: commands.append(f"setblock {x} {y} {z} minecraft:air") return commands def load_iglu_pairs(): """Load instruction-to-build pairs from IGLU dataset.""" csv_path = IGLU_DIR / "clarifying_questions_train.csv" if not csv_path.exists(): print(f"CSV not found: {csv_path}") return [] # Build target state index target_dir = IGLU_DIR / "target_world_states" / "builder-data" targets = {} if target_dir.exists(): for game_dir in target_dir.iterdir(): if game_dir.is_dir(): for step_file in game_dir.iterdir(): if step_file.is_file(): targets.setdefault(game_dir.name, []).append(step_file) pairs = [] with open(csv_path) as f: reader = csv.DictReader(f) for row in reader: game_id = row['GameId'].lower().replace(' ', '') instruction = row['InputInstruction'].strip() if not instruction or row.get('IsInstructionClear') != 'Yes': continue init_path = IGLU_DIR / row['InitializedWorldPath'] if game_id in targets and init_path.exists(): pairs.append({ 'instruction': instruction, 'init_path': str(init_path), 'target_path': str(targets[game_id][0]), 'game_id': game_id, }) return pairs def convert_pair_to_example(pair, idx, block_variant="wool"): """Convert one IGLU pair to a training example.""" block_map = BLOCK_VARIANTS.get(block_variant, BLOCK_VARIANTS["wool"]) player = random.choice(PLAYERS) with open(pair['init_path']) as f: init = json.load(f) with open(pair['target_path']) as f: target = json.load(f) init_blocks = set(tuple(b) for b in init.get('worldEndingState', {}).get('blocks', [])) target_blocks = set(tuple(b) for b in target.get('worldEndingState', {}).get('blocks', [])) to_place = sorted(target_blocks - init_blocks) to_remove = sorted(init_blocks - target_blocks) if not to_place and not to_remove: return None total_changes = len(to_place) + len(to_remove) commands = blocks_to_commands(to_place, to_remove, block_map) if not commands: return None instruction = pair['instruction'] # Make it sound like a Minecraft player request prefixes = [ f"sudo {instruction}", f"sudo can you {instruction.lower()}", f"sudo please {instruction.lower()}", f"sudo I need you to {instruction.lower()}", ] prompt = random.choice(prefixes) msgs = [sys_msg(), user_msg(f"Player {player}: {prompt}")] if total_changes <= 4: # Direct rcon.execute for small builds for cmd in commands: msgs.append(tool_call("rcon.execute", {"command": cmd})) msgs.append(tool_result({"success": True, "result": "Changed the block"})) reasoning = f"Direct block placement: {len(to_place)} placed, {len(to_remove)} removed." resp = {"risk_level": 3, "commands": commands, "reasoning": reasoning} else: # Script workflow for larger builds script_name = f"build_{idx:04d}" desc = instruction[:80] # Validate msgs.append(tool_call("script.validate", {"commands": commands})) msgs.append(tool_result({ "valid": True, "total": len(commands), "passed": len(commands), "errors": [], })) # Write msgs.append(tool_call("script.write", { "name": script_name, "commands": commands, "description": desc, })) msgs.append(tool_result({ "ok": True, "path": f"mortdecai:{script_name}", "lines": len(commands), })) # Execute at player msgs.append(tool_call("script.execute", { "name": script_name, "as_player": player, })) msgs.append(tool_result({ "ok": True, "result": f"Executed {len(commands)} commands from function mortdecai:{script_name}", })) reasoning = (f"Complex build ({total_changes} block changes). " f"Wrote script '{script_name}' with {len(commands)} commands. " f"Placed {len(to_place)}, removed {len(to_remove)}.") resp = { "risk_level": 3, "commands": [f"function mortdecai:{script_name}"], "reasoning": reasoning, } msgs.append(final_response(resp)) return { "id": f"iglu-build-{idx:05d}", "source": "iglu_dataset", "type": "build_script" if total_changes > 4 else "build_direct", "block_changes": total_changes, "messages": msgs, } def main(): print("Loading IGLU dataset...") pairs = load_iglu_pairs() print(f"Found {len(pairs)} instruction-build pairs") if not pairs: print("No data found. Make sure iglu-repo is cloned in data/external/") return examples = [] skipped = 0 # Process with variety — use different block variants variants = list(BLOCK_VARIANTS.keys()) for idx, pair in enumerate(pairs): variant = variants[idx % len(variants)] ex = convert_pair_to_example(pair, idx, variant) if ex: examples.append(ex) else: skipped += 1 if (idx + 1) % 500 == 0: print(f" Processed {idx+1}/{len(pairs)}, generated {len(examples)}") # Stats direct = sum(1 for e in examples if e['type'] == 'build_direct') script = sum(1 for e in examples if e['type'] == 'build_script') avg_blocks = sum(e['block_changes'] for e in examples) / max(len(examples), 1) print(f"\nGenerated {len(examples)} examples (skipped {skipped} empty)") print(f" Direct (1-4 blocks): {direct}") print(f" Script (5+ blocks): {script}") print(f" Avg block changes: {avg_blocks:.1f}") OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True) with open(OUTPUT_PATH, "w") as f: for ex in examples: f.write(json.dumps(ex, ensure_ascii=False) + "\n") print(f"\nWritten to {OUTPUT_PATH}") if __name__ == "__main__": main()