0.6.0 training session: Oracle Bot, RL combat, Mind's Eye, multilingual pipeline

Major changes from this session: Training: - 0.6.0 training running: 9B on steel141 3090 Ti, 27B on rented H100 NVL - 7,256 merged training examples (up from 3,183) - New training data: failure modes (85), midloop messaging (27), prompt injection defense (29), personality (32), gold from quarantine bank (232), new tool examples (30), claude's own experience (10) - All training data RCON-validated at 100% pass rate - Bake-off: gemma3:27b 66%, qwen3.5:27b 61%, translategemma:27b 56% Oracle Bot (Mind's Eye): - Invisible spectator bot (mineflayer) streams world state via WebSocket - HTML5 Canvas frontend at mind.mortdec.ai - Real-time tool trace visualization with expandable entries - Streaming model tokens during inference - Gateway integration: fire-and-forget POST /trace on every tool call Reinforcement Learning: - Gymnasium environment wrapping mineflayer bot (minecraft_env.py) - PPO training via Stable Baselines3 (10K param policy network) - Behavioral cloning pretraining (97.5% accuracy on expert policy) - Infinite training loop with auto-restart and checkpoint resume - Bot learns combat, survival, navigation from raw experience Bot Army: - 8-soldier marching formation with autonomous combat - Combat bots using mineflayer-pvp, pathfinder, armor-manager - Multilingual prayer bots via translategemma:27b (18 languages) - Frame-based AI architecture: LLM planner + reactive micro-scripts Infrastructure: - Fixed mattpc.sethpc.xyz billing gateway (API key + player list parser) - Billing gateway now tracks all LAN traffic (LAN auto-auth) - Gateway fallback for empty god-mode responses - Updated mortdec.ai landing page Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-22 20:22:50 -04:00
parent baab24f8b1
commit 5b28002001
44 changed files with 20873 additions and 4352 deletions
@@ -0,0 +1,342 @@
+#!/usr/bin/env python3
+"""
+minecraft_env.py — Gymnasium environment wrapping a mineflayer bot.
+
+The bot runs in a Node.js subprocess, communicating via stdin/stdout JSON.
+The Python Gym env sends actions and receives observations at ~600ms ticks.
+
+Usage:
+    from minecraft_env import MinecraftCombatEnv
+    env = MinecraftCombatEnv()
+    obs, info = env.reset()
+    while True:
+        action = env.action_space.sample()  # or policy(obs)
+        obs, reward, terminated, truncated, info = env.step(action)
+"""
+
+import json
+import subprocess
+import time
+import os
+import signal
+import numpy as np
+import gymnasium as gym
+from gymnasium import spaces
+from pathlib import Path
+
+INGAME_DIR = Path(__file__).resolve().parent.parent.parent / "ingame"
+
+
+class MinecraftCombatEnv(gym.Env):
+    """Minecraft combat survival environment via mineflayer bot."""
+
+    metadata = {"render_modes": ["human"], "render_fps": 2}
+
+    # Discrete actions
+    ACTIONS = ["forward", "fight", "flee", "eat", "sprint", "idle"]
+
+    # Hostile mob types for reward calculation
+    HOSTILE = {
+        "zombie", "husk", "skeleton", "creeper", "spider", "cave_spider",
+        "witch", "enderman", "drowned", "stray", "phantom", "parched",
+        "camel_husk", "slime", "magma_cube",
+    }
+
+    def __init__(
+        self,
+        host="192.168.0.244",
+        port=25568,
+        username="RLBot",
+        max_steps=600,       # 600 ticks × 0.6s = 6 minutes per episode
+        tick_rate=0.6,       # seconds per tick (sword cooldown rate)
+        render_mode=None,
+    ):
+        super().__init__()
+        self.host = host
+        self.port = port
+        self.username = username
+        self.max_steps = max_steps
+        self.tick_rate = tick_rate
+        self.render_mode = render_mode
+
+        # Observation space: 13 floats normalized to [0, 1]
+        # [hp, food, nearest_mob_dist, nearest_mob_angle, mob_count,
+        #  has_sword, has_armor, has_food, is_day, on_water,
+        #  y_level_norm, damage_taken_this_tick, is_fleeing]
+        self.observation_space = spaces.Box(
+            low=0.0, high=1.0, shape=(13,), dtype=np.float32
+        )
+
+        # Action space: 6 discrete actions
+        self.action_space = spaces.Discrete(len(self.ACTIONS))
+
+        # Internal state
+        self.proc = None
+        self.step_count = 0
+        self.total_reward = 0
+        self.kills = 0
+        self.prev_hp = 20.0
+        self.prev_food = 20
+        self.alive = False
+        self.last_obs = None
+
+    def _start_bot(self):
+        """Start the mineflayer bot subprocess."""
+        if self.proc and self.proc.poll() is None:
+            self._stop_bot()
+
+        bot_script = INGAME_DIR / "rl_bot.js"
+        self.proc = subprocess.Popen(
+            ["node", str(bot_script), self.host, str(self.port), self.username],
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.DEVNULL,
+            text=True,
+            bufsize=1,  # line buffered
+        )
+
+    def _stop_bot(self):
+        """Stop the bot subprocess."""
+        if self.proc:
+            try:
+                self.proc.stdin.write("quit\n")
+                self.proc.stdin.flush()
+                self.proc.wait(timeout=3)
+            except Exception:
+                try:
+                    self.proc.kill()
+                except Exception:
+                    pass
+            self.proc = None
+
+    def _send(self, cmd):
+        """Send a command to the bot and read the JSON response."""
+        try:
+            self.proc.stdin.write(cmd + "\n")
+            self.proc.stdin.flush()
+
+            # Read lines until we get a valid JSON observation
+            deadline = time.time() + 5.0
+            while time.time() < deadline:
+                line = self.proc.stdout.readline().strip()
+                if not line:
+                    continue
+                try:
+                    data = json.loads(line)
+                    return data
+                except json.JSONDecodeError:
+                    continue
+            return None
+        except (BrokenPipeError, OSError):
+            return None
+
+    def _parse_observation(self, data):
+        """Convert JSON bot state to numpy observation vector."""
+        if not data or "hp" not in data:
+            return np.zeros(13, dtype=np.float32)
+
+        hp = (data.get("hp") or 0) / 20.0  # normalize to [0, 1]
+        food = (data.get("food") or 0) / 20.0
+        mobs = data.get("mobs", [])
+
+        # Nearest hostile mob
+        hostile_mobs = [m for m in mobs if m.get("hostile", False)]
+        if hostile_mobs:
+            nearest = min(hostile_mobs, key=lambda m: m["dist"])
+            nearest_dist = min(nearest["dist"] / 24.0, 1.0)  # normalize
+            # Angle: approximate from relative position if available
+            nearest_angle = 0.5  # default forward
+        else:
+            nearest_dist = 1.0  # no mob = max distance
+            nearest_angle = 0.5
+
+        mob_count = min(len(hostile_mobs) / 10.0, 1.0)
+
+        # Inventory flags
+        inv = data.get("inv", "")
+        has_sword = 1.0 if "sword" in inv else 0.0
+        has_armor = 1.0 if data.get("armor", "none") != "none" else 0.0
+        has_food = 1.0 if any(f in inv for f in ["beef", "bread", "pork", "chicken", "apple", "potato", "cod"]) else 0.0
+
+        # World state
+        is_day = 1.0 if data.get("day", True) else 0.0
+        on_water = 1.0 if data.get("below", "") == "water" else 0.0
+
+        # Y level (normalize: 0=bedrock, 320=max → 0-1)
+        y = (data.get("pos") or {}).get("y", 64) or 64
+        y_norm = min(max(y, 0), 320) / 320.0
+
+        # Damage taken this tick
+        current_hp = float(data.get("hp") or 20)
+        prev = float(self.prev_hp or 20)
+        damage = max(0, prev - current_hp) / 20.0
+
+        # Is currently fleeing (HP < 5)
+        is_fleeing = 1.0 if current_hp < 5 else 0.0
+
+        obs = np.array([
+            hp, food, nearest_dist, nearest_angle, mob_count,
+            has_sword, has_armor, has_food, is_day, on_water,
+            y_norm, damage, is_fleeing,
+        ], dtype=np.float32)
+
+        return obs
+
+    def _calc_reward(self, data, action):
+        """Calculate reward from state transition."""
+        if not data or "hp" not in data:
+            return -100.0  # lost connection = death equivalent
+
+        reward = 0.0
+        hp = float(data.get("hp") or 0)
+        food = int(data.get("food") or 20)
+
+        # Survival reward: +1 per tick alive
+        reward += 1.0
+
+        # Damage penalty
+        damage = max(0, float(self.prev_hp or 20) - hp)
+        if damage > 0:
+            reward -= damage * 2.0  # -2 per HP lost
+
+        # Death penalty
+        if hp <= 0 or data.get("died", False):
+            reward -= 100.0
+
+        # Kill reward
+        new_kills = data.get("kills", 0)
+        kills_this_tick = new_kills - self.kills
+        if kills_this_tick > 0:
+            reward += kills_this_tick * 10.0
+        self.kills = new_kills
+
+        # Eating when hungry: good
+        prev_food = int(self.prev_food or 20)
+        if action == 3 and prev_food < 14 and food > prev_food:
+            reward += 5.0
+
+        # Eating when full: wasted action
+        if action == 3 and prev_food >= 18:
+            reward -= 1.0
+
+        # Fighting when no mobs nearby: wasted
+        mobs = data.get("mobs", [])
+        hostile_nearby = [m for m in mobs if m.get("hostile") and m["dist"] < 6]
+        if action == 1 and not hostile_nearby:
+            reward -= 0.5
+
+        # Fleeing when HP is low and mobs nearby: good decision
+        if action == 2 and hp < 8 and hostile_nearby:
+            reward += 3.0
+
+        # Idle penalty (doing nothing when threats exist)
+        if action == 5 and hostile_nearby:
+            reward -= 2.0
+
+        # Update state
+        self.prev_hp = hp
+        self.prev_food = food
+
+        return reward
+
+    def reset(self, seed=None, options=None):
+        """Reset the environment — reconnect bot and start new episode."""
+        super().reset(seed=seed)
+
+        self._start_bot()
+
+        # Wait for bot to spawn
+        deadline = time.time() + 30.0
+        data = None
+        while time.time() < deadline:
+            line = self.proc.stdout.readline().strip()
+            if not line:
+                continue
+            try:
+                d = json.loads(line)
+                if d.get("event") == "ready":
+                    data = d
+                    break
+                if "hp" in d:
+                    data = d
+                    break
+            except json.JSONDecodeError:
+                continue
+
+        if not data:
+            # Fallback: send observe
+            time.sleep(3)
+            data = self._send("observe")
+
+        self.step_count = 0
+        self.total_reward = 0
+        self.kills = data.get("kills", 0) if data else 0
+        self.prev_hp = data.get("hp", 20) if data else 20
+        self.prev_food = data.get("food", 20) if data else 20
+        self.alive = True
+
+        obs = self._parse_observation(data)
+        self.last_obs = obs
+        info = {"raw": data}
+
+        return obs, info
+
+    def step(self, action):
+        """Execute one action and return (obs, reward, terminated, truncated, info)."""
+        self.step_count += 1
+        action_name = self.ACTIONS[action]
+
+        # Send action to bot
+        data = self._send(action_name)
+
+        # Wait for game tick
+        time.sleep(self.tick_rate)
+
+        # Get observation after action
+        if data is None or "hp" not in data:
+            obs_data = self._send("observe")
+        else:
+            obs_data = data
+
+        obs = self._parse_observation(obs_data)
+        reward = self._calc_reward(obs_data, action)
+        self.total_reward += reward
+
+        # Check termination
+        terminated = False
+        if obs_data and (obs_data.get("hp", 0) <= 0 or obs_data.get("died", False)):
+            terminated = True
+            self.alive = False
+
+        # Check truncation (max steps)
+        truncated = self.step_count >= self.max_steps
+
+        info = {
+            "raw": obs_data,
+            "step": self.step_count,
+            "total_reward": self.total_reward,
+            "kills": self.kills,
+            "alive": self.alive,
+        }
+
+        self.last_obs = obs
+
+        if self.render_mode == "human":
+            self.render()
+
+        return obs, reward, terminated, truncated, info
+
+    def render(self):
+        """Print current state."""
+        if self.last_obs is not None:
+            hp = self.last_obs[0] * 20
+            food = self.last_obs[1] * 20
+            mob_dist = self.last_obs[2] * 24
+            mob_count = int(self.last_obs[4] * 10)
+            print(f"  Step {self.step_count}: HP={hp:.0f} Food={food:.0f} "
+                  f"Mobs={mob_count}@{mob_dist:.0f}b Kills={self.kills} "
+                  f"R={self.total_reward:.1f}")
+
+    def close(self):
+        """Clean up."""
+        self._stop_bot()