0.6.0 training session: Oracle Bot, RL combat, Mind's Eye, multilingual pipeline

Major changes from this session:

Training:
- 0.6.0 training running: 9B on steel141 3090 Ti, 27B on rented H100 NVL
- 7,256 merged training examples (up from 3,183)
- New training data: failure modes (85), midloop messaging (27),
  prompt injection defense (29), personality (32), gold from quarantine
  bank (232), new tool examples (30), claude's own experience (10)
- All training data RCON-validated at 100% pass rate
- Bake-off: gemma3:27b 66%, qwen3.5:27b 61%, translategemma:27b 56%

Oracle Bot (Mind's Eye):
- Invisible spectator bot (mineflayer) streams world state via WebSocket
- HTML5 Canvas frontend at mind.mortdec.ai
- Real-time tool trace visualization with expandable entries
- Streaming model tokens during inference
- Gateway integration: fire-and-forget POST /trace on every tool call

Reinforcement Learning:
- Gymnasium environment wrapping mineflayer bot (minecraft_env.py)
- PPO training via Stable Baselines3 (10K param policy network)
- Behavioral cloning pretraining (97.5% accuracy on expert policy)
- Infinite training loop with auto-restart and checkpoint resume
- Bot learns combat, survival, navigation from raw experience

Bot Army:
- 8-soldier marching formation with autonomous combat
- Combat bots using mineflayer-pvp, pathfinder, armor-manager
- Multilingual prayer bots via translategemma:27b (18 languages)
- Frame-based AI architecture: LLM planner + reactive micro-scripts

Infrastructure:
- Fixed mattpc.sethpc.xyz billing gateway (API key + player list parser)
- Billing gateway now tracks all LAN traffic (LAN auto-auth)
- Gateway fallback for empty god-mode responses
- Updated mortdec.ai landing page

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Seth
2026-03-22 20:22:50 -04:00
parent baab24f8b1
commit 5b28002001
44 changed files with 20873 additions and 4352 deletions
View File
+342
View File
@@ -0,0 +1,342 @@
#!/usr/bin/env python3
"""
minecraft_env.py — Gymnasium environment wrapping a mineflayer bot.
The bot runs in a Node.js subprocess, communicating via stdin/stdout JSON.
The Python Gym env sends actions and receives observations at ~600ms ticks.
Usage:
from minecraft_env import MinecraftCombatEnv
env = MinecraftCombatEnv()
obs, info = env.reset()
while True:
action = env.action_space.sample() # or policy(obs)
obs, reward, terminated, truncated, info = env.step(action)
"""
import json
import subprocess
import time
import os
import signal
import numpy as np
import gymnasium as gym
from gymnasium import spaces
from pathlib import Path
INGAME_DIR = Path(__file__).resolve().parent.parent.parent / "ingame"
class MinecraftCombatEnv(gym.Env):
"""Minecraft combat survival environment via mineflayer bot."""
metadata = {"render_modes": ["human"], "render_fps": 2}
# Discrete actions
ACTIONS = ["forward", "fight", "flee", "eat", "sprint", "idle"]
# Hostile mob types for reward calculation
HOSTILE = {
"zombie", "husk", "skeleton", "creeper", "spider", "cave_spider",
"witch", "enderman", "drowned", "stray", "phantom", "parched",
"camel_husk", "slime", "magma_cube",
}
def __init__(
self,
host="192.168.0.244",
port=25568,
username="RLBot",
max_steps=600, # 600 ticks × 0.6s = 6 minutes per episode
tick_rate=0.6, # seconds per tick (sword cooldown rate)
render_mode=None,
):
super().__init__()
self.host = host
self.port = port
self.username = username
self.max_steps = max_steps
self.tick_rate = tick_rate
self.render_mode = render_mode
# Observation space: 13 floats normalized to [0, 1]
# [hp, food, nearest_mob_dist, nearest_mob_angle, mob_count,
# has_sword, has_armor, has_food, is_day, on_water,
# y_level_norm, damage_taken_this_tick, is_fleeing]
self.observation_space = spaces.Box(
low=0.0, high=1.0, shape=(13,), dtype=np.float32
)
# Action space: 6 discrete actions
self.action_space = spaces.Discrete(len(self.ACTIONS))
# Internal state
self.proc = None
self.step_count = 0
self.total_reward = 0
self.kills = 0
self.prev_hp = 20.0
self.prev_food = 20
self.alive = False
self.last_obs = None
def _start_bot(self):
"""Start the mineflayer bot subprocess."""
if self.proc and self.proc.poll() is None:
self._stop_bot()
bot_script = INGAME_DIR / "rl_bot.js"
self.proc = subprocess.Popen(
["node", str(bot_script), self.host, str(self.port), self.username],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL,
text=True,
bufsize=1, # line buffered
)
def _stop_bot(self):
"""Stop the bot subprocess."""
if self.proc:
try:
self.proc.stdin.write("quit\n")
self.proc.stdin.flush()
self.proc.wait(timeout=3)
except Exception:
try:
self.proc.kill()
except Exception:
pass
self.proc = None
def _send(self, cmd):
"""Send a command to the bot and read the JSON response."""
try:
self.proc.stdin.write(cmd + "\n")
self.proc.stdin.flush()
# Read lines until we get a valid JSON observation
deadline = time.time() + 5.0
while time.time() < deadline:
line = self.proc.stdout.readline().strip()
if not line:
continue
try:
data = json.loads(line)
return data
except json.JSONDecodeError:
continue
return None
except (BrokenPipeError, OSError):
return None
def _parse_observation(self, data):
"""Convert JSON bot state to numpy observation vector."""
if not data or "hp" not in data:
return np.zeros(13, dtype=np.float32)
hp = (data.get("hp") or 0) / 20.0 # normalize to [0, 1]
food = (data.get("food") or 0) / 20.0
mobs = data.get("mobs", [])
# Nearest hostile mob
hostile_mobs = [m for m in mobs if m.get("hostile", False)]
if hostile_mobs:
nearest = min(hostile_mobs, key=lambda m: m["dist"])
nearest_dist = min(nearest["dist"] / 24.0, 1.0) # normalize
# Angle: approximate from relative position if available
nearest_angle = 0.5 # default forward
else:
nearest_dist = 1.0 # no mob = max distance
nearest_angle = 0.5
mob_count = min(len(hostile_mobs) / 10.0, 1.0)
# Inventory flags
inv = data.get("inv", "")
has_sword = 1.0 if "sword" in inv else 0.0
has_armor = 1.0 if data.get("armor", "none") != "none" else 0.0
has_food = 1.0 if any(f in inv for f in ["beef", "bread", "pork", "chicken", "apple", "potato", "cod"]) else 0.0
# World state
is_day = 1.0 if data.get("day", True) else 0.0
on_water = 1.0 if data.get("below", "") == "water" else 0.0
# Y level (normalize: 0=bedrock, 320=max → 0-1)
y = (data.get("pos") or {}).get("y", 64) or 64
y_norm = min(max(y, 0), 320) / 320.0
# Damage taken this tick
current_hp = float(data.get("hp") or 20)
prev = float(self.prev_hp or 20)
damage = max(0, prev - current_hp) / 20.0
# Is currently fleeing (HP < 5)
is_fleeing = 1.0 if current_hp < 5 else 0.0
obs = np.array([
hp, food, nearest_dist, nearest_angle, mob_count,
has_sword, has_armor, has_food, is_day, on_water,
y_norm, damage, is_fleeing,
], dtype=np.float32)
return obs
def _calc_reward(self, data, action):
"""Calculate reward from state transition."""
if not data or "hp" not in data:
return -100.0 # lost connection = death equivalent
reward = 0.0
hp = float(data.get("hp") or 0)
food = int(data.get("food") or 20)
# Survival reward: +1 per tick alive
reward += 1.0
# Damage penalty
damage = max(0, float(self.prev_hp or 20) - hp)
if damage > 0:
reward -= damage * 2.0 # -2 per HP lost
# Death penalty
if hp <= 0 or data.get("died", False):
reward -= 100.0
# Kill reward
new_kills = data.get("kills", 0)
kills_this_tick = new_kills - self.kills
if kills_this_tick > 0:
reward += kills_this_tick * 10.0
self.kills = new_kills
# Eating when hungry: good
prev_food = int(self.prev_food or 20)
if action == 3 and prev_food < 14 and food > prev_food:
reward += 5.0
# Eating when full: wasted action
if action == 3 and prev_food >= 18:
reward -= 1.0
# Fighting when no mobs nearby: wasted
mobs = data.get("mobs", [])
hostile_nearby = [m for m in mobs if m.get("hostile") and m["dist"] < 6]
if action == 1 and not hostile_nearby:
reward -= 0.5
# Fleeing when HP is low and mobs nearby: good decision
if action == 2 and hp < 8 and hostile_nearby:
reward += 3.0
# Idle penalty (doing nothing when threats exist)
if action == 5 and hostile_nearby:
reward -= 2.0
# Update state
self.prev_hp = hp
self.prev_food = food
return reward
def reset(self, seed=None, options=None):
"""Reset the environment — reconnect bot and start new episode."""
super().reset(seed=seed)
self._start_bot()
# Wait for bot to spawn
deadline = time.time() + 30.0
data = None
while time.time() < deadline:
line = self.proc.stdout.readline().strip()
if not line:
continue
try:
d = json.loads(line)
if d.get("event") == "ready":
data = d
break
if "hp" in d:
data = d
break
except json.JSONDecodeError:
continue
if not data:
# Fallback: send observe
time.sleep(3)
data = self._send("observe")
self.step_count = 0
self.total_reward = 0
self.kills = data.get("kills", 0) if data else 0
self.prev_hp = data.get("hp", 20) if data else 20
self.prev_food = data.get("food", 20) if data else 20
self.alive = True
obs = self._parse_observation(data)
self.last_obs = obs
info = {"raw": data}
return obs, info
def step(self, action):
"""Execute one action and return (obs, reward, terminated, truncated, info)."""
self.step_count += 1
action_name = self.ACTIONS[action]
# Send action to bot
data = self._send(action_name)
# Wait for game tick
time.sleep(self.tick_rate)
# Get observation after action
if data is None or "hp" not in data:
obs_data = self._send("observe")
else:
obs_data = data
obs = self._parse_observation(obs_data)
reward = self._calc_reward(obs_data, action)
self.total_reward += reward
# Check termination
terminated = False
if obs_data and (obs_data.get("hp", 0) <= 0 or obs_data.get("died", False)):
terminated = True
self.alive = False
# Check truncation (max steps)
truncated = self.step_count >= self.max_steps
info = {
"raw": obs_data,
"step": self.step_count,
"total_reward": self.total_reward,
"kills": self.kills,
"alive": self.alive,
}
self.last_obs = obs
if self.render_mode == "human":
self.render()
return obs, reward, terminated, truncated, info
def render(self):
"""Print current state."""
if self.last_obs is not None:
hp = self.last_obs[0] * 20
food = self.last_obs[1] * 20
mob_dist = self.last_obs[2] * 24
mob_count = int(self.last_obs[4] * 10)
print(f" Step {self.step_count}: HP={hp:.0f} Food={food:.0f} "
f"Mobs={mob_count}@{mob_dist:.0f}b Kills={self.kills} "
f"R={self.total_reward:.1f}")
def close(self):
"""Clean up."""
self._stop_bot()
+184
View File
@@ -0,0 +1,184 @@
#!/usr/bin/env python3
"""
pretrain_policy.py — Give the RL policy a head start via behavioral cloning.
Generates synthetic expert demonstrations from our hand-coded survival rules,
then trains the policy network to imitate them. The resulting weights become
the starting point for PPO (instead of random initialization).
Usage:
python3 training/rl/pretrain_policy.py
# Then run train_combat.py — it will load the pretrained checkpoint
"""
import numpy as np
import torch
import torch.nn as nn
from pathlib import Path
from stable_baselines3 import PPO
from gymnasium import spaces
ROOT = Path(__file__).resolve().parent.parent.parent
CKPT_DIR = ROOT / "training" / "rl" / "checkpoints"
# Actions: 0=forward, 1=fight, 2=flee, 3=eat, 4=sprint, 5=idle
# Obs: [hp, food, nearest_mob_dist, nearest_mob_angle, mob_count,
# has_sword, has_armor, has_food, is_day, on_water,
# y_level_norm, damage_taken, is_fleeing]
def expert_action(obs):
"""Hand-coded expert policy — the survival rules we discovered tonight."""
hp = obs[0] # 0-1 (0=dead, 1=full)
food = obs[1] # 0-1
mob_dist = obs[2] # 0-1 (0=right here, 1=24+ blocks)
mob_count = obs[4] # 0-1 (0=none, 1=10+)
has_sword = obs[5] # 0 or 1
has_food = obs[7] # 0 or 1
damage = obs[11] # 0-1
is_fleeing = obs[12] # 0 or 1
# PRIORITY 1: Flee if critical HP
if hp < 0.25: # < 5 HP
if has_food and food < 0.7:
return 3 # eat
return 2 # flee
# PRIORITY 2: Flee if overwhelmed (3+ mobs and not full HP)
if mob_count > 0.3 and hp < 0.6:
return 2 # flee
# PRIORITY 3: Eat if hungry and have food
if food < 0.7 and has_food and hp < 0.8:
return 3 # eat
# PRIORITY 4: Fight if mob nearby and have sword
if mob_dist < 0.25 and has_sword: # < 6 blocks
return 1 # fight
# PRIORITY 5: Approach mob if nearby but not in melee
if mob_dist < 0.5 and has_sword: # < 12 blocks
return 0 # forward (approach)
# PRIORITY 6: Sprint if taking damage (dodge)
if damage > 0:
return 4 # sprint
# PRIORITY 7: Explore
if mob_dist > 0.8: # no mobs nearby
return 0 # forward
# Default: idle
return 5
def generate_expert_data(n_samples=50000):
"""Generate diverse observations and expert actions."""
obs_list = []
act_list = []
for _ in range(n_samples):
# Random observation (covering the full state space)
obs = np.zeros(13, dtype=np.float32)
obs[0] = np.random.beta(2, 1) # hp: skew toward higher
obs[1] = np.random.beta(2, 1) # food: skew toward higher
obs[2] = np.random.uniform(0, 1) # mob distance
obs[3] = np.random.uniform(0, 1) # mob angle
obs[4] = np.random.beta(1, 3) # mob count: skew toward fewer
obs[5] = float(np.random.random() > 0.3) # has_sword: 70% chance
obs[6] = float(np.random.random() > 0.4) # has_armor: 60% chance
obs[7] = float(np.random.random() > 0.3) # has_food: 70% chance
obs[8] = float(np.random.random() > 0.4) # is_day: 60% chance
obs[9] = float(np.random.random() > 0.85) # on_water: 15% chance
obs[10] = np.random.uniform(0.15, 0.3) # y_level: surface range
obs[11] = np.random.beta(1, 5) # damage: skew toward low
obs[12] = float(obs[0] < 0.25) # is_fleeing
action = expert_action(obs)
obs_list.append(obs)
act_list.append(action)
return np.array(obs_list), np.array(act_list)
def pretrain():
print("Generating 50,000 expert demonstrations...")
obs_data, act_data = generate_expert_data(50000)
# Show action distribution
unique, counts = np.unique(act_data, return_counts=True)
action_names = ["forward", "fight", "flee", "eat", "sprint", "idle"]
print("\nExpert action distribution:")
for a, c in zip(unique, counts):
print(f" {action_names[a]:10} {c:6} ({c/len(act_data)*100:.1f}%)")
# Create a PPO model with the same architecture
import gymnasium as gym
class DummyMCEnv(gym.Env):
metadata = {"render_modes": []}
def __init__(self):
self.observation_space = spaces.Box(low=0, high=1, shape=(13,), dtype=np.float32)
self.action_space = spaces.Discrete(6)
def reset(self, **kw):
return np.zeros(13, dtype=np.float32), {}
def step(self, a):
return np.zeros(13, dtype=np.float32), 0, True, False, {}
dummy_env = DummyMCEnv()
model = PPO(
"MlpPolicy", dummy_env, verbose=0,
policy_kwargs={"net_arch": [64, 64]},
)
# Extract the policy network and train via supervised learning
policy = model.policy
optimizer = torch.optim.Adam(policy.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
obs_tensor = torch.FloatTensor(obs_data)
act_tensor = torch.LongTensor(act_data)
print(f"\nPretraining policy ({sum(p.numel() for p in policy.parameters()):,} params)...")
batch_size = 256
n_epochs = 20
for epoch in range(n_epochs):
# Shuffle
perm = torch.randperm(len(obs_tensor))
total_loss = 0
correct = 0
n_batches = 0
for i in range(0, len(obs_tensor), batch_size):
idx = perm[i:i+batch_size]
batch_obs = obs_tensor[idx]
batch_act = act_tensor[idx]
# Forward through policy network
features = policy.extract_features(batch_obs, policy.pi_features_extractor)
latent = policy.mlp_extractor.forward_actor(features)
logits = policy.action_net(latent)
loss = criterion(logits, batch_act)
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss += loss.item()
correct += (logits.argmax(dim=1) == batch_act).sum().item()
n_batches += 1
accuracy = correct / len(obs_tensor) * 100
avg_loss = total_loss / n_batches
print(f" Epoch {epoch+1:2d}/{n_epochs}: loss={avg_loss:.4f} accuracy={accuracy:.1f}%")
# Save the pretrained model
CKPT_DIR.mkdir(parents=True, exist_ok=True)
save_path = CKPT_DIR / "combat_ppo_pretrained.zip"
model.save(str(save_path))
print(f"\nPretrained model saved to {save_path}")
print("PPO will resume from this checkpoint and improve via RL.")
if __name__ == "__main__":
pretrain()
+178
View File
@@ -0,0 +1,178 @@
#!/usr/bin/env python3
"""
train_combat.py — Train a small policy network for Minecraft combat via PPO.
The agent learns to fight, flee, eat, and survive in a hostile Minecraft world.
Uses the MinecraftCombatEnv gymnasium wrapper which controls a mineflayer bot.
Usage:
# Install deps first:
pip install gymnasium stable-baselines3 torch
# Train (on steel141 with mc-train conda env):
python3 training/rl/train_combat.py
# Train with custom settings:
python3 training/rl/train_combat.py --timesteps 50000 --host 192.168.0.244 --port 25568
# Evaluate a trained model:
python3 training/rl/train_combat.py --eval --model training/rl/checkpoints/combat_ppo.zip
"""
import argparse
import os
import sys
from pathlib import Path
# Add project root to path
ROOT = Path(__file__).resolve().parent.parent.parent
sys.path.insert(0, str(ROOT))
def train(args):
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback
from training.rl.minecraft_env import MinecraftCombatEnv
print(f"=== Minecraft RL Combat Training ===")
print(f"Host: {args.host}:{args.port}")
print(f"Timesteps: {args.timesteps}")
print(f"Policy: MlpPolicy (3-layer MLP)")
print()
# Create environment
env = MinecraftCombatEnv(
host=args.host,
port=args.port,
username=f"RLBot_{os.getpid() % 100}",
max_steps=args.max_steps,
render_mode="human" if args.verbose else None,
)
# Checkpointing
ckpt_dir = ROOT / "training" / "rl" / "checkpoints"
ckpt_dir.mkdir(parents=True, exist_ok=True)
checkpoint_cb = CheckpointCallback(
save_freq=args.save_freq,
save_path=str(ckpt_dir),
name_prefix="combat_ppo",
)
# Check for existing checkpoint to resume from
latest_ckpt = None
if ckpt_dir.exists():
ckpts = sorted(ckpt_dir.glob("combat_ppo_*.zip"), key=lambda p: p.stat().st_mtime)
if ckpts:
latest_ckpt = str(ckpts[-1])
print(f"RESUMING from: {latest_ckpt}")
if latest_ckpt:
# Load existing model and continue training
model = PPO.load(
latest_ckpt,
env=env,
tensorboard_log=str(ckpt_dir / "tb_logs"),
)
model.learning_rate = 3e-4 # can adjust between runs
else:
# Fresh model
model = PPO(
"MlpPolicy",
env,
verbose=1,
learning_rate=3e-4,
n_steps=256, # collect 256 steps before update
batch_size=64,
n_epochs=4,
gamma=0.99, # discount factor
gae_lambda=0.95,
clip_range=0.2,
ent_coef=0.01, # entropy bonus for exploration
policy_kwargs={
"net_arch": [64, 64], # 2 hidden layers of 64 units
},
tensorboard_log=str(ckpt_dir / "tb_logs"),
)
print(f"Policy network params: {sum(p.numel() for p in model.policy.parameters()):,}")
print(f"Training for {args.timesteps} timesteps...")
print()
try:
model.learn(
total_timesteps=args.timesteps,
callback=checkpoint_cb,
progress_bar=True,
)
except KeyboardInterrupt:
print("\nTraining interrupted.")
# Save final model
final_path = ckpt_dir / "combat_ppo_final.zip"
model.save(str(final_path))
print(f"\nModel saved to {final_path}")
env.close()
def evaluate(args):
from stable_baselines3 import PPO
from training.rl.minecraft_env import MinecraftCombatEnv
print(f"=== Evaluating {args.model} ===")
env = MinecraftCombatEnv(
host=args.host,
port=args.port,
username="RLBot_eval",
max_steps=args.max_steps,
render_mode="human",
)
model = PPO.load(args.model)
total_reward = 0
total_kills = 0
episodes = args.eval_episodes
for ep in range(episodes):
obs, info = env.reset()
ep_reward = 0
done = False
while not done:
action, _ = model.predict(obs, deterministic=True)
obs, reward, terminated, truncated, info = env.step(action)
ep_reward += reward
done = terminated or truncated
total_reward += ep_reward
total_kills += info.get("kills", 0)
print(f" Episode {ep+1}: reward={ep_reward:.1f} kills={info.get('kills', 0)} steps={info.get('step', 0)}")
print(f"\nAverage: reward={total_reward/episodes:.1f} kills={total_kills/episodes:.1f}")
env.close()
def main():
parser = argparse.ArgumentParser(description="Minecraft RL Combat Training")
parser.add_argument("--host", default="192.168.0.244")
parser.add_argument("--port", type=int, default=25568)
parser.add_argument("--timesteps", type=int, default=10000)
parser.add_argument("--max-steps", type=int, default=300)
parser.add_argument("--save-freq", type=int, default=2000)
parser.add_argument("--verbose", action="store_true")
parser.add_argument("--eval", action="store_true")
parser.add_argument("--eval-episodes", type=int, default=5)
parser.add_argument("--model", default="training/rl/checkpoints/combat_ppo_final.zip")
args = parser.parse_args()
if args.eval:
evaluate(args)
else:
train(args)
if __name__ == "__main__":
main()