Files
Mortdecai/training/rl/train_combat.py
T
Seth 5b28002001 0.6.0 training session: Oracle Bot, RL combat, Mind's Eye, multilingual pipeline
Major changes from this session:

Training:
- 0.6.0 training running: 9B on steel141 3090 Ti, 27B on rented H100 NVL
- 7,256 merged training examples (up from 3,183)
- New training data: failure modes (85), midloop messaging (27),
  prompt injection defense (29), personality (32), gold from quarantine
  bank (232), new tool examples (30), claude's own experience (10)
- All training data RCON-validated at 100% pass rate
- Bake-off: gemma3:27b 66%, qwen3.5:27b 61%, translategemma:27b 56%

Oracle Bot (Mind's Eye):
- Invisible spectator bot (mineflayer) streams world state via WebSocket
- HTML5 Canvas frontend at mind.mortdec.ai
- Real-time tool trace visualization with expandable entries
- Streaming model tokens during inference
- Gateway integration: fire-and-forget POST /trace on every tool call

Reinforcement Learning:
- Gymnasium environment wrapping mineflayer bot (minecraft_env.py)
- PPO training via Stable Baselines3 (10K param policy network)
- Behavioral cloning pretraining (97.5% accuracy on expert policy)
- Infinite training loop with auto-restart and checkpoint resume
- Bot learns combat, survival, navigation from raw experience

Bot Army:
- 8-soldier marching formation with autonomous combat
- Combat bots using mineflayer-pvp, pathfinder, armor-manager
- Multilingual prayer bots via translategemma:27b (18 languages)
- Frame-based AI architecture: LLM planner + reactive micro-scripts

Infrastructure:
- Fixed mattpc.sethpc.xyz billing gateway (API key + player list parser)
- Billing gateway now tracks all LAN traffic (LAN auto-auth)
- Gateway fallback for empty god-mode responses
- Updated mortdec.ai landing page

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-22 20:22:50 -04:00

179 lines
5.3 KiB
Python

#!/usr/bin/env python3
"""
train_combat.py — Train a small policy network for Minecraft combat via PPO.
The agent learns to fight, flee, eat, and survive in a hostile Minecraft world.
Uses the MinecraftCombatEnv gymnasium wrapper which controls a mineflayer bot.
Usage:
# Install deps first:
pip install gymnasium stable-baselines3 torch
# Train (on steel141 with mc-train conda env):
python3 training/rl/train_combat.py
# Train with custom settings:
python3 training/rl/train_combat.py --timesteps 50000 --host 192.168.0.244 --port 25568
# Evaluate a trained model:
python3 training/rl/train_combat.py --eval --model training/rl/checkpoints/combat_ppo.zip
"""
import argparse
import os
import sys
from pathlib import Path
# Add project root to path
ROOT = Path(__file__).resolve().parent.parent.parent
sys.path.insert(0, str(ROOT))
def train(args):
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback
from training.rl.minecraft_env import MinecraftCombatEnv
print(f"=== Minecraft RL Combat Training ===")
print(f"Host: {args.host}:{args.port}")
print(f"Timesteps: {args.timesteps}")
print(f"Policy: MlpPolicy (3-layer MLP)")
print()
# Create environment
env = MinecraftCombatEnv(
host=args.host,
port=args.port,
username=f"RLBot_{os.getpid() % 100}",
max_steps=args.max_steps,
render_mode="human" if args.verbose else None,
)
# Checkpointing
ckpt_dir = ROOT / "training" / "rl" / "checkpoints"
ckpt_dir.mkdir(parents=True, exist_ok=True)
checkpoint_cb = CheckpointCallback(
save_freq=args.save_freq,
save_path=str(ckpt_dir),
name_prefix="combat_ppo",
)
# Check for existing checkpoint to resume from
latest_ckpt = None
if ckpt_dir.exists():
ckpts = sorted(ckpt_dir.glob("combat_ppo_*.zip"), key=lambda p: p.stat().st_mtime)
if ckpts:
latest_ckpt = str(ckpts[-1])
print(f"RESUMING from: {latest_ckpt}")
if latest_ckpt:
# Load existing model and continue training
model = PPO.load(
latest_ckpt,
env=env,
tensorboard_log=str(ckpt_dir / "tb_logs"),
)
model.learning_rate = 3e-4 # can adjust between runs
else:
# Fresh model
model = PPO(
"MlpPolicy",
env,
verbose=1,
learning_rate=3e-4,
n_steps=256, # collect 256 steps before update
batch_size=64,
n_epochs=4,
gamma=0.99, # discount factor
gae_lambda=0.95,
clip_range=0.2,
ent_coef=0.01, # entropy bonus for exploration
policy_kwargs={
"net_arch": [64, 64], # 2 hidden layers of 64 units
},
tensorboard_log=str(ckpt_dir / "tb_logs"),
)
print(f"Policy network params: {sum(p.numel() for p in model.policy.parameters()):,}")
print(f"Training for {args.timesteps} timesteps...")
print()
try:
model.learn(
total_timesteps=args.timesteps,
callback=checkpoint_cb,
progress_bar=True,
)
except KeyboardInterrupt:
print("\nTraining interrupted.")
# Save final model
final_path = ckpt_dir / "combat_ppo_final.zip"
model.save(str(final_path))
print(f"\nModel saved to {final_path}")
env.close()
def evaluate(args):
from stable_baselines3 import PPO
from training.rl.minecraft_env import MinecraftCombatEnv
print(f"=== Evaluating {args.model} ===")
env = MinecraftCombatEnv(
host=args.host,
port=args.port,
username="RLBot_eval",
max_steps=args.max_steps,
render_mode="human",
)
model = PPO.load(args.model)
total_reward = 0
total_kills = 0
episodes = args.eval_episodes
for ep in range(episodes):
obs, info = env.reset()
ep_reward = 0
done = False
while not done:
action, _ = model.predict(obs, deterministic=True)
obs, reward, terminated, truncated, info = env.step(action)
ep_reward += reward
done = terminated or truncated
total_reward += ep_reward
total_kills += info.get("kills", 0)
print(f" Episode {ep+1}: reward={ep_reward:.1f} kills={info.get('kills', 0)} steps={info.get('step', 0)}")
print(f"\nAverage: reward={total_reward/episodes:.1f} kills={total_kills/episodes:.1f}")
env.close()
def main():
parser = argparse.ArgumentParser(description="Minecraft RL Combat Training")
parser.add_argument("--host", default="192.168.0.244")
parser.add_argument("--port", type=int, default=25568)
parser.add_argument("--timesteps", type=int, default=10000)
parser.add_argument("--max-steps", type=int, default=300)
parser.add_argument("--save-freq", type=int, default=2000)
parser.add_argument("--verbose", action="store_true")
parser.add_argument("--eval", action="store_true")
parser.add_argument("--eval-episodes", type=int, default=5)
parser.add_argument("--model", default="training/rl/checkpoints/combat_ppo_final.zip")
args = parser.parse_args()
if args.eval:
evaluate(args)
else:
train(args)
if __name__ == "__main__":
main()