0.6.0 training session: Oracle Bot, RL combat, Mind's Eye, multilingual pipeline

Major changes from this session: Training: - 0.6.0 training running: 9B on steel141 3090 Ti, 27B on rented H100 NVL - 7,256 merged training examples (up from 3,183) - New training data: failure modes (85), midloop messaging (27), prompt injection defense (29), personality (32), gold from quarantine bank (232), new tool examples (30), claude's own experience (10) - All training data RCON-validated at 100% pass rate - Bake-off: gemma3:27b 66%, qwen3.5:27b 61%, translategemma:27b 56% Oracle Bot (Mind's Eye): - Invisible spectator bot (mineflayer) streams world state via WebSocket - HTML5 Canvas frontend at mind.mortdec.ai - Real-time tool trace visualization with expandable entries - Streaming model tokens during inference - Gateway integration: fire-and-forget POST /trace on every tool call Reinforcement Learning: - Gymnasium environment wrapping mineflayer bot (minecraft_env.py) - PPO training via Stable Baselines3 (10K param policy network) - Behavioral cloning pretraining (97.5% accuracy on expert policy) - Infinite training loop with auto-restart and checkpoint resume - Bot learns combat, survival, navigation from raw experience Bot Army: - 8-soldier marching formation with autonomous combat - Combat bots using mineflayer-pvp, pathfinder, armor-manager - Multilingual prayer bots via translategemma:27b (18 languages) - Frame-based AI architecture: LLM planner + reactive micro-scripts Infrastructure: - Fixed mattpc.sethpc.xyz billing gateway (API key + player list parser) - Billing gateway now tracks all LAN traffic (LAN auto-auth) - Gateway fallback for empty god-mode responses - Updated mortdec.ai landing page Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-22 20:22:50 -04:00
parent baab24f8b1
commit 5b28002001
44 changed files with 20873 additions and 4352 deletions
@@ -178,12 +178,17 @@ def score_result(example: dict, actual_cmds: list, parsed: dict) -> dict:
    }


-def run_bakeoff(models: list, ollama_url: str, no_think: bool = False):
+def run_bakeoff(models: list, ollama_url: str, no_think: bool = False, limit: int = 0):
    """Run all models against the dataset and compare."""
+    import random
    # Load dataset
    with open(DATASET) as f:
        examples = [json.loads(line) for line in f if line.strip()]

+    if limit > 0 and limit < len(examples):
+        random.seed(42)
+        examples = random.sample(examples, limit)
+
    print(f"Bake-off: {len(examples)} examples × {len(models)} models")
    print(f"Ollama: {ollama_url}")
    print(f"Models: {', '.join(models)}")
@@ -211,23 +216,37 @@ def run_bakeoff(models: list, ollama_url: str, no_think: bool = False):
        for i, ex in enumerate(examples):
            eid = ex.get("id", f"ex-{i}")
            category = ex.get("category", "?")
-            query = ex["input"]["user_message"]

-            # Determine mode
-            mode = "sudo"
-            if query.lower().startswith("pray "):
-                mode = "god"
-                query_stripped = query[5:]
+            # Handle both old dict format and new messages[] format
+            if "messages" in ex and isinstance(ex["messages"], list):
+                # Messages format: extract user message and system prompt
+                msgs = ex["messages"]
+                sys_content = ""
+                user_content = ""
+                for msg in msgs:
+                    if msg.get("role") == "system":
+                        sys_content = msg.get("content", "")
+                    elif msg.get("role") == "user":
+                        user_content = msg.get("content", "")
+                query = user_content
+                mode = "god" if "You are God" in sys_content else "sudo"
+                messages = [
+                    {"role": "system", "content": sys_content},
+                    {"role": "user", "content": user_content},
+                ]
            else:
-                query_stripped = query
-
-            # Build prompt
-            system_prompt = get_prompt(mode)
-            user_msg = build_user_message(ex)
-            messages = [
-                {"role": "system", "content": system_prompt},
-                {"role": "user", "content": user_msg},
-            ]
+                query = ex["input"]["user_message"]
+                # Determine mode
+                mode = "sudo"
+                if query.lower().startswith("pray "):
+                    mode = "god"
+                # Build prompt
+                system_prompt = get_prompt(mode)
+                user_msg = build_user_message(ex)
+                messages = [
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_msg},
+                ]

            # Call LLM
            try:
@@ -240,8 +259,29 @@ def run_bakeoff(models: list, ollama_url: str, no_think: bool = False):
            parsed = parse_response(resp["content"])
            actual_cmds = parsed.get("commands", [])

-            # Score
-            scores = score_result(ex, actual_cmds, parsed)
+            # Score — adapt example to old format for scoring if needed
+            score_ex = ex
+            if "messages" in ex and "output" not in ex:
+                # Extract expected output from assistant message
+                expected_content = ""
+                for msg in ex["messages"]:
+                    if msg.get("role") == "assistant":
+                        expected_content = msg.get("content", "")
+                        break
+                try:
+                    expected_parsed = json.loads(expected_content)
+                except (json.JSONDecodeError, TypeError):
+                    expected_parsed = {"commands": [], "message": ""}
+                score_ex = {
+                    "input": {"user_message": query},
+                    "output": {
+                        "commands": expected_parsed.get("commands", []),
+                        "message": expected_parsed.get("message", ""),
+                        "safety_flags": [],
+                    },
+                    "category": category,
+                }
+            scores = score_result(score_ex, actual_cmds, parsed)

            status = "OK" if scores["cmd_match"] else "MISS"
            syntax_flag = "" if scores["syntax_ok"] else " [SYNTAX]"
@@ -252,7 +292,7 @@ def run_bakeoff(models: list, ollama_url: str, no_think: bool = False):
                  f"({category}) {query[:50]}  [{resp['duration_ms']}ms]")

            if not scores["cmd_match"]:
-                expected_cmds = ex["output"].get("commands", [])
+                expected_cmds = score_ex.get("output", {}).get("commands", [])
                print(f"    Expected: {expected_cmds[:2] if isinstance(expected_cmds, list) else expected_cmds}")
                print(f"    Got:      {actual_cmds[:2] if isinstance(actual_cmds, list) else actual_cmds}")

@@ -260,7 +300,7 @@ def run_bakeoff(models: list, ollama_url: str, no_think: bool = False):
                "id": eid,
                "category": category,
                "query": query,
-                "expected": ex["output"].get("commands", []),
+                "expected": score_ex.get("output", {}).get("commands", []),
                "actual": actual_cmds,
                "message": parsed.get("message", ""),
                "reasoning": parsed.get("reasoning", ""),
@@ -336,9 +376,11 @@ def main():
                        default=["qwen3-coder:30b", "gemma3n:e4b"])
    parser.add_argument("--no-think", action="store_true",
                        help="Prepend /no_think to disable thinking tokens (helps Qwen models)")
+    parser.add_argument("--limit", type=int, default=0,
+                        help="Max examples per model (0 = all)")
    args = parser.parse_args()

-    run_bakeoff(args.models, args.ollama_url, no_think=args.no_think)
+    run_bakeoff(args.models, args.ollama_url, no_think=args.no_think, limit=args.limit)


 if __name__ == "__main__":