0.6.0 training session: Oracle Bot, RL combat, Mind's Eye, multilingual pipeline
Major changes from this session: Training: - 0.6.0 training running: 9B on steel141 3090 Ti, 27B on rented H100 NVL - 7,256 merged training examples (up from 3,183) - New training data: failure modes (85), midloop messaging (27), prompt injection defense (29), personality (32), gold from quarantine bank (232), new tool examples (30), claude's own experience (10) - All training data RCON-validated at 100% pass rate - Bake-off: gemma3:27b 66%, qwen3.5:27b 61%, translategemma:27b 56% Oracle Bot (Mind's Eye): - Invisible spectator bot (mineflayer) streams world state via WebSocket - HTML5 Canvas frontend at mind.mortdec.ai - Real-time tool trace visualization with expandable entries - Streaming model tokens during inference - Gateway integration: fire-and-forget POST /trace on every tool call Reinforcement Learning: - Gymnasium environment wrapping mineflayer bot (minecraft_env.py) - PPO training via Stable Baselines3 (10K param policy network) - Behavioral cloning pretraining (97.5% accuracy on expert policy) - Infinite training loop with auto-restart and checkpoint resume - Bot learns combat, survival, navigation from raw experience Bot Army: - 8-soldier marching formation with autonomous combat - Combat bots using mineflayer-pvp, pathfinder, armor-manager - Multilingual prayer bots via translategemma:27b (18 languages) - Frame-based AI architecture: LLM planner + reactive micro-scripts Infrastructure: - Fixed mattpc.sethpc.xyz billing gateway (API key + player list parser) - Billing gateway now tracks all LAN traffic (LAN auto-auth) - Gateway fallback for empty god-mode responses - Updated mortdec.ai landing page Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
+63
-21
@@ -178,12 +178,17 @@ def score_result(example: dict, actual_cmds: list, parsed: dict) -> dict:
|
||||
}
|
||||
|
||||
|
||||
def run_bakeoff(models: list, ollama_url: str, no_think: bool = False):
|
||||
def run_bakeoff(models: list, ollama_url: str, no_think: bool = False, limit: int = 0):
|
||||
"""Run all models against the dataset and compare."""
|
||||
import random
|
||||
# Load dataset
|
||||
with open(DATASET) as f:
|
||||
examples = [json.loads(line) for line in f if line.strip()]
|
||||
|
||||
if limit > 0 and limit < len(examples):
|
||||
random.seed(42)
|
||||
examples = random.sample(examples, limit)
|
||||
|
||||
print(f"Bake-off: {len(examples)} examples × {len(models)} models")
|
||||
print(f"Ollama: {ollama_url}")
|
||||
print(f"Models: {', '.join(models)}")
|
||||
@@ -211,23 +216,37 @@ def run_bakeoff(models: list, ollama_url: str, no_think: bool = False):
|
||||
for i, ex in enumerate(examples):
|
||||
eid = ex.get("id", f"ex-{i}")
|
||||
category = ex.get("category", "?")
|
||||
query = ex["input"]["user_message"]
|
||||
|
||||
# Determine mode
|
||||
mode = "sudo"
|
||||
if query.lower().startswith("pray "):
|
||||
mode = "god"
|
||||
query_stripped = query[5:]
|
||||
# Handle both old dict format and new messages[] format
|
||||
if "messages" in ex and isinstance(ex["messages"], list):
|
||||
# Messages format: extract user message and system prompt
|
||||
msgs = ex["messages"]
|
||||
sys_content = ""
|
||||
user_content = ""
|
||||
for msg in msgs:
|
||||
if msg.get("role") == "system":
|
||||
sys_content = msg.get("content", "")
|
||||
elif msg.get("role") == "user":
|
||||
user_content = msg.get("content", "")
|
||||
query = user_content
|
||||
mode = "god" if "You are God" in sys_content else "sudo"
|
||||
messages = [
|
||||
{"role": "system", "content": sys_content},
|
||||
{"role": "user", "content": user_content},
|
||||
]
|
||||
else:
|
||||
query_stripped = query
|
||||
|
||||
# Build prompt
|
||||
system_prompt = get_prompt(mode)
|
||||
user_msg = build_user_message(ex)
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_msg},
|
||||
]
|
||||
query = ex["input"]["user_message"]
|
||||
# Determine mode
|
||||
mode = "sudo"
|
||||
if query.lower().startswith("pray "):
|
||||
mode = "god"
|
||||
# Build prompt
|
||||
system_prompt = get_prompt(mode)
|
||||
user_msg = build_user_message(ex)
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_msg},
|
||||
]
|
||||
|
||||
# Call LLM
|
||||
try:
|
||||
@@ -240,8 +259,29 @@ def run_bakeoff(models: list, ollama_url: str, no_think: bool = False):
|
||||
parsed = parse_response(resp["content"])
|
||||
actual_cmds = parsed.get("commands", [])
|
||||
|
||||
# Score
|
||||
scores = score_result(ex, actual_cmds, parsed)
|
||||
# Score — adapt example to old format for scoring if needed
|
||||
score_ex = ex
|
||||
if "messages" in ex and "output" not in ex:
|
||||
# Extract expected output from assistant message
|
||||
expected_content = ""
|
||||
for msg in ex["messages"]:
|
||||
if msg.get("role") == "assistant":
|
||||
expected_content = msg.get("content", "")
|
||||
break
|
||||
try:
|
||||
expected_parsed = json.loads(expected_content)
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
expected_parsed = {"commands": [], "message": ""}
|
||||
score_ex = {
|
||||
"input": {"user_message": query},
|
||||
"output": {
|
||||
"commands": expected_parsed.get("commands", []),
|
||||
"message": expected_parsed.get("message", ""),
|
||||
"safety_flags": [],
|
||||
},
|
||||
"category": category,
|
||||
}
|
||||
scores = score_result(score_ex, actual_cmds, parsed)
|
||||
|
||||
status = "OK" if scores["cmd_match"] else "MISS"
|
||||
syntax_flag = "" if scores["syntax_ok"] else " [SYNTAX]"
|
||||
@@ -252,7 +292,7 @@ def run_bakeoff(models: list, ollama_url: str, no_think: bool = False):
|
||||
f"({category}) {query[:50]} [{resp['duration_ms']}ms]")
|
||||
|
||||
if not scores["cmd_match"]:
|
||||
expected_cmds = ex["output"].get("commands", [])
|
||||
expected_cmds = score_ex.get("output", {}).get("commands", [])
|
||||
print(f" Expected: {expected_cmds[:2] if isinstance(expected_cmds, list) else expected_cmds}")
|
||||
print(f" Got: {actual_cmds[:2] if isinstance(actual_cmds, list) else actual_cmds}")
|
||||
|
||||
@@ -260,7 +300,7 @@ def run_bakeoff(models: list, ollama_url: str, no_think: bool = False):
|
||||
"id": eid,
|
||||
"category": category,
|
||||
"query": query,
|
||||
"expected": ex["output"].get("commands", []),
|
||||
"expected": score_ex.get("output", {}).get("commands", []),
|
||||
"actual": actual_cmds,
|
||||
"message": parsed.get("message", ""),
|
||||
"reasoning": parsed.get("reasoning", ""),
|
||||
@@ -336,9 +376,11 @@ def main():
|
||||
default=["qwen3-coder:30b", "gemma3n:e4b"])
|
||||
parser.add_argument("--no-think", action="store_true",
|
||||
help="Prepend /no_think to disable thinking tokens (helps Qwen models)")
|
||||
parser.add_argument("--limit", type=int, default=0,
|
||||
help="Max examples per model (0 = all)")
|
||||
args = parser.parse_args()
|
||||
|
||||
run_bakeoff(args.models, args.ollama_url, no_think=args.no_think)
|
||||
run_bakeoff(args.models, args.ollama_url, no_think=args.no_think, limit=args.limit)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user