GPU scheduler, 14-tool architecture, plugin deployment, event dispatcher

GPU Scheduler (gpu.sethpc.xyz): - Live dashboard with 4 GPUs, training monitor, loss sparklines - Preset-based job scheduler with 3 triggers (time, finish_training, cost) - Model selection per GPU, pipeline configuration - Tool self-play and training pipeline types - Behind Google OAuth, live-refresh without page reload Tool Architecture (14 tools): - 3 new tools: world.nearby_entities, memory.read, memory.write - 7 script.* tools: write, validate, execute, read, list, delete, schedule - ScriptManager: full mcfunction datapack CRUD with RCON validation - Training data: 1,430 tool examples (up from 1,159) Plugin Deployment (paper-ai-25567): - WorldGuard 7.0.12, CoreProtect CE 23.1, EssentialsX 2.21.2, Vault 1.7.3 - Fresh greenfield world reset - 104 RCON-validated plugin training examples Event Dispatcher: - Watches server log for deaths, joins, advancements, PvP kills - Configurable trigger probability and cooldowns per event type - Deployed to dev server, fires god_system prompts on events - 21 event-response training examples Training Infrastructure: - train_lora.py: --save-steps 50, --resume from checkpoint - run_training.sh: stops Ollama, activates conda, restarts after - Passwordless sudo for ollama services on steel141 - Dev server added to MCSManager with autoStart Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-21 03:14:45 -04:00
parent 434589d098
commit da8f557219
34 changed files with 7822 additions and 2 deletions
@@ -0,0 +1,524 @@
+[
+  {
+    "id": "dd43e5ea",
+    "preset_id": "a78c48c1",
+    "preset_name": "Everyone Test",
+    "pipeline": "self_play",
+    "params": {
+      "model": "mortdecai:0.4.0",
+      "tiers": "drills,self_critique,adversarial",
+      "rounds_per_tier": "50",
+      "rcon_host": "192.168.0.244",
+      "rcon_port": "25578"
+    },
+    "gpus": [
+      "3090ti",
+      "2080ti",
+      "rtx4000"
+    ],
+    "status": "failed",
+    "created_at": "2026-03-21T01:19:09Z",
+    "started_at": "2026-03-21T01:19:09Z",
+    "error": "'str' object cannot be interpreted as an integer",
+    "finished_at": "2026-03-21T01:19:09Z"
+  },
+  {
+    "id": "bc8112d9",
+    "preset_id": "a78c48c1",
+    "preset_name": "Everyone Test",
+    "pipeline": "self_play",
+    "params": {
+      "model": "mortdecai:0.4.0",
+      "tiers": "drills,self_critique,adversarial",
+      "rounds_per_tier": "50",
+      "rcon_host": "192.168.0.244",
+      "rcon_port": "25578"
+    },
+    "gpus": [
+      "3090ti",
+      "2080ti",
+      "rtx4000"
+    ],
+    "status": "failed",
+    "created_at": "2026-03-21T01:21:03Z",
+    "started_at": "2026-03-21T01:21:03Z",
+    "error": "'str' object cannot be interpreted as an integer",
+    "finished_at": "2026-03-21T01:21:04Z"
+  },
+  {
+    "id": "be1265be",
+    "preset_id": "a78c48c1",
+    "preset_name": "Everyone Test",
+    "pipeline": "self_play",
+    "params": {
+      "model": "mortdecai:0.4.0",
+      "tiers": "drills,self_critique,adversarial",
+      "rounds_per_tier": "50",
+      "rcon_host": "192.168.0.244",
+      "rcon_port": "25578"
+    },
+    "gpus": [
+      "3090ti",
+      "2080ti",
+      "rtx4000"
+    ],
+    "status": "cancelled",
+    "created_at": "2026-03-21T01:23:34Z",
+    "started_at": "2026-03-21T01:23:34Z"
+  },
+  {
+    "id": "2b895dcf",
+    "preset_id": "a78c48c1",
+    "preset_name": "Everyone Test",
+    "pipeline": "self_play",
+    "params": {
+      "model": "mortdecai:0.4.0",
+      "tiers": "1,2,3",
+      "rounds_per_tier": "50",
+      "rcon_host": "192.168.0.244",
+      "rcon_port": "25578",
+      "rcon_pass": "REDACTED_RCON"
+    },
+    "gpus": [
+      "3090ti",
+      "2080ti",
+      "rtx4000"
+    ],
+    "status": "cancelled",
+    "created_at": "2026-03-21T01:28:31Z",
+    "started_at": "2026-03-21T01:28:31Z",
+    "gpu_assignments": {
+      "3090ti": [
+        "1"
+      ],
+      "2080ti": [
+        "2"
+      ],
+      "rtx4000": [
+        "3"
+      ]
+    }
+  },
+  {
+    "id": "db75e2ba",
+    "preset_id": "06356764",
+    "preset_name": "Infer during training",
+    "pipeline": "self_play",
+    "params": {
+      "model": "mortdecai:0.4.0",
+      "tiers": "1,2,3",
+      "rounds_per_tier": "50",
+      "rcon_host": "192.168.0.244",
+      "rcon_port": "25578",
+      "rcon_pass": "REDACTED_RCON"
+    },
+    "gpus": [
+      "rtx4000"
+    ],
+    "status": "cancelled",
+    "created_at": "2026-03-21T01:38:09Z",
+    "started_at": "2026-03-21T01:38:09Z",
+    "gpu_assignments": {
+      "rtx4000": [
+        "1",
+        "2",
+        "3"
+      ]
+    }
+  },
+  {
+    "id": "d1581da3",
+    "preset_id": "9cc95c0a",
+    "preset_name": "Train",
+    "pipeline": "training",
+    "params": {
+      "base_model": "Qwen/Qwen3.5-9B",
+      "dataset": "auto",
+      "output_name": "mortdecai-0.5.0",
+      "epochs": "1",
+      "lr": "0.0001",
+      "batch_size": "2",
+      "grad_accum": "4",
+      "max_seq_len": "2048",
+      "save_steps": "50"
+    },
+    "gpus": [
+      "3090ti",
+      "2080ti"
+    ],
+    "status": "failed",
+    "created_at": "2026-03-21T01:38:13Z",
+    "started_at": "2026-03-21T01:38:13Z",
+    "log_path": "training/train_run_mortdecai-0.5.0.log",
+    "progress": {
+      "active": false,
+      "loss_history": [
+        0.1309,
+        0.07891,
+        0.03225,
+        0.03791,
+        0.07594,
+        0.07748,
+        0.05243,
+        0.0536,
+        0.05368,
+        0.05622,
+        0.04548,
+        0.07975,
+        0.04655,
+        0.01792,
+        0.08467,
+        0.0151,
+        0.05061,
+        0.04185,
+        0.04518,
+        0.03152,
+        0.084,
+        0.06383,
+        0.04852,
+        0.0456,
+        0.05045,
+        0.05591,
+        0.06717,
+        0.05597,
+        0.04513,
+        0.04979,
+        0.02702,
+        0.04608,
+        0.04292,
+        0.04888,
+        0.09399,
+        0.03988,
+        0.02565,
+        0.05894,
+        0.03941,
+        0.04952,
+        0.0767,
+        0.0494,
+        0.1099,
+        0.03652,
+        0.05015,
+        0.07898,
+        0.05064,
+        0.03833,
+        0.04133,
+        0.03163,
+        0.09881,
+        0.05912,
+        0.05795,
+        0.02599,
+        0.09814,
+        0.04749,
+        0.0284,
+        0.06074,
+        0.04718,
+        0.03789,
+        0.08998,
+        0.04451,
+        0.05937,
+        0.04544,
+        0.06173,
+        0.04686,
+        0.05936,
+        0.0311,
+        0.03927,
+        0.08231,
+        0.02436,
+        0.05194,
+        0.04414,
+        0.03787,
+        0.0383,
+        0.0408,
+        0.04119,
+        0.03175,
+        0.08285,
+        0.05705,
+        0.02964,
+        0.0409,
+        0.03605,
+        0.04664,
+        0.04889,
+        0.03085,
+        0.05376,
+        0.0594,
+        0.0357,
+        0.0965,
+        0.04077,
+        0.07085,
+        0.0476,
+        0.04919,
+        0.03484,
+        0.02473,
+        0.07078,
+        0.08155,
+        0.05989,
+        0.06994,
+        0.07064
+      ],
+      "pct": 47,
+      "current_step": 250,
+      "total_steps": 535,
+      "eta": "2:27:31",
+      "elapsed": "2:09:24",
+      "error": "OOM",
+      "latest_loss": 0.07064,
+      "learning_rate": "6.464e-05"
+    },
+    "error": "OOM",
+    "finished_at": "2026-03-21T01:38:49Z"
+  },
+  {
+    "id": "8e7909c4",
+    "preset_id": "06356764",
+    "preset_name": "Infer during training",
+    "pipeline": "self_play",
+    "params": {
+      "model": "mortdecai:0.4.0",
+      "tiers": "1,2,3",
+      "rounds_per_tier": "50",
+      "rcon_host": "192.168.0.244",
+      "rcon_port": "25578",
+      "rcon_pass": "REDACTED_RCON"
+    },
+    "gpus": [
+      "rtx4000"
+    ],
+    "status": "running",
+    "created_at": "2026-03-21T01:48:09Z",
+    "started_at": "2026-03-21T01:48:09Z",
+    "gpu_assignments": {
+      "rtx4000": [
+        "1",
+        "2",
+        "3"
+      ]
+    }
+  },
+  {
+    "id": "c72dfada",
+    "preset_id": "9cc95c0a",
+    "preset_name": "Train",
+    "pipeline": "training",
+    "params": {
+      "base_model": "Qwen/Qwen3.5-9B",
+      "dataset": "auto",
+      "output_name": "mortdecai-0.5.0",
+      "epochs": "1",
+      "lr": "0.0001",
+      "batch_size": "2",
+      "grad_accum": "4",
+      "max_seq_len": "2048",
+      "save_steps": "50"
+    },
+    "gpus": [
+      "3090ti",
+      "2080ti"
+    ],
+    "status": "failed",
+    "created_at": "2026-03-21T01:48:14Z",
+    "started_at": "2026-03-21T01:48:14Z",
+    "log_path": "training/train_run_mortdecai-0.5.0.log",
+    "progress": {
+      "active": false,
+      "loss_history": [
+        0.1309,
+        0.07891,
+        0.03225,
+        0.03791,
+        0.07594,
+        0.07748,
+        0.05243,
+        0.0536,
+        0.05368,
+        0.05622,
+        0.04548,
+        0.07975,
+        0.04655,
+        0.01792,
+        0.08467,
+        0.0151,
+        0.05061,
+        0.04185,
+        0.04518,
+        0.03152,
+        0.084,
+        0.06383,
+        0.04852,
+        0.0456,
+        0.05045,
+        0.05591,
+        0.06717,
+        0.05597,
+        0.04513,
+        0.04979,
+        0.02702,
+        0.04608,
+        0.04292,
+        0.04888,
+        0.09399,
+        0.03988,
+        0.02565,
+        0.05894,
+        0.03941,
+        0.04952,
+        0.0767,
+        0.0494,
+        0.1099,
+        0.03652,
+        0.05015,
+        0.07898,
+        0.05064,
+        0.03833,
+        0.04133,
+        0.03163,
+        0.09881,
+        0.05912,
+        0.05795,
+        0.02599,
+        0.09814,
+        0.04749,
+        0.0284,
+        0.06074,
+        0.04718,
+        0.03789,
+        0.08998,
+        0.04451,
+        0.05937,
+        0.04544,
+        0.06173,
+        0.04686,
+        0.05936,
+        0.0311,
+        0.03927,
+        0.08231,
+        0.02436,
+        0.05194,
+        0.04414,
+        0.03787,
+        0.0383,
+        0.0408,
+        0.04119,
+        0.03175,
+        0.08285,
+        0.05705,
+        0.02964,
+        0.0409,
+        0.03605,
+        0.04664,
+        0.04889,
+        0.03085,
+        0.05376,
+        0.0594,
+        0.0357,
+        0.0965,
+        0.04077,
+        0.07085,
+        0.0476,
+        0.04919,
+        0.03484,
+        0.02473,
+        0.07078,
+        0.08155,
+        0.05989,
+        0.06994,
+        0.07064
+      ],
+      "pct": 47,
+      "current_step": 250,
+      "total_steps": 535,
+      "eta": "2:27:31",
+      "elapsed": "2:09:24",
+      "error": "OOM",
+      "latest_loss": 0.07064,
+      "learning_rate": "6.464e-05"
+    },
+    "error": "OOM",
+    "finished_at": "2026-03-21T01:48:50Z"
+  },
+  {
+    "id": "28691b1d",
+    "preset_id": "9cc95c0a",
+    "preset_name": "Train",
+    "pipeline": "training",
+    "params": {
+      "base_model": "Qwen/Qwen3.5-9B",
+      "dataset": "auto",
+      "output_name": "mortdecai-0.5.0",
+      "epochs": "1",
+      "lr": "0.0001",
+      "batch_size": "2",
+      "grad_accum": "4",
+      "max_seq_len": "2048",
+      "save_steps": "50"
+    },
+    "gpus": [
+      "3090ti",
+      "2080ti"
+    ],
+    "status": "failed",
+    "created_at": "2026-03-21T01:51:47Z",
+    "started_at": "2026-03-21T01:51:47Z",
+    "log_path": "training/train_run_mortdecai-0.5.0.log",
+    "progress": {
+      "active": false,
+      "loss_history": [],
+      "error": "crashed"
+    },
+    "error": "crashed",
+    "finished_at": "2026-03-21T01:52:23Z"
+  },
+  {
+    "id": "adff373a",
+    "preset_id": "9cc95c0a",
+    "preset_name": "Train",
+    "pipeline": "training",
+    "params": {
+      "base_model": "Qwen/Qwen3.5-9B",
+      "dataset": "auto",
+      "output_name": "mortdecai-0.5.0",
+      "epochs": "1",
+      "lr": "0.0001",
+      "batch_size": "2",
+      "grad_accum": "4",
+      "max_seq_len": "2048",
+      "save_steps": "50"
+    },
+    "gpus": [
+      "3090ti",
+      "2080ti"
+    ],
+    "status": "running",
+    "created_at": "2026-03-21T02:05:09Z",
+    "started_at": "2026-03-21T02:05:09Z",
+    "log_path": "training/train_run_mortdecai-0.5.0.log",
+    "progress": {
+      "active": true,
+      "loss_history": [],
+      "pct": 2,
+      "current_step": 9,
+      "total_steps": 548,
+      "eta": "3:19:36",
+      "elapsed": "03:21"
+    }
+  },
+  {
+    "id": "32cc3363",
+    "preset_id": "06356764",
+    "preset_name": "Infer during training",
+    "pipeline": "self_play",
+    "params": {
+      "model": "mortdecai:0.4.0",
+      "tiers": "1,2,3",
+      "rounds_per_tier": "50",
+      "rcon_host": "192.168.0.244",
+      "rcon_port": "25578",
+      "rcon_pass": "REDACTED_RCON"
+    },
+    "gpus": [
+      "rtx4000"
+    ],
+    "status": "running",
+    "created_at": "2026-03-21T02:10:15Z",
+    "started_at": "2026-03-21T02:10:15Z"
+  }
+]