GPU scheduler, 14-tool architecture, plugin deployment, event dispatcher

GPU Scheduler (gpu.sethpc.xyz):
- Live dashboard with 4 GPUs, training monitor, loss sparklines
- Preset-based job scheduler with 3 triggers (time, finish_training, cost)
- Model selection per GPU, pipeline configuration
- Tool self-play and training pipeline types
- Behind Google OAuth, live-refresh without page reload

Tool Architecture (14 tools):
- 3 new tools: world.nearby_entities, memory.read, memory.write
- 7 script.* tools: write, validate, execute, read, list, delete, schedule
- ScriptManager: full mcfunction datapack CRUD with RCON validation
- Training data: 1,430 tool examples (up from 1,159)

Plugin Deployment (paper-ai-25567):
- WorldGuard 7.0.12, CoreProtect CE 23.1, EssentialsX 2.21.2, Vault 1.7.3
- Fresh greenfield world reset
- 104 RCON-validated plugin training examples

Event Dispatcher:
- Watches server log for deaths, joins, advancements, PvP kills
- Configurable trigger probability and cooldowns per event type
- Deployed to dev server, fires god_system prompts on events
- 21 event-response training examples

Training Infrastructure:
- train_lora.py: --save-steps 50, --resume from checkpoint
- run_training.sh: stops Ollama, activates conda, restarts after
- Passwordless sudo for ollama services on steel141
- Dev server added to MCSManager with autoStart

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Mortdecai
2026-03-21 03:14:45 -04:00
parent 434589d098
commit da8f557219
34 changed files with 7822 additions and 2 deletions
+524
View File
@@ -0,0 +1,524 @@
[
{
"id": "dd43e5ea",
"preset_id": "a78c48c1",
"preset_name": "Everyone Test",
"pipeline": "self_play",
"params": {
"model": "mortdecai:0.4.0",
"tiers": "drills,self_critique,adversarial",
"rounds_per_tier": "50",
"rcon_host": "192.168.0.244",
"rcon_port": "25578"
},
"gpus": [
"3090ti",
"2080ti",
"rtx4000"
],
"status": "failed",
"created_at": "2026-03-21T01:19:09Z",
"started_at": "2026-03-21T01:19:09Z",
"error": "'str' object cannot be interpreted as an integer",
"finished_at": "2026-03-21T01:19:09Z"
},
{
"id": "bc8112d9",
"preset_id": "a78c48c1",
"preset_name": "Everyone Test",
"pipeline": "self_play",
"params": {
"model": "mortdecai:0.4.0",
"tiers": "drills,self_critique,adversarial",
"rounds_per_tier": "50",
"rcon_host": "192.168.0.244",
"rcon_port": "25578"
},
"gpus": [
"3090ti",
"2080ti",
"rtx4000"
],
"status": "failed",
"created_at": "2026-03-21T01:21:03Z",
"started_at": "2026-03-21T01:21:03Z",
"error": "'str' object cannot be interpreted as an integer",
"finished_at": "2026-03-21T01:21:04Z"
},
{
"id": "be1265be",
"preset_id": "a78c48c1",
"preset_name": "Everyone Test",
"pipeline": "self_play",
"params": {
"model": "mortdecai:0.4.0",
"tiers": "drills,self_critique,adversarial",
"rounds_per_tier": "50",
"rcon_host": "192.168.0.244",
"rcon_port": "25578"
},
"gpus": [
"3090ti",
"2080ti",
"rtx4000"
],
"status": "cancelled",
"created_at": "2026-03-21T01:23:34Z",
"started_at": "2026-03-21T01:23:34Z"
},
{
"id": "2b895dcf",
"preset_id": "a78c48c1",
"preset_name": "Everyone Test",
"pipeline": "self_play",
"params": {
"model": "mortdecai:0.4.0",
"tiers": "1,2,3",
"rounds_per_tier": "50",
"rcon_host": "192.168.0.244",
"rcon_port": "25578",
"rcon_pass": "REDACTED_RCON"
},
"gpus": [
"3090ti",
"2080ti",
"rtx4000"
],
"status": "cancelled",
"created_at": "2026-03-21T01:28:31Z",
"started_at": "2026-03-21T01:28:31Z",
"gpu_assignments": {
"3090ti": [
"1"
],
"2080ti": [
"2"
],
"rtx4000": [
"3"
]
}
},
{
"id": "db75e2ba",
"preset_id": "06356764",
"preset_name": "Infer during training",
"pipeline": "self_play",
"params": {
"model": "mortdecai:0.4.0",
"tiers": "1,2,3",
"rounds_per_tier": "50",
"rcon_host": "192.168.0.244",
"rcon_port": "25578",
"rcon_pass": "REDACTED_RCON"
},
"gpus": [
"rtx4000"
],
"status": "cancelled",
"created_at": "2026-03-21T01:38:09Z",
"started_at": "2026-03-21T01:38:09Z",
"gpu_assignments": {
"rtx4000": [
"1",
"2",
"3"
]
}
},
{
"id": "d1581da3",
"preset_id": "9cc95c0a",
"preset_name": "Train",
"pipeline": "training",
"params": {
"base_model": "Qwen/Qwen3.5-9B",
"dataset": "auto",
"output_name": "mortdecai-0.5.0",
"epochs": "1",
"lr": "0.0001",
"batch_size": "2",
"grad_accum": "4",
"max_seq_len": "2048",
"save_steps": "50"
},
"gpus": [
"3090ti",
"2080ti"
],
"status": "failed",
"created_at": "2026-03-21T01:38:13Z",
"started_at": "2026-03-21T01:38:13Z",
"log_path": "training/train_run_mortdecai-0.5.0.log",
"progress": {
"active": false,
"loss_history": [
0.1309,
0.07891,
0.03225,
0.03791,
0.07594,
0.07748,
0.05243,
0.0536,
0.05368,
0.05622,
0.04548,
0.07975,
0.04655,
0.01792,
0.08467,
0.0151,
0.05061,
0.04185,
0.04518,
0.03152,
0.084,
0.06383,
0.04852,
0.0456,
0.05045,
0.05591,
0.06717,
0.05597,
0.04513,
0.04979,
0.02702,
0.04608,
0.04292,
0.04888,
0.09399,
0.03988,
0.02565,
0.05894,
0.03941,
0.04952,
0.0767,
0.0494,
0.1099,
0.03652,
0.05015,
0.07898,
0.05064,
0.03833,
0.04133,
0.03163,
0.09881,
0.05912,
0.05795,
0.02599,
0.09814,
0.04749,
0.0284,
0.06074,
0.04718,
0.03789,
0.08998,
0.04451,
0.05937,
0.04544,
0.06173,
0.04686,
0.05936,
0.0311,
0.03927,
0.08231,
0.02436,
0.05194,
0.04414,
0.03787,
0.0383,
0.0408,
0.04119,
0.03175,
0.08285,
0.05705,
0.02964,
0.0409,
0.03605,
0.04664,
0.04889,
0.03085,
0.05376,
0.0594,
0.0357,
0.0965,
0.04077,
0.07085,
0.0476,
0.04919,
0.03484,
0.02473,
0.07078,
0.08155,
0.05989,
0.06994,
0.07064
],
"pct": 47,
"current_step": 250,
"total_steps": 535,
"eta": "2:27:31",
"elapsed": "2:09:24",
"error": "OOM",
"latest_loss": 0.07064,
"learning_rate": "6.464e-05"
},
"error": "OOM",
"finished_at": "2026-03-21T01:38:49Z"
},
{
"id": "8e7909c4",
"preset_id": "06356764",
"preset_name": "Infer during training",
"pipeline": "self_play",
"params": {
"model": "mortdecai:0.4.0",
"tiers": "1,2,3",
"rounds_per_tier": "50",
"rcon_host": "192.168.0.244",
"rcon_port": "25578",
"rcon_pass": "REDACTED_RCON"
},
"gpus": [
"rtx4000"
],
"status": "running",
"created_at": "2026-03-21T01:48:09Z",
"started_at": "2026-03-21T01:48:09Z",
"gpu_assignments": {
"rtx4000": [
"1",
"2",
"3"
]
}
},
{
"id": "c72dfada",
"preset_id": "9cc95c0a",
"preset_name": "Train",
"pipeline": "training",
"params": {
"base_model": "Qwen/Qwen3.5-9B",
"dataset": "auto",
"output_name": "mortdecai-0.5.0",
"epochs": "1",
"lr": "0.0001",
"batch_size": "2",
"grad_accum": "4",
"max_seq_len": "2048",
"save_steps": "50"
},
"gpus": [
"3090ti",
"2080ti"
],
"status": "failed",
"created_at": "2026-03-21T01:48:14Z",
"started_at": "2026-03-21T01:48:14Z",
"log_path": "training/train_run_mortdecai-0.5.0.log",
"progress": {
"active": false,
"loss_history": [
0.1309,
0.07891,
0.03225,
0.03791,
0.07594,
0.07748,
0.05243,
0.0536,
0.05368,
0.05622,
0.04548,
0.07975,
0.04655,
0.01792,
0.08467,
0.0151,
0.05061,
0.04185,
0.04518,
0.03152,
0.084,
0.06383,
0.04852,
0.0456,
0.05045,
0.05591,
0.06717,
0.05597,
0.04513,
0.04979,
0.02702,
0.04608,
0.04292,
0.04888,
0.09399,
0.03988,
0.02565,
0.05894,
0.03941,
0.04952,
0.0767,
0.0494,
0.1099,
0.03652,
0.05015,
0.07898,
0.05064,
0.03833,
0.04133,
0.03163,
0.09881,
0.05912,
0.05795,
0.02599,
0.09814,
0.04749,
0.0284,
0.06074,
0.04718,
0.03789,
0.08998,
0.04451,
0.05937,
0.04544,
0.06173,
0.04686,
0.05936,
0.0311,
0.03927,
0.08231,
0.02436,
0.05194,
0.04414,
0.03787,
0.0383,
0.0408,
0.04119,
0.03175,
0.08285,
0.05705,
0.02964,
0.0409,
0.03605,
0.04664,
0.04889,
0.03085,
0.05376,
0.0594,
0.0357,
0.0965,
0.04077,
0.07085,
0.0476,
0.04919,
0.03484,
0.02473,
0.07078,
0.08155,
0.05989,
0.06994,
0.07064
],
"pct": 47,
"current_step": 250,
"total_steps": 535,
"eta": "2:27:31",
"elapsed": "2:09:24",
"error": "OOM",
"latest_loss": 0.07064,
"learning_rate": "6.464e-05"
},
"error": "OOM",
"finished_at": "2026-03-21T01:48:50Z"
},
{
"id": "28691b1d",
"preset_id": "9cc95c0a",
"preset_name": "Train",
"pipeline": "training",
"params": {
"base_model": "Qwen/Qwen3.5-9B",
"dataset": "auto",
"output_name": "mortdecai-0.5.0",
"epochs": "1",
"lr": "0.0001",
"batch_size": "2",
"grad_accum": "4",
"max_seq_len": "2048",
"save_steps": "50"
},
"gpus": [
"3090ti",
"2080ti"
],
"status": "failed",
"created_at": "2026-03-21T01:51:47Z",
"started_at": "2026-03-21T01:51:47Z",
"log_path": "training/train_run_mortdecai-0.5.0.log",
"progress": {
"active": false,
"loss_history": [],
"error": "crashed"
},
"error": "crashed",
"finished_at": "2026-03-21T01:52:23Z"
},
{
"id": "adff373a",
"preset_id": "9cc95c0a",
"preset_name": "Train",
"pipeline": "training",
"params": {
"base_model": "Qwen/Qwen3.5-9B",
"dataset": "auto",
"output_name": "mortdecai-0.5.0",
"epochs": "1",
"lr": "0.0001",
"batch_size": "2",
"grad_accum": "4",
"max_seq_len": "2048",
"save_steps": "50"
},
"gpus": [
"3090ti",
"2080ti"
],
"status": "running",
"created_at": "2026-03-21T02:05:09Z",
"started_at": "2026-03-21T02:05:09Z",
"log_path": "training/train_run_mortdecai-0.5.0.log",
"progress": {
"active": true,
"loss_history": [],
"pct": 2,
"current_step": 9,
"total_steps": 548,
"eta": "3:19:36",
"elapsed": "03:21"
}
},
{
"id": "32cc3363",
"preset_id": "06356764",
"preset_name": "Infer during training",
"pipeline": "self_play",
"params": {
"model": "mortdecai:0.4.0",
"tiers": "1,2,3",
"rounds_per_tier": "50",
"rcon_host": "192.168.0.244",
"rcon_port": "25578",
"rcon_pass": "REDACTED_RCON"
},
"gpus": [
"rtx4000"
],
"status": "running",
"created_at": "2026-03-21T02:10:15Z",
"started_at": "2026-03-21T02:10:15Z"
}
]