#!/usr/bin/env python3 """ Mortdecai GPU Scheduler — preset-based job scheduler with live GPU monitoring. Features: - GPU dashboard with live stats across the homelab - Configuration presets (GPU assignments, model selection, pipeline type) - Job scheduler with 3 trigger types: time, finish_training, cost - Model management: load/unload Ollama models per GPU - Training progress monitor with loss curves Usage: python3 gpu_scheduler.py --port 8098 Serve behind Caddy as gpu.sethpc.xyz with google_auth. """ import argparse import json import os import re import subprocess import threading import time import uuid from http.server import HTTPServer, BaseHTTPRequestHandler from pathlib import Path from urllib.parse import parse_qs, urlparse from datetime import datetime, timedelta PORT = 8098 DATA_DIR = Path(__file__).resolve().parent.parent / "data" / "scheduler" # ── GPU Inventory ────────────────────────────────────────────────────────── GPUS = [ { "id": "3090ti", "name": "RTX 3090 Ti", "vram_gb": 24, "vram_mb": 24564, "host": "seth@192.168.0.141", "gpu_index": 1, "ollama_port": 11434, "ollama_service": "ollama.service", "capabilities": ["training", "inference", "self-play", "pipeline"], "location": "steel141", }, { "id": "2080ti", "name": "RTX 2080 Ti", "vram_gb": 11, "vram_mb": 11264, "host": "seth@192.168.0.141", "gpu_index": 0, "ollama_port": 11435, "ollama_service": "ollama-gpu0.service", "capabilities": ["inference", "self-play", "pipeline", "generator"], "location": "steel141", }, { "id": "rtx4000", "name": "Quadro RTX 4000", "vram_gb": 8, "vram_mb": 8192, "host": "pve197", "gpu_index": 0, "pct_id": 105, "ollama_port": 11434, "ollama_service": "ollama.service", "capabilities": ["inference", "self-play", "pipeline", "prod"], "location": "pve197 → CT 105", }, { "id": "1660s", "name": "GTX 1660 Super", "vram_gb": 6, "vram_mb": 6144, "host": "root@192.168.0.235", "gpu_index": 0, "ollama_port": 11434, "ollama_service": "ollama.service", "capabilities": ["generator", "inference-small"], "location": "bedroom", "ssh_extra": "-o StrictHostKeyChecking=no", "ssh_pass": "REDACTED_PASSWORD", }, ] GPU_MAP = {g["id"]: g for g in GPUS} # ── Pipeline Definitions ────────────────────────────────────────────────── PIPELINE_TYPES = { "training": { "label": "Training (QLoRA)", "description": "Fine-tune model via Unsloth QLoRA", "gpu_req": ["training"], "params": ["base_model", "dataset", "output_name", "epochs", "lr", "batch_size", "grad_accum", "max_seq_len", "save_steps"], "defaults": { "base_model": "Qwen/Qwen3.5-9B", "dataset": "auto", "output_name": "mortdecai-0.5.0", "epochs": 1, "lr": 1e-4, "batch_size": 2, "grad_accum": 4, "max_seq_len": 2048, "save_steps": 50, }, }, "self_play": { "label": "Self-Play", "description": "Model generates edge cases and learns from failures", "gpu_req": ["inference"], "params": ["model", "tiers", "rounds_per_tier", "rcon_host", "rcon_port", "rcon_pass"], "defaults": { "model": "mortdecai:0.4.0", "tiers": "1,2,3", "rounds_per_tier": 50, "rcon_host": "192.168.0.244", "rcon_port": 25578, "rcon_pass": "REDACTED_RCON", }, }, "prompt_pipeline": { "label": "Prompt Pipeline", "description": "Small model generates prompts, big models process + RCON validate", "gpu_req": ["generator", "inference"], "params": ["gen_model", "proc_model", "batch_size", "interval"], "defaults": { "gen_model": "qwen3.5:0.8b", "proc_model": "mortdecai:0.4.0", "batch_size": 30, "interval": 120, }, }, "bakeoff": { "label": "Bake-off", "description": "Compare model versions on standard test prompts", "gpu_req": ["inference"], "params": ["models", "test_set", "rcon_host"], "defaults": { "models": "mortdecai:0.4.0,mortdecai:0.5.0", "test_set": "standard", "rcon_host": "192.168.0.244", }, }, "export_gguf": { "label": "Export GGUF", "description": "Convert LoRA adapter to GGUF for Ollama", "gpu_req": ["training"], "params": ["adapter_path", "output_name", "quant"], "defaults": { "adapter_path": "training/checkpoints/mortdecai-0.5.0", "output_name": "mortdecai:0.5.0", "quant": "q4_k_m", }, }, "tool_self_play": { "label": "Tool Self-Play", "description": "Exercise all 14 tools on the dev server — scripts, memory, entities, wiki", "gpu_req": ["inference"], "params": ["model", "rounds", "categories", "rcon_host", "rcon_port", "rcon_pass"], "defaults": { "model": "mortdecai:0.4.0", "rounds": 10, "categories": "all", "rcon_host": "192.168.0.112", "rcon_port": 25578, "rcon_pass": "REDACTED_RCON", }, }, "load_model": { "label": "Load Model", "description": "Load/switch Ollama model on a GPU", "gpu_req": ["inference"], "params": ["model"], "defaults": {"model": "mortdecai:0.4.0"}, }, } # ── State ────────────────────────────────────────────────────────────────── _lock = threading.Lock() _state = { "gpus": {}, "training": None, "last_refresh": None, } _presets = {} # id -> preset dict _jobs = [] # list of job dicts _schedule = [] # list of scheduled trigger dicts _cost_tracker = {"total_kwh": 0.0, "total_cost": 0.0, "electricity_rate": 0.12} TRAINING_LOG_PATTERN = "/home/seth/mc-ai-training/Minecraft-AI-model/training/train_run_*.log" TRAINING_HOST = "seth@192.168.0.141" # ── Persistence ──────────────────────────────────────────────────────────── def _ensure_data_dir(): DATA_DIR.mkdir(parents=True, exist_ok=True) def _save_presets(): _ensure_data_dir() with open(DATA_DIR / "presets.json", "w") as f: json.dump(_presets, f, indent=2) def _save_jobs(): _ensure_data_dir() with open(DATA_DIR / "jobs.json", "w") as f: json.dump(_jobs, f, indent=2, default=str) def _save_schedule(): _ensure_data_dir() with open(DATA_DIR / "schedule.json", "w") as f: json.dump(_schedule, f, indent=2, default=str) def _load_persisted(): global _presets, _jobs, _schedule _ensure_data_dir() for name, target in [("presets.json", "_presets"), ("jobs.json", "_jobs"), ("schedule.json", "_schedule")]: path = DATA_DIR / name if path.exists(): with open(path) as f: data = json.load(f) if target == "_presets": _presets = data elif target == "_jobs": _jobs = data elif target == "_schedule": _schedule = data # ── SSH Helpers ──────────────────────────────────────────────────────────── def _ssh_cmd(gpu_or_host, cmd, timeout=8): """Run a command over SSH. Accepts a GPU dict or host string.""" if isinstance(gpu_or_host, dict): gpu = gpu_or_host host = gpu["host"] extra = gpu.get("ssh_extra", "").split() if gpu.get("ssh_extra") else [] ssh_pass = gpu.get("ssh_pass") # If pct_id is set, wrap command through proxmox host if "pct_id" in gpu: cmd = f"pct exec {gpu['pct_id']} -- bash -c '{cmd}'" else: host = gpu_or_host extra = [] ssh_pass = None try: if ssh_pass: full_cmd = ["sshpass", "-p", ssh_pass, "ssh", "-o", "ConnectTimeout=4"] + extra + [host, cmd] else: full_cmd = ["ssh", "-o", "ConnectTimeout=4", "-o", "BatchMode=yes"] + extra + [host, cmd] r = subprocess.run(full_cmd, capture_output=True, text=True, timeout=timeout) return r.stdout.strip() if r.returncode == 0 else None except Exception: return None def _ollama_api(gpu, endpoint, method="GET", data=None): """Call Ollama API on a GPU via SSH curl.""" port = gpu["ollama_port"] if method == "GET": cmd = f"curl -s --connect-timeout 3 http://localhost:{port}{endpoint}" else: payload = json.dumps(data).replace("'", "'\\''") if data else "{}" cmd = f"curl -s --connect-timeout 3 -X POST http://localhost:{port}{endpoint} -d '{payload}'" raw = _ssh_cmd(gpu, cmd) if raw: try: return json.loads(raw) except json.JSONDecodeError: pass return None # ── GPU Monitoring ───────────────────────────────────────────────────────── def _fetch_gpu_stats(gpu): query = f"nvidia-smi --id={gpu['gpu_index']} --query-gpu=utilization.gpu,temperature.gpu,power.draw,memory.used,memory.total,fan.speed --format=csv,noheader,nounits" raw = _ssh_cmd(gpu, query) if not raw: return {"online": False, "id": gpu["id"], "name": gpu["name"]} parts = [p.strip() for p in raw.split(",")] try: return { "online": True, "id": gpu["id"], "name": gpu["name"], "vram_gb": gpu["vram_gb"], "location": gpu["location"], "capabilities": gpu["capabilities"], "utilization": int(parts[0]), "temperature": int(parts[1]), "power_watts": float(parts[2]), "vram_used_mb": int(parts[3]), "vram_total_mb": int(parts[4]), "fan_speed": int(parts[5]) if parts[5] not in ("[N/A]", "[Not Supported]") else None, "vram_pct": round(int(parts[3]) / int(parts[4]) * 100, 1), } except (ValueError, IndexError): return {"online": True, "id": gpu["id"], "name": gpu["name"], "error": raw} def _fetch_ollama_info(gpu): """Get running + available models from Ollama.""" ps = _ollama_api(gpu, "/api/ps") or {} tags = _ollama_api(gpu, "/api/tags") or {} running = [] for m in ps.get("models", []): running.append({ "name": m.get("name", "?"), "size_gb": round(m.get("size", 0) / 1e9, 1), "vram_gb": round(m.get("size_vram", 0) / 1e9, 1), }) available = [m.get("name", "?") for m in tags.get("models", [])] return {"running": running, "available": available} def _fetch_training_status(): # Find the most recently modified training log log_path = _ssh_cmd(TRAINING_HOST, f"ls -t {TRAINING_LOG_PATTERN} 2>/dev/null | head -1", timeout=5) if not log_path: return None raw = _ssh_cmd(TRAINING_HOST, f"tail -200 {log_path} 2>/dev/null", timeout=8) if not raw: return None status = {"active": False, "loss_history": []} progress_matches = re.findall(r'(\d+)%\|[^|]*\|\s*(\d+)/(\d+)\s*\[([^\]]+)\]', raw) if progress_matches: last = progress_matches[-1] status["pct"] = int(last[0]) status["current_step"] = int(last[1]) status["total_steps"] = int(last[2]) timing = last[3] eta_match = re.search(r'<([^,]+)', timing) elapsed_match = re.match(r'([^<]+)', timing) if eta_match: status["eta"] = eta_match.group(1).strip() if elapsed_match: status["elapsed"] = elapsed_match.group(1).strip() status["active"] = True if "OutOfMemoryError" in raw: status["active"] = False status["error"] = "OOM" elif "Error" in raw.split("\n")[-1] and "OutOfMemoryError" not in raw: status["active"] = False status["error"] = "crashed" loss_matches = re.findall(r"'loss':\s*'([^']+)'", raw) for lm in loss_matches: try: status["loss_history"].append(float(lm)) except ValueError: pass if status["loss_history"]: status["latest_loss"] = status["loss_history"][-1] lr_matches = re.findall(r"'learning_rate':\s*'([^']+)'", raw) if lr_matches: status["learning_rate"] = lr_matches[-1] return status def _fetch_processes(gpu): cmd = f"nvidia-smi --id={gpu['gpu_index']} --query-compute-apps=pid,name,used_memory --format=csv,noheader,nounits 2>/dev/null" raw = _ssh_cmd(gpu, cmd) if not raw: return [] procs = [] for line in raw.strip().split("\n"): if not line.strip(): continue parts = [p.strip() for p in line.split(",")] if len(parts) >= 3: procs.append({"pid": parts[0], "name": parts[1].split("/")[-1], "vram_mb": parts[2]}) return procs def refresh_state(): new_gpus = {} threads = [] def fetch_one(gpu): stats = _fetch_gpu_stats(gpu) stats["ollama"] = _fetch_ollama_info(gpu) stats["processes"] = _fetch_processes(gpu) # Check if any job is running on this GPU active_jobs = [j for j in _jobs if j.get("status") == "running" and gpu["id"] in j.get("gpus", [])] stats["active_job"] = active_jobs[0]["id"] if active_jobs else None new_gpus[gpu["id"]] = stats for gpu in GPUS: t = threading.Thread(target=fetch_one, args=(gpu,)) t.start() threads.append(t) for t in threads: t.join(timeout=12) with _lock: _state["gpus"] = new_gpus _state["training"] = _fetch_training_status() _state["last_refresh"] = time.strftime("%H:%M:%S") def _bg_refresh_loop(interval=10): while True: try: refresh_state() _check_triggers() except Exception as e: print(f"[scheduler] refresh error: {e}") time.sleep(interval) # ── Job Execution ────────────────────────────────────────────────────────── def _run_job_async(job): """Execute a job in a background thread.""" def _run(): job["status"] = "running" job["started_at"] = time.strftime("%Y-%m-%dT%H:%M:%SZ") _save_jobs() print(f"[scheduler] starting job {job['id']}: {job['pipeline']}") try: pipeline = job["pipeline"] params = job["params"] gpus = job["gpus"] if pipeline == "training": _exec_training(job, params) elif pipeline == "self_play": _exec_self_play(job, params, gpus) elif pipeline == "prompt_pipeline": _exec_prompt_pipeline(job, params, gpus) elif pipeline == "load_model": _exec_load_model(job, params, gpus) elif pipeline == "export_gguf": _exec_export_gguf(job, params) elif pipeline == "bakeoff": _exec_bakeoff(job, params, gpus) elif pipeline == "tool_self_play": _exec_tool_self_play(job, params, gpus) else: job["error"] = f"unknown pipeline: {pipeline}" job["status"] = "failed" except Exception as e: job["error"] = str(e) job["status"] = "failed" print(f"[scheduler] job {job['id']} failed: {e}") if job["status"] == "running": job["status"] = "completed" job["finished_at"] = time.strftime("%Y-%m-%dT%H:%M:%SZ") _save_jobs() print(f"[scheduler] job {job['id']} → {job['status']}") t = threading.Thread(target=_run, daemon=True) t.start() return job def _exec_training(job, params): """Launch training on the 3090 Ti via SSH.""" output_name = params.get('output_name', 'mortdecai-0.5.0') log_name = f"train_run_{output_name}.log" # Build the training command with conda environment activation train_cmd = ( f"source /home/seth/miniconda3/etc/profile.d/conda.sh && " f"conda activate mc-train && " f"cd /home/seth/mc-ai-training/Minecraft-AI-model && " f"TORCH_COMPILE_DISABLE=1 TORCHDYNAMO_DISABLE=1 CUDA_VISIBLE_DEVICES=0 " f"python3 training/scripts/train_lora.py " f"--model '{params.get('base_model', 'Qwen/Qwen3.5-9B')}' " f"--output 'training/checkpoints/{output_name}' " f"--lr {params.get('lr', 1e-4)} " f"--epochs {int(params.get('epochs', 1))} " f"--batch-size {int(params.get('batch_size', 2))} " f"--grad-accum {int(params.get('grad_accum', 4))} " f"--max-seq-len {int(params.get('max_seq_len', 2048))} " f"--save-steps {int(params.get('save_steps', 50))}" ) if params.get("resume"): train_cmd += " --resume" train_cmd += f" 2>&1 | tee training/{log_name}" # Cancel any running jobs on the 3090 Ti to free VRAM for j in _jobs: if j.get("status") == "running" and "3090ti" in j.get("gpus", []) and j["id"] != job["id"]: j["status"] = "cancelled" print(f"[training] cancelled conflicting job {j['id']} on 3090ti") _save_jobs() # Stop both Ollama services AND prevent auto-restart _ssh_cmd(TRAINING_HOST, "sudo systemctl stop ollama.service 2>/dev/null", timeout=10) _ssh_cmd(TRAINING_HOST, "sudo systemctl stop ollama-gpu0.service 2>/dev/null", timeout=10) time.sleep(2) # Kill any lingering ollama processes holding GPU 1 VRAM _ssh_cmd(TRAINING_HOST, "for pid in $(nvidia-smi --id=1 --query-compute-apps=pid --format=csv,noheader,nounits 2>/dev/null); do kill $pid 2>/dev/null; done", timeout=5) time.sleep(3) # Verify VRAM is free enough (need ~18GB free on 24GB card) vram_check = _ssh_cmd(TRAINING_HOST, "nvidia-smi --id=1 --query-gpu=memory.free --format=csv,noheader,nounits") if vram_check: try: free_mb = int(vram_check.strip()) except ValueError: free_mb = 0 print(f"[training] 3090 Ti free VRAM: {free_mb}MB") if free_mb < 18000: # Last resort: try harder to free VRAM _ssh_cmd(TRAINING_HOST, "sudo systemctl stop ollama.service; sudo systemctl stop ollama-gpu0.service", timeout=10) time.sleep(5) vram_check2 = _ssh_cmd(TRAINING_HOST, "nvidia-smi --id=1 --query-gpu=memory.free --format=csv,noheader,nounits") try: free_mb = int(vram_check2.strip()) if vram_check2 else 0 except ValueError: free_mb = 0 if free_mb < 18000: job["status"] = "failed" job["error"] = f"Not enough VRAM: {free_mb}MB free, need 18000MB" # Restart Ollama since we're not training _ssh_cmd(TRAINING_HOST, "sudo systemctl start ollama.service 2>/dev/null", timeout=10) _ssh_cmd(TRAINING_HOST, "sudo systemctl start ollama-gpu0.service 2>/dev/null", timeout=10) return # Launch training via nohup with bash -l for conda nohup_cmd = f"nohup bash -c '{train_cmd}' > /dev/null 2>&1 &" _ssh_cmd(TRAINING_HOST, nohup_cmd, timeout=10) job["log_path"] = f"training/{log_name}" print(f"[training] launched, logging to {log_name}") # Monitor until done while job["status"] == "running": time.sleep(30) status = _fetch_training_status() if status: job["progress"] = status if status.get("error"): job["status"] = "failed" job["error"] = status["error"] break if not status.get("active") and status.get("current_step", 0) == status.get("total_steps", 0) and status.get("total_steps", 0) > 0: job["status"] = "completed" break # Restart Ollama services after training _ssh_cmd(TRAINING_HOST, "sudo systemctl start ollama.service 2>/dev/null", timeout=10) _ssh_cmd(TRAINING_HOST, "sudo systemctl start ollama-gpu0.service 2>/dev/null", timeout=10) def _exec_self_play(job, params, gpus): resolved_gpus = [GPU_MAP[gid] for gid in gpus if gid in GPU_MAP] if not resolved_gpus: job["error"] = "no GPU assigned" job["status"] = "failed" return model = params.get("model", "mortdecai:0.4.0") tiers = [t.strip() for t in params.get("tiers", "1,2,3").split(",")] rounds = int(params.get("rounds_per_tier", 50)) rcon_host = params.get("rcon_host", "192.168.0.244") rcon_port = int(params.get("rcon_port", 25578)) rcon_pass = params.get("rcon_pass", "REDACTED_RCON") script_path = "/home/seth/mc-ai-training/Minecraft-AI-model/training/scripts/self_play.py" # Distribute tiers round-robin across GPUs, launch all in parallel gpu_assignments = {} # gpu_id -> list of tiers for i, tier in enumerate(tiers): gpu = resolved_gpus[i % len(resolved_gpus)] gpu_assignments.setdefault(gpu["id"], []).append(tier) job["gpu_assignments"] = {gid: ts for gid, ts in gpu_assignments.items()} # Launch all GPU workers in parallel threads errors = [] def run_on_gpu(gpu, assigned_tiers): port = gpu["ollama_port"] for tier in assigned_tiers: if job["status"] != "running": break log_file = f"/tmp/selfplay_{gpu['id']}_{tier}.log" cmd = (f"cd /home/seth/mc-ai-training/Minecraft-AI-model && " f"python3 {script_path} --tier {tier} --rounds {rounds} " f"--ollama-url http://localhost:{port} --model {model} " f"--rcon-host {rcon_host} --rcon-port {rcon_port} --rcon-pass {rcon_pass}") _ssh_cmd(gpu, f"nohup bash -c '{cmd}' > {log_file} 2>&1 &", timeout=10) print(f"[self-play] {gpu['name']}: {tier} x{rounds} started") # Wait for this tier to finish for _ in range(rounds * 3): time.sleep(10) log = _ssh_cmd(gpu, f"tail -5 {log_file} 2>/dev/null") if log and ("Complete" in log or "Error" in log or "Traceback" in log): if "Error" in log or "Traceback" in log: errors.append(f"{gpu['name']}/{tier}: {log[-200:]}") break if job["status"] != "running": break threads = [] for gid, assigned_tiers in gpu_assignments.items(): gpu = GPU_MAP[gid] t = threading.Thread(target=run_on_gpu, args=(gpu, assigned_tiers), daemon=True) t.start() threads.append(t) for t in threads: t.join() if errors: job["error"] = "; ".join(errors[:3]) def _exec_prompt_pipeline(job, params, gpus): gen_gpu = GPU_MAP.get(gpus[0]) if len(gpus) > 0 else None proc_gpu = GPU_MAP.get(gpus[1]) if len(gpus) > 1 else gen_gpu if not gen_gpu: job["error"] = "no GPUs assigned" job["status"] = "failed" return gen_port = gen_gpu["ollama_port"] proc_port = proc_gpu["ollama_port"] if proc_gpu else gen_port gen_host_ip = gen_gpu["host"].split("@")[-1] proc_host_ip = proc_gpu["host"].split("@")[-1] if proc_gpu else gen_host_ip cmd = (f"cd /home/seth/mc-ai-training/Minecraft-AI-model && " f"python3 training/scripts/prompt_pipeline.py --mode all " f"--gen-url http://{gen_host_ip}:{gen_port} " f"--gen-model {params.get('gen_model', 'qwen3.5:0.8b')} " f"--proc-urls http://{proc_host_ip}:{proc_port} " f"--proc-model {params.get('proc_model', 'mortdecai:0.4.0')} " f"--interval {params.get('interval', 120)}") _ssh_cmd(TRAINING_HOST, f"nohup bash -c '{cmd}' > /tmp/pipeline.log 2>&1 &", timeout=10) def _exec_load_model(job, params, gpus): for gid in gpus: gpu = GPU_MAP.get(gid) if not gpu: continue model = params.get("model", "mortdecai:0.4.0") result = _ollama_api(gpu, "/api/generate", method="POST", data={ "model": model, "prompt": "test", "stream": False, "options": {"num_predict": 1}, }) if result and "error" not in result: job["result"] = f"Loaded {model} on {gpu['name']}" else: job["error"] = f"Failed to load {model} on {gpu['name']}: {result}" job["status"] = "failed" def _exec_export_gguf(job, params): adapter = params.get("adapter_path", "training/checkpoints/mortdecai-0.5.0") quant = params.get("quant", "q4_k_m") cmd = (f"cd /home/seth/mc-ai-training/Minecraft-AI-model && " f"python3 -m unsloth.save --model {adapter} --output_type gguf --quantization {quant}") _ssh_cmd(TRAINING_HOST, f"nohup bash -c '{cmd}' > /tmp/export_gguf.log 2>&1 &", timeout=10) # Monitor for _ in range(120): time.sleep(15) log = _ssh_cmd(TRAINING_HOST, "tail -3 /tmp/export_gguf.log 2>/dev/null") if log and ("Saved" in log or "Error" in log or "error" in log): if "Error" in log or "error" in log: job["status"] = "failed" job["error"] = log break def _exec_bakeoff(job, params, gpus): gpu = GPU_MAP.get(gpus[0]) if gpus else None if not gpu: job["error"] = "no GPU assigned" job["status"] = "failed" return models = params.get("models", "mortdecai:0.4.0") cmd = (f"cd /home/seth/mc-ai-training/Minecraft-AI-model && " f"python3 training/scripts/bakeoff.py --models {models}") _ssh_cmd(TRAINING_HOST, f"nohup bash -c '{cmd}' > /tmp/bakeoff.log 2>&1 &", timeout=10) def _exec_tool_self_play(job, params, gpus): """Run tool-focused self-play on the dev server via the assigned GPU's Ollama.""" gpu = GPU_MAP.get(gpus[0]) if gpus else None if not gpu: job["error"] = "no GPU assigned" job["status"] = "failed" return host_ip = gpu["host"].split("@")[-1] if "@" in gpu["host"] else gpu["host"] # For pct-based GPUs, use the CT's external IP if "pct_id" in gpu: host_ip = "192.168.0.179" # CT 105 external IP port = gpu["ollama_port"] model = params.get("model", "mortdecai:0.4.0") rounds = int(params.get("rounds", 10)) categories = params.get("categories", "all") rcon_host = params.get("rcon_host", "192.168.0.112") rcon_port = int(params.get("rcon_port", 25578)) rcon_pass = params.get("rcon_pass", "REDACTED_RCON") script_path = "/home/seth/mc-ai-training/Minecraft-AI-model/training/scripts/tool_self_play.py" log_file = f"/tmp/tool_selfplay_{gpu['id']}.log" cmd = (f"cd /home/seth/mc-ai-training/Minecraft-AI-model && " f"python3 {script_path} " f"--ollama-url http://{host_ip}:{port} --model {model} " f"--rcon-host {rcon_host} --rcon-port {rcon_port} --rcon-pass {rcon_pass} " f"--rounds {rounds} --categories {categories}") _ssh_cmd(TRAINING_HOST, f"nohup bash -c '{cmd}' > {log_file} 2>&1 &", timeout=10) print(f"[tool-self-play] launched on {gpu['name']}, logging to {log_file}") # Monitor until done for _ in range(rounds * len(PIPELINE_TYPES) * 3): time.sleep(15) log = _ssh_cmd(TRAINING_HOST, f"tail -5 {log_file} 2>/dev/null") if log and ("Complete" in log or "Traceback" in log): if "Traceback" in log: job["error"] = log[-300:] job["status"] = "failed" break if job["status"] != "running": break # ── Trigger Engine ───────────────────────────────────────────────────────── def _check_triggers(): """Evaluate all scheduled triggers.""" now = datetime.now() for sched in _schedule: if sched.get("status") != "pending": continue trigger = sched["trigger"] fired = False if trigger["type"] == "time": target_str = trigger.get("at") if target_str: try: target = datetime.fromisoformat(target_str) if now >= target: fired = True except ValueError: pass duration_s = trigger.get("duration_seconds") created_str = sched.get("created_at") if duration_s and created_str: try: created = datetime.fromisoformat(created_str) if now >= created + timedelta(seconds=int(duration_s)): fired = True except ValueError: pass elif trigger["type"] == "finish_training": training = _state.get("training") if training: total = training.get("total_steps", 0) current = training.get("current_step", 0) if total > 0 and current >= total and not training.get("active"): fired = True elif trigger["type"] == "cost": threshold = float(trigger.get("threshold_usd", 999)) if _cost_tracker["total_cost"] >= threshold: fired = True if fired: sched["status"] = "fired" sched["fired_at"] = now.isoformat() _save_schedule() print(f"[scheduler] trigger fired: {sched['id']} → launching preset {sched['preset_id']}") _launch_preset(sched["preset_id"]) def _launch_preset(preset_id): """Create and start a job from a preset.""" preset = _presets.get(preset_id) if not preset: print(f"[scheduler] preset {preset_id} not found") return None job = { "id": str(uuid.uuid4())[:8], "preset_id": preset_id, "preset_name": preset.get("name", "?"), "pipeline": preset["pipeline"], "params": preset.get("params", {}), "gpus": preset.get("gpus", []), "status": "queued", "created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ"), } _jobs.append(job) _save_jobs() _run_job_async(job) return job # ── HTML Rendering ───────────────────────────────────────────────────────── def _render_page(): with _lock: state = dict(_state) gpu_cards = "" for gpu in GPUS: data = state["gpus"].get(gpu["id"], {"online": False, "id": gpu["id"], "name": gpu["name"]}) gpu_cards += _gpu_card_html(data) training_html = _training_card_html(state.get("training")) presets_list_html, presets_form_html = _presets_panel_html() schedule_html = _schedule_panel_html() jobs_html = _jobs_panel_html() last_refresh = state.get("last_refresh", "never") online_count = len([g for g in state["gpus"].values() if g.get("online")]) return f""" Mortdecai GPU Scheduler

Mortdecai GPU Scheduler

{online_count}/{len(GPUS)} GPUs online — refreshed {last_refresh}
{training_html}

GPUs

{gpu_cards}
{schedule_html}
{jobs_html}

Presets

{presets_list_html}
{presets_form_html}
""" def _gpu_card_html(d): if not d.get("online"): return f"""
{d.get('name','?')}OFFLINE
""" util = d.get("utilization", 0) temp = d.get("temperature", 0) vram_pct = d.get("vram_pct", 0) vram_used = d.get("vram_used_mb", 0) vram_total = d.get("vram_total_mb", 0) power = d.get("power_watts", 0) tc = "bad" if temp > 80 else "warn" if temp > 70 else "ok" uc = "ok" if util > 50 else "warn" if util > 10 else "dim" vc = "bad" if vram_pct > 90 else "warn" if vram_pct > 70 else "ok" ollama = d.get("ollama", {}) running = ollama.get("running", []) avail = ollama.get("available", []) model_tags = " ".join(f'{m["name"]}' for m in running) if running else 'idle' avail_options = "".join(f'' for m in avail if m) model_select = f""" """ if avail else "" active_job = d.get("active_job") job_badge = f'job {active_job}' if active_job else "" caps = " ".join(f'{c}' for c in d.get("capabilities", [])) return f"""
{d['name']}{'ACTIVE' if util>10 else 'IDLE'}
{d.get('location','')} {job_badge}
GPU
{util}%
VRAM
{vram_used}/{vram_total}MB
{temp}C {power:.0f}W
{model_tags}
{model_select}
{caps}
""" def _training_card_html(t): if not t: return '
Trainingno log
' pct = t.get("pct", 0) step = t.get("current_step", 0) total = t.get("total_steps", 0) error = t.get("error") active = t.get("active", False) loss = t.get("latest_loss") lr = t.get("learning_rate", "?") eta = t.get("eta", "?") elapsed = t.get("elapsed", "?") if error: status = f'CRASHED ({error})' elif active: status = 'TRAINING' else: status = 'STOPPED' # Sparkline lh = t.get("loss_history", []) spark = "" if lh: recent = lh[-40:] mx, mn = max(recent), min(recent) rng = mx - mn if mx != mn else 1 w, h = 400, 70 pts = " ".join(f"{i/(max(len(recent)-1,1))*w:.0f},{h-((v-mn)/rng*h):.0f}" for i, v in enumerate(recent)) spark = f""" {mx:.4f}{mn:.4f}""" return f"""
Training{status}
{step}/{total} ({pct}%)
Elapsed: {elapsed} ETA: {eta} Loss: {f'{loss:.4f}' if loss else '?'} LR: {lr}
{spark}
""" def _presets_list_html(): """Just the preset rows — refreshable without touching the form.""" rows = "" for pid, p in sorted(_presets.items(), key=lambda x: x[1].get("name", "")): gpus = ", ".join(p.get("gpus", [])) rows += f"""
{p['name']}
{p['pipeline']} — {gpus}
""" if not rows: rows = '
No presets yet. Create one below.
' return rows def _presets_panel_html(): """Returns (list_html, form_html) — list refreshes live, form stays static.""" list_html = _presets_list_html() pipe_opts = "".join(f'' for k, v in PIPELINE_TYPES.items()) gpu_checks = "".join(f'' for g in GPUS) form_html = f"""

New Preset

{gpu_checks}
""" return list_html, form_html def _schedule_panel_html(): rows = "" for s in sorted(_schedule, key=lambda x: x.get("created_at", ""), reverse=True)[:10]: preset_name = _presets.get(s.get("preset_id", ""), {}).get("name", s.get("preset_id", "?")) trigger = s.get("trigger", {}) ttype = trigger.get("type", "?") if ttype == "time": if trigger.get("at"): trigger_desc = f"at {trigger['at']}" else: secs = int(trigger.get("duration_seconds", 0)) trigger_desc = f"after {secs//3600}h{(secs%3600)//60}m" elif ttype == "finish_training": trigger_desc = "when training completes" elif ttype == "cost": trigger_desc = f"at ${trigger.get('threshold_usd', '?')}" else: trigger_desc = ttype st = s.get("status", "?") st_class = "ok" if st == "fired" else "warn" if st == "pending" else "dim" sid = s["id"] cancel_btn = f"""""" if st == 'pending' else '' rows += f"""
{st} {preset_name} {trigger_desc} {cancel_btn}
""" if not rows: rows = '
No scheduled triggers.
' return f"""
Scheduled Triggers{len([s for s in _schedule if s.get('status')=='pending'])} pending
{rows}
""" def _jobs_panel_html(): recent = sorted(_jobs, key=lambda j: j.get("created_at", ""), reverse=True)[:15] rows = "" for j in recent: st = j.get("status", "?") st_class = "ok" if st == "completed" else "bad" if st == "failed" else "warn" if st == "running" else "dim" gpus = ", ".join(j.get("gpus", [])) name = j.get("preset_name", j.get("pipeline", "?")) err = f' ({j["error"]})' if j.get("error") else "" jid = j["id"] stop_btn = f"""""" if st == 'running' else '' created = j.get('created_at', '')[:16] rows += f"""
{st} {name} {gpus} {created} {err} {stop_btn}
""" if not rows: rows = '
No jobs yet.
' return f"""
Jobs{len([j for j in _jobs if j.get('status')=='running'])} running
{rows}
""" # ── CSS ──────────────────────────────────────────────────────────────────── CSS = """ *{box-sizing:border-box;margin:0;padding:0} body{font-family:'Courier New',monospace;background:#111;color:#e0e0e0;padding:1rem 1.5rem} header{margin-bottom:1.2rem} h1{color:#D35400;font-size:1.5rem;margin-bottom:0.2rem} h2{color:#D35400;font-size:1.1rem;margin:1rem 0 0.6rem} h3{color:#D35400;font-size:1rem;margin-bottom:0.6rem} .subtitle{color:#666;font-size:0.8rem} .accent{color:#D35400} .ok{color:#4caf50} .warn{color:#ff9800} .bad{color:#f44336} .dim{color:#555} .layout{display:grid;grid-template-columns:1fr 340px;gap:1.2rem} @media(max-width:900px){.layout{grid-template-columns:1fr}} .main-col{min-width:0} .side-col{display:flex;flex-direction:column;gap:1rem} .grid{display:grid;grid-template-columns:repeat(auto-fill,minmax(280px,1fr));gap:0.8rem} .card{background:#1a1a1a;border:1px solid #2a2a2a;border-radius:6px;padding:0.8rem;margin-bottom:0.8rem;transition:border-color 0.2s} .card:hover{border-color:#D35400} .card.offline{opacity:0.4;border-color:#f44336} .card-header{display:flex;justify-content:space-between;align-items:center;margin-bottom:0.3rem;font-size:0.95rem} .card-sub{color:#555;font-size:0.75rem;margin-bottom:0.5rem} .bar-row{display:flex;align-items:center;margin:0.25rem 0;gap:0.4rem} .bar-label{width:36px;color:#777;font-size:0.75rem} .bar{flex:1;background:#222;border-radius:3px;height:20px;overflow:hidden} .bar-fill{height:100%;border-radius:3px;background:#D35400;display:flex;align-items:center;padding-left:5px;font-size:0.7rem;color:#fff;min-width:fit-content;transition:width 0.5s} .bar-fill.ok{background:#4caf50} .bar-fill.warn{background:#ff9800} .bar-fill.bad{background:#f44336} .stats{display:flex;gap:0.8rem;font-size:0.8rem;margin:0.4rem 0;flex-wrap:wrap;color:#999} .models{margin-top:0.4rem} .model-ctrl{margin-top:0.3rem;display:flex;gap:0.3rem;align-items:center} .model-select{background:#222;color:#ccc;border:1px solid #333;border-radius:3px;padding:2px 4px;font-size:0.75rem;font-family:monospace} .tag{display:inline-block;background:#222;border:1px solid #333;border-radius:3px;padding:1px 6px;font-size:0.7rem;margin:1px} .tag.accent{border-color:#D35400;color:#D35400} .tag.dim{color:#444;border-color:#222} .caps{margin-top:0.3rem;display:flex;gap:3px;flex-wrap:wrap} .cap{font-size:0.65rem;color:#555;background:#1e1e1e;border-radius:2px;padding:1px 4px} .train-card .progress{background:#222;border-radius:3px;height:26px;margin:0.4rem 0;overflow:hidden} .train-card .progress-fill{height:100%;background:linear-gradient(90deg,#D35400,#e67e22);border-radius:3px;transition:width 1s;display:flex;align-items:center;justify-content:center;font-size:0.8rem;font-weight:bold;color:#fff} .spark{display:block;margin-top:0.5rem;background:#1a1a1a;border:1px solid #222;border-radius:3px} .panel{background:#1a1a1a;border:1px solid #2a2a2a;border-radius:6px;padding:0.8rem} .preset-row{display:flex;flex-wrap:wrap;align-items:center;gap:0.4rem;padding:0.4rem 0;border-bottom:1px solid #222;font-size:0.8rem} .preset-name{font-weight:bold;color:#e0e0e0;flex:1} .preset-info{color:#777;font-size:0.75rem;width:100%} .preset-actions{display:flex;gap:0.3rem} .sched-row{display:flex;align-items:center;gap:0.5rem;padding:0.3rem 0;border-bottom:1px solid #222;font-size:0.8rem} .job-row{display:flex;align-items:center;gap:0.5rem;padding:0.3rem 0;border-bottom:1px solid #1e1e1e;font-size:0.8rem} .btn{background:#222;border:1px solid #D35400;color:#D35400;padding:5px 12px;border-radius:3px;cursor:pointer;font-family:monospace;font-size:0.8rem;transition:background 0.15s} .btn:hover{background:#D35400;color:#fff} .btn-sm{padding:3px 8px;font-size:0.75rem} .btn-xs{padding:2px 6px;font-size:0.7rem} .btn-danger{border-color:#f44336;color:#f44336} .btn-danger:hover{background:#f44336;color:#fff} .create-form{margin-top:0.5rem} .create-form form{display:flex;flex-direction:column;gap:0.5rem;margin-top:0.6rem} .create-form label{display:flex;flex-direction:column;font-size:0.8rem;color:#999;gap:0.2rem} .create-form input,.create-form select{background:#222;color:#e0e0e0;border:1px solid #333;border-radius:3px;padding:4px 6px;font-family:monospace;font-size:0.8rem} .gpu-select{display:flex;flex-wrap:wrap;gap:0.3rem;font-size:0.8rem;color:#999} .gpu-check{display:flex;align-items:center;gap:0.2rem;font-size:0.75rem} .gpu-check input{accent-color:#D35400} .modal-overlay{display:none;position:fixed;top:0;left:0;width:100%;height:100%;background:rgba(0,0,0,0.7);z-index:100;justify-content:center;align-items:center} .modal-overlay.active{display:flex} .modal{background:#1a1a1a;border:1px solid #D35400;border-radius:8px;padding:1.2rem;width:400px;max-width:90vw} .modal h3{margin-bottom:0.8rem} .modal label{display:flex;flex-direction:column;font-size:0.8rem;color:#999;gap:0.2rem;margin-bottom:0.4rem} .modal input,.modal select{background:#222;color:#e0e0e0;border:1px solid #333;border-radius:3px;padding:4px 6px;font-family:monospace;font-size:0.8rem} .modal .btn-row{display:flex;gap:0.5rem;margin-top:0.8rem;justify-content:flex-end} """ # ── JS ───────────────────────────────────────────────────────────────────── PIPELINE_TYPES_JSON = json.dumps({k: {"params": v["params"], "defaults": v["defaults"], "label": v["label"]} for k, v in PIPELINE_TYPES.items()}) JS = f""" const PIPELINES = {PIPELINE_TYPES_JSON}; function api(action, data) {{ return fetch('/api/action', {{ method:'POST', headers:{{'Content-Type':'application/json'}}, body: JSON.stringify({{action, ...data}}) }}).then(r=>r.json()); }} function updateParamFields(pipeline) {{ const p = PIPELINES[pipeline]; if (!p) return; const container = document.getElementById('param-fields'); container.innerHTML = ''; for (const key of p.params) {{ const val = p.defaults[key] ?? ''; const label = document.createElement('label'); label.textContent = key; const input = document.createElement('input'); input.name = 'param_' + key; input.value = val; label.appendChild(input); container.appendChild(label); }} }} function createPreset(e) {{ e.preventDefault(); const form = e.target; const fd = new FormData(form); const gpus = fd.getAll('gpus'); const params = {{}}; for (const [k,v] of fd.entries()) {{ if (k.startsWith('param_')) params[k.slice(6)] = v; }} api('create_preset', {{ name: fd.get('name'), pipeline: fd.get('pipeline'), gpus, params }}).then(() => liveRefresh()); return false; }} function launchPreset(id) {{ if (confirm('Launch this preset now?')) api('launch_preset', {{preset_id: id}}).then(() => setTimeout(()=>location.reload(), 1000)); }} function deletePreset(id) {{ if (confirm('Delete this preset?')) api('delete_preset', {{preset_id: id}}).then(() => location.reload()); }} function loadModel(gpuId) {{ const sel = document.getElementById('ms-' + gpuId); if (!sel) return; api('load_model', {{gpu_id: gpuId, model: sel.value}}).then(() => setTimeout(()=>location.reload(), 3000)); }} function cancelJob(id) {{ api('cancel_job', {{job_id: id}}).then(() => location.reload()); }} function cancelSchedule(id) {{ api('cancel_schedule', {{schedule_id: id}}).then(() => location.reload()); }} // Schedule modal let _schedPresetId = null; function schedulePreset(id) {{ _schedPresetId = id; document.getElementById('sched-modal').classList.add('active'); }} function closeModal() {{ document.getElementById('sched-modal').classList.remove('active'); }} function submitSchedule(e) {{ e.preventDefault(); const fd = new FormData(e.target); const ttype = fd.get('trigger_type'); const trigger = {{type: ttype}}; if (ttype === 'time') {{ const mode = fd.get('time_mode'); if (mode === 'at') trigger.at = fd.get('time_at'); else trigger.duration_seconds = parseInt(fd.get('duration_hours')||0)*3600 + parseInt(fd.get('duration_mins')||0)*60; }} else if (ttype === 'cost') {{ trigger.threshold_usd = parseFloat(fd.get('cost_threshold')); }} api('create_schedule', {{preset_id: _schedPresetId, trigger}}).then(() => liveRefresh()); return false; }} // Init param fields for first pipeline document.addEventListener('DOMContentLoaded', () => {{ const sel = document.querySelector('[name=pipeline]'); if (sel) updateParamFields(sel.value); }}); // Live refresh — update dynamic sections without reloading the page function liveRefresh() {{ fetch('/api/fragments').then(r => r.json()).then(f => {{ const ids = {{'gpu-grid':'gpus', 'train-section':'training', 'schedule-section':'schedule', 'jobs-section':'jobs', 'presets-list':'presets', 'refresh-time':'refresh_time'}}; for (const [elId, key] of Object.entries(ids)) {{ const el = document.getElementById(elId); if (el && f[key] != null) el.innerHTML = f[key]; }} }}).catch(() => {{}}); }} setInterval(liveRefresh, 10000); """ # ── HTTP Handler ─────────────────────────────────────────────────────────── class SchedulerHandler(BaseHTTPRequestHandler): def log_message(self, format, *args): pass def do_GET(self): path = urlparse(self.path).path if path in ("/", "/dashboard"): html = _render_page() # Inject schedule modal at end of body modal = """""" html = html.replace("", modal + "") self._respond(200, html, "text/html") elif path == "/api/state": with _lock: data = {"gpus": _state["gpus"], "training": _state["training"], "presets": _presets, "jobs": _jobs[-20:], "schedule": _schedule[-20:]} self._respond(200, json.dumps(data, default=str, indent=2), "application/json") elif path == "/api/training": self._respond(200, json.dumps(_fetch_training_status(), default=str), "application/json") elif path == "/api/presets": self._respond(200, json.dumps(_presets, indent=2), "application/json") elif path == "/api/pipelines": info = {k: {"label": v["label"], "description": v["description"], "params": v["params"], "defaults": v["defaults"]} for k, v in PIPELINE_TYPES.items()} self._respond(200, json.dumps(info, indent=2), "application/json") elif path == "/api/fragments": # Return HTML fragments for live refresh (no full page reload) with _lock: state = dict(_state) gpu_cards = "" for gpu in GPUS: data = state["gpus"].get(gpu["id"], {"online": False, "id": gpu["id"], "name": gpu["name"]}) gpu_cards += _gpu_card_html(data) online_count = len([g for g in state["gpus"].values() if g.get("online")]) last_refresh = state.get("last_refresh", "never") fragments = { "gpus": gpu_cards, "training": _training_card_html(state.get("training")), "schedule": _schedule_panel_html(), "jobs": _jobs_panel_html(), "presets": _presets_list_html(), "refresh_time": f"{online_count}/{len(GPUS)} GPUs online — refreshed {last_refresh}", } self._respond(200, json.dumps(fragments), "application/json") else: self._respond(404, "Not found", "text/plain") def do_POST(self): path = urlparse(self.path).path if path != "/api/action": self._respond(404, "Not found", "text/plain") return length = int(self.headers.get("Content-Length", 0)) body = json.loads(self.rfile.read(length)) if length else {} action = body.get("action", "") try: result = self._handle_action(action, body) self._respond(200, json.dumps(result, default=str), "application/json") except Exception as e: self._respond(500, json.dumps({"ok": False, "error": str(e)}), "application/json") def _handle_action(self, action, body): if action == "refresh": threading.Thread(target=refresh_state, daemon=True).start() return {"ok": True} elif action == "create_preset": pid = str(uuid.uuid4())[:8] _presets[pid] = { "id": pid, "name": body.get("name", "unnamed"), "pipeline": body.get("pipeline", "self_play"), "gpus": body.get("gpus", []), "params": body.get("params", {}), "created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ"), } _save_presets() return {"ok": True, "preset_id": pid} elif action == "delete_preset": pid = body.get("preset_id") if pid in _presets: del _presets[pid] _save_presets() return {"ok": True} elif action == "launch_preset": job = _launch_preset(body.get("preset_id")) return {"ok": True, "job": job} elif action == "create_schedule": sid = str(uuid.uuid4())[:8] sched = { "id": sid, "preset_id": body.get("preset_id"), "trigger": body.get("trigger", {}), "status": "pending", "created_at": datetime.now().isoformat(), } _schedule.append(sched) _save_schedule() return {"ok": True, "schedule_id": sid} elif action == "cancel_schedule": sid = body.get("schedule_id") for s in _schedule: if s["id"] == sid: s["status"] = "cancelled" _save_schedule() return {"ok": True} elif action == "cancel_job": jid = body.get("job_id") for j in _jobs: if j["id"] == jid and j["status"] == "running": j["status"] = "cancelled" _save_jobs() return {"ok": True} elif action == "load_model": gpu_id = body.get("gpu_id") model = body.get("model") gpu = GPU_MAP.get(gpu_id) if not gpu: return {"ok": False, "error": "unknown GPU"} result = _ollama_api(gpu, "/api/generate", method="POST", data={ "model": model, "prompt": "test", "stream": False, "options": {"num_predict": 1}, }) return {"ok": True, "result": result} elif action == "stop_ollama": gpu_id = body.get("gpu_id", "3090ti") gpu = GPU_MAP.get(gpu_id) if gpu: svc = gpu.get("ollama_service", "ollama.service") _ssh_cmd(gpu, f"sudo systemctl stop {svc} 2>&1", timeout=10) return {"ok": True} elif action == "start_ollama": gpu_id = body.get("gpu_id", "3090ti") gpu = GPU_MAP.get(gpu_id) if gpu: svc = gpu.get("ollama_service", "ollama.service") _ssh_cmd(gpu, f"sudo systemctl start {svc} 2>&1", timeout=10) return {"ok": True} return {"ok": False, "error": f"unknown action: {action}"} def _respond(self, code, body, content_type): self.send_response(code) self.send_header("Content-Type", content_type) self.send_header("Access-Control-Allow-Origin", "*") self.end_headers() self.wfile.write(body.encode() if isinstance(body, str) else body) # ── Main ─────────────────────────────────────────────────────────────────── def main(): parser = argparse.ArgumentParser(description="Mortdecai GPU Scheduler") parser.add_argument("--port", type=int, default=PORT) parser.add_argument("--refresh-interval", type=int, default=10) args = parser.parse_args() _load_persisted() print(f"Loaded {len(_presets)} presets, {len(_jobs)} jobs, {len(_schedule)} schedules") t = threading.Thread(target=_bg_refresh_loop, args=(args.refresh_interval,), daemon=True) t.start() print("Initial GPU scan...") refresh_state() server = HTTPServer(("0.0.0.0", args.port), SchedulerHandler) print(f"GPU Scheduler on http://0.0.0.0:{args.port}") print(f" {len(GPUS)} GPUs, refresh {args.refresh_interval}s") try: server.serve_forever() except KeyboardInterrupt: print("\nShutting down.") if __name__ == "__main__": main()