da8f557219
GPU Scheduler (gpu.sethpc.xyz): - Live dashboard with 4 GPUs, training monitor, loss sparklines - Preset-based job scheduler with 3 triggers (time, finish_training, cost) - Model selection per GPU, pipeline configuration - Tool self-play and training pipeline types - Behind Google OAuth, live-refresh without page reload Tool Architecture (14 tools): - 3 new tools: world.nearby_entities, memory.read, memory.write - 7 script.* tools: write, validate, execute, read, list, delete, schedule - ScriptManager: full mcfunction datapack CRUD with RCON validation - Training data: 1,430 tool examples (up from 1,159) Plugin Deployment (paper-ai-25567): - WorldGuard 7.0.12, CoreProtect CE 23.1, EssentialsX 2.21.2, Vault 1.7.3 - Fresh greenfield world reset - 104 RCON-validated plugin training examples Event Dispatcher: - Watches server log for deaths, joins, advancements, PvP kills - Configurable trigger probability and cooldowns per event type - Deployed to dev server, fires god_system prompts on events - 21 event-response training examples Training Infrastructure: - train_lora.py: --save-steps 50, --resume from checkpoint - run_training.sh: stops Ollama, activates conda, restarts after - Passwordless sudo for ollama services on steel141 - Dev server added to MCSManager with autoStart Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
81 lines
2.3 KiB
Bash
Executable File
81 lines
2.3 KiB
Bash
Executable File
#!/bin/bash
|
|
# Training launcher — stops competing Ollama, trains, restarts Ollama.
|
|
# Usage: ./run_training.sh [--resume]
|
|
#
|
|
# Prevents the OOM crash caused by ollama.service holding 6GB on the 3090 Ti.
|
|
|
|
set -e
|
|
|
|
VERSION="0.5.0"
|
|
MODEL="Qwen/Qwen3.5-9B"
|
|
OUTPUT="training/checkpoints/mortdecai-${VERSION}"
|
|
LOG="training/train_run_${VERSION}.log"
|
|
RESUME_FLAG=""
|
|
|
|
if [[ "$1" == "--resume" ]]; then
|
|
RESUME_FLAG="--resume"
|
|
echo ">> Resume mode: will pick up from latest checkpoint"
|
|
fi
|
|
|
|
# Disable torch compile (causes "Argument list too long" on this system)
|
|
export TORCH_COMPILE_DISABLE=1
|
|
export TORCHDYNAMO_DISABLE=1
|
|
|
|
# Use the 3090 Ti (CUDA device ordering: GPU 1 in nvidia-smi = device 0 in CUDA when isolated)
|
|
export CUDA_VISIBLE_DEVICES=0
|
|
|
|
echo "============================================"
|
|
echo " Mortdecai ${VERSION} Training"
|
|
echo "============================================"
|
|
echo "Model: ${MODEL}"
|
|
echo "Output: ${OUTPUT}"
|
|
echo "Log: ${LOG}"
|
|
echo ""
|
|
|
|
# Stop Ollama on 3090 Ti to free VRAM
|
|
echo ">> Stopping ollama.service (3090 Ti)..."
|
|
sudo systemctl stop ollama.service 2>/dev/null && echo " Stopped." || echo " Already stopped or not found."
|
|
sleep 2
|
|
|
|
# Verify VRAM is free
|
|
echo ">> GPU status:"
|
|
nvidia-smi --id=1 --query-gpu=name,memory.used,memory.free --format=csv,noheader
|
|
echo ""
|
|
|
|
# Run training
|
|
echo ">> Starting training at $(date)"
|
|
cd "$(dirname "$0")/../.."
|
|
|
|
python3 training/scripts/train_lora.py \
|
|
--model "${MODEL}" \
|
|
--output "${OUTPUT}" \
|
|
--lr 1e-4 \
|
|
--epochs 1 \
|
|
--batch-size 2 \
|
|
--grad-accum 4 \
|
|
--max-seq-len 2048 \
|
|
--save-steps 50 \
|
|
${RESUME_FLAG} \
|
|
2>&1 | tee "${LOG}"
|
|
|
|
TRAIN_EXIT=$?
|
|
|
|
echo ""
|
|
echo ">> Training finished at $(date) (exit code: ${TRAIN_EXIT})"
|
|
|
|
# Restart Ollama
|
|
echo ">> Restarting ollama.service..."
|
|
sudo systemctl start ollama.service 2>/dev/null && echo " Started." || echo " Failed to start."
|
|
|
|
if [ $TRAIN_EXIT -eq 0 ]; then
|
|
echo ""
|
|
echo "============================================"
|
|
echo " Training complete! Next steps:"
|
|
echo " 1. Export GGUF: python3 -m unsloth.save --model ${OUTPUT} --output_type gguf"
|
|
echo " 2. Create Ollama model: ollama create mortdecai:${VERSION} -f Modelfile"
|
|
echo " 3. Run bake-off: python3 training/scripts/bakeoff.py"
|
|
echo "============================================"
|
|
fi
|
|
|
|
exit $TRAIN_EXIT
|