#!/bin/bash # Training launcher — stops competing Ollama, trains, restarts Ollama. # Usage: ./run_training.sh [--resume] # # Prevents the OOM crash caused by ollama.service holding 6GB on the 3090 Ti. set -e VERSION="0.5.0" MODEL="Qwen/Qwen3.5-9B" OUTPUT="training/checkpoints/mortdecai-${VERSION}" LOG="training/train_run_${VERSION}.log" RESUME_FLAG="" if [[ "$1" == "--resume" ]]; then RESUME_FLAG="--resume" echo ">> Resume mode: will pick up from latest checkpoint" fi # Disable torch compile (causes "Argument list too long" on this system) export TORCH_COMPILE_DISABLE=1 export TORCHDYNAMO_DISABLE=1 # Use the 3090 Ti (CUDA device ordering: GPU 1 in nvidia-smi = device 0 in CUDA when isolated) export CUDA_VISIBLE_DEVICES=0 echo "============================================" echo " Mortdecai ${VERSION} Training" echo "============================================" echo "Model: ${MODEL}" echo "Output: ${OUTPUT}" echo "Log: ${LOG}" echo "" # Stop Ollama on 3090 Ti to free VRAM echo ">> Stopping ollama.service (3090 Ti)..." sudo systemctl stop ollama.service 2>/dev/null && echo " Stopped." || echo " Already stopped or not found." sleep 2 # Verify VRAM is free echo ">> GPU status:" nvidia-smi --id=1 --query-gpu=name,memory.used,memory.free --format=csv,noheader echo "" # Run training echo ">> Starting training at $(date)" cd "$(dirname "$0")/../.." python3 training/scripts/train_lora.py \ --model "${MODEL}" \ --output "${OUTPUT}" \ --lr 1e-4 \ --epochs 1 \ --batch-size 2 \ --grad-accum 4 \ --max-seq-len 2048 \ --save-steps 50 \ ${RESUME_FLAG} \ 2>&1 | tee "${LOG}" TRAIN_EXIT=$? echo "" echo ">> Training finished at $(date) (exit code: ${TRAIN_EXIT})" # Restart Ollama echo ">> Restarting ollama.service..." sudo systemctl start ollama.service 2>/dev/null && echo " Started." || echo " Failed to start." if [ $TRAIN_EXIT -eq 0 ]; then echo "" echo "============================================" echo " Training complete! Next steps:" echo " 1. Export GGUF: python3 -m unsloth.save --model ${OUTPUT} --output_type gguf" echo " 2. Create Ollama model: ollama create mortdecai:${VERSION} -f Modelfile" echo " 3. Run bake-off: python3 training/scripts/bakeoff.py" echo "============================================" fi exit $TRAIN_EXIT