Mortdecai/training/scripts/run_training.sh

#!/bin/bash
# Training launcher — stops competing Ollama, trains, restarts Ollama.
# Usage: ./run_training.sh [--resume]
#
# Prevents the OOM crash caused by ollama.service holding 6GB on the 3090 Ti.

set -e

VERSION="0.5.0"
MODEL="Qwen/Qwen3.5-9B"
OUTPUT="training/checkpoints/mortdecai-${VERSION}"
LOG="training/train_run_${VERSION}.log"
RESUME_FLAG=""

if [[ "$1" == "--resume" ]]; then
    RESUME_FLAG="--resume"
    echo ">> Resume mode: will pick up from latest checkpoint"
fi

# Disable torch compile (causes "Argument list too long" on this system)
export TORCH_COMPILE_DISABLE=1
export TORCHDYNAMO_DISABLE=1

# Use the 3090 Ti (CUDA device ordering: GPU 1 in nvidia-smi = device 0 in CUDA when isolated)
export CUDA_VISIBLE_DEVICES=0

echo "============================================"
echo "  Mortdecai ${VERSION} Training"
echo "============================================"
echo "Model:   ${MODEL}"
echo "Output:  ${OUTPUT}"
echo "Log:     ${LOG}"
echo ""

# Stop Ollama on 3090 Ti to free VRAM
echo ">> Stopping ollama.service (3090 Ti)..."
sudo systemctl stop ollama.service 2>/dev/null && echo "   Stopped." || echo "   Already stopped or not found."
sleep 2

# Verify VRAM is free
echo ">> GPU status:"
nvidia-smi --id=1 --query-gpu=name,memory.used,memory.free --format=csv,noheader
echo ""

# Run training
echo ">> Starting training at $(date)"
cd "$(dirname "$0")/../.."

python3 training/scripts/train_lora.py \
    --model "${MODEL}" \
    --output "${OUTPUT}" \
    --lr 1e-4 \
    --epochs 1 \
    --batch-size 2 \
    --grad-accum 4 \
    --max-seq-len 2048 \
    --save-steps 50 \
    ${RESUME_FLAG} \
    2>&1 | tee "${LOG}"

TRAIN_EXIT=$?

echo ""
echo ">> Training finished at $(date) (exit code: ${TRAIN_EXIT})"

# Restart Ollama
echo ">> Restarting ollama.service..."
sudo systemctl start ollama.service 2>/dev/null && echo "   Started." || echo "   Failed to start."

if [ $TRAIN_EXIT -eq 0 ]; then
    echo ""
    echo "============================================"
    echo "  Training complete! Next steps:"
    echo "  1. Export GGUF: python3 -m unsloth.save --model ${OUTPUT} --output_type gguf"
    echo "  2. Create Ollama model: ollama create mortdecai:${VERSION} -f Modelfile"
    echo "  3. Run bake-off: python3 training/scripts/bakeoff.py"
    echo "============================================"
fi

exit $TRAIN_EXIT