#!/bin/bash # ────────────────────────────────────────────────────────────────────────────── # Mortdecai Batch Training Pipeline # # Trains both 9B and 14B models sequentially on a rented GPU, then exports # GGUFs at all required quant levels for the fleet. # # Usage (on rented H100): # # 1. Upload this repo + dataset # rsync -avz --exclude='.git' . gpu-host:~/mortdecai/ # # # 2. SSH in and run # cd ~/mortdecai # bash training/scripts/batch_train.sh # # # 3. Monitor from another machine (pick one): # ssh gpu-host "tail -f ~/mortdecai/training_progress.jsonl" # # OR set DISCORD_WEBHOOK for push notifications: # export DISCORD_WEBHOOK="https://discord.com/api/webhooks/..." # bash training/scripts/batch_train.sh # # # 4. Download checkpoints when done # rsync -avz gpu-host:~/mortdecai/training/checkpoints/mortdecai-0.6.0-* ./training/checkpoints/ # # Prerequisites on the rented machine: # pip install unsloth torch transformers datasets peft trl # ────────────────────────────────────────────────────────────────────────────── set -euo pipefail VERSION="0.6.0" DATASET="data/processed/merged_training_v06.jsonl" CHECKPOINT_DIR="training/checkpoints" PROGRESS_LOG="training_progress.jsonl" # Discord bot token + channel for progress notifications DISCORD_TOKEN="${DISCORD_TOKEN:-REDACTED_DISCORD_TOKEN_2}" DISCORD_CHANNEL="${DISCORD_CHANNEL:-1485160229573361664}" # ── Progress reporting ──────────────────────────────────────────────────────── notify() { local stage="$1" local message="$2" local ts=$(date -u +%Y-%m-%dT%H:%M:%SZ) # Log to file echo "{\"ts\":\"$ts\",\"stage\":\"$stage\",\"message\":\"$message\"}" >> "$PROGRESS_LOG" # Print locally echo " [$ts] $stage: $message" # Discord bot API if [ -n "$DISCORD_TOKEN" ]; then curl -s -X POST "https://discord.com/api/v10/channels/${DISCORD_CHANNEL}/messages" \ -H "Authorization: Bot ${DISCORD_TOKEN}" \ -H "Content-Type: application/json" \ -d "{\"content\":\"**Mortdecai Training** [${stage}] ${message}\"}" \ > /dev/null 2>&1 || true fi } # Models to train MODELS=( "Qwen/Qwen3.5-9B" "Qwen/Qwen3.5-14B" ) # Quant levels per model (mapped to target GPUs) # 9B: Q4=RTX4000(8GB), Q6=2080Ti(11GB), Q8=3090Ti(24GB) # 14B: Q3=RTX4000(8GB), Q4=2080Ti(11GB), Q6=3090Ti(24GB), F16=StrixHalo(64GB) declare -A QUANTS QUANTS["Qwen/Qwen3.5-9B"]="Q3_K_M Q4_K_M Q6_K Q8_0" QUANTS["Qwen/Qwen3.5-14B"]="Q3_K_M Q4_K_M Q6_K Q8_0" # ── Preflight ───────────────────────────────────────────────────────────────── if [ ! -f "$DATASET" ]; then echo "ERROR: Dataset not found at $DATASET" echo "Run: python3 training/scripts/merge_datasets.py" exit 1 fi EXAMPLE_COUNT=$(wc -l < "$DATASET") echo "╔══════════════════════════════════════════════════════════╗" echo "║ Mortdecai Batch Training Pipeline v${VERSION} ║" echo "╠══════════════════════════════════════════════════════════╣" echo "║ Dataset: ${EXAMPLE_COUNT} examples ║" echo "║ Models: ${#MODELS[@]} ($(printf '%s ' "${MODELS[@]}" | sed 's|Qwen/||g'))║" echo "╚══════════════════════════════════════════════════════════╝" echo "" nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null || echo "WARNING: No GPU detected" echo "" # ── Conda/venv setup ────────────────────────────────────────────────────────── if command -v conda &>/dev/null; then source "$(conda info --base)/etc/profile.d/conda.sh" conda activate mc-train 2>/dev/null || echo "No mc-train env, using current" fi mkdir -p "$CHECKPOINT_DIR" # ── Training loop ───────────────────────────────────────────────────────────── for BASE_MODEL in "${MODELS[@]}"; do MODEL_SHORT=$(echo "$BASE_MODEL" | sed 's|Qwen/||; s|\.|-|g' | tr '[:upper:]' '[:lower:]') CKPT_NAME="mortdecai-${VERSION}-${MODEL_SHORT}" CKPT_PATH="${CHECKPOINT_DIR}/${CKPT_NAME}" MERGED_PATH="${CKPT_PATH}-merged" GGUF_DIR="${CKPT_PATH}-gguf" echo "" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo " Training: ${BASE_MODEL} → ${CKPT_NAME}" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" TRAIN_START=$(date +%s) # ── Step 1: LoRA fine-tune ── if [ -d "$CKPT_PATH" ] && [ -f "$CKPT_PATH/adapter_model.safetensors" ]; then notify "SKIP" "${CKPT_NAME} LoRA checkpoint exists" else notify "TRAIN" "Starting ${BASE_MODEL} LoRA fine-tune (${EXAMPLE_COUNT} examples)" python3 training/scripts/train_lora.py \ --model "$BASE_MODEL" \ --dataset "$DATASET" \ --output "$CKPT_PATH" \ --epochs 3 \ --batch-size 4 \ --lr 2e-4 \ --rank 64 \ --alpha 128 \ --save-steps 100 \ 2>&1 | tee "${CKPT_PATH}.train.log" notify "TRAIN" "${CKPT_NAME} LoRA training complete" fi # ── Step 2: Merge LoRA into base ── if [ -d "$MERGED_PATH" ] && [ -f "$MERGED_PATH/model.safetensors.index.json" ]; then notify "SKIP" "${CKPT_NAME} merged weights exist" else notify "MERGE" "Merging ${CKPT_NAME} LoRA into base model..." python3 -c " from unsloth import FastLanguageModel model, tokenizer = FastLanguageModel.from_pretrained('${CKPT_PATH}') model.save_pretrained_merged('${MERGED_PATH}', tokenizer, save_method='merged_16bit') print('Merge complete: ${MERGED_PATH}') " fi # ── Step 3: Convert to GGUF (F16) ── mkdir -p "$GGUF_DIR" F16_GGUF="${GGUF_DIR}/${MODEL_SHORT}.F16.gguf" if [ -f "$F16_GGUF" ]; then notify "SKIP" "${CKPT_NAME} F16 GGUF exists" else notify "GGUF" "Converting ${CKPT_NAME} to F16 GGUF..." LLAMA_CONVERT=$(find / -name "convert_hf_to_gguf.py" 2>/dev/null | head -1) if [ -z "$LLAMA_CONVERT" ]; then echo " WARNING: convert_hf_to_gguf.py not found, skipping GGUF export" echo " Run manually: python3 convert_hf_to_gguf.py $MERGED_PATH --outfile $F16_GGUF --outtype f16" continue fi python3 "$LLAMA_CONVERT" "$MERGED_PATH" --outfile "$F16_GGUF" --outtype f16 fi # ── Step 4: Quantize ── LLAMA_QUANTIZE=$(find / -name "llama-quantize" -o -name "quantize" 2>/dev/null | head -1) if [ -z "$LLAMA_QUANTIZE" ]; then echo " WARNING: llama-quantize not found, skipping quantization" echo " Run manually on steel141 after downloading F16 GGUF" else echo " [4/4] Quantizing..." for QUANT in ${QUANTS[$BASE_MODEL]}; do QFILE="${GGUF_DIR}/${MODEL_SHORT}.${QUANT}.gguf" if [ -f "$QFILE" ]; then echo " [SKIP] $QUANT exists" else echo " Quantizing $QUANT..." "$LLAMA_QUANTIZE" "$F16_GGUF" "$QFILE" "$QUANT" fi done fi TRAIN_END=$(date +%s) ELAPSED=$(( (TRAIN_END - TRAIN_START) / 60 )) notify "DONE" "${CKPT_NAME} complete in ${ELAPSED}m" echo "" echo " ✓ ${CKPT_NAME} complete in ${ELAPSED}m" echo " LoRA: $CKPT_PATH" echo " Merged: $MERGED_PATH" echo " GGUFs: $GGUF_DIR/" ls -lh "$GGUF_DIR/"*.gguf 2>/dev/null | awk '{print " " $NF " (" $5 ")"}' done # ── Summary ─────────────────────────────────────────────────────────────────── echo "" echo "╔══════════════════════════════════════════════════════════╗" echo "║ All training complete! ║" echo "╠══════════════════════════════════════════════════════════╣" echo "║ Next steps: ║" echo "║ 1. Download checkpoints to steel141 ║" echo "║ 2. Register in Ollama: ║" echo "║ ollama create mortdecai:0.6.0-9b -f Modelfile.9b ║" echo "║ ollama create mortdecai:0.6.0-14b -f Modelfile.14b ║" echo "║ 3. Run bake-off against 0.5.0 ║" echo "║ 4. Deploy winner to prod ║" echo "╚══════════════════════════════════════════════════════════╝"