#!/bin/bash
# ──────────────────────────────────────────────────────────────────────────────
# Mortdecai Batch Training Pipeline
#
# Trains both 9B and 14B models sequentially on a rented GPU, then exports
# GGUFs at all required quant levels for the fleet.
#
# Usage (on rented H100):
#   # 1. Upload this repo + dataset
#   rsync -avz --exclude='.git' . gpu-host:~/mortdecai/
#
#   # 2. SSH in and run
#   cd ~/mortdecai
#   bash training/scripts/batch_train.sh
#
#   # 3. Monitor from another machine (pick one):
#   ssh gpu-host "tail -f ~/mortdecai/training_progress.jsonl"
#   # OR set DISCORD_WEBHOOK for push notifications:
#   export DISCORD_WEBHOOK="https://discord.com/api/webhooks/..."
#   bash training/scripts/batch_train.sh
#
#   # 4. Download checkpoints when done
#   rsync -avz gpu-host:~/mortdecai/training/checkpoints/mortdecai-0.6.0-* ./training/checkpoints/
#
# Prerequisites on the rented machine:
#   pip install unsloth torch transformers datasets peft trl
# ──────────────────────────────────────────────────────────────────────────────

set -euo pipefail

VERSION="0.6.0"
DATASET="data/processed/merged_training_v06.jsonl"
CHECKPOINT_DIR="training/checkpoints"
PROGRESS_LOG="training_progress.jsonl"

# Discord bot token + channel for progress notifications
DISCORD_TOKEN="${DISCORD_TOKEN:-REDACTED_DISCORD_TOKEN_2}"
DISCORD_CHANNEL="${DISCORD_CHANNEL:-1485160229573361664}"

# ── Progress reporting ────────────────────────────────────────────────────────

notify() {
    local stage="$1"
    local message="$2"
    local ts=$(date -u +%Y-%m-%dT%H:%M:%SZ)

    # Log to file
    echo "{\"ts\":\"$ts\",\"stage\":\"$stage\",\"message\":\"$message\"}" >> "$PROGRESS_LOG"

    # Print locally
    echo "  [$ts] $stage: $message"

    # Discord bot API
    if [ -n "$DISCORD_TOKEN" ]; then
        curl -s -X POST "https://discord.com/api/v10/channels/${DISCORD_CHANNEL}/messages" \
            -H "Authorization: Bot ${DISCORD_TOKEN}" \
            -H "Content-Type: application/json" \
            -d "{\"content\":\"**Mortdecai Training** [${stage}] ${message}\"}" \
            > /dev/null 2>&1 || true
    fi
}

# Models to train
MODELS=(
    "Qwen/Qwen3.5-9B"
    "Qwen/Qwen3.5-14B"
)

# Quant levels per model (mapped to target GPUs)
# 9B:  Q4=RTX4000(8GB), Q6=2080Ti(11GB), Q8=3090Ti(24GB)
# 14B: Q3=RTX4000(8GB), Q4=2080Ti(11GB), Q6=3090Ti(24GB), F16=StrixHalo(64GB)
declare -A QUANTS
QUANTS["Qwen/Qwen3.5-9B"]="Q3_K_M Q4_K_M Q6_K Q8_0"
QUANTS["Qwen/Qwen3.5-14B"]="Q3_K_M Q4_K_M Q6_K Q8_0"

# ── Preflight ─────────────────────────────────────────────────────────────────

if [ ! -f "$DATASET" ]; then
    echo "ERROR: Dataset not found at $DATASET"
    echo "Run: python3 training/scripts/merge_datasets.py"
    exit 1
fi

EXAMPLE_COUNT=$(wc -l < "$DATASET")
echo "╔══════════════════════════════════════════════════════════╗"
echo "║  Mortdecai Batch Training Pipeline v${VERSION}              ║"
echo "╠══════════════════════════════════════════════════════════╣"
echo "║  Dataset: ${EXAMPLE_COUNT} examples                              ║"
echo "║  Models:  ${#MODELS[@]} ($(printf '%s ' "${MODELS[@]}" | sed 's|Qwen/||g'))║"
echo "╚══════════════════════════════════════════════════════════╝"
echo ""

nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null || echo "WARNING: No GPU detected"
echo ""

# ── Conda/venv setup ──────────────────────────────────────────────────────────

if command -v conda &>/dev/null; then
    source "$(conda info --base)/etc/profile.d/conda.sh"
    conda activate mc-train 2>/dev/null || echo "No mc-train env, using current"
fi

mkdir -p "$CHECKPOINT_DIR"

# ── Training loop ─────────────────────────────────────────────────────────────

for BASE_MODEL in "${MODELS[@]}"; do
    MODEL_SHORT=$(echo "$BASE_MODEL" | sed 's|Qwen/||; s|\.|-|g' | tr '[:upper:]' '[:lower:]')
    CKPT_NAME="mortdecai-${VERSION}-${MODEL_SHORT}"
    CKPT_PATH="${CHECKPOINT_DIR}/${CKPT_NAME}"
    MERGED_PATH="${CKPT_PATH}-merged"
    GGUF_DIR="${CKPT_PATH}-gguf"

    echo ""
    echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
    echo "  Training: ${BASE_MODEL} → ${CKPT_NAME}"
    echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
    TRAIN_START=$(date +%s)

    # ── Step 1: LoRA fine-tune ──
    if [ -d "$CKPT_PATH" ] && [ -f "$CKPT_PATH/adapter_model.safetensors" ]; then
        notify "SKIP" "${CKPT_NAME} LoRA checkpoint exists"
    else
        notify "TRAIN" "Starting ${BASE_MODEL} LoRA fine-tune (${EXAMPLE_COUNT} examples)"
        python3 training/scripts/train_lora.py \
            --model "$BASE_MODEL" \
            --dataset "$DATASET" \
            --output "$CKPT_PATH" \
            --epochs 3 \
            --batch-size 4 \
            --lr 2e-4 \
            --rank 64 \
            --alpha 128 \
            --save-steps 100 \
            2>&1 | tee "${CKPT_PATH}.train.log"
        notify "TRAIN" "${CKPT_NAME} LoRA training complete"
    fi

    # ── Step 2: Merge LoRA into base ──
    if [ -d "$MERGED_PATH" ] && [ -f "$MERGED_PATH/model.safetensors.index.json" ]; then
        notify "SKIP" "${CKPT_NAME} merged weights exist"
    else
        notify "MERGE" "Merging ${CKPT_NAME} LoRA into base model..."
        python3 -c "
from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained('${CKPT_PATH}')
model.save_pretrained_merged('${MERGED_PATH}', tokenizer, save_method='merged_16bit')
print('Merge complete: ${MERGED_PATH}')
"
    fi

    # ── Step 3: Convert to GGUF (F16) ──
    mkdir -p "$GGUF_DIR"
    F16_GGUF="${GGUF_DIR}/${MODEL_SHORT}.F16.gguf"

    if [ -f "$F16_GGUF" ]; then
        notify "SKIP" "${CKPT_NAME} F16 GGUF exists"
    else
        notify "GGUF" "Converting ${CKPT_NAME} to F16 GGUF..."
        LLAMA_CONVERT=$(find / -name "convert_hf_to_gguf.py" 2>/dev/null | head -1)
        if [ -z "$LLAMA_CONVERT" ]; then
            echo "  WARNING: convert_hf_to_gguf.py not found, skipping GGUF export"
            echo "  Run manually: python3 convert_hf_to_gguf.py $MERGED_PATH --outfile $F16_GGUF --outtype f16"
            continue
        fi
        python3 "$LLAMA_CONVERT" "$MERGED_PATH" --outfile "$F16_GGUF" --outtype f16
    fi

    # ── Step 4: Quantize ──
    LLAMA_QUANTIZE=$(find / -name "llama-quantize" -o -name "quantize" 2>/dev/null | head -1)
    if [ -z "$LLAMA_QUANTIZE" ]; then
        echo "  WARNING: llama-quantize not found, skipping quantization"
        echo "  Run manually on steel141 after downloading F16 GGUF"
    else
        echo "  [4/4] Quantizing..."
        for QUANT in ${QUANTS[$BASE_MODEL]}; do
            QFILE="${GGUF_DIR}/${MODEL_SHORT}.${QUANT}.gguf"
            if [ -f "$QFILE" ]; then
                echo "    [SKIP] $QUANT exists"
            else
                echo "    Quantizing $QUANT..."
                "$LLAMA_QUANTIZE" "$F16_GGUF" "$QFILE" "$QUANT"
            fi
        done
    fi

    TRAIN_END=$(date +%s)
    ELAPSED=$(( (TRAIN_END - TRAIN_START) / 60 ))
    notify "DONE" "${CKPT_NAME} complete in ${ELAPSED}m"
    echo ""
    echo "  ✓ ${CKPT_NAME} complete in ${ELAPSED}m"
    echo "    LoRA:   $CKPT_PATH"
    echo "    Merged: $MERGED_PATH"
    echo "    GGUFs:  $GGUF_DIR/"
    ls -lh "$GGUF_DIR/"*.gguf 2>/dev/null | awk '{print "      " $NF " (" $5 ")"}'
done

# ── Summary ───────────────────────────────────────────────────────────────────

echo ""
echo "╔══════════════════════════════════════════════════════════╗"
echo "║  All training complete!                                  ║"
echo "╠══════════════════════════════════════════════════════════╣"
echo "║  Next steps:                                             ║"
echo "║  1. Download checkpoints to steel141                     ║"
echo "║  2. Register in Ollama:                                  ║"
echo "║     ollama create mortdecai:0.6.0-9b -f Modelfile.9b    ║"
echo "║     ollama create mortdecai:0.6.0-14b -f Modelfile.14b  ║"
echo "║  3. Run bake-off against 0.5.0                          ║"
echo "║  4. Deploy winner to prod                                ║"
echo "╚══════════════════════════════════════════════════════════╝"