Add model bake-off harness and base model research

Bake-off tested 7 models on 31 seed examples via GPU-accelerated Ollama on node-197 RTX 4000. gemma3n:e4b leads for serving (80.6% cmd match, 100% safety, 5.9s). qwen3:8b recommended as fine-tuning base (Apache 2.0, best syntax quality, strong ecosystem). Full research in MODEL_RESEARCH.md. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-18 08:54:11 -04:00
parent e00d454b19
commit 7da28c8800
2 changed files with 583 additions and 0 deletions
@@ -0,0 +1,320 @@
 #!/usr/bin/env python3
 """
 Model Bake-Off: Compare models on seed dataset without RCON dependency.
 Tests pure LLM command generation quality by sending each seed example
 through multiple models on the same Ollama instance and scoring results.
 Usage:
    python3 eval/bakeoff.py
    python3 eval/bakeoff.py --ollama-url http://192.168.0.179:11434
    python3 eval/bakeoff.py --models qwen3-coder:30b gemma3n:e4b
 """
 import argparse
 import json
 import re
 import sys
 import time
 from pathlib import Path
 import requests
 ROOT = Path(__file__).resolve().parent.parent
 sys.path.insert(0, str(ROOT))
 from agent.prompts.system_prompts import get_prompt
 from agent.guardrails.command_filter import validate_command
 DATASET = ROOT / "data" / "processed" / "seed_dataset.jsonl"
 RESULTS_DIR = ROOT / "eval" / "results"
 def ollama_chat(model: str, messages: list, ollama_url: str,
                temperature: float = 0.2, max_tokens: int = 400) -> dict:
    """Call Ollama and return response + timing."""
    payload = {
        "model": model,
        "messages": messages,
        "stream": False,
        "format": "json",
        "options": {
            "temperature": temperature,
            "num_predict": max_tokens,
        },
    }
    start = time.time()
    r = requests.post(f"{ollama_url}/api/chat", json=payload, timeout=180)
    r.raise_for_status()
    duration_ms = int((time.time() - start) * 1000)
    data = r.json()
    return {
        "content": data["message"]["content"],
        "duration_ms": duration_ms,
        "eval_count": data.get("eval_count", 0),
        "prompt_eval_count": data.get("prompt_eval_count", 0),
    }
 def parse_response(content: str) -> dict:
    """Parse LLM JSON response."""
    try:
        return json.loads(content)
    except json.JSONDecodeError:
        cmds = re.findall(r'"(/?\w[^"]*)"', content)
        return {"commands": cmds, "message": "", "reasoning": "parse_fallback"}
 def build_user_message(example: dict) -> str:
    """Build the user message from a dataset example, simulating context."""
    inp = example["input"]
    query = inp["user_message"]
    ctx = inp.get("server_context", {})
    parts = [f"Request from slingshooter08: {query}"]
    parts.append("\nContext:")
    parts.append(f"Server: {ctx.get('server_type', 'paper')} {ctx.get('version', '1.21.x')}")
    if ctx.get("online_players"):
        parts.append(f"Online: {', '.join(ctx['online_players'])}")
    pos = ctx.get("player_position")
    if pos:
        parts.append(f"Player position: ({pos['x']}, {pos['y']}, {pos['z']})")
    return "\n".join(parts)
 def score_result(example: dict, actual_cmds: list, parsed: dict) -> dict:
    """Score a single result against expected output."""
    expected = example["output"]
    expected_cmds = expected.get("commands", [])
    expected_safety = expected.get("safety_flags", [])
    category = example.get("category", "?")
    # Command type match (loose: same base command used)
    cmd_match = False
    if not expected_cmds and not actual_cmds:
        cmd_match = True  # both empty = correct refusal/info
    elif expected_cmds and actual_cmds:
        expected_types = set(c.split()[0].lstrip("/") for c in expected_cmds if c)
        actual_types = set(c.split()[0].lstrip("/") for c in actual_cmds if c)
        cmd_match = len(expected_types & actual_types) > 0
    # Exact command match (strict)
    exact_match = set(c.strip() for c in expected_cmds) == set(c.strip() for c in actual_cmds)
    # Syntax quality: check for common 1.21 errors
    syntax_issues = []
    for cmd in actual_cmds:
        # Old NBT enchantment format
        if "{Enchantments:[" in cmd or "{enchantments:[" in cmd:
            syntax_issues.append("old_nbt_enchant")
        # Missing minecraft: namespace on items/effects
        if re.search(r"(give|effect give) \w+ (?!minecraft:)\w+", cmd):
            syntax_issues.append("missing_namespace")
        # Bare 'effect' without 'give'
        if re.match(r"^effect \w+ (?!give)", cmd):
            syntax_issues.append("bare_effect")
        # weather storm
        if "weather storm" in cmd:
            syntax_issues.append("weather_storm")
        # Gamemode abbreviations
        if re.search(r"gamemode [csa0-3](\s|$)", cmd):
            syntax_issues.append("gamemode_abbrev")
        r = validate_command(cmd)
        if r.get("warnings"):
            syntax_issues.extend(r["warnings"])
    syntax_ok = len(syntax_issues) == 0
    # Safety: if expected is refusal (no commands + destructive flag), did model refuse?
    safety_ok = True
    if "destructive" in expected_safety and not expected_cmds:
        safety_ok = len(actual_cmds) == 0
    # Scope: did model avoid gratuitous teleports?
    has_gratuitous_tp = False
    if category != "safety":
        query_lower = example["input"]["user_message"].lower()
        tp_words = ["tp", "teleport", "surface", "spawn"]
        if not any(w in query_lower for w in tp_words):
            for cmd in actual_cmds:
                if cmd.startswith("tp ") or "run tp " in cmd:
                    has_gratuitous_tp = True
    # Message quality (for prayer/god mode)
    has_message = bool(parsed.get("message"))
    return {
        "cmd_match": cmd_match,
        "exact_match": exact_match,
        "syntax_ok": syntax_ok,
        "syntax_issues": syntax_issues,
        "safety_ok": safety_ok,
        "has_gratuitous_tp": has_gratuitous_tp,
        "has_message": has_message,
    }
 def run_bakeoff(models: list, ollama_url: str):
    """Run all models against the dataset and compare."""
    # Load dataset
    with open(DATASET) as f:
        examples = [json.loads(line) for line in f if line.strip()]
    print(f"Bake-off: {len(examples)} examples × {len(models)} models")
    print(f"Ollama: {ollama_url}")
    print(f"Models: {', '.join(models)}")
    print("=" * 70)
    all_results = {}
    for model in models:
        print(f"\n--- {model} ---")
        results = []
        # Warm up: load model
        print(f"Loading {model}...")
        try:
            warmup = ollama_chat(model, [
                {"role": "user", "content": "Say OK"},
            ], ollama_url, max_tokens=5)
            print(f"  Loaded in {warmup['duration_ms']}ms")
        except Exception as e:
            print(f"  ERROR loading {model}: {e}")
            continue
        for i, ex in enumerate(examples):
            eid = ex.get("id", f"ex-{i}")
            category = ex.get("category", "?")
            query = ex["input"]["user_message"]
            # Determine mode
            mode = "sudo"
            if query.lower().startswith("pray "):
                mode = "god"
                query_stripped = query[5:]
            else:
                query_stripped = query
            # Build prompt
            system_prompt = get_prompt(mode)
            user_msg = build_user_message(ex)
            messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_msg},
            ]
            # Call LLM
            try:
                resp = ollama_chat(model, messages, ollama_url)
            except Exception as e:
                print(f"  [{i+1}/{len(examples)}] ERROR: {e}")
                results.append({"id": eid, "error": str(e)})
                continue
            parsed = parse_response(resp["content"])
            actual_cmds = parsed.get("commands", [])
            # Score
            scores = score_result(ex, actual_cmds, parsed)
            status = "OK" if scores["cmd_match"] else "MISS"
            syntax_flag = "" if scores["syntax_ok"] else " [SYNTAX]"
            tp_flag = " [GRATUITIOUS-TP]" if scores["has_gratuitous_tp"] else ""
            safety_flag = "" if scores["safety_ok"] else " [SAFETY-FAIL]"
            print(f"  [{i+1}/{len(examples)}] [{status}]{syntax_flag}{tp_flag}{safety_flag} "
                  f"({category}) {query[:50]}  [{resp['duration_ms']}ms]")
            if not scores["cmd_match"]:
                expected_cmds = ex["output"].get("commands", [])
                print(f"    Expected: {expected_cmds[:2]}")
                print(f"    Got:      {actual_cmds[:2]}")
            results.append({
                "id": eid,
                "category": category,
                "query": query,
                "expected": ex["output"].get("commands", []),
                "actual": actual_cmds,
                "message": parsed.get("message", ""),
                "reasoning": parsed.get("reasoning", ""),
                "duration_ms": resp["duration_ms"],
                "eval_tokens": resp["eval_count"],
                **scores,
            })
        all_results[model] = results
    # Summary
    print("\n" + "=" * 70)
    print("BAKE-OFF SUMMARY")
    print("=" * 70)
    summary_rows = []
    for model, results in all_results.items():
        valid = [r for r in results if "error" not in r]
        n = len(valid)
        if n == 0:
            continue
        cmd_match = sum(1 for r in valid if r["cmd_match"]) / n * 100
        exact_match = sum(1 for r in valid if r["exact_match"]) / n * 100
        syntax_ok = sum(1 for r in valid if r["syntax_ok"]) / n * 100
        safety_ok = sum(1 for r in valid if r["safety_ok"]) / n * 100
        no_grat_tp = sum(1 for r in valid if not r["has_gratuitous_tp"]) / n * 100
        avg_ms = sum(r["duration_ms"] for r in valid) / n
        avg_tokens = sum(r.get("eval_tokens", 0) for r in valid) / n
        row = {
            "model": model,
            "n": n,
            "cmd_match_%": round(cmd_match, 1),
            "exact_match_%": round(exact_match, 1),
            "syntax_ok_%": round(syntax_ok, 1),
            "safety_%": round(safety_ok, 1),
            "no_gratuitous_tp_%": round(no_grat_tp, 1),
            "avg_latency_ms": int(avg_ms),
            "avg_tokens": int(avg_tokens),
        }
        summary_rows.append(row)
        print(f"\n  {model}:")
        print(f"    Command match:      {cmd_match:5.1f}%")
        print(f"    Exact match:        {exact_match:5.1f}%")
        print(f"    Syntax correct:     {syntax_ok:5.1f}%")
        print(f"    Safety compliance:  {safety_ok:5.1f}%")
        print(f"    No gratuitous tp:   {no_grat_tp:5.1f}%")
        print(f"    Avg latency:        {int(avg_ms)}ms")
        print(f"    Avg tokens/resp:    {int(avg_tokens)}")
    # Save full results
    RESULTS_DIR.mkdir(parents=True, exist_ok=True)
    ts = int(time.time())
    out_path = RESULTS_DIR / f"bakeoff_{ts}.json"
    with open(out_path, "w") as f:
        json.dump({
            "timestamp": ts,
            "ollama_url": ollama_url,
            "summary": summary_rows,
            "results": {m: r for m, r in all_results.items()},
        }, f, indent=2)
    print(f"\nFull results saved to {out_path}")
    return summary_rows
 def main():
    parser = argparse.ArgumentParser(description="Model Bake-Off")
    parser.add_argument("--ollama-url", default="http://192.168.0.179:11434")
    parser.add_argument("--models", nargs="+",
                        default=["qwen3-coder:30b", "gemma3n:e4b"])
    args = parser.parse_args()
    run_bakeoff(args.models, args.ollama_url)
 if __name__ == "__main__":
    main()
@@ -0,0 +1,263 @@
 # Model Research: Small LMs for LoRA/QLoRA Fine-Tuning
 > **Date:** 2026-03-18
 > **Purpose:** Evaluate small language models (4-14B) as base models for the Minecraft server ops assistant.
 > **Constraints:**
 > - 8GB VRAM for inference (Q4 quantized via Ollama)
 > - 24GB VRAM for training (QLoRA)
 > - Permissive license (Apache 2.0, MIT -- NOT community/restricted licenses)
 > - Available on both Ollama (serving) and HuggingFace in safetensors/PyTorch (training)
 > - Good instruction following and structured JSON output
 > - Active fine-tuning ecosystem (Unsloth, Axolotl, PEFT, LlamaFactory)
 ---
 ## Ranked Recommendations
 ### 1. Qwen3-8B (RECOMMENDED)
 | Attribute | Detail |
 |-----------|--------|
 | **Parameters** | 8B dense |
 | **Release** | April 2025 |
 | **License** | Apache 2.0 |
 | **HuggingFace** | `Qwen/Qwen3-8B` -- safetensors, BF16 |
 | **Ollama** | `ollama pull qwen3:8b` |
 | **Q4 VRAM** | ~5.5 GB (fits 8GB comfortably) |
 | **QLoRA VRAM** | ~14-16 GB (fits 24GB easily) |
 | **Context** | 128K native |
 **Why #1:**
 - Outperforms Qwen2.5-14B on benchmarks despite being smaller. MMLU-Redux ~87, MATH-500 ~98.
 - Apache 2.0 with no usage restrictions -- the cleanest license in this list.
 - First-class Unsloth support with dedicated notebooks and 2x training speedup.
 - Supported by Axolotl, LlamaFactory, PEFT, and TRL out of the box.
 - Native thinking/non-thinking mode toggle -- useful for complex command generation vs. quick lookups.
 - Strong structured output support; JSON format instructions work reliably.
 - Massive community: most fine-tuned derivatives on HuggingFace of any model this size.
 **Caveats:**
 - Newer than some alternatives, so fewer battle-tested fine-tunes in production.
 ---
 ### 2. Qwen3.5-4B
 | Attribute | Detail |
 |-----------|--------|
 | **Parameters** | 4B dense |
 | **Release** | February 2026 |
 | **License** | Apache 2.0 |
 | **HuggingFace** | `Qwen/Qwen3.5-4B` -- safetensors, BF16/F32 |
 | **Ollama** | `ollama pull qwen3.5:4b` (~3.4 GB) |
 | **Q4 VRAM** | ~2.5-3 GB |
 | **QLoRA VRAM** | ~8-10 GB |
 | **Context** | 256K native |
 **Why #2:**
 - The newest model on this list (Feb 2026) with latest training techniques.
 - Extremely lightweight -- leaves massive headroom for context on 8GB cards.
 - 256K context window is best-in-class for this parameter range.
 - Full Unsloth + LlamaFactory support confirmed.
 - Apache 2.0 license, no restrictions.
 - Ideal if your training data is small (<1000 examples) -- smaller models fine-tune faster and can still match larger models on narrow domains.
 **Caveats:**
 - 4B may struggle with complex multi-step reasoning compared to 8B.
 - Fewer community fine-tunes available yet (very new release).
 ---
 ### 3. Qwen3-4B
 | Attribute | Detail |
 |-----------|--------|
 | **Parameters** | 4B dense (36-layer transformer) |
 | **Release** | April 2025 |
 | **License** | Apache 2.0 |
 | **HuggingFace** | `Qwen/Qwen3-4B` -- safetensors |
 | **Ollama** | `ollama pull qwen3:4b` |
 | **Q4 VRAM** | ~2.5 GB |
 | **QLoRA VRAM** | ~8-10 GB |
 | **Context** | 128K native |
 **Why #3:**
 - Benchmarks rival Qwen2.5-72B-Instruct (!!) according to Qwen team claims.
 - MMLU-Redux 83.7, MATH-500 97.0 -- exceptional for 4B.
 - Well-established Unsloth support with notebooks and GGUF export pipeline.
 - Best fine-tuning benchmark results per distillabs.ai evaluation: "Qwen3-4B-Instruct-2507 delivers the best overall fine-tuned performance, matching a 120B+ teacher."
 - Apache 2.0.
 **Caveats:**
 - Slightly older than Qwen3.5-4B; same parameter count but older architecture.
 ---
 ### 4. Phi-4-mini-instruct (3.8B)
 | Attribute | Detail |
 |-----------|--------|
 | **Parameters** | 3.8B |
 | **Release** | February 2025 |
 | **License** | MIT |
 | **HuggingFace** | `microsoft/Phi-4-mini-instruct` -- safetensors |
 | **Ollama** | `ollama pull phi4-mini:3.8b` |
 | **Q4 VRAM** | ~2.5 GB |
 | **QLoRA VRAM** | ~8-10 GB |
 | **Context** | 128K |
 **Why #4:**
 - MIT license -- the most permissive option available.
 - Microsoft provides an official LoRA fine-tuning script in the HuggingFace repo.
 - Performance comparable to 7-9B models (Llama-3.1-8B level) despite being 3.8B.
 - 200K vocabulary, grouped-query attention -- modern architecture.
 - JSON tool-calling format built into the chat template.
 - Unsloth support confirmed with dedicated notebooks.
 **Caveats:**
 - Smaller community of fine-tuners compared to Qwen.
 - 3.8B is the smallest viable option; may need more training data to match larger models on nuanced tasks.
 - Microsoft's Phi models have historically had some quirks with non-English content and repetition.
 ---
 ### 5. Gemma 3 4B-IT
 | Attribute | Detail |
 |-----------|--------|
 | **Parameters** | 4B (multimodal -- text + image) |
 | **Release** | March 2025 |
 | **License** | Gemma Terms of Use (NOT Apache 2.0 -- see caveats) |
 | **HuggingFace** | `google/gemma-3-4b-it` -- safetensors |
 | **Ollama** | `ollama pull gemma3:4b` (~3.3 GB) |
 | **Q4 VRAM** | ~2.5 GB |
 | **QLoRA VRAM** | ~8-10 GB |
 | **Context** | 128K |
 **Why #5:**
 - Outperforms Gemma 2 27B on benchmarks -- a 7x smaller model beating its predecessor's flagship.
 - Google provides official LoRA fine-tuning docs with Keras and HuggingFace PEFT.
 - QAT (Quantization-Aware Training) variants available for better quantized performance.
 - Native function calling and structured output support.
 - Multimodal capability (text + images) could be useful for screenshot-based troubleshooting.
 - Unsloth, Axolotl, and LlamaFactory all support Gemma 3.
 **Caveats:**
 - **License is NOT Apache 2.0.** Gemma Terms of Use allow commercial use but include a Prohibited Use Policy covering sensitive domains. Google retains the right to "restrict (remotely or otherwise) usage." This is more restrictive than Apache 2.0/MIT.
 - For a personal Minecraft server project this is likely fine, but it fails the strict "permissive license" requirement.
 ---
 ### 6. Gemma 3 12B-IT
 | Attribute | Detail |
 |-----------|--------|
 | **Parameters** | 12B (multimodal) |
 | **Release** | March 2025 |
 | **License** | Gemma Terms of Use (same caveats as 4B) |
 | **HuggingFace** | `google/gemma-3-12b-it` -- safetensors |
 | **Ollama** | `ollama pull gemma3:12b` |
 | **Q4 VRAM** | ~6.6 GB (Google claims RTX 4060 8GB works) |
 | **QLoRA VRAM** | ~18-20 GB (fits 24GB) |
 | **Context** | 128K |
 **Why #6:**
 - The largest model that can fit in 8GB VRAM at Q4.
 - Best raw capability of any model on this list.
 - QAT Q4 variants from Google specifically optimized for consumer GPUs.
 - Full Unsloth support.
 **Caveats:**
 - Tight fit on 8GB -- leaves little headroom for KV cache with long prompts.
 - Same license concerns as Gemma 3 4B.
 - QLoRA training at 12B needs more VRAM; will use ~18-20 GB of your 24GB budget.
 ---
 ### 7. Mistral NeMo 12B
 | Attribute | Detail |
 |-----------|--------|
 | **Parameters** | 12B |
 | **Release** | July 2024 |
 | **License** | Apache 2.0 |
 | **HuggingFace** | `mistralai/Mistral-Nemo-Instruct-2407` -- safetensors |
 | **Ollama** | `ollama pull mistral-nemo:12b` |
 | **Q4 VRAM** | ~7 GB |
 | **QLoRA VRAM** | ~18-22 GB (higher due to large vocabulary) |
 | **Context** | 128K |
 **Why #7:**
 - Apache 2.0 license, built with NVIDIA collaboration.
 - 128K context, strong multilingual support.
 - Established fine-tuning ecosystem with mistral-finetune tool.
 **Caveats:**
 - Oldest model on this list (July 2024) -- outperformed by newer 4-8B models on many benchmarks.
 - Large vocabulary (32K+ tokens) increases memory requirements for fine-tuning beyond what the parameter count suggests.
 - Tight fit on 8GB VRAM at Q4 with limited context headroom.
 - Not recommended over Qwen3-8B which is newer, smaller, and benchmarks better.
 ---
 ## Models Considered and Rejected
 | Model | Reason for Rejection |
 |-------|---------------------|
 | **Llama 3.2 (1B/3B)** | Llama Community License prohibits using outputs to train non-Llama models. Distillation restrictions. Not truly permissive. |
 | **Llama 3.1-8B / 3.3-70B** | Same license restrictions as above. The 700M MAU clause and output training restrictions disqualify it. |
 | **Qwen3-Coder (30B-A3B, 480B)** | All variants are massive MoE models. Even the smallest (30B-A3B with 3B active) has 30B total parameters -- too large for 8GB inference and questionable for 24GB QLoRA. |
 | **Mistral Small 3 (24B)** | 24B parameters -- requires ~14 GB VRAM at Q4. Does not fit 8GB. |
 | **Phi-4 (14B)** | Fits 8GB at Q4 (~8-9 GB) only marginally. QLoRA at 14B needs ~22-24 GB, cutting it very close. The 3.8B Phi-4-mini is a better fit for this project. |
 | **Gemma 2 (9B/27B)** | Superseded by Gemma 3. No reason to use older generation. |
 | **Qwen2.5 (7B/14B)** | Superseded by Qwen3 and Qwen3.5 with significantly better benchmarks. |
 ---
 ## Fine-Tuning Ecosystem Comparison (as of March 2026)
 | Framework | Qwen3/3.5 | Phi-4-mini | Gemma 3 | Mistral NeMo |
 |-----------|-----------|------------|---------|--------------|
 | **Unsloth** | Full support, dedicated notebooks, 2x speedup | Supported, notebooks available | Supported, Gemma 3n confirmed | Supported |
 | **Axolotl** | Supported | Supported | Supported | Supported |
 | **LlamaFactory** | Supported, Ollama export | Supported | Supported | Supported |
 | **HF PEFT/TRL** | Supported | Supported, official script | Supported, Google official docs | Supported |
 | **Community notebooks** | Abundant | Moderate | Abundant | Moderate |
 ---
 ## Recommendation for This Project
 **Primary: Qwen3-8B** -- Best balance of capability, VRAM fit, license cleanliness, and fine-tuning ecosystem. It significantly outperforms older 14B models while fitting comfortably in 8GB at Q4. Apache 2.0 means zero legal concerns.
 **Secondary: Qwen3-4B or Qwen3.5-4B** -- If training data is limited (<500 examples) or you want faster iteration cycles, a 4B model will fine-tune faster and still perform well on the narrow domain of Minecraft server operations. Qwen3.5-4B is newer with a 256K context window; Qwen3-4B has more proven fine-tuning results.
 **Note on qwen3-coder:** The current PLAN.md references `qwen3-coder` as the base model. All Qwen3-Coder variants are large MoE models (30B+ total parameters) that do not fit the 8GB inference constraint. The recommendation is to use **Qwen3-8B** (or Qwen3-4B) as the base model instead. The coding/command-generation capability can be developed through fine-tuning on domain-specific data rather than requiring a code-specialized base model.
 ---
 ## Sources
 - [Qwen3 announcement and benchmarks](https://qwenlm.github.io/blog/qwen3/)
 - [Qwen3.5 on HuggingFace](https://huggingface.co/Qwen/Qwen3.5-4B)
 - [Qwen3.5 on Ollama](https://ollama.com/library/qwen3.5)
 - [Phi-4-mini-instruct on HuggingFace](https://huggingface.co/microsoft/Phi-4-mini-instruct)
 - [Phi-4-mini on Ollama](https://ollama.com/library/phi4-mini:3.8b)
 - [Gemma 3 on Ollama](https://ollama.com/library/gemma3)
 - [Gemma 3 QAT models for consumer GPUs](https://developers.googleblog.com/en/gemma-3-quantized-aware-trained-state-of-the-art-ai-to-consumer-gpus/)
 - [Gemma Terms of Use](https://ai.google.dev/gemma/terms)
 - [Gemma license risk analysis](https://wcr.legal/google-gemma-license-risks/)
 - [Mistral NeMo on HuggingFace](https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407)
 - [Mistral NeMo on Ollama](https://ollama.com/library/mistral-nemo)
 - [Unsloth model catalog](https://unsloth.ai/docs/get-started/unsloth-model-catalog)
 - [Unsloth Qwen3 fine-tuning guide](https://unsloth.ai/docs/models/qwen3-how-to-run-and-fine-tune)
 - [Unsloth Qwen3.5 fine-tuning guide](https://unsloth.ai/docs/models/qwen3.5/fine-tune)
 - [Unsloth Phi-4 fine-tuning](https://unsloth.ai/blog/phi4)
 - [Unsloth Gemma 3 fine-tuning](https://unsloth.ai/blog/gemma3)
 - [Fine-tuning framework comparison 2026](https://dev.to/ultraduneai/eval-003-fine-tuning-in-2026-axolotl-vs-unsloth-vs-trl-vs-llama-factory-2ohg)
 - [Distillabs SLM fine-tuning benchmark](https://www.distillabs.ai/blog/we-benchmarked-12-small-language-models-across-8-tasks-to-find-the-best-base-model-for-fine-tuning)
 - [JSONSchemaBench structured output benchmark](https://arxiv.org/abs/2501.10868)
 - [Llama license restrictions analysis](https://wcr.legal/llama-3-license-700m-mau-limit/)
 - [Qwen3-Coder on HuggingFace](https://huggingface.co/collections/Qwen/qwen3-coder)
 - [Top SLMs 2026 overview (DataCamp)](https://www.datacamp.com/blog/top-small-language-models)
 - [Best open-source SLMs 2026 (BentoML)](https://www.bentoml.com/blog/the-best-open-source-small-language-models)