From 7da28c88004baa5740fa6b4e9760e74062fb27ee Mon Sep 17 00:00:00 2001 From: Seth Freiberg Date: Wed, 18 Mar 2026 08:54:11 -0400 Subject: [PATCH] Add model bake-off harness and base model research Bake-off tested 7 models on 31 seed examples via GPU-accelerated Ollama on node-197 RTX 4000. gemma3n:e4b leads for serving (80.6% cmd match, 100% safety, 5.9s). qwen3:8b recommended as fine-tuning base (Apache 2.0, best syntax quality, strong ecosystem). Full research in MODEL_RESEARCH.md. Co-Authored-By: Claude Opus 4.6 (1M context) --- eval/bakeoff.py | 320 +++++++++++++++++++++++++++++++++++++ training/MODEL_RESEARCH.md | 263 ++++++++++++++++++++++++++++++ 2 files changed, 583 insertions(+) create mode 100644 eval/bakeoff.py create mode 100644 training/MODEL_RESEARCH.md diff --git a/eval/bakeoff.py b/eval/bakeoff.py new file mode 100644 index 0000000..ea2d1aa --- /dev/null +++ b/eval/bakeoff.py @@ -0,0 +1,320 @@ +#!/usr/bin/env python3 +""" +Model Bake-Off: Compare models on seed dataset without RCON dependency. + +Tests pure LLM command generation quality by sending each seed example +through multiple models on the same Ollama instance and scoring results. + +Usage: + python3 eval/bakeoff.py + python3 eval/bakeoff.py --ollama-url http://192.168.0.179:11434 + python3 eval/bakeoff.py --models qwen3-coder:30b gemma3n:e4b +""" + +import argparse +import json +import re +import sys +import time +from pathlib import Path + +import requests + +ROOT = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(ROOT)) + +from agent.prompts.system_prompts import get_prompt +from agent.guardrails.command_filter import validate_command + +DATASET = ROOT / "data" / "processed" / "seed_dataset.jsonl" +RESULTS_DIR = ROOT / "eval" / "results" + + +def ollama_chat(model: str, messages: list, ollama_url: str, + temperature: float = 0.2, max_tokens: int = 400) -> dict: + """Call Ollama and return response + timing.""" + payload = { + "model": model, + "messages": messages, + "stream": False, + "format": "json", + "options": { + "temperature": temperature, + "num_predict": max_tokens, + }, + } + start = time.time() + r = requests.post(f"{ollama_url}/api/chat", json=payload, timeout=180) + r.raise_for_status() + duration_ms = int((time.time() - start) * 1000) + data = r.json() + return { + "content": data["message"]["content"], + "duration_ms": duration_ms, + "eval_count": data.get("eval_count", 0), + "prompt_eval_count": data.get("prompt_eval_count", 0), + } + + +def parse_response(content: str) -> dict: + """Parse LLM JSON response.""" + try: + return json.loads(content) + except json.JSONDecodeError: + cmds = re.findall(r'"(/?\w[^"]*)"', content) + return {"commands": cmds, "message": "", "reasoning": "parse_fallback"} + + +def build_user_message(example: dict) -> str: + """Build the user message from a dataset example, simulating context.""" + inp = example["input"] + query = inp["user_message"] + ctx = inp.get("server_context", {}) + + parts = [f"Request from slingshooter08: {query}"] + parts.append("\nContext:") + parts.append(f"Server: {ctx.get('server_type', 'paper')} {ctx.get('version', '1.21.x')}") + + if ctx.get("online_players"): + parts.append(f"Online: {', '.join(ctx['online_players'])}") + + pos = ctx.get("player_position") + if pos: + parts.append(f"Player position: ({pos['x']}, {pos['y']}, {pos['z']})") + + return "\n".join(parts) + + +def score_result(example: dict, actual_cmds: list, parsed: dict) -> dict: + """Score a single result against expected output.""" + expected = example["output"] + expected_cmds = expected.get("commands", []) + expected_safety = expected.get("safety_flags", []) + category = example.get("category", "?") + + # Command type match (loose: same base command used) + cmd_match = False + if not expected_cmds and not actual_cmds: + cmd_match = True # both empty = correct refusal/info + elif expected_cmds and actual_cmds: + expected_types = set(c.split()[0].lstrip("/") for c in expected_cmds if c) + actual_types = set(c.split()[0].lstrip("/") for c in actual_cmds if c) + cmd_match = len(expected_types & actual_types) > 0 + + # Exact command match (strict) + exact_match = set(c.strip() for c in expected_cmds) == set(c.strip() for c in actual_cmds) + + # Syntax quality: check for common 1.21 errors + syntax_issues = [] + for cmd in actual_cmds: + # Old NBT enchantment format + if "{Enchantments:[" in cmd or "{enchantments:[" in cmd: + syntax_issues.append("old_nbt_enchant") + # Missing minecraft: namespace on items/effects + if re.search(r"(give|effect give) \w+ (?!minecraft:)\w+", cmd): + syntax_issues.append("missing_namespace") + # Bare 'effect' without 'give' + if re.match(r"^effect \w+ (?!give)", cmd): + syntax_issues.append("bare_effect") + # weather storm + if "weather storm" in cmd: + syntax_issues.append("weather_storm") + # Gamemode abbreviations + if re.search(r"gamemode [csa0-3](\s|$)", cmd): + syntax_issues.append("gamemode_abbrev") + r = validate_command(cmd) + if r.get("warnings"): + syntax_issues.extend(r["warnings"]) + + syntax_ok = len(syntax_issues) == 0 + + # Safety: if expected is refusal (no commands + destructive flag), did model refuse? + safety_ok = True + if "destructive" in expected_safety and not expected_cmds: + safety_ok = len(actual_cmds) == 0 + + # Scope: did model avoid gratuitous teleports? + has_gratuitous_tp = False + if category != "safety": + query_lower = example["input"]["user_message"].lower() + tp_words = ["tp", "teleport", "surface", "spawn"] + if not any(w in query_lower for w in tp_words): + for cmd in actual_cmds: + if cmd.startswith("tp ") or "run tp " in cmd: + has_gratuitous_tp = True + + # Message quality (for prayer/god mode) + has_message = bool(parsed.get("message")) + + return { + "cmd_match": cmd_match, + "exact_match": exact_match, + "syntax_ok": syntax_ok, + "syntax_issues": syntax_issues, + "safety_ok": safety_ok, + "has_gratuitous_tp": has_gratuitous_tp, + "has_message": has_message, + } + + +def run_bakeoff(models: list, ollama_url: str): + """Run all models against the dataset and compare.""" + # Load dataset + with open(DATASET) as f: + examples = [json.loads(line) for line in f if line.strip()] + + print(f"Bake-off: {len(examples)} examples × {len(models)} models") + print(f"Ollama: {ollama_url}") + print(f"Models: {', '.join(models)}") + print("=" * 70) + + all_results = {} + + for model in models: + print(f"\n--- {model} ---") + results = [] + + # Warm up: load model + print(f"Loading {model}...") + try: + warmup = ollama_chat(model, [ + {"role": "user", "content": "Say OK"}, + ], ollama_url, max_tokens=5) + print(f" Loaded in {warmup['duration_ms']}ms") + except Exception as e: + print(f" ERROR loading {model}: {e}") + continue + + for i, ex in enumerate(examples): + eid = ex.get("id", f"ex-{i}") + category = ex.get("category", "?") + query = ex["input"]["user_message"] + + # Determine mode + mode = "sudo" + if query.lower().startswith("pray "): + mode = "god" + query_stripped = query[5:] + else: + query_stripped = query + + # Build prompt + system_prompt = get_prompt(mode) + user_msg = build_user_message(ex) + messages = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_msg}, + ] + + # Call LLM + try: + resp = ollama_chat(model, messages, ollama_url) + except Exception as e: + print(f" [{i+1}/{len(examples)}] ERROR: {e}") + results.append({"id": eid, "error": str(e)}) + continue + + parsed = parse_response(resp["content"]) + actual_cmds = parsed.get("commands", []) + + # Score + scores = score_result(ex, actual_cmds, parsed) + + status = "OK" if scores["cmd_match"] else "MISS" + syntax_flag = "" if scores["syntax_ok"] else " [SYNTAX]" + tp_flag = " [GRATUITIOUS-TP]" if scores["has_gratuitous_tp"] else "" + safety_flag = "" if scores["safety_ok"] else " [SAFETY-FAIL]" + + print(f" [{i+1}/{len(examples)}] [{status}]{syntax_flag}{tp_flag}{safety_flag} " + f"({category}) {query[:50]} [{resp['duration_ms']}ms]") + + if not scores["cmd_match"]: + expected_cmds = ex["output"].get("commands", []) + print(f" Expected: {expected_cmds[:2]}") + print(f" Got: {actual_cmds[:2]}") + + results.append({ + "id": eid, + "category": category, + "query": query, + "expected": ex["output"].get("commands", []), + "actual": actual_cmds, + "message": parsed.get("message", ""), + "reasoning": parsed.get("reasoning", ""), + "duration_ms": resp["duration_ms"], + "eval_tokens": resp["eval_count"], + **scores, + }) + + all_results[model] = results + + # Summary + print("\n" + "=" * 70) + print("BAKE-OFF SUMMARY") + print("=" * 70) + + summary_rows = [] + for model, results in all_results.items(): + valid = [r for r in results if "error" not in r] + n = len(valid) + if n == 0: + continue + + cmd_match = sum(1 for r in valid if r["cmd_match"]) / n * 100 + exact_match = sum(1 for r in valid if r["exact_match"]) / n * 100 + syntax_ok = sum(1 for r in valid if r["syntax_ok"]) / n * 100 + safety_ok = sum(1 for r in valid if r["safety_ok"]) / n * 100 + no_grat_tp = sum(1 for r in valid if not r["has_gratuitous_tp"]) / n * 100 + avg_ms = sum(r["duration_ms"] for r in valid) / n + avg_tokens = sum(r.get("eval_tokens", 0) for r in valid) / n + + row = { + "model": model, + "n": n, + "cmd_match_%": round(cmd_match, 1), + "exact_match_%": round(exact_match, 1), + "syntax_ok_%": round(syntax_ok, 1), + "safety_%": round(safety_ok, 1), + "no_gratuitous_tp_%": round(no_grat_tp, 1), + "avg_latency_ms": int(avg_ms), + "avg_tokens": int(avg_tokens), + } + summary_rows.append(row) + + print(f"\n {model}:") + print(f" Command match: {cmd_match:5.1f}%") + print(f" Exact match: {exact_match:5.1f}%") + print(f" Syntax correct: {syntax_ok:5.1f}%") + print(f" Safety compliance: {safety_ok:5.1f}%") + print(f" No gratuitous tp: {no_grat_tp:5.1f}%") + print(f" Avg latency: {int(avg_ms)}ms") + print(f" Avg tokens/resp: {int(avg_tokens)}") + + # Save full results + RESULTS_DIR.mkdir(parents=True, exist_ok=True) + ts = int(time.time()) + out_path = RESULTS_DIR / f"bakeoff_{ts}.json" + with open(out_path, "w") as f: + json.dump({ + "timestamp": ts, + "ollama_url": ollama_url, + "summary": summary_rows, + "results": {m: r for m, r in all_results.items()}, + }, f, indent=2) + print(f"\nFull results saved to {out_path}") + + return summary_rows + + +def main(): + parser = argparse.ArgumentParser(description="Model Bake-Off") + parser.add_argument("--ollama-url", default="http://192.168.0.179:11434") + parser.add_argument("--models", nargs="+", + default=["qwen3-coder:30b", "gemma3n:e4b"]) + args = parser.parse_args() + + run_bakeoff(args.models, args.ollama_url) + + +if __name__ == "__main__": + main() diff --git a/training/MODEL_RESEARCH.md b/training/MODEL_RESEARCH.md new file mode 100644 index 0000000..d031188 --- /dev/null +++ b/training/MODEL_RESEARCH.md @@ -0,0 +1,263 @@ +# Model Research: Small LMs for LoRA/QLoRA Fine-Tuning + +> **Date:** 2026-03-18 +> **Purpose:** Evaluate small language models (4-14B) as base models for the Minecraft server ops assistant. +> **Constraints:** +> - 8GB VRAM for inference (Q4 quantized via Ollama) +> - 24GB VRAM for training (QLoRA) +> - Permissive license (Apache 2.0, MIT -- NOT community/restricted licenses) +> - Available on both Ollama (serving) and HuggingFace in safetensors/PyTorch (training) +> - Good instruction following and structured JSON output +> - Active fine-tuning ecosystem (Unsloth, Axolotl, PEFT, LlamaFactory) + +--- + +## Ranked Recommendations + +### 1. Qwen3-8B (RECOMMENDED) + +| Attribute | Detail | +|-----------|--------| +| **Parameters** | 8B dense | +| **Release** | April 2025 | +| **License** | Apache 2.0 | +| **HuggingFace** | `Qwen/Qwen3-8B` -- safetensors, BF16 | +| **Ollama** | `ollama pull qwen3:8b` | +| **Q4 VRAM** | ~5.5 GB (fits 8GB comfortably) | +| **QLoRA VRAM** | ~14-16 GB (fits 24GB easily) | +| **Context** | 128K native | + +**Why #1:** +- Outperforms Qwen2.5-14B on benchmarks despite being smaller. MMLU-Redux ~87, MATH-500 ~98. +- Apache 2.0 with no usage restrictions -- the cleanest license in this list. +- First-class Unsloth support with dedicated notebooks and 2x training speedup. +- Supported by Axolotl, LlamaFactory, PEFT, and TRL out of the box. +- Native thinking/non-thinking mode toggle -- useful for complex command generation vs. quick lookups. +- Strong structured output support; JSON format instructions work reliably. +- Massive community: most fine-tuned derivatives on HuggingFace of any model this size. + +**Caveats:** +- Newer than some alternatives, so fewer battle-tested fine-tunes in production. + +--- + +### 2. Qwen3.5-4B + +| Attribute | Detail | +|-----------|--------| +| **Parameters** | 4B dense | +| **Release** | February 2026 | +| **License** | Apache 2.0 | +| **HuggingFace** | `Qwen/Qwen3.5-4B` -- safetensors, BF16/F32 | +| **Ollama** | `ollama pull qwen3.5:4b` (~3.4 GB) | +| **Q4 VRAM** | ~2.5-3 GB | +| **QLoRA VRAM** | ~8-10 GB | +| **Context** | 256K native | + +**Why #2:** +- The newest model on this list (Feb 2026) with latest training techniques. +- Extremely lightweight -- leaves massive headroom for context on 8GB cards. +- 256K context window is best-in-class for this parameter range. +- Full Unsloth + LlamaFactory support confirmed. +- Apache 2.0 license, no restrictions. +- Ideal if your training data is small (<1000 examples) -- smaller models fine-tune faster and can still match larger models on narrow domains. + +**Caveats:** +- 4B may struggle with complex multi-step reasoning compared to 8B. +- Fewer community fine-tunes available yet (very new release). + +--- + +### 3. Qwen3-4B + +| Attribute | Detail | +|-----------|--------| +| **Parameters** | 4B dense (36-layer transformer) | +| **Release** | April 2025 | +| **License** | Apache 2.0 | +| **HuggingFace** | `Qwen/Qwen3-4B` -- safetensors | +| **Ollama** | `ollama pull qwen3:4b` | +| **Q4 VRAM** | ~2.5 GB | +| **QLoRA VRAM** | ~8-10 GB | +| **Context** | 128K native | + +**Why #3:** +- Benchmarks rival Qwen2.5-72B-Instruct (!!) according to Qwen team claims. +- MMLU-Redux 83.7, MATH-500 97.0 -- exceptional for 4B. +- Well-established Unsloth support with notebooks and GGUF export pipeline. +- Best fine-tuning benchmark results per distillabs.ai evaluation: "Qwen3-4B-Instruct-2507 delivers the best overall fine-tuned performance, matching a 120B+ teacher." +- Apache 2.0. + +**Caveats:** +- Slightly older than Qwen3.5-4B; same parameter count but older architecture. + +--- + +### 4. Phi-4-mini-instruct (3.8B) + +| Attribute | Detail | +|-----------|--------| +| **Parameters** | 3.8B | +| **Release** | February 2025 | +| **License** | MIT | +| **HuggingFace** | `microsoft/Phi-4-mini-instruct` -- safetensors | +| **Ollama** | `ollama pull phi4-mini:3.8b` | +| **Q4 VRAM** | ~2.5 GB | +| **QLoRA VRAM** | ~8-10 GB | +| **Context** | 128K | + +**Why #4:** +- MIT license -- the most permissive option available. +- Microsoft provides an official LoRA fine-tuning script in the HuggingFace repo. +- Performance comparable to 7-9B models (Llama-3.1-8B level) despite being 3.8B. +- 200K vocabulary, grouped-query attention -- modern architecture. +- JSON tool-calling format built into the chat template. +- Unsloth support confirmed with dedicated notebooks. + +**Caveats:** +- Smaller community of fine-tuners compared to Qwen. +- 3.8B is the smallest viable option; may need more training data to match larger models on nuanced tasks. +- Microsoft's Phi models have historically had some quirks with non-English content and repetition. + +--- + +### 5. Gemma 3 4B-IT + +| Attribute | Detail | +|-----------|--------| +| **Parameters** | 4B (multimodal -- text + image) | +| **Release** | March 2025 | +| **License** | Gemma Terms of Use (NOT Apache 2.0 -- see caveats) | +| **HuggingFace** | `google/gemma-3-4b-it` -- safetensors | +| **Ollama** | `ollama pull gemma3:4b` (~3.3 GB) | +| **Q4 VRAM** | ~2.5 GB | +| **QLoRA VRAM** | ~8-10 GB | +| **Context** | 128K | + +**Why #5:** +- Outperforms Gemma 2 27B on benchmarks -- a 7x smaller model beating its predecessor's flagship. +- Google provides official LoRA fine-tuning docs with Keras and HuggingFace PEFT. +- QAT (Quantization-Aware Training) variants available for better quantized performance. +- Native function calling and structured output support. +- Multimodal capability (text + images) could be useful for screenshot-based troubleshooting. +- Unsloth, Axolotl, and LlamaFactory all support Gemma 3. + +**Caveats:** +- **License is NOT Apache 2.0.** Gemma Terms of Use allow commercial use but include a Prohibited Use Policy covering sensitive domains. Google retains the right to "restrict (remotely or otherwise) usage." This is more restrictive than Apache 2.0/MIT. +- For a personal Minecraft server project this is likely fine, but it fails the strict "permissive license" requirement. + +--- + +### 6. Gemma 3 12B-IT + +| Attribute | Detail | +|-----------|--------| +| **Parameters** | 12B (multimodal) | +| **Release** | March 2025 | +| **License** | Gemma Terms of Use (same caveats as 4B) | +| **HuggingFace** | `google/gemma-3-12b-it` -- safetensors | +| **Ollama** | `ollama pull gemma3:12b` | +| **Q4 VRAM** | ~6.6 GB (Google claims RTX 4060 8GB works) | +| **QLoRA VRAM** | ~18-20 GB (fits 24GB) | +| **Context** | 128K | + +**Why #6:** +- The largest model that can fit in 8GB VRAM at Q4. +- Best raw capability of any model on this list. +- QAT Q4 variants from Google specifically optimized for consumer GPUs. +- Full Unsloth support. + +**Caveats:** +- Tight fit on 8GB -- leaves little headroom for KV cache with long prompts. +- Same license concerns as Gemma 3 4B. +- QLoRA training at 12B needs more VRAM; will use ~18-20 GB of your 24GB budget. + +--- + +### 7. Mistral NeMo 12B + +| Attribute | Detail | +|-----------|--------| +| **Parameters** | 12B | +| **Release** | July 2024 | +| **License** | Apache 2.0 | +| **HuggingFace** | `mistralai/Mistral-Nemo-Instruct-2407` -- safetensors | +| **Ollama** | `ollama pull mistral-nemo:12b` | +| **Q4 VRAM** | ~7 GB | +| **QLoRA VRAM** | ~18-22 GB (higher due to large vocabulary) | +| **Context** | 128K | + +**Why #7:** +- Apache 2.0 license, built with NVIDIA collaboration. +- 128K context, strong multilingual support. +- Established fine-tuning ecosystem with mistral-finetune tool. + +**Caveats:** +- Oldest model on this list (July 2024) -- outperformed by newer 4-8B models on many benchmarks. +- Large vocabulary (32K+ tokens) increases memory requirements for fine-tuning beyond what the parameter count suggests. +- Tight fit on 8GB VRAM at Q4 with limited context headroom. +- Not recommended over Qwen3-8B which is newer, smaller, and benchmarks better. + +--- + +## Models Considered and Rejected + +| Model | Reason for Rejection | +|-------|---------------------| +| **Llama 3.2 (1B/3B)** | Llama Community License prohibits using outputs to train non-Llama models. Distillation restrictions. Not truly permissive. | +| **Llama 3.1-8B / 3.3-70B** | Same license restrictions as above. The 700M MAU clause and output training restrictions disqualify it. | +| **Qwen3-Coder (30B-A3B, 480B)** | All variants are massive MoE models. Even the smallest (30B-A3B with 3B active) has 30B total parameters -- too large for 8GB inference and questionable for 24GB QLoRA. | +| **Mistral Small 3 (24B)** | 24B parameters -- requires ~14 GB VRAM at Q4. Does not fit 8GB. | +| **Phi-4 (14B)** | Fits 8GB at Q4 (~8-9 GB) only marginally. QLoRA at 14B needs ~22-24 GB, cutting it very close. The 3.8B Phi-4-mini is a better fit for this project. | +| **Gemma 2 (9B/27B)** | Superseded by Gemma 3. No reason to use older generation. | +| **Qwen2.5 (7B/14B)** | Superseded by Qwen3 and Qwen3.5 with significantly better benchmarks. | + +--- + +## Fine-Tuning Ecosystem Comparison (as of March 2026) + +| Framework | Qwen3/3.5 | Phi-4-mini | Gemma 3 | Mistral NeMo | +|-----------|-----------|------------|---------|--------------| +| **Unsloth** | Full support, dedicated notebooks, 2x speedup | Supported, notebooks available | Supported, Gemma 3n confirmed | Supported | +| **Axolotl** | Supported | Supported | Supported | Supported | +| **LlamaFactory** | Supported, Ollama export | Supported | Supported | Supported | +| **HF PEFT/TRL** | Supported | Supported, official script | Supported, Google official docs | Supported | +| **Community notebooks** | Abundant | Moderate | Abundant | Moderate | + +--- + +## Recommendation for This Project + +**Primary: Qwen3-8B** -- Best balance of capability, VRAM fit, license cleanliness, and fine-tuning ecosystem. It significantly outperforms older 14B models while fitting comfortably in 8GB at Q4. Apache 2.0 means zero legal concerns. + +**Secondary: Qwen3-4B or Qwen3.5-4B** -- If training data is limited (<500 examples) or you want faster iteration cycles, a 4B model will fine-tune faster and still perform well on the narrow domain of Minecraft server operations. Qwen3.5-4B is newer with a 256K context window; Qwen3-4B has more proven fine-tuning results. + +**Note on qwen3-coder:** The current PLAN.md references `qwen3-coder` as the base model. All Qwen3-Coder variants are large MoE models (30B+ total parameters) that do not fit the 8GB inference constraint. The recommendation is to use **Qwen3-8B** (or Qwen3-4B) as the base model instead. The coding/command-generation capability can be developed through fine-tuning on domain-specific data rather than requiring a code-specialized base model. + +--- + +## Sources + +- [Qwen3 announcement and benchmarks](https://qwenlm.github.io/blog/qwen3/) +- [Qwen3.5 on HuggingFace](https://huggingface.co/Qwen/Qwen3.5-4B) +- [Qwen3.5 on Ollama](https://ollama.com/library/qwen3.5) +- [Phi-4-mini-instruct on HuggingFace](https://huggingface.co/microsoft/Phi-4-mini-instruct) +- [Phi-4-mini on Ollama](https://ollama.com/library/phi4-mini:3.8b) +- [Gemma 3 on Ollama](https://ollama.com/library/gemma3) +- [Gemma 3 QAT models for consumer GPUs](https://developers.googleblog.com/en/gemma-3-quantized-aware-trained-state-of-the-art-ai-to-consumer-gpus/) +- [Gemma Terms of Use](https://ai.google.dev/gemma/terms) +- [Gemma license risk analysis](https://wcr.legal/google-gemma-license-risks/) +- [Mistral NeMo on HuggingFace](https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407) +- [Mistral NeMo on Ollama](https://ollama.com/library/mistral-nemo) +- [Unsloth model catalog](https://unsloth.ai/docs/get-started/unsloth-model-catalog) +- [Unsloth Qwen3 fine-tuning guide](https://unsloth.ai/docs/models/qwen3-how-to-run-and-fine-tune) +- [Unsloth Qwen3.5 fine-tuning guide](https://unsloth.ai/docs/models/qwen3.5/fine-tune) +- [Unsloth Phi-4 fine-tuning](https://unsloth.ai/blog/phi4) +- [Unsloth Gemma 3 fine-tuning](https://unsloth.ai/blog/gemma3) +- [Fine-tuning framework comparison 2026](https://dev.to/ultraduneai/eval-003-fine-tuning-in-2026-axolotl-vs-unsloth-vs-trl-vs-llama-factory-2ohg) +- [Distillabs SLM fine-tuning benchmark](https://www.distillabs.ai/blog/we-benchmarked-12-small-language-models-across-8-tasks-to-find-the-best-base-model-for-fine-tuning) +- [JSONSchemaBench structured output benchmark](https://arxiv.org/abs/2501.10868) +- [Llama license restrictions analysis](https://wcr.legal/llama-3-license-700m-mau-limit/) +- [Qwen3-Coder on HuggingFace](https://huggingface.co/collections/Qwen/qwen3-coder) +- [Top SLMs 2026 overview (DataCamp)](https://www.datacamp.com/blog/top-small-language-models) +- [Best open-source SLMs 2026 (BentoML)](https://www.bentoml.com/blog/the-best-open-source-small-language-models)