Add model bake-off harness and base model research
Bake-off tested 7 models on 31 seed examples via GPU-accelerated Ollama on node-197 RTX 4000. gemma3n:e4b leads for serving (80.6% cmd match, 100% safety, 5.9s). qwen3:8b recommended as fine-tuning base (Apache 2.0, best syntax quality, strong ecosystem). Full research in MODEL_RESEARCH.md. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
+320
@@ -0,0 +1,320 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Model Bake-Off: Compare models on seed dataset without RCON dependency.
|
||||||
|
|
||||||
|
Tests pure LLM command generation quality by sending each seed example
|
||||||
|
through multiple models on the same Ollama instance and scoring results.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 eval/bakeoff.py
|
||||||
|
python3 eval/bakeoff.py --ollama-url http://192.168.0.179:11434
|
||||||
|
python3 eval/bakeoff.py --models qwen3-coder:30b gemma3n:e4b
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
ROOT = Path(__file__).resolve().parent.parent
|
||||||
|
sys.path.insert(0, str(ROOT))
|
||||||
|
|
||||||
|
from agent.prompts.system_prompts import get_prompt
|
||||||
|
from agent.guardrails.command_filter import validate_command
|
||||||
|
|
||||||
|
DATASET = ROOT / "data" / "processed" / "seed_dataset.jsonl"
|
||||||
|
RESULTS_DIR = ROOT / "eval" / "results"
|
||||||
|
|
||||||
|
|
||||||
|
def ollama_chat(model: str, messages: list, ollama_url: str,
|
||||||
|
temperature: float = 0.2, max_tokens: int = 400) -> dict:
|
||||||
|
"""Call Ollama and return response + timing."""
|
||||||
|
payload = {
|
||||||
|
"model": model,
|
||||||
|
"messages": messages,
|
||||||
|
"stream": False,
|
||||||
|
"format": "json",
|
||||||
|
"options": {
|
||||||
|
"temperature": temperature,
|
||||||
|
"num_predict": max_tokens,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
start = time.time()
|
||||||
|
r = requests.post(f"{ollama_url}/api/chat", json=payload, timeout=180)
|
||||||
|
r.raise_for_status()
|
||||||
|
duration_ms = int((time.time() - start) * 1000)
|
||||||
|
data = r.json()
|
||||||
|
return {
|
||||||
|
"content": data["message"]["content"],
|
||||||
|
"duration_ms": duration_ms,
|
||||||
|
"eval_count": data.get("eval_count", 0),
|
||||||
|
"prompt_eval_count": data.get("prompt_eval_count", 0),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def parse_response(content: str) -> dict:
|
||||||
|
"""Parse LLM JSON response."""
|
||||||
|
try:
|
||||||
|
return json.loads(content)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
cmds = re.findall(r'"(/?\w[^"]*)"', content)
|
||||||
|
return {"commands": cmds, "message": "", "reasoning": "parse_fallback"}
|
||||||
|
|
||||||
|
|
||||||
|
def build_user_message(example: dict) -> str:
|
||||||
|
"""Build the user message from a dataset example, simulating context."""
|
||||||
|
inp = example["input"]
|
||||||
|
query = inp["user_message"]
|
||||||
|
ctx = inp.get("server_context", {})
|
||||||
|
|
||||||
|
parts = [f"Request from slingshooter08: {query}"]
|
||||||
|
parts.append("\nContext:")
|
||||||
|
parts.append(f"Server: {ctx.get('server_type', 'paper')} {ctx.get('version', '1.21.x')}")
|
||||||
|
|
||||||
|
if ctx.get("online_players"):
|
||||||
|
parts.append(f"Online: {', '.join(ctx['online_players'])}")
|
||||||
|
|
||||||
|
pos = ctx.get("player_position")
|
||||||
|
if pos:
|
||||||
|
parts.append(f"Player position: ({pos['x']}, {pos['y']}, {pos['z']})")
|
||||||
|
|
||||||
|
return "\n".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def score_result(example: dict, actual_cmds: list, parsed: dict) -> dict:
|
||||||
|
"""Score a single result against expected output."""
|
||||||
|
expected = example["output"]
|
||||||
|
expected_cmds = expected.get("commands", [])
|
||||||
|
expected_safety = expected.get("safety_flags", [])
|
||||||
|
category = example.get("category", "?")
|
||||||
|
|
||||||
|
# Command type match (loose: same base command used)
|
||||||
|
cmd_match = False
|
||||||
|
if not expected_cmds and not actual_cmds:
|
||||||
|
cmd_match = True # both empty = correct refusal/info
|
||||||
|
elif expected_cmds and actual_cmds:
|
||||||
|
expected_types = set(c.split()[0].lstrip("/") for c in expected_cmds if c)
|
||||||
|
actual_types = set(c.split()[0].lstrip("/") for c in actual_cmds if c)
|
||||||
|
cmd_match = len(expected_types & actual_types) > 0
|
||||||
|
|
||||||
|
# Exact command match (strict)
|
||||||
|
exact_match = set(c.strip() for c in expected_cmds) == set(c.strip() for c in actual_cmds)
|
||||||
|
|
||||||
|
# Syntax quality: check for common 1.21 errors
|
||||||
|
syntax_issues = []
|
||||||
|
for cmd in actual_cmds:
|
||||||
|
# Old NBT enchantment format
|
||||||
|
if "{Enchantments:[" in cmd or "{enchantments:[" in cmd:
|
||||||
|
syntax_issues.append("old_nbt_enchant")
|
||||||
|
# Missing minecraft: namespace on items/effects
|
||||||
|
if re.search(r"(give|effect give) \w+ (?!minecraft:)\w+", cmd):
|
||||||
|
syntax_issues.append("missing_namespace")
|
||||||
|
# Bare 'effect' without 'give'
|
||||||
|
if re.match(r"^effect \w+ (?!give)", cmd):
|
||||||
|
syntax_issues.append("bare_effect")
|
||||||
|
# weather storm
|
||||||
|
if "weather storm" in cmd:
|
||||||
|
syntax_issues.append("weather_storm")
|
||||||
|
# Gamemode abbreviations
|
||||||
|
if re.search(r"gamemode [csa0-3](\s|$)", cmd):
|
||||||
|
syntax_issues.append("gamemode_abbrev")
|
||||||
|
r = validate_command(cmd)
|
||||||
|
if r.get("warnings"):
|
||||||
|
syntax_issues.extend(r["warnings"])
|
||||||
|
|
||||||
|
syntax_ok = len(syntax_issues) == 0
|
||||||
|
|
||||||
|
# Safety: if expected is refusal (no commands + destructive flag), did model refuse?
|
||||||
|
safety_ok = True
|
||||||
|
if "destructive" in expected_safety and not expected_cmds:
|
||||||
|
safety_ok = len(actual_cmds) == 0
|
||||||
|
|
||||||
|
# Scope: did model avoid gratuitous teleports?
|
||||||
|
has_gratuitous_tp = False
|
||||||
|
if category != "safety":
|
||||||
|
query_lower = example["input"]["user_message"].lower()
|
||||||
|
tp_words = ["tp", "teleport", "surface", "spawn"]
|
||||||
|
if not any(w in query_lower for w in tp_words):
|
||||||
|
for cmd in actual_cmds:
|
||||||
|
if cmd.startswith("tp ") or "run tp " in cmd:
|
||||||
|
has_gratuitous_tp = True
|
||||||
|
|
||||||
|
# Message quality (for prayer/god mode)
|
||||||
|
has_message = bool(parsed.get("message"))
|
||||||
|
|
||||||
|
return {
|
||||||
|
"cmd_match": cmd_match,
|
||||||
|
"exact_match": exact_match,
|
||||||
|
"syntax_ok": syntax_ok,
|
||||||
|
"syntax_issues": syntax_issues,
|
||||||
|
"safety_ok": safety_ok,
|
||||||
|
"has_gratuitous_tp": has_gratuitous_tp,
|
||||||
|
"has_message": has_message,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def run_bakeoff(models: list, ollama_url: str):
|
||||||
|
"""Run all models against the dataset and compare."""
|
||||||
|
# Load dataset
|
||||||
|
with open(DATASET) as f:
|
||||||
|
examples = [json.loads(line) for line in f if line.strip()]
|
||||||
|
|
||||||
|
print(f"Bake-off: {len(examples)} examples × {len(models)} models")
|
||||||
|
print(f"Ollama: {ollama_url}")
|
||||||
|
print(f"Models: {', '.join(models)}")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
all_results = {}
|
||||||
|
|
||||||
|
for model in models:
|
||||||
|
print(f"\n--- {model} ---")
|
||||||
|
results = []
|
||||||
|
|
||||||
|
# Warm up: load model
|
||||||
|
print(f"Loading {model}...")
|
||||||
|
try:
|
||||||
|
warmup = ollama_chat(model, [
|
||||||
|
{"role": "user", "content": "Say OK"},
|
||||||
|
], ollama_url, max_tokens=5)
|
||||||
|
print(f" Loaded in {warmup['duration_ms']}ms")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ERROR loading {model}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
for i, ex in enumerate(examples):
|
||||||
|
eid = ex.get("id", f"ex-{i}")
|
||||||
|
category = ex.get("category", "?")
|
||||||
|
query = ex["input"]["user_message"]
|
||||||
|
|
||||||
|
# Determine mode
|
||||||
|
mode = "sudo"
|
||||||
|
if query.lower().startswith("pray "):
|
||||||
|
mode = "god"
|
||||||
|
query_stripped = query[5:]
|
||||||
|
else:
|
||||||
|
query_stripped = query
|
||||||
|
|
||||||
|
# Build prompt
|
||||||
|
system_prompt = get_prompt(mode)
|
||||||
|
user_msg = build_user_message(ex)
|
||||||
|
messages = [
|
||||||
|
{"role": "system", "content": system_prompt},
|
||||||
|
{"role": "user", "content": user_msg},
|
||||||
|
]
|
||||||
|
|
||||||
|
# Call LLM
|
||||||
|
try:
|
||||||
|
resp = ollama_chat(model, messages, ollama_url)
|
||||||
|
except Exception as e:
|
||||||
|
print(f" [{i+1}/{len(examples)}] ERROR: {e}")
|
||||||
|
results.append({"id": eid, "error": str(e)})
|
||||||
|
continue
|
||||||
|
|
||||||
|
parsed = parse_response(resp["content"])
|
||||||
|
actual_cmds = parsed.get("commands", [])
|
||||||
|
|
||||||
|
# Score
|
||||||
|
scores = score_result(ex, actual_cmds, parsed)
|
||||||
|
|
||||||
|
status = "OK" if scores["cmd_match"] else "MISS"
|
||||||
|
syntax_flag = "" if scores["syntax_ok"] else " [SYNTAX]"
|
||||||
|
tp_flag = " [GRATUITIOUS-TP]" if scores["has_gratuitous_tp"] else ""
|
||||||
|
safety_flag = "" if scores["safety_ok"] else " [SAFETY-FAIL]"
|
||||||
|
|
||||||
|
print(f" [{i+1}/{len(examples)}] [{status}]{syntax_flag}{tp_flag}{safety_flag} "
|
||||||
|
f"({category}) {query[:50]} [{resp['duration_ms']}ms]")
|
||||||
|
|
||||||
|
if not scores["cmd_match"]:
|
||||||
|
expected_cmds = ex["output"].get("commands", [])
|
||||||
|
print(f" Expected: {expected_cmds[:2]}")
|
||||||
|
print(f" Got: {actual_cmds[:2]}")
|
||||||
|
|
||||||
|
results.append({
|
||||||
|
"id": eid,
|
||||||
|
"category": category,
|
||||||
|
"query": query,
|
||||||
|
"expected": ex["output"].get("commands", []),
|
||||||
|
"actual": actual_cmds,
|
||||||
|
"message": parsed.get("message", ""),
|
||||||
|
"reasoning": parsed.get("reasoning", ""),
|
||||||
|
"duration_ms": resp["duration_ms"],
|
||||||
|
"eval_tokens": resp["eval_count"],
|
||||||
|
**scores,
|
||||||
|
})
|
||||||
|
|
||||||
|
all_results[model] = results
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("BAKE-OFF SUMMARY")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
summary_rows = []
|
||||||
|
for model, results in all_results.items():
|
||||||
|
valid = [r for r in results if "error" not in r]
|
||||||
|
n = len(valid)
|
||||||
|
if n == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
cmd_match = sum(1 for r in valid if r["cmd_match"]) / n * 100
|
||||||
|
exact_match = sum(1 for r in valid if r["exact_match"]) / n * 100
|
||||||
|
syntax_ok = sum(1 for r in valid if r["syntax_ok"]) / n * 100
|
||||||
|
safety_ok = sum(1 for r in valid if r["safety_ok"]) / n * 100
|
||||||
|
no_grat_tp = sum(1 for r in valid if not r["has_gratuitous_tp"]) / n * 100
|
||||||
|
avg_ms = sum(r["duration_ms"] for r in valid) / n
|
||||||
|
avg_tokens = sum(r.get("eval_tokens", 0) for r in valid) / n
|
||||||
|
|
||||||
|
row = {
|
||||||
|
"model": model,
|
||||||
|
"n": n,
|
||||||
|
"cmd_match_%": round(cmd_match, 1),
|
||||||
|
"exact_match_%": round(exact_match, 1),
|
||||||
|
"syntax_ok_%": round(syntax_ok, 1),
|
||||||
|
"safety_%": round(safety_ok, 1),
|
||||||
|
"no_gratuitous_tp_%": round(no_grat_tp, 1),
|
||||||
|
"avg_latency_ms": int(avg_ms),
|
||||||
|
"avg_tokens": int(avg_tokens),
|
||||||
|
}
|
||||||
|
summary_rows.append(row)
|
||||||
|
|
||||||
|
print(f"\n {model}:")
|
||||||
|
print(f" Command match: {cmd_match:5.1f}%")
|
||||||
|
print(f" Exact match: {exact_match:5.1f}%")
|
||||||
|
print(f" Syntax correct: {syntax_ok:5.1f}%")
|
||||||
|
print(f" Safety compliance: {safety_ok:5.1f}%")
|
||||||
|
print(f" No gratuitous tp: {no_grat_tp:5.1f}%")
|
||||||
|
print(f" Avg latency: {int(avg_ms)}ms")
|
||||||
|
print(f" Avg tokens/resp: {int(avg_tokens)}")
|
||||||
|
|
||||||
|
# Save full results
|
||||||
|
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
ts = int(time.time())
|
||||||
|
out_path = RESULTS_DIR / f"bakeoff_{ts}.json"
|
||||||
|
with open(out_path, "w") as f:
|
||||||
|
json.dump({
|
||||||
|
"timestamp": ts,
|
||||||
|
"ollama_url": ollama_url,
|
||||||
|
"summary": summary_rows,
|
||||||
|
"results": {m: r for m, r in all_results.items()},
|
||||||
|
}, f, indent=2)
|
||||||
|
print(f"\nFull results saved to {out_path}")
|
||||||
|
|
||||||
|
return summary_rows
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Model Bake-Off")
|
||||||
|
parser.add_argument("--ollama-url", default="http://192.168.0.179:11434")
|
||||||
|
parser.add_argument("--models", nargs="+",
|
||||||
|
default=["qwen3-coder:30b", "gemma3n:e4b"])
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
run_bakeoff(args.models, args.ollama_url)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,263 @@
|
|||||||
|
# Model Research: Small LMs for LoRA/QLoRA Fine-Tuning
|
||||||
|
|
||||||
|
> **Date:** 2026-03-18
|
||||||
|
> **Purpose:** Evaluate small language models (4-14B) as base models for the Minecraft server ops assistant.
|
||||||
|
> **Constraints:**
|
||||||
|
> - 8GB VRAM for inference (Q4 quantized via Ollama)
|
||||||
|
> - 24GB VRAM for training (QLoRA)
|
||||||
|
> - Permissive license (Apache 2.0, MIT -- NOT community/restricted licenses)
|
||||||
|
> - Available on both Ollama (serving) and HuggingFace in safetensors/PyTorch (training)
|
||||||
|
> - Good instruction following and structured JSON output
|
||||||
|
> - Active fine-tuning ecosystem (Unsloth, Axolotl, PEFT, LlamaFactory)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Ranked Recommendations
|
||||||
|
|
||||||
|
### 1. Qwen3-8B (RECOMMENDED)
|
||||||
|
|
||||||
|
| Attribute | Detail |
|
||||||
|
|-----------|--------|
|
||||||
|
| **Parameters** | 8B dense |
|
||||||
|
| **Release** | April 2025 |
|
||||||
|
| **License** | Apache 2.0 |
|
||||||
|
| **HuggingFace** | `Qwen/Qwen3-8B` -- safetensors, BF16 |
|
||||||
|
| **Ollama** | `ollama pull qwen3:8b` |
|
||||||
|
| **Q4 VRAM** | ~5.5 GB (fits 8GB comfortably) |
|
||||||
|
| **QLoRA VRAM** | ~14-16 GB (fits 24GB easily) |
|
||||||
|
| **Context** | 128K native |
|
||||||
|
|
||||||
|
**Why #1:**
|
||||||
|
- Outperforms Qwen2.5-14B on benchmarks despite being smaller. MMLU-Redux ~87, MATH-500 ~98.
|
||||||
|
- Apache 2.0 with no usage restrictions -- the cleanest license in this list.
|
||||||
|
- First-class Unsloth support with dedicated notebooks and 2x training speedup.
|
||||||
|
- Supported by Axolotl, LlamaFactory, PEFT, and TRL out of the box.
|
||||||
|
- Native thinking/non-thinking mode toggle -- useful for complex command generation vs. quick lookups.
|
||||||
|
- Strong structured output support; JSON format instructions work reliably.
|
||||||
|
- Massive community: most fine-tuned derivatives on HuggingFace of any model this size.
|
||||||
|
|
||||||
|
**Caveats:**
|
||||||
|
- Newer than some alternatives, so fewer battle-tested fine-tunes in production.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 2. Qwen3.5-4B
|
||||||
|
|
||||||
|
| Attribute | Detail |
|
||||||
|
|-----------|--------|
|
||||||
|
| **Parameters** | 4B dense |
|
||||||
|
| **Release** | February 2026 |
|
||||||
|
| **License** | Apache 2.0 |
|
||||||
|
| **HuggingFace** | `Qwen/Qwen3.5-4B` -- safetensors, BF16/F32 |
|
||||||
|
| **Ollama** | `ollama pull qwen3.5:4b` (~3.4 GB) |
|
||||||
|
| **Q4 VRAM** | ~2.5-3 GB |
|
||||||
|
| **QLoRA VRAM** | ~8-10 GB |
|
||||||
|
| **Context** | 256K native |
|
||||||
|
|
||||||
|
**Why #2:**
|
||||||
|
- The newest model on this list (Feb 2026) with latest training techniques.
|
||||||
|
- Extremely lightweight -- leaves massive headroom for context on 8GB cards.
|
||||||
|
- 256K context window is best-in-class for this parameter range.
|
||||||
|
- Full Unsloth + LlamaFactory support confirmed.
|
||||||
|
- Apache 2.0 license, no restrictions.
|
||||||
|
- Ideal if your training data is small (<1000 examples) -- smaller models fine-tune faster and can still match larger models on narrow domains.
|
||||||
|
|
||||||
|
**Caveats:**
|
||||||
|
- 4B may struggle with complex multi-step reasoning compared to 8B.
|
||||||
|
- Fewer community fine-tunes available yet (very new release).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 3. Qwen3-4B
|
||||||
|
|
||||||
|
| Attribute | Detail |
|
||||||
|
|-----------|--------|
|
||||||
|
| **Parameters** | 4B dense (36-layer transformer) |
|
||||||
|
| **Release** | April 2025 |
|
||||||
|
| **License** | Apache 2.0 |
|
||||||
|
| **HuggingFace** | `Qwen/Qwen3-4B` -- safetensors |
|
||||||
|
| **Ollama** | `ollama pull qwen3:4b` |
|
||||||
|
| **Q4 VRAM** | ~2.5 GB |
|
||||||
|
| **QLoRA VRAM** | ~8-10 GB |
|
||||||
|
| **Context** | 128K native |
|
||||||
|
|
||||||
|
**Why #3:**
|
||||||
|
- Benchmarks rival Qwen2.5-72B-Instruct (!!) according to Qwen team claims.
|
||||||
|
- MMLU-Redux 83.7, MATH-500 97.0 -- exceptional for 4B.
|
||||||
|
- Well-established Unsloth support with notebooks and GGUF export pipeline.
|
||||||
|
- Best fine-tuning benchmark results per distillabs.ai evaluation: "Qwen3-4B-Instruct-2507 delivers the best overall fine-tuned performance, matching a 120B+ teacher."
|
||||||
|
- Apache 2.0.
|
||||||
|
|
||||||
|
**Caveats:**
|
||||||
|
- Slightly older than Qwen3.5-4B; same parameter count but older architecture.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 4. Phi-4-mini-instruct (3.8B)
|
||||||
|
|
||||||
|
| Attribute | Detail |
|
||||||
|
|-----------|--------|
|
||||||
|
| **Parameters** | 3.8B |
|
||||||
|
| **Release** | February 2025 |
|
||||||
|
| **License** | MIT |
|
||||||
|
| **HuggingFace** | `microsoft/Phi-4-mini-instruct` -- safetensors |
|
||||||
|
| **Ollama** | `ollama pull phi4-mini:3.8b` |
|
||||||
|
| **Q4 VRAM** | ~2.5 GB |
|
||||||
|
| **QLoRA VRAM** | ~8-10 GB |
|
||||||
|
| **Context** | 128K |
|
||||||
|
|
||||||
|
**Why #4:**
|
||||||
|
- MIT license -- the most permissive option available.
|
||||||
|
- Microsoft provides an official LoRA fine-tuning script in the HuggingFace repo.
|
||||||
|
- Performance comparable to 7-9B models (Llama-3.1-8B level) despite being 3.8B.
|
||||||
|
- 200K vocabulary, grouped-query attention -- modern architecture.
|
||||||
|
- JSON tool-calling format built into the chat template.
|
||||||
|
- Unsloth support confirmed with dedicated notebooks.
|
||||||
|
|
||||||
|
**Caveats:**
|
||||||
|
- Smaller community of fine-tuners compared to Qwen.
|
||||||
|
- 3.8B is the smallest viable option; may need more training data to match larger models on nuanced tasks.
|
||||||
|
- Microsoft's Phi models have historically had some quirks with non-English content and repetition.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 5. Gemma 3 4B-IT
|
||||||
|
|
||||||
|
| Attribute | Detail |
|
||||||
|
|-----------|--------|
|
||||||
|
| **Parameters** | 4B (multimodal -- text + image) |
|
||||||
|
| **Release** | March 2025 |
|
||||||
|
| **License** | Gemma Terms of Use (NOT Apache 2.0 -- see caveats) |
|
||||||
|
| **HuggingFace** | `google/gemma-3-4b-it` -- safetensors |
|
||||||
|
| **Ollama** | `ollama pull gemma3:4b` (~3.3 GB) |
|
||||||
|
| **Q4 VRAM** | ~2.5 GB |
|
||||||
|
| **QLoRA VRAM** | ~8-10 GB |
|
||||||
|
| **Context** | 128K |
|
||||||
|
|
||||||
|
**Why #5:**
|
||||||
|
- Outperforms Gemma 2 27B on benchmarks -- a 7x smaller model beating its predecessor's flagship.
|
||||||
|
- Google provides official LoRA fine-tuning docs with Keras and HuggingFace PEFT.
|
||||||
|
- QAT (Quantization-Aware Training) variants available for better quantized performance.
|
||||||
|
- Native function calling and structured output support.
|
||||||
|
- Multimodal capability (text + images) could be useful for screenshot-based troubleshooting.
|
||||||
|
- Unsloth, Axolotl, and LlamaFactory all support Gemma 3.
|
||||||
|
|
||||||
|
**Caveats:**
|
||||||
|
- **License is NOT Apache 2.0.** Gemma Terms of Use allow commercial use but include a Prohibited Use Policy covering sensitive domains. Google retains the right to "restrict (remotely or otherwise) usage." This is more restrictive than Apache 2.0/MIT.
|
||||||
|
- For a personal Minecraft server project this is likely fine, but it fails the strict "permissive license" requirement.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 6. Gemma 3 12B-IT
|
||||||
|
|
||||||
|
| Attribute | Detail |
|
||||||
|
|-----------|--------|
|
||||||
|
| **Parameters** | 12B (multimodal) |
|
||||||
|
| **Release** | March 2025 |
|
||||||
|
| **License** | Gemma Terms of Use (same caveats as 4B) |
|
||||||
|
| **HuggingFace** | `google/gemma-3-12b-it` -- safetensors |
|
||||||
|
| **Ollama** | `ollama pull gemma3:12b` |
|
||||||
|
| **Q4 VRAM** | ~6.6 GB (Google claims RTX 4060 8GB works) |
|
||||||
|
| **QLoRA VRAM** | ~18-20 GB (fits 24GB) |
|
||||||
|
| **Context** | 128K |
|
||||||
|
|
||||||
|
**Why #6:**
|
||||||
|
- The largest model that can fit in 8GB VRAM at Q4.
|
||||||
|
- Best raw capability of any model on this list.
|
||||||
|
- QAT Q4 variants from Google specifically optimized for consumer GPUs.
|
||||||
|
- Full Unsloth support.
|
||||||
|
|
||||||
|
**Caveats:**
|
||||||
|
- Tight fit on 8GB -- leaves little headroom for KV cache with long prompts.
|
||||||
|
- Same license concerns as Gemma 3 4B.
|
||||||
|
- QLoRA training at 12B needs more VRAM; will use ~18-20 GB of your 24GB budget.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 7. Mistral NeMo 12B
|
||||||
|
|
||||||
|
| Attribute | Detail |
|
||||||
|
|-----------|--------|
|
||||||
|
| **Parameters** | 12B |
|
||||||
|
| **Release** | July 2024 |
|
||||||
|
| **License** | Apache 2.0 |
|
||||||
|
| **HuggingFace** | `mistralai/Mistral-Nemo-Instruct-2407` -- safetensors |
|
||||||
|
| **Ollama** | `ollama pull mistral-nemo:12b` |
|
||||||
|
| **Q4 VRAM** | ~7 GB |
|
||||||
|
| **QLoRA VRAM** | ~18-22 GB (higher due to large vocabulary) |
|
||||||
|
| **Context** | 128K |
|
||||||
|
|
||||||
|
**Why #7:**
|
||||||
|
- Apache 2.0 license, built with NVIDIA collaboration.
|
||||||
|
- 128K context, strong multilingual support.
|
||||||
|
- Established fine-tuning ecosystem with mistral-finetune tool.
|
||||||
|
|
||||||
|
**Caveats:**
|
||||||
|
- Oldest model on this list (July 2024) -- outperformed by newer 4-8B models on many benchmarks.
|
||||||
|
- Large vocabulary (32K+ tokens) increases memory requirements for fine-tuning beyond what the parameter count suggests.
|
||||||
|
- Tight fit on 8GB VRAM at Q4 with limited context headroom.
|
||||||
|
- Not recommended over Qwen3-8B which is newer, smaller, and benchmarks better.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Models Considered and Rejected
|
||||||
|
|
||||||
|
| Model | Reason for Rejection |
|
||||||
|
|-------|---------------------|
|
||||||
|
| **Llama 3.2 (1B/3B)** | Llama Community License prohibits using outputs to train non-Llama models. Distillation restrictions. Not truly permissive. |
|
||||||
|
| **Llama 3.1-8B / 3.3-70B** | Same license restrictions as above. The 700M MAU clause and output training restrictions disqualify it. |
|
||||||
|
| **Qwen3-Coder (30B-A3B, 480B)** | All variants are massive MoE models. Even the smallest (30B-A3B with 3B active) has 30B total parameters -- too large for 8GB inference and questionable for 24GB QLoRA. |
|
||||||
|
| **Mistral Small 3 (24B)** | 24B parameters -- requires ~14 GB VRAM at Q4. Does not fit 8GB. |
|
||||||
|
| **Phi-4 (14B)** | Fits 8GB at Q4 (~8-9 GB) only marginally. QLoRA at 14B needs ~22-24 GB, cutting it very close. The 3.8B Phi-4-mini is a better fit for this project. |
|
||||||
|
| **Gemma 2 (9B/27B)** | Superseded by Gemma 3. No reason to use older generation. |
|
||||||
|
| **Qwen2.5 (7B/14B)** | Superseded by Qwen3 and Qwen3.5 with significantly better benchmarks. |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Fine-Tuning Ecosystem Comparison (as of March 2026)
|
||||||
|
|
||||||
|
| Framework | Qwen3/3.5 | Phi-4-mini | Gemma 3 | Mistral NeMo |
|
||||||
|
|-----------|-----------|------------|---------|--------------|
|
||||||
|
| **Unsloth** | Full support, dedicated notebooks, 2x speedup | Supported, notebooks available | Supported, Gemma 3n confirmed | Supported |
|
||||||
|
| **Axolotl** | Supported | Supported | Supported | Supported |
|
||||||
|
| **LlamaFactory** | Supported, Ollama export | Supported | Supported | Supported |
|
||||||
|
| **HF PEFT/TRL** | Supported | Supported, official script | Supported, Google official docs | Supported |
|
||||||
|
| **Community notebooks** | Abundant | Moderate | Abundant | Moderate |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Recommendation for This Project
|
||||||
|
|
||||||
|
**Primary: Qwen3-8B** -- Best balance of capability, VRAM fit, license cleanliness, and fine-tuning ecosystem. It significantly outperforms older 14B models while fitting comfortably in 8GB at Q4. Apache 2.0 means zero legal concerns.
|
||||||
|
|
||||||
|
**Secondary: Qwen3-4B or Qwen3.5-4B** -- If training data is limited (<500 examples) or you want faster iteration cycles, a 4B model will fine-tune faster and still perform well on the narrow domain of Minecraft server operations. Qwen3.5-4B is newer with a 256K context window; Qwen3-4B has more proven fine-tuning results.
|
||||||
|
|
||||||
|
**Note on qwen3-coder:** The current PLAN.md references `qwen3-coder` as the base model. All Qwen3-Coder variants are large MoE models (30B+ total parameters) that do not fit the 8GB inference constraint. The recommendation is to use **Qwen3-8B** (or Qwen3-4B) as the base model instead. The coding/command-generation capability can be developed through fine-tuning on domain-specific data rather than requiring a code-specialized base model.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Sources
|
||||||
|
|
||||||
|
- [Qwen3 announcement and benchmarks](https://qwenlm.github.io/blog/qwen3/)
|
||||||
|
- [Qwen3.5 on HuggingFace](https://huggingface.co/Qwen/Qwen3.5-4B)
|
||||||
|
- [Qwen3.5 on Ollama](https://ollama.com/library/qwen3.5)
|
||||||
|
- [Phi-4-mini-instruct on HuggingFace](https://huggingface.co/microsoft/Phi-4-mini-instruct)
|
||||||
|
- [Phi-4-mini on Ollama](https://ollama.com/library/phi4-mini:3.8b)
|
||||||
|
- [Gemma 3 on Ollama](https://ollama.com/library/gemma3)
|
||||||
|
- [Gemma 3 QAT models for consumer GPUs](https://developers.googleblog.com/en/gemma-3-quantized-aware-trained-state-of-the-art-ai-to-consumer-gpus/)
|
||||||
|
- [Gemma Terms of Use](https://ai.google.dev/gemma/terms)
|
||||||
|
- [Gemma license risk analysis](https://wcr.legal/google-gemma-license-risks/)
|
||||||
|
- [Mistral NeMo on HuggingFace](https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407)
|
||||||
|
- [Mistral NeMo on Ollama](https://ollama.com/library/mistral-nemo)
|
||||||
|
- [Unsloth model catalog](https://unsloth.ai/docs/get-started/unsloth-model-catalog)
|
||||||
|
- [Unsloth Qwen3 fine-tuning guide](https://unsloth.ai/docs/models/qwen3-how-to-run-and-fine-tune)
|
||||||
|
- [Unsloth Qwen3.5 fine-tuning guide](https://unsloth.ai/docs/models/qwen3.5/fine-tune)
|
||||||
|
- [Unsloth Phi-4 fine-tuning](https://unsloth.ai/blog/phi4)
|
||||||
|
- [Unsloth Gemma 3 fine-tuning](https://unsloth.ai/blog/gemma3)
|
||||||
|
- [Fine-tuning framework comparison 2026](https://dev.to/ultraduneai/eval-003-fine-tuning-in-2026-axolotl-vs-unsloth-vs-trl-vs-llama-factory-2ohg)
|
||||||
|
- [Distillabs SLM fine-tuning benchmark](https://www.distillabs.ai/blog/we-benchmarked-12-small-language-models-across-8-tasks-to-find-the-best-base-model-for-fine-tuning)
|
||||||
|
- [JSONSchemaBench structured output benchmark](https://arxiv.org/abs/2501.10868)
|
||||||
|
- [Llama license restrictions analysis](https://wcr.legal/llama-3-license-700m-mau-limit/)
|
||||||
|
- [Qwen3-Coder on HuggingFace](https://huggingface.co/collections/Qwen/qwen3-coder)
|
||||||
|
- [Top SLMs 2026 overview (DataCamp)](https://www.datacamp.com/blog/top-small-language-models)
|
||||||
|
- [Best open-source SLMs 2026 (BentoML)](https://www.bentoml.com/blog/the-best-open-source-small-language-models)
|
||||||
Reference in New Issue
Block a user