Add LoRA training scripts and fix bake-off token budget

- training/scripts/train_lora.py: Unsloth QLoRA trainer for qwen3:8b
- training/scripts/train_lora.sh: Launch script for steel141 RTX 3090 Ti
- eval/bakeoff.py: Fixed token budget (400->1500) that caused qwen3
  models to exhaust tokens on thinking, added --no-think flag
- agent/serve.py: Default model changed to gemma3n:e4b

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-18 10:40:18 -04:00
parent 6fbab8045c
commit 48b627d498
4 changed files with 240 additions and 5 deletions
+15 -4
View File
@@ -31,7 +31,8 @@ RESULTS_DIR = ROOT / "eval" / "results"
def ollama_chat(model: str, messages: list, ollama_url: str,
temperature: float = 0.2, max_tokens: int = 400) -> dict:
temperature: float = 0.2, max_tokens: int = 1500,
no_think: bool = False) -> dict:
"""Call Ollama and return response + timing."""
payload = {
"model": model,
@@ -43,6 +44,12 @@ def ollama_chat(model: str, messages: list, ollama_url: str,
"num_predict": max_tokens,
},
}
if no_think:
# Prepend /no_think to the last user message to disable thinking tokens
for msg in reversed(payload["messages"]):
if msg["role"] == "user":
msg["content"] = "/no_think\n" + msg["content"]
break
start = time.time()
r = requests.post(f"{ollama_url}/api/chat", json=payload, timeout=180)
r.raise_for_status()
@@ -157,7 +164,7 @@ def score_result(example: dict, actual_cmds: list, parsed: dict) -> dict:
}
def run_bakeoff(models: list, ollama_url: str):
def run_bakeoff(models: list, ollama_url: str, no_think: bool = False):
"""Run all models against the dataset and compare."""
# Load dataset
with open(DATASET) as f:
@@ -166,6 +173,8 @@ def run_bakeoff(models: list, ollama_url: str):
print(f"Bake-off: {len(examples)} examples × {len(models)} models")
print(f"Ollama: {ollama_url}")
print(f"Models: {', '.join(models)}")
if no_think:
print("Mode: /no_think (thinking tokens disabled)")
print("=" * 70)
all_results = {}
@@ -208,7 +217,7 @@ def run_bakeoff(models: list, ollama_url: str):
# Call LLM
try:
resp = ollama_chat(model, messages, ollama_url)
resp = ollama_chat(model, messages, ollama_url, no_think=no_think)
except Exception as e:
print(f" [{i+1}/{len(examples)}] ERROR: {e}")
results.append({"id": eid, "error": str(e)})
@@ -311,9 +320,11 @@ def main():
parser.add_argument("--ollama-url", default="http://192.168.0.179:11434")
parser.add_argument("--models", nargs="+",
default=["qwen3-coder:30b", "gemma3n:e4b"])
parser.add_argument("--no-think", action="store_true",
help="Prepend /no_think to disable thinking tokens (helps Qwen models)")
args = parser.parse_args()
run_bakeoff(args.models, args.ollama_url)
run_bakeoff(args.models, args.ollama_url, no_think=args.no_think)
if __name__ == "__main__":