Add LoRA training scripts and fix bake-off token budget

- training/scripts/train_lora.py: Unsloth QLoRA trainer for qwen3:8b - training/scripts/train_lora.sh: Launch script for steel141 RTX 3090 Ti - eval/bakeoff.py: Fixed token budget (400->1500) that caused qwen3 models to exhaust tokens on thinking, added --no-think flag - agent/serve.py: Default model changed to gemma3n:e4b Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-18 10:40:18 -04:00
parent 6fbab8045c
commit 48b627d498
4 changed files with 240 additions and 5 deletions
@@ -31,7 +31,8 @@ RESULTS_DIR = ROOT / "eval" / "results"


 def ollama_chat(model: str, messages: list, ollama_url: str,
-                temperature: float = 0.2, max_tokens: int = 400) -> dict:
+                temperature: float = 0.2, max_tokens: int = 1500,
+                no_think: bool = False) -> dict:
    """Call Ollama and return response + timing."""
    payload = {
        "model": model,
@@ -43,6 +44,12 @@ def ollama_chat(model: str, messages: list, ollama_url: str,
            "num_predict": max_tokens,
        },
    }
+    if no_think:
+        # Prepend /no_think to the last user message to disable thinking tokens
+        for msg in reversed(payload["messages"]):
+            if msg["role"] == "user":
+                msg["content"] = "/no_think\n" + msg["content"]
+                break
    start = time.time()
    r = requests.post(f"{ollama_url}/api/chat", json=payload, timeout=180)
    r.raise_for_status()
@@ -157,7 +164,7 @@ def score_result(example: dict, actual_cmds: list, parsed: dict) -> dict:
    }


-def run_bakeoff(models: list, ollama_url: str):
+def run_bakeoff(models: list, ollama_url: str, no_think: bool = False):
    """Run all models against the dataset and compare."""
    # Load dataset
    with open(DATASET) as f:
@@ -166,6 +173,8 @@ def run_bakeoff(models: list, ollama_url: str):
    print(f"Bake-off: {len(examples)} examples × {len(models)} models")
    print(f"Ollama: {ollama_url}")
    print(f"Models: {', '.join(models)}")
+    if no_think:
+        print("Mode: /no_think (thinking tokens disabled)")
    print("=" * 70)

    all_results = {}
@@ -208,7 +217,7 @@ def run_bakeoff(models: list, ollama_url: str):

            # Call LLM
            try:
-                resp = ollama_chat(model, messages, ollama_url)
+                resp = ollama_chat(model, messages, ollama_url, no_think=no_think)
            except Exception as e:
                print(f"  [{i+1}/{len(examples)}] ERROR: {e}")
                results.append({"id": eid, "error": str(e)})
@@ -311,9 +320,11 @@ def main():
    parser.add_argument("--ollama-url", default="http://192.168.0.179:11434")
    parser.add_argument("--models", nargs="+",
                        default=["qwen3-coder:30b", "gemma3n:e4b"])
+    parser.add_argument("--no-think", action="store_true",
+                        help="Prepend /no_think to disable thinking tokens (helps Qwen models)")
    args = parser.parse_args()

-    run_bakeoff(args.models, args.ollama_url)
+    run_bakeoff(args.models, args.ollama_url, no_think=args.no_think)


 if __name__ == "__main__":