feat: native-bakeoff scaffold — Ollama JSON vs native-token tool-calling
Three-arm harness under scripts/native-bakeoff/: - arm A: /api/chat with JSON tools (current default) - arm B: /api/generate raw:true with canonical HF jinja template rendered directly - arm C: google-deepmind/gemma JAX ToolSampler (env-gated, JAX required) Interim finding from A+B sweep on matt-strix gemma4:26b Q4: Ollama's bidirectional JSON↔native tool-call translator is faithful. The "long" multi-tool task produces identical behavior (7 steps / 6 tools) on both arms. Earlier arm-B parser bug that looked like a divergence was a harness issue: preserving the model's <|channel>thought\n<channel|> prefix as assistant content tripped the jinja template's tool_response-following conditional, appending a spurious <turn|>\n that corrupted the next step's prompt. Fixed by dropping the channel prefix on the assistant message. Arm C left as scaffolded-but-not-run — the JAX/bf16 reference path would answer "does the GGUF runtime diverge from DeepMind's implementation" but requires a separate env with the `gemma` PyPI package. Parked pending SDXL eviction or vast-h100 session. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,82 @@
|
||||
{
|
||||
"arm": "ollama-json",
|
||||
"model": "gemma4:26b",
|
||||
"num_ctx": 8192,
|
||||
"num_predict": 2048,
|
||||
"started_at": 1776600292.960776,
|
||||
"turns": [
|
||||
{
|
||||
"step": 1,
|
||||
"elapsed_s": 0.84,
|
||||
"prompt_eval_count": 1393,
|
||||
"eval_count": 24,
|
||||
"content_len": 0,
|
||||
"tool_call_count": 1,
|
||||
"history_chars_before_append": 3009
|
||||
},
|
||||
{
|
||||
"step": 2,
|
||||
"elapsed_s": 0.96,
|
||||
"prompt_eval_count": 1507,
|
||||
"eval_count": 29,
|
||||
"content_len": 0,
|
||||
"tool_call_count": 1,
|
||||
"history_chars_before_append": 3281
|
||||
},
|
||||
{
|
||||
"step": 3,
|
||||
"elapsed_s": 0.81,
|
||||
"prompt_eval_count": 1645,
|
||||
"eval_count": 24,
|
||||
"content_len": 0,
|
||||
"tool_call_count": 1,
|
||||
"history_chars_before_append": 3572
|
||||
},
|
||||
{
|
||||
"step": 4,
|
||||
"elapsed_s": 0.88,
|
||||
"prompt_eval_count": 1808,
|
||||
"eval_count": 25,
|
||||
"content_len": 0,
|
||||
"tool_call_count": 1,
|
||||
"history_chars_before_append": 4188
|
||||
},
|
||||
{
|
||||
"step": 5,
|
||||
"elapsed_s": 0.87,
|
||||
"prompt_eval_count": 1923,
|
||||
"eval_count": 24,
|
||||
"content_len": 0,
|
||||
"tool_call_count": 1,
|
||||
"history_chars_before_append": 4616
|
||||
},
|
||||
{
|
||||
"step": 6,
|
||||
"elapsed_s": 4.01,
|
||||
"prompt_eval_count": 2053,
|
||||
"eval_count": 177,
|
||||
"content_len": 0,
|
||||
"tool_call_count": 1,
|
||||
"history_chars_before_append": 4843
|
||||
},
|
||||
{
|
||||
"step": 7,
|
||||
"elapsed_s": 5.56,
|
||||
"prompt_eval_count": 2277,
|
||||
"eval_count": 247,
|
||||
"content_len": 950,
|
||||
"tool_call_count": 0,
|
||||
"history_chars_before_append": 4958
|
||||
}
|
||||
],
|
||||
"final": {
|
||||
"halt_reason": "no_tool_calls",
|
||||
"steps_used": 7,
|
||||
"tool_calls_total": 6,
|
||||
"wall_clock_s": 13.93,
|
||||
"final_message_count": 26,
|
||||
"final_history_chars": 5908
|
||||
},
|
||||
"task": "long",
|
||||
"task_prompt": "Research question with multiple steps: (1) check memory for what I have on home_automation, (2) search sethflix for any home-automation documentaries, (3) web_search for current news about Home Assistant version releases, (4) fetch the top search result for details, (5) check chat_search for prior mentions, (6) summarize all findings and write a new memory entry with the summary. Do each step in order and report back at the end."
|
||||
}
|
||||
@@ -0,0 +1,37 @@
|
||||
{
|
||||
"arm": "ollama-json",
|
||||
"model": "gemma4:26b",
|
||||
"num_ctx": 8192,
|
||||
"num_predict": 2048,
|
||||
"started_at": 1776600290.2718768,
|
||||
"turns": [
|
||||
{
|
||||
"step": 1,
|
||||
"elapsed_s": 0.72,
|
||||
"prompt_eval_count": 1310,
|
||||
"eval_count": 23,
|
||||
"content_len": 0,
|
||||
"tool_call_count": 1,
|
||||
"history_chars_before_append": 2656
|
||||
},
|
||||
{
|
||||
"step": 2,
|
||||
"elapsed_s": 1.74,
|
||||
"prompt_eval_count": 1422,
|
||||
"eval_count": 67,
|
||||
"content_len": 209,
|
||||
"tool_call_count": 0,
|
||||
"history_chars_before_append": 2928
|
||||
}
|
||||
],
|
||||
"final": {
|
||||
"halt_reason": "no_tool_calls",
|
||||
"steps_used": 2,
|
||||
"tool_calls_total": 1,
|
||||
"wall_clock_s": 2.46,
|
||||
"final_message_count": 16,
|
||||
"final_history_chars": 3137
|
||||
},
|
||||
"task": "memory",
|
||||
"task_prompt": "What do I have stored about home automation? If anything, summarize it briefly."
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
{
|
||||
"arm": "ollama-json",
|
||||
"model": "gemma4:26b",
|
||||
"num_ctx": 8192,
|
||||
"num_predict": 2048,
|
||||
"started_at": 1776600280.7781782,
|
||||
"turns": [
|
||||
{
|
||||
"step": 1,
|
||||
"elapsed_s": 0.87,
|
||||
"prompt_eval_count": 1322,
|
||||
"eval_count": 30,
|
||||
"content_len": 0,
|
||||
"tool_call_count": 1,
|
||||
"history_chars_before_append": 2697
|
||||
},
|
||||
{
|
||||
"step": 2,
|
||||
"elapsed_s": 0.92,
|
||||
"prompt_eval_count": 1501,
|
||||
"eval_count": 27,
|
||||
"content_len": 0,
|
||||
"tool_call_count": 1,
|
||||
"history_chars_before_append": 3306
|
||||
},
|
||||
{
|
||||
"step": 3,
|
||||
"elapsed_s": 1.41,
|
||||
"prompt_eval_count": 1670,
|
||||
"eval_count": 51,
|
||||
"content_len": 0,
|
||||
"tool_call_count": 1,
|
||||
"history_chars_before_append": 3914
|
||||
},
|
||||
{
|
||||
"step": 4,
|
||||
"elapsed_s": 2.11,
|
||||
"prompt_eval_count": 1806,
|
||||
"eval_count": 86,
|
||||
"content_len": 311,
|
||||
"tool_call_count": 0,
|
||||
"history_chars_before_append": 4188
|
||||
}
|
||||
],
|
||||
"final": {
|
||||
"halt_reason": "no_tool_calls",
|
||||
"steps_used": 4,
|
||||
"tool_calls_total": 3,
|
||||
"wall_clock_s": 5.31,
|
||||
"final_message_count": 20,
|
||||
"final_history_chars": 4499
|
||||
},
|
||||
"task": "movies",
|
||||
"task_prompt": "Recommend 3 sci-fi movies NOT already in my sethflix library. Check your picks against check_sethflix before finalizing."
|
||||
}
|
||||
@@ -0,0 +1,37 @@
|
||||
{
|
||||
"arm": "ollama-json",
|
||||
"model": "gemma4:26b",
|
||||
"num_ctx": 8192,
|
||||
"num_predict": 2048,
|
||||
"started_at": 1776600286.2860382,
|
||||
"turns": [
|
||||
{
|
||||
"step": 1,
|
||||
"elapsed_s": 0.93,
|
||||
"prompt_eval_count": 1315,
|
||||
"eval_count": 33,
|
||||
"content_len": 0,
|
||||
"tool_call_count": 2,
|
||||
"history_chars_before_append": 2677
|
||||
},
|
||||
{
|
||||
"step": 2,
|
||||
"elapsed_s": 2.81,
|
||||
"prompt_eval_count": 1600,
|
||||
"eval_count": 115,
|
||||
"content_len": 499,
|
||||
"tool_call_count": 0,
|
||||
"history_chars_before_append": 3499
|
||||
}
|
||||
],
|
||||
"final": {
|
||||
"halt_reason": "no_tool_calls",
|
||||
"steps_used": 2,
|
||||
"tool_calls_total": 2,
|
||||
"wall_clock_s": 3.74,
|
||||
"final_message_count": 17,
|
||||
"final_history_chars": 3998
|
||||
},
|
||||
"task": "research",
|
||||
"task_prompt": "Look up what Home Assistant is, then check chat history for any prior mentions of it in this server."
|
||||
}
|
||||
Reference in New Issue
Block a user