5a2a02e483
This repo opens with the design-discovery work completed before any product code is written. Two model bakeoffs against gemma4:8b/26b/31b on a local Ollama established that: - Whole-puzzle generation in the Connections shape is unreliable on Gemma 4 (gemma4:31b ~50% structural-pass, gemma4:26b ~20-30%); 31b is intentionally out of project scope, so the generation route is harder still. - Atomic semantic-judging skills are reliable: 87.5%/93.75%/100% (8B/26b/31b) on JUDGE; *all three models* scored 10/10 on CREATIVE_ACCEPT — fair judging of player-INVENTED categories. That is the structural unlock vs static hand-curated word games. The README contains the full writeup, the test bench, and a brainstormed bank of 10 distinct game-mechanics ideas across the fast/medium/slow tempo range, plus a primitives table for recombination. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
237 lines
9.3 KiB
Python
237 lines
9.3 KiB
Python
#!/usr/bin/env python3
|
|
"""Bakeoff: can Gemma 4 generate Connections-style structured puzzles?
|
|
|
|
Stress-tests unaided one-shot generation on gemma4:26b and gemma4:31b on a
|
|
local Ollama (point OLLAMA_HOST at your instance; default localhost:11434).
|
|
Output is graded by hand afterward against a rubric in the README:
|
|
overlap-traps, tight category labels, purple wordplay, blind anchor vs a
|
|
real human-curated puzzle.
|
|
"""
|
|
import json
|
|
import os
|
|
import sys
|
|
import time
|
|
import urllib.request
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
OLLAMA = f"{os.environ.get('OLLAMA_HOST', 'http://localhost:11434').rstrip('/')}/api/generate"
|
|
MODELS = ["gemma4:26b", "gemma4:31b-it-q4_K_M"]
|
|
N_PER_MODEL = 5
|
|
TEMPERATURE = 0.8
|
|
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
|
|
|
PROMPT = """You are designing a single puzzle in the style of NYT Connections.
|
|
|
|
A Connections puzzle has:
|
|
- Exactly 16 distinct words or short phrases
|
|
- Sorted into 4 hidden groups of 4
|
|
- Each group has a tight, specific category label
|
|
- Difficulty bands: yellow (easiest, most direct), green (medium), blue (harder, often more abstract), purple (trickiest -- wordplay, double meanings, hidden patterns; e.g. "___ HOUSE": GREEN, ICE, COURT, FIRE)
|
|
- The CRITICAL feature: at least 2-3 words must plausibly fit a different group than where they actually go. These red herrings are what make the puzzle hard. Without them, the puzzle is trivial.
|
|
|
|
Generate ONE puzzle on a theme of your choice. Output strict JSON in this shape:
|
|
|
|
{
|
|
"theme_seed": "<one-line description of what inspired the puzzle>",
|
|
"groups": [
|
|
{"difficulty": "yellow", "category": "<tight category label>", "words": ["W1","W2","W3","W4"]},
|
|
{"difficulty": "green", "category": "<...>", "words": [...]},
|
|
{"difficulty": "blue", "category": "<...>", "words": [...]},
|
|
{"difficulty": "purple", "category": "<...>", "words": [...]}
|
|
],
|
|
"intended_traps": [
|
|
{"word": "<a word from the puzzle>", "actual_group": "yellow|green|blue|purple", "trap_group": "yellow|green|blue|purple", "reason": "<why it plausibly fits the trap group>"}
|
|
]
|
|
}
|
|
|
|
Rules:
|
|
- All 16 words must be distinct
|
|
- Categories must be tight enough that the right answer feels obviously right after the reveal
|
|
- intended_traps must list at least 2 genuine red-herring words
|
|
- Output ONLY the JSON object. No preamble, no markdown fences, no commentary.
|
|
"""
|
|
|
|
|
|
def call(model: str, prompt: str, temperature: float, timeout: int = 600):
|
|
# NOTE on Gemma 4 settings (see ~/bin/gemma4-research/GOTCHAS.md):
|
|
# - No format=json (infinite loop on gemma4:26b Q4)
|
|
# - think=false for single-turn JSON pipelines (else thinking tokens eat budget)
|
|
# - num_ctx >> 2048 default, num_predict >> 128 default
|
|
payload = {
|
|
"model": model,
|
|
"prompt": prompt,
|
|
"stream": False,
|
|
"think": False,
|
|
"options": {
|
|
"temperature": temperature,
|
|
"num_ctx": 8192,
|
|
"num_predict": 4096,
|
|
},
|
|
}
|
|
req = urllib.request.Request(
|
|
OLLAMA,
|
|
data=json.dumps(payload).encode(),
|
|
headers={"Content-Type": "application/json"},
|
|
)
|
|
t0 = time.time()
|
|
with urllib.request.urlopen(req, timeout=timeout) as r:
|
|
data = json.loads(r.read())
|
|
return time.time() - t0, data
|
|
|
|
|
|
def extract_json(body: str):
|
|
"""Pull the JSON object out of a Gemma response. Returns parsed dict or raises."""
|
|
if not body or "{" not in body or "}" not in body:
|
|
raise ValueError("no JSON object delimiters in response")
|
|
chunk = body[body.find("{"): body.rfind("}") + 1]
|
|
return json.loads(chunk)
|
|
|
|
|
|
def warm(model: str) -> None:
|
|
print(f"[warm] {model}", file=sys.stderr, flush=True)
|
|
call(model, "Reply with just the word OK.", temperature=0.1, timeout=300)
|
|
|
|
|
|
def run_model(model: str, n: int):
|
|
out = []
|
|
for i in range(1, n + 1):
|
|
# Retry with temp-bump pattern from AI_Visualizer
|
|
last_raw = ""
|
|
last_dt = 0.0
|
|
last_data = {}
|
|
last_err = None
|
|
puzzle = None
|
|
ok = False
|
|
attempts = 0
|
|
for attempt in range(3):
|
|
attempts = attempt + 1
|
|
temp = TEMPERATURE + attempt * 0.1
|
|
print(f"[{model}] puzzle {i}/{n} attempt {attempts} (temp={temp:.1f})",
|
|
file=sys.stderr, flush=True)
|
|
try:
|
|
dt, data = call(model, PROMPT, temperature=temp)
|
|
except Exception as e:
|
|
last_err = repr(e)
|
|
continue
|
|
last_dt, last_data = dt, data
|
|
last_raw = data.get("response", "") or ""
|
|
try:
|
|
puzzle = extract_json(last_raw)
|
|
ok = True
|
|
break
|
|
except Exception as e:
|
|
last_err = repr(e)
|
|
continue
|
|
|
|
if ok:
|
|
out.append({
|
|
"model": model, "i": i, "dt": last_dt, "ok": True,
|
|
"attempts": attempts,
|
|
"puzzle": puzzle,
|
|
"eval_count": last_data.get("eval_count", 0),
|
|
"prompt_eval_count": last_data.get("prompt_eval_count", 0),
|
|
})
|
|
else:
|
|
out.append({
|
|
"model": model, "i": i, "dt": last_dt, "ok": False,
|
|
"attempts": attempts,
|
|
"puzzle": {"_parse_error": last_err, "_raw": last_raw[:3000]},
|
|
"eval_count": last_data.get("eval_count", 0) if last_data else 0,
|
|
"prompt_eval_count": last_data.get("prompt_eval_count", 0) if last_data else 0,
|
|
})
|
|
return out
|
|
|
|
|
|
def render(results, stamp: str) -> str:
|
|
lines = [
|
|
f"# Gemma 4 Generation Bakeoff -- {stamp}",
|
|
"",
|
|
"## Setup",
|
|
f"- Ollama endpoint: `{OLLAMA}` (RTX 3090 Ti on the test host)",
|
|
"- Other GPU workloads paused for the duration of the run",
|
|
f"- Models: {', '.join(f'`{m}`' for m in MODELS)}",
|
|
f"- {N_PER_MODEL} puzzles per model, base temperature {TEMPERATURE}",
|
|
"- Gemma 4 settings (per `~/bin/gemma4-research/GOTCHAS.md`): `think=false`, "
|
|
"`num_ctx=8192`, `num_predict=4096`. No `format=json` (infinite-loop bug). "
|
|
"JSON extracted client-side via `body[body.find('{'):body.rfind('}')+1]`.",
|
|
"- Up to 3 attempts per puzzle with temperature bumped +0.1 each retry "
|
|
"(AI_Visualizer pattern). Reported metrics use the *successful* attempt.",
|
|
"- One-shot, unaided generation. No critique pass, no example puzzle in prompt.",
|
|
"",
|
|
"## Timing",
|
|
"",
|
|
"| Model | n | avg s | avg tokens | tok/s |",
|
|
"|---|---|---|---|---|",
|
|
]
|
|
for m in MODELS:
|
|
rs = [r for r in results if r["model"] == m and "error" not in r]
|
|
if not rs:
|
|
lines.append(f"| `{m}` | 0 | -- | -- | -- |")
|
|
continue
|
|
avg_s = sum(r["dt"] for r in rs) / len(rs)
|
|
avg_tok = sum(r["eval_count"] for r in rs) / len(rs)
|
|
toks = avg_tok / avg_s if avg_s else 0
|
|
lines.append(f"| `{m}` | {len(rs)} | {avg_s:.1f} | {avg_tok:.0f} | {toks:.1f} |")
|
|
|
|
lines += ["", "## JSON parse rate", ""]
|
|
for m in MODELS:
|
|
rs = [r for r in results if r["model"] == m]
|
|
ok = sum(1 for r in rs if r.get("ok"))
|
|
lines.append(f"- `{m}`: {ok}/{len(rs)} parsed cleanly")
|
|
lines += [""]
|
|
|
|
for r in results:
|
|
head = f"## {r['model']} -- puzzle {r['i']}"
|
|
lines += [head, ""]
|
|
if "error" in r:
|
|
lines += [f"_API error:_ `{r['error']}`", ""]
|
|
continue
|
|
if not r.get("ok"):
|
|
err = r["puzzle"].get("_parse_error", "")
|
|
raw = r["puzzle"].get("_raw", "")[:1500]
|
|
lines += [f"_JSON parse failed:_ `{err}`", "```", raw, "```", ""]
|
|
continue
|
|
p = r["puzzle"]
|
|
lines += [f"**Theme seed:** {p.get('theme_seed', '--')}", ""]
|
|
lines += ["| Diff | Category | Words |", "|---|---|---|"]
|
|
for g in p.get("groups", []) or []:
|
|
words = ", ".join(g.get("words", []) or [])
|
|
cat = (g.get("category") or "?").replace("|", "\\|")
|
|
lines.append(f"| {g.get('difficulty', '?')} | {cat} | {words} |")
|
|
traps = p.get("intended_traps", []) or []
|
|
lines += ["", f"**Claimed traps ({len(traps)}):**"]
|
|
if not traps:
|
|
lines.append("- _none claimed_")
|
|
for t in traps:
|
|
lines.append(
|
|
f"- `{t.get('word')}` (actually {t.get('actual_group')}, traps {t.get('trap_group')}): "
|
|
f"{t.get('reason')}"
|
|
)
|
|
lines += ["", "_Grade:_ TODO", "", f"_dt={r['dt']:.1f}s, tokens={r['eval_count']}_", ""]
|
|
return "\n".join(lines)
|
|
|
|
|
|
def main() -> None:
|
|
out_dir = PROJECT_ROOT / "docs" / "reference"
|
|
out_dir.mkdir(parents=True, exist_ok=True)
|
|
stamp = datetime.now().strftime("%Y-%m-%d-%H%M%S")
|
|
raw_path = out_dir / f"gemma-generation-bakeoff-{stamp}-raw.json"
|
|
md_path = out_dir / f"gemma-generation-bakeoff-{stamp}.md"
|
|
|
|
all_results = []
|
|
for m in MODELS:
|
|
warm(m)
|
|
all_results.extend(run_model(m, N_PER_MODEL))
|
|
|
|
raw_path.write_text(json.dumps(all_results, indent=2))
|
|
print(f"raw -> {raw_path}", file=sys.stderr)
|
|
md_path.write_text(render(all_results, stamp))
|
|
print(f"md -> {md_path}", file=sys.stderr)
|
|
# Final stdout: just the markdown path so callers can pipe.
|
|
print(md_path)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|