Files
seth_semantic_game/docs/reference/gemma-semantic-bakeoff-2026-04-27-224800-raw.json
T
Mortdecai 5a2a02e483 docs: bootstrap repo with bakeoff results and game-mechanics idea bank
This repo opens with the design-discovery work completed before any product
code is written. Two model bakeoffs against gemma4:8b/26b/31b on a local
Ollama established that:

- Whole-puzzle generation in the Connections shape is unreliable on Gemma 4
  (gemma4:31b ~50% structural-pass, gemma4:26b ~20-30%); 31b is intentionally
  out of project scope, so the generation route is harder still.
- Atomic semantic-judging skills are reliable: 87.5%/93.75%/100% (8B/26b/31b)
  on JUDGE; *all three models* scored 10/10 on CREATIVE_ACCEPT — fair judging
  of player-INVENTED categories. That is the structural unlock vs static
  hand-curated word games.

The README contains the full writeup, the test bench, and a brainstormed
bank of 10 distinct game-mechanics ideas across the fast/medium/slow tempo
range, plus a primitives table for recombination.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-27 23:09:46 -04:00

2077 lines
44 KiB
JSON

[
{
"case_id": "judge-y1",
"type": "JUDGE",
"model": "gemma4:latest",
"dt": 0.6691536903381348,
"eval_count": 31,
"ok": true,
"parsed": {
"verdict": "yes",
"reason": "All four words are common names of tree species.",
"misfit_words": []
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-y2",
"type": "JUDGE",
"model": "gemma4:latest",
"dt": 0.6203711032867432,
"eval_count": 30,
"ok": true,
"parsed": {
"verdict": "yes",
"reason": "All four words are names of Greek letters.",
"misfit_words": []
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-y3",
"type": "JUDGE",
"model": "gemma4:latest",
"dt": 0.7630858421325684,
"eval_count": 48,
"ok": true,
"parsed": {
"verdict": "no",
"reason": "The words are not all representative of the days of the week.",
"misfit_words": [
"MONDAY",
"FRIDAY",
"SUNDAY",
"WEDNESDAY"
]
},
"raw": null,
"error": null,
"score": "FAIL"
},
{
"case_id": "judge-y4",
"type": "JUDGE",
"model": "gemma4:latest",
"dt": 0.6369147300720215,
"eval_count": 34,
"ok": true,
"parsed": {
"verdict": "yes",
"reason": "All four words are direct and common synonyms for 'happy'.",
"misfit_words": []
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-y5",
"type": "JUDGE",
"model": "gemma4:latest",
"dt": 0.6616277694702148,
"eval_count": 35,
"ok": true,
"parsed": {
"verdict": "yes",
"reason": "All four words can precede 'HOUSE' to form common compound words.",
"misfit_words": []
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-y6",
"type": "JUDGE",
"model": "gemma4:latest",
"dt": 0.7502412796020508,
"eval_count": 45,
"ok": true,
"parsed": {
"verdict": "no",
"reason": "Only SHOULDER, FRONT, and SNAP fit common phrases with COLD, while TURKEY does not.",
"misfit_words": [
"TURKEY"
]
},
"raw": null,
"error": null,
"score": "FAIL"
},
{
"case_id": "judge-y7",
"type": "JUDGE",
"model": "gemma4:latest",
"dt": 0.619704008102417,
"eval_count": 31,
"ok": true,
"parsed": {
"verdict": "yes",
"reason": "All four words are strong synonyms for moving rapidly.",
"misfit_words": []
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-y8",
"type": "JUDGE",
"model": "gemma4:latest",
"dt": 0.6541116237640381,
"eval_count": 34,
"ok": true,
"parsed": {
"verdict": "yes",
"reason": "All four words are common synonyms for a mental concept or idea.",
"misfit_words": []
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-n1",
"type": "JUDGE",
"model": "gemma4:latest",
"dt": 0.6654887199401855,
"eval_count": 36,
"ok": true,
"parsed": {
"verdict": "no",
"reason": "Carrot is a root vegetable, not a type of tree.",
"misfit_words": [
"CARROT"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-n2",
"type": "JUDGE",
"model": "gemma4:latest",
"dt": 0.6903350353240967,
"eval_count": 38,
"ok": true,
"parsed": {
"verdict": "no",
"reason": "Cyrillic is a script, not a Greek letter.",
"misfit_words": [
"CYRILLIC"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-n3",
"type": "JUDGE",
"model": "gemma4:latest",
"dt": 0.7092163562774658,
"eval_count": 42,
"ok": true,
"parsed": {
"verdict": "no",
"reason": "The word 'SAD' is an antonym, not a synonym for 'happy'.",
"misfit_words": [
"SAD"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-n4",
"type": "JUDGE",
"model": "gemma4:latest",
"dt": 0.6665847301483154,
"eval_count": 35,
"ok": true,
"parsed": {
"verdict": "no",
"reason": "January is a month, not a day of the week.",
"misfit_words": [
"JANUARY"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-n5",
"type": "JUDGE",
"model": "gemma4:latest",
"dt": 0.6246583461761475,
"eval_count": 31,
"ok": true,
"parsed": {
"verdict": "no",
"reason": "NIGHT is not a body part.",
"misfit_words": [
"NIGHT"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-n6",
"type": "JUDGE",
"model": "gemma4:latest",
"dt": 0.6524357795715332,
"eval_count": 36,
"ok": true,
"parsed": {
"verdict": "no",
"reason": "Bats and mosquitoes are not types of birds.",
"misfit_words": [
"BAT",
"MOSQUITO"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-n7",
"type": "JUDGE",
"model": "gemma4:latest",
"dt": 0.6560320854187012,
"eval_count": 35,
"ok": true,
"parsed": {
"verdict": "no",
"reason": "Blue is a color, not necessarily a thing that is red.",
"misfit_words": [
"BLUE"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-n8",
"type": "JUDGE",
"model": "gemma4:latest",
"dt": 0.7655882835388184,
"eval_count": 48,
"ok": true,
"parsed": {
"verdict": "no",
"reason": "Only SHOULDER, FRONT, and PIZZA fit common phrases with COLD, but MOUNTAIN does not.",
"misfit_words": [
"MOUNTAIN"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "create-e1",
"type": "CREATE",
"model": "gemma4:latest",
"dt": 0.5947470664978027,
"eval_count": 30,
"ok": true,
"parsed": {
"words": [
"Oak",
"Maple",
"Pine",
"Birch"
],
"reason": "These are all common types of trees."
},
"raw": null,
"error": null,
"score": "MANUAL"
},
{
"case_id": "create-e2",
"type": "CREATE",
"model": "gemma4:latest",
"dt": 0.6220695972442627,
"eval_count": 32,
"ok": true,
"parsed": {
"words": [
"Alpha",
"Beta",
"Gamma",
"Delta"
],
"reason": "These are the first four letters of the Greek alphabet."
},
"raw": null,
"error": null,
"score": "MANUAL"
},
{
"case_id": "create-e3",
"type": "CREATE",
"model": "gemma4:latest",
"dt": 0.6435914039611816,
"eval_count": 35,
"ok": true,
"parsed": {
"words": [
"furious",
"irate",
"enraged",
"mad"
],
"reason": "All four words are direct synonyms for feeling intense anger."
},
"raw": null,
"error": null,
"score": "MANUAL"
},
{
"case_id": "create-e4",
"type": "CREATE",
"model": "gemma4:latest",
"dt": 0.6068227291107178,
"eval_count": 30,
"ok": true,
"parsed": {
"words": [
"Monday",
"Tuesday",
"Wednesday",
"Thursday"
],
"reason": "These are four consecutive days of the week."
},
"raw": null,
"error": null,
"score": "MANUAL"
},
{
"case_id": "create-m1",
"type": "CREATE",
"model": "gemma4:latest",
"dt": 0.7005162239074707,
"eval_count": 41,
"ok": true,
"parsed": {
"words": [
"BLIZZARD",
"HEAT",
"STORM",
"SNOW"
],
"reason": "These words can all precede 'STORM' to form a type of storm."
},
"raw": null,
"error": null,
"score": "MANUAL"
},
{
"case_id": "create-m2",
"type": "CREATE",
"model": "gemma4:latest",
"dt": 0.6505529880523682,
"eval_count": 37,
"ok": true,
"parsed": {
"words": [
"tiny",
"petite",
"minuscule",
"diminutive"
],
"reason": "All four words are direct synonyms for 'small' in size."
},
"raw": null,
"error": null,
"score": "MANUAL"
},
{
"case_id": "create-m3",
"type": "CREATE",
"model": "gemma4:latest",
"dt": 0.6751258373260498,
"eval_count": 37,
"ok": true,
"parsed": {
"words": [
"JEANS",
"BIRD",
"MOON",
"BLOOD"
],
"reason": "All words can follow 'BLUE' to form common phrases."
},
"raw": null,
"error": null,
"score": "MANUAL"
},
{
"case_id": "create-m4",
"type": "CREATE",
"model": "gemma4:latest",
"dt": 0.6219143867492676,
"eval_count": 32,
"ok": true,
"parsed": {
"words": [
"heart",
"promise",
"record",
"bone"
],
"reason": "All four items can be metaphorically or literally broken."
},
"raw": null,
"error": null,
"score": "MANUAL"
},
{
"case_id": "create-h1",
"type": "CREATE",
"model": "gemma4:latest",
"dt": 0.7682175636291504,
"eval_count": 49,
"ok": true,
"parsed": {
"words": [
"SEA",
"SEE",
"HEAR",
"HERE"
],
"reason": "All four words are homophones of common body parts (sea/see for 'C' sound, hear/here for 'hear')."
},
"raw": null,
"error": null,
"score": "MANUAL"
},
{
"case_id": "create-h2",
"type": "CREATE",
"model": "gemma4:latest",
"dt": 0.7266902923583984,
"eval_count": 42,
"ok": true,
"parsed": {
"words": [
"KNEEHIGH",
"EARDRUM",
"STOMACHACHE",
"EYELASH"
],
"reason": "Each word contains a body part name as a substring."
},
"raw": null,
"error": null,
"score": "MANUAL"
},
{
"case_id": "ca-y1",
"type": "CREATIVE_ACCEPT",
"model": "gemma4:latest",
"dt": 0.6249268054962158,
"eval_count": 31,
"ok": true,
"parsed": {
"valid": "yes",
"reason": "All four words can be used as verbs meaning to go up.",
"weak_words": []
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "ca-y2",
"type": "CREATIVE_ACCEPT",
"model": "gemma4:latest",
"dt": 0.5928308963775635,
"eval_count": 27,
"ok": true,
"parsed": {
"valid": "yes",
"reason": "All four listed items are types of fruit.",
"weak_words": []
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "ca-y3",
"type": "CREATIVE_ACCEPT",
"model": "gemma4:latest",
"dt": 0.6987485885620117,
"eval_count": 40,
"ok": true,
"parsed": {
"valid": "yes",
"reason": "You can crack a nut, a code, a smile, or a whip (like a whip crack).",
"weak_words": []
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "ca-y4",
"type": "CREATIVE_ACCEPT",
"model": "gemma4:latest",
"dt": 0.6028926372528076,
"eval_count": 27,
"ok": true,
"parsed": {
"valid": "yes",
"reason": "All four items are commonly associated with baseball.",
"weak_words": []
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "ca-y5",
"type": "CREATIVE_ACCEPT",
"model": "gemma4:latest",
"dt": 0.6008837223052979,
"eval_count": 29,
"ok": true,
"parsed": {
"valid": "yes",
"reason": "All four are names of Roman deities associated with planets.",
"weak_words": []
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "ca-n1",
"type": "CREATIVE_ACCEPT",
"model": "gemma4:latest",
"dt": 0.728452205657959,
"eval_count": 43,
"ok": true,
"parsed": {
"valid": "no",
"reason": "These words are types of trees, not furniture brands.",
"weak_words": [
"OAK",
"MAPLE",
"BIRCH",
"PINE"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "ca-n2",
"type": "CREATIVE_ACCEPT",
"model": "gemma4:latest",
"dt": 0.621466875076294,
"eval_count": 32,
"ok": true,
"parsed": {
"valid": "no",
"reason": "These words are Greek letters, not inherently meaning 'small'.",
"weak_words": [
"ALPHA"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "ca-n3",
"type": "CREATIVE_ACCEPT",
"model": "gemma4:latest",
"dt": 0.6523487567901611,
"eval_count": 33,
"ok": true,
"parsed": {
"valid": "no",
"reason": "BAT and BALL are not things worn on the hand.",
"weak_words": [
"BAT",
"BALL"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "ca-n4",
"type": "CREATIVE_ACCEPT",
"model": "gemma4:latest",
"dt": 0.7682509422302246,
"eval_count": 48,
"ok": true,
"parsed": {
"valid": "no",
"reason": "All the words listed are days of the week, not months of the year.",
"weak_words": [
"MONDAY",
"FRIDAY",
"SUNDAY",
"WEDNESDAY"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "ca-n5",
"type": "CREATIVE_ACCEPT",
"model": "gemma4:latest",
"dt": 0.6511921882629395,
"eval_count": 34,
"ok": true,
"parsed": {
"valid": "no",
"reason": "Nut and smile are not typically considered kitchen items.",
"weak_words": [
"NUT",
"SMILE"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "ca-b1",
"type": "CREATIVE_ACCEPT",
"model": "gemma4:latest",
"dt": 0.6162059307098389,
"eval_count": 30,
"ok": true,
"parsed": {
"valid": "no",
"reason": "None of these words are recognized tech or phone brands.",
"weak_words": [
"APPLE"
]
},
"raw": null,
"error": null,
"score": "BORDERLINE"
},
{
"case_id": "ca-b2",
"type": "CREATIVE_ACCEPT",
"model": "gemma4:latest",
"dt": 0.8338274955749512,
"eval_count": 56,
"ok": true,
"parsed": {
"valid": "no",
"reason": "Only SHARP and FLAT are common musical terms, while KEY and NATURAL are also musical terms, making 'Real estate listing terms' a poor fit for all four.",
"weak_words": [
"KEY",
"NATURAL"
]
},
"raw": null,
"error": null,
"score": "BORDERLINE"
},
{
"case_id": "judge-y1",
"type": "JUDGE",
"model": "gemma4:26b",
"dt": 0.7991511821746826,
"eval_count": 38,
"ok": true,
"parsed": {
"verdict": "yes",
"reason": "All four words are specific, well-known types of trees.",
"misfit_words": []
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-y2",
"type": "JUDGE",
"model": "gemma4:26b",
"dt": 0.7396526336669922,
"eval_count": 38,
"ok": true,
"parsed": {
"verdict": "yes",
"reason": "All four words are standard names of letters in the Greek alphabet.",
"misfit_words": []
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-y3",
"type": "JUDGE",
"model": "gemma4:26b",
"dt": 0.7360649108886719,
"eval_count": 37,
"ok": true,
"parsed": {
"verdict": "yes",
"reason": "All four words are specific names of days in the week.",
"misfit_words": []
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-y4",
"type": "JUDGE",
"model": "gemma4:26b",
"dt": 0.7296037673950195,
"eval_count": 37,
"ok": true,
"parsed": {
"verdict": "yes",
"reason": "All four words are direct synonyms representing different degrees of happiness.",
"misfit_words": []
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-y5",
"type": "JUDGE",
"model": "gemma4:26b",
"dt": 0.9206283092498779,
"eval_count": 60,
"ok": true,
"parsed": {
"verdict": "no",
"reason": "While Green, Ice, and Fire House are common, 'Court House' is typically written as two words (Courthouse) or refers to a different semantic structure.",
"misfit_words": [
"COURT"
]
},
"raw": null,
"error": null,
"score": "FAIL"
},
{
"case_id": "judge-y6",
"type": "JUDGE",
"model": "gemma4:26b",
"dt": 0.9253664016723633,
"eval_count": 50,
"ok": true,
"parsed": {
"verdict": "yes",
"reason": "All four words form common phrases when preceded by 'cold': cold shoulder, cold front, cold snap, and cold turkey.",
"misfit_words": []
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-y7",
"type": "JUDGE",
"model": "gemma4:26b",
"dt": 0.7268784046173096,
"eval_count": 36,
"ok": true,
"parsed": {
"verdict": "yes",
"reason": "All four words are synonyms for moving at high speed.",
"misfit_words": []
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-y8",
"type": "JUDGE",
"model": "gemma4:26b",
"dt": 0.7770085334777832,
"eval_count": 42,
"ok": true,
"parsed": {
"verdict": "yes",
"reason": "All four words are direct synonyms for a mental concept, notion, or vague understanding.",
"misfit_words": []
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-n1",
"type": "JUDGE",
"model": "gemma4:26b",
"dt": 0.7453572750091553,
"eval_count": 40,
"ok": true,
"parsed": {
"verdict": "no",
"reason": "Carrot is a root vegetable, not a type of tree.",
"misfit_words": [
"CARROT"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-n2",
"type": "JUDGE",
"model": "gemma4:26b",
"dt": 0.7958042621612549,
"eval_count": 44,
"ok": true,
"parsed": {
"verdict": "no",
"reason": "CYRILLIC refers to a script, not a specific Greek letter.",
"misfit_words": [
"CYRILLIC"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-n3",
"type": "JUDGE",
"model": "gemma4:26b",
"dt": 0.7643091678619385,
"eval_count": 42,
"ok": true,
"parsed": {
"verdict": "no",
"reason": "The word 'SAD' is an antonym rather than a synonym.",
"misfit_words": [
"SAD"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-n4",
"type": "JUDGE",
"model": "gemma4:26b",
"dt": 0.7373373508453369,
"eval_count": 40,
"ok": true,
"parsed": {
"verdict": "no",
"reason": "JANUARY is a month, not a day of the week.",
"misfit_words": [
"JANUARY"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-n5",
"type": "JUDGE",
"model": "gemma4:26b",
"dt": 0.7540380954742432,
"eval_count": 39,
"ok": true,
"parsed": {
"verdict": "no",
"reason": "The word 'NIGHT' is not a body part.",
"misfit_words": [
"NIGHT"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-n6",
"type": "JUDGE",
"model": "gemma4:26b",
"dt": 0.7899594306945801,
"eval_count": 46,
"ok": true,
"parsed": {
"verdict": "no",
"reason": "BAT and MOSQUITO are mammals and insects, respectively, not birds.",
"misfit_words": [
"BAT",
"MOSQUITO"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-n7",
"type": "JUDGE",
"model": "gemma4:26b",
"dt": 0.7485904693603516,
"eval_count": 39,
"ok": true,
"parsed": {
"verdict": "no",
"reason": "BLUE and GRASS are not typically red.",
"misfit_words": [
"BLUE",
"GRASS"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-n8",
"type": "JUDGE",
"model": "gemma4:26b",
"dt": 0.9349522590637207,
"eval_count": 63,
"ok": true,
"parsed": {
"verdict": "no",
"reason": "While shoulder, front, and mountain can follow 'cold', 'cold pizza' is a description of temperature rather than a common idiomatic phrase or compound noun like the others.",
"misfit_words": [
"PIZZA"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "create-e1",
"type": "CREATE",
"model": "gemma4:26b",
"dt": 0.6970863342285156,
"eval_count": 34,
"ok": true,
"parsed": {
"words": [
"Oak",
"Maple",
"Birch",
"Cedar"
],
"reason": "These are all common species of trees."
},
"raw": null,
"error": null,
"score": "MANUAL"
},
{
"case_id": "create-e2",
"type": "CREATE",
"model": "gemma4:26b",
"dt": 0.7034692764282227,
"eval_count": 37,
"ok": true,
"parsed": {
"words": [
"ALPHA",
"BETA",
"GAMMA",
"DELTA"
],
"reason": "These are the first four letters of the Greek alphabet."
},
"raw": null,
"error": null,
"score": "MANUAL"
},
{
"case_id": "create-e3",
"type": "CREATE",
"model": "gemma4:26b",
"dt": 0.723792314529419,
"eval_count": 40,
"ok": true,
"parsed": {
"words": [
"Irate",
"Livid",
"Furious",
"Incensed"
],
"reason": "All four words are direct synonyms for being extremely angry."
},
"raw": null,
"error": null,
"score": "MANUAL"
},
{
"case_id": "create-e4",
"type": "CREATE",
"model": "gemma4:26b",
"dt": 0.6912398338317871,
"eval_count": 34,
"ok": true,
"parsed": {
"words": [
"Monday",
"Tuesday",
"Wednesday",
"Thursday"
],
"reason": "These are all consecutive days of the week."
},
"raw": null,
"error": null,
"score": "MANUAL"
},
{
"case_id": "create-m1",
"type": "CREATE",
"model": "gemma4:26b",
"dt": 0.7368042469024658,
"eval_count": 42,
"ok": true,
"parsed": {
"words": [
"BRAIN",
"DUST",
"FIRE",
"SNOW"
],
"reason": "Each word forms a common compound word when followed by 'STORM'."
},
"raw": null,
"error": null,
"score": "MANUAL"
},
{
"case_id": "create-m2",
"type": "CREATE",
"model": "gemma4:26b",
"dt": 0.7377326488494873,
"eval_count": 42,
"ok": true,
"parsed": {
"words": [
"tiny",
"microscopic",
"diminutive",
"minuscule"
],
"reason": "All four words are direct synonyms for something of very small size."
},
"raw": null,
"error": null,
"score": "MANUAL"
},
{
"case_id": "create-m3",
"type": "CREATE",
"model": "gemma4:26b",
"dt": 0.7627155780792236,
"eval_count": 43,
"ok": true,
"parsed": {
"words": [
"BIRD",
"JAY",
"PRINT",
"WHALE"
],
"reason": "Each word can be preceded by 'blue' to form a common term."
},
"raw": null,
"error": null,
"score": "MANUAL"
},
{
"case_id": "create-m4",
"type": "CREATE",
"model": "gemma4:26b",
"dt": 0.7079043388366699,
"eval_count": 37,
"ok": true,
"parsed": {
"words": [
"Promise",
"Record",
"Heart",
"Law"
],
"reason": "All four are common objects or concepts that can be broken."
},
"raw": null,
"error": null,
"score": "MANUAL"
},
{
"case_id": "create-h1",
"type": "CREATE",
"model": "gemma4:26b",
"dt": 0.8588354587554932,
"eval_count": 53,
"ok": true,
"parsed": {
"words": [
"MUSSEL",
"HARE",
"EYE",
"KNEAD"
],
"reason": "Each word is a homophone of a body part (muscle, hair, eye, knee) but spelled differently."
},
"raw": null,
"error": null,
"score": "MANUAL"
},
{
"case_id": "create-h2",
"type": "CREATE",
"model": "gemma4:26b",
"dt": 0.8054819107055664,
"eval_count": 48,
"ok": true,
"parsed": {
"words": [
"HANDSOME",
"FOOTPRINT",
"EARNEST",
"ARMORY"
],
"reason": "Each word contains a body part (hand, foot, ear, arm) as a substring."
},
"raw": null,
"error": null,
"score": "MANUAL"
},
{
"case_id": "ca-y1",
"type": "CREATIVE_ACCEPT",
"model": "gemma4:26b",
"dt": 0.7489900588989258,
"eval_count": 39,
"ok": true,
"parsed": {
"valid": "yes",
"reason": "All four words are synonyms or closely related verbs describing the act of rising or ascending.",
"weak_words": []
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "ca-y2",
"type": "CREATIVE_ACCEPT",
"model": "gemma4:26b",
"dt": 0.7018086910247803,
"eval_count": 31,
"ok": true,
"parsed": {
"valid": "yes",
"reason": "All four words are common types of fruit.",
"weak_words": []
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "ca-y3",
"type": "CREATIVE_ACCEPT",
"model": "gemma4:26b",
"dt": 0.7803692817687988,
"eval_count": 41,
"ok": true,
"parsed": {
"valid": "yes",
"reason": "You can crack a whip, crack a nut, crack a code, and crack a smile.",
"weak_words": []
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "ca-y4",
"type": "CREATIVE_ACCEPT",
"model": "gemma4:26b",
"dt": 0.7275149822235107,
"eval_count": 37,
"ok": true,
"parsed": {
"valid": "yes",
"reason": "All four words are standard pieces of equipment used in the game of baseball.",
"weak_words": []
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "ca-y5",
"type": "CREATIVE_ACCEPT",
"model": "gemma4:26b",
"dt": 0.7079963684082031,
"eval_count": 34,
"ok": true,
"parsed": {
"valid": "yes",
"reason": "All four words are names of prominent deities in Roman mythology.",
"weak_words": []
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "ca-n1",
"type": "CREATIVE_ACCEPT",
"model": "gemma4:26b",
"dt": 0.8165183067321777,
"eval_count": 47,
"ok": true,
"parsed": {
"valid": "no",
"reason": "These words represent types of trees rather than furniture brands.",
"weak_words": [
"OAK",
"MAPLE",
"BIRCH",
"PINE"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "ca-n2",
"type": "CREATIVE_ACCEPT",
"model": "gemma4:26b",
"dt": 0.9430453777313232,
"eval_count": 60,
"ok": true,
"parsed": {
"valid": "no",
"reason": "Alpha, beta, gamma, and delta are Greek letters used to denote various things, but none of them inherently mean 'small'.",
"weak_words": [
"ALPHA",
"BETA",
"GAMMA",
"DELTA"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "ca-n3",
"type": "CREATIVE_ACCEPT",
"model": "gemma4:26b",
"dt": 0.7568156719207764,
"eval_count": 37,
"ok": true,
"parsed": {
"valid": "no",
"reason": "Bat and ball are not items worn on the hand.",
"weak_words": [
"BAT",
"BALL"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "ca-n4",
"type": "CREATIVE_ACCEPT",
"model": "gemma4:26b",
"dt": 0.882706880569458,
"eval_count": 54,
"ok": true,
"parsed": {
"valid": "no",
"reason": "None of the words are months of the year; they are all days of the week.",
"weak_words": [
"MONDAY",
"FRIDAY",
"SUNDAY",
"WEDNESDAY"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "ca-n5",
"type": "CREATIVE_ACCEPT",
"model": "gemma4:26b",
"dt": 0.8334591388702393,
"eval_count": 48,
"ok": true,
"parsed": {
"valid": "no",
"reason": "While whip and nut can relate to food, code and smile have no plausible connection to kitchen items.",
"weak_words": [
"CODE",
"SMILE"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "ca-b1",
"type": "CREATIVE_ACCEPT",
"model": "gemma4:26b",
"dt": 0.7428381443023682,
"eval_count": 38,
"ok": true,
"parsed": {
"valid": "yes",
"reason": "All four words are names of well-known technology companies or mobile device brands.",
"weak_words": []
},
"raw": null,
"error": null,
"score": "BORDERLINE"
},
{
"case_id": "ca-b2",
"type": "CREATIVE_ACCEPT",
"model": "gemma4:26b",
"dt": 0.9244155883789062,
"eval_count": 58,
"ok": true,
"parsed": {
"valid": "no",
"reason": "While 'flat' and 'key' can relate to real estate, 'sharp' and 'natural' have no plausible connection to real estate listings.",
"weak_words": [
"SHARP",
"NATURAL"
]
},
"raw": null,
"error": null,
"score": "BORDERLINE"
},
{
"case_id": "judge-y1",
"type": "JUDGE",
"model": "gemma4:31b-it-q4_K_M",
"dt": 2.0965914726257324,
"eval_count": 34,
"ok": true,
"parsed": {
"verdict": "yes",
"reason": "All four words are common types of trees.",
"misfit_words": []
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-y2",
"type": "JUDGE",
"model": "gemma4:31b-it-q4_K_M",
"dt": 2.0503978729248047,
"eval_count": 38,
"ok": true,
"parsed": {
"verdict": "yes",
"reason": "All four words are the first four letters of the Greek alphabet.",
"misfit_words": []
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-y3",
"type": "JUDGE",
"model": "gemma4:31b-it-q4_K_M",
"dt": 1.9456124305725098,
"eval_count": 35,
"ok": true,
"parsed": {
"verdict": "yes",
"reason": "All four words are standard days of the week.",
"misfit_words": []
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-y4",
"type": "JUDGE",
"model": "gemma4:31b-it-q4_K_M",
"dt": 2.055443048477173,
"eval_count": 38,
"ok": true,
"parsed": {
"verdict": "yes",
"reason": "All four words are direct synonyms for the state of being happy.",
"misfit_words": []
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-y5",
"type": "JUDGE",
"model": "gemma4:31b-it-q4_K_M",
"dt": 2.074954032897949,
"eval_count": 39,
"ok": true,
"parsed": {
"verdict": "yes",
"reason": "All four words form common compound words or phrases with 'house'.",
"misfit_words": []
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-y6",
"type": "JUDGE",
"model": "gemma4:31b-it-q4_K_M",
"dt": 2.1484782695770264,
"eval_count": 41,
"ok": true,
"parsed": {
"verdict": "yes",
"reason": "All four words form common compound words or phrases when preceded by 'cold'.",
"misfit_words": []
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-y7",
"type": "JUDGE",
"model": "gemma4:31b-it-q4_K_M",
"dt": 1.980708122253418,
"eval_count": 35,
"ok": true,
"parsed": {
"verdict": "yes",
"reason": "All four words are common synonyms for moving rapidly.",
"misfit_words": []
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-y8",
"type": "JUDGE",
"model": "gemma4:31b-it-q4_K_M",
"dt": 2.1886539459228516,
"eval_count": 42,
"ok": true,
"parsed": {
"verdict": "yes",
"reason": "All four words are common synonyms for a mental representation or a suggestion of an idea.",
"misfit_words": []
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-n1",
"type": "JUDGE",
"model": "gemma4:31b-it-q4_K_M",
"dt": 2.060978412628174,
"eval_count": 38,
"ok": true,
"parsed": {
"verdict": "no",
"reason": "Carrot is a root vegetable, not a tree.",
"misfit_words": [
"CARROT"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-n2",
"type": "JUDGE",
"model": "gemma4:31b-it-q4_K_M",
"dt": 2.2179007530212402,
"eval_count": 43,
"ok": true,
"parsed": {
"verdict": "no",
"reason": "Cyrillic is a writing system, not a Greek letter.",
"misfit_words": [
"CYRILLIC"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-n3",
"type": "JUDGE",
"model": "gemma4:31b-it-q4_K_M",
"dt": 2.1611642837524414,
"eval_count": 40,
"ok": true,
"parsed": {
"verdict": "no",
"reason": "SAD is an antonym of happy, not a synonym.",
"misfit_words": [
"SAD"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-n4",
"type": "JUDGE",
"model": "gemma4:31b-it-q4_K_M",
"dt": 2.117088794708252,
"eval_count": 40,
"ok": true,
"parsed": {
"verdict": "no",
"reason": "JANUARY is a month, not a day of the week.",
"misfit_words": [
"JANUARY"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-n5",
"type": "JUDGE",
"model": "gemma4:31b-it-q4_K_M",
"dt": 2.1064138412475586,
"eval_count": 40,
"ok": true,
"parsed": {
"verdict": "no",
"reason": "NIGHT is a time of day, not a body part.",
"misfit_words": [
"NIGHT"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-n6",
"type": "JUDGE",
"model": "gemma4:31b-it-q4_K_M",
"dt": 2.0780858993530273,
"eval_count": 38,
"ok": true,
"parsed": {
"verdict": "no",
"reason": "Bat and mosquito are not birds.",
"misfit_words": [
"BAT",
"MOSQUITO"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-n7",
"type": "JUDGE",
"model": "gemma4:31b-it-q4_K_M",
"dt": 2.004055976867676,
"eval_count": 37,
"ok": true,
"parsed": {
"verdict": "no",
"reason": "Blue and grass are not red.",
"misfit_words": [
"BLUE",
"GRASS"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "judge-n8",
"type": "JUDGE",
"model": "gemma4:31b-it-q4_K_M",
"dt": 2.7377030849456787,
"eval_count": 57,
"ok": true,
"parsed": {
"verdict": "no",
"reason": "Cold pizza and cold mountain are not established compound words or common idioms like cold shoulder and cold front.",
"misfit_words": [
"PIZZA",
"MOUNTAIN"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "create-e1",
"type": "CREATE",
"model": "gemma4:31b-it-q4_K_M",
"dt": 1.9321458339691162,
"eval_count": 34,
"ok": true,
"parsed": {
"words": [
"Oak",
"Maple",
"Pine",
"Birch"
],
"reason": "All four are common types of trees."
},
"raw": null,
"error": null,
"score": "MANUAL"
},
{
"case_id": "create-e2",
"type": "CREATE",
"model": "gemma4:31b-it-q4_K_M",
"dt": 1.9048969745635986,
"eval_count": 36,
"ok": true,
"parsed": {
"words": [
"Alpha",
"Beta",
"Gamma",
"Delta"
],
"reason": "These are the first four letters of the Greek alphabet."
},
"raw": null,
"error": null,
"score": "MANUAL"
},
{
"case_id": "create-e3",
"type": "CREATE",
"model": "gemma4:31b-it-q4_K_M",
"dt": 2.0525240898132324,
"eval_count": 40,
"ok": true,
"parsed": {
"words": [
"Furious",
"Irate",
"Livid",
"Enraged"
],
"reason": "All four words are strong synonyms for being extremely angry."
},
"raw": null,
"error": null,
"score": "MANUAL"
},
{
"case_id": "create-e4",
"type": "CREATE",
"model": "gemma4:31b-it-q4_K_M",
"dt": 1.7998781204223633,
"eval_count": 33,
"ok": true,
"parsed": {
"words": [
"Monday",
"Wednesday",
"Friday",
"Sunday"
],
"reason": "They are all days of the week."
},
"raw": null,
"error": null,
"score": "MANUAL"
},
{
"case_id": "create-m1",
"type": "CREATE",
"model": "gemma4:31b-it-q4_K_M",
"dt": 2.073843479156494,
"eval_count": 40,
"ok": true,
"parsed": {
"words": [
"BRAIN",
"FIRE",
"THUNDER",
"SNOW"
],
"reason": "All four words form common compound words when followed by STORM."
},
"raw": null,
"error": null,
"score": "MANUAL"
},
{
"case_id": "create-m2",
"type": "CREATE",
"model": "gemma4:31b-it-q4_K_M",
"dt": 2.000436782836914,
"eval_count": 38,
"ok": true,
"parsed": {
"words": [
"tiny",
"petite",
"slight",
"miniature"
],
"reason": "All four words are direct synonyms for small in size."
},
"raw": null,
"error": null,
"score": "MANUAL"
},
{
"case_id": "create-m3",
"type": "CREATE",
"model": "gemma4:31b-it-q4_K_M",
"dt": 2.1925113201141357,
"eval_count": 43,
"ok": true,
"parsed": {
"words": [
"BERRY",
"JAY",
"MOON",
"PRINT"
],
"reason": "All four words follow 'BLUE' to form common compound words or phrases."
},
"raw": null,
"error": null,
"score": "MANUAL"
},
{
"case_id": "create-m4",
"type": "CREATE",
"model": "gemma4:31b-it-q4_K_M",
"dt": 2.063403606414795,
"eval_count": 38,
"ok": true,
"parsed": {
"words": [
"Record",
"Promise",
"Silence",
"Heart"
],
"reason": "All four are common abstract or physical things that can be broken."
},
"raw": null,
"error": null,
"score": "MANUAL"
},
{
"case_id": "create-h1",
"type": "CREATE",
"model": "gemma4:31b-it-q4_K_M",
"dt": 6.69955849647522,
"eval_count": 170,
"ok": false,
"parsed": null,
"raw": "{\"words\": [\"HARE\", \"MUSSEL\", \"CEREAL\", \"STEEL\"], \"reason\": \"They are homophones of hair, muscle, serial (not a body part) - wait.\"}\n\n*(Self-correction: Cereal/Serial is not a body part. Let me refine the list: Hare/Hair, Mussel/Muscle, Nose/Knows is not it, Eye/I is not it. Let's use: Hare, Mussel, Hue/Hew is not it. Let's try: Hare, Mussel, Waist/Waste, Sole/Soul.)*\n\n{\"words\": [\"HARE\", \"MUSSEL\", \"WASTE\", \"SOUL\"], \"reason\": \"They are homophones of hair, muscle, waist, and sole.\"}",
"error": "JSONDecodeError('Extra data: line 3 column 1 (char 133)')",
"score": "PARSE_FAIL"
},
{
"case_id": "create-h2",
"type": "CREATE",
"model": "gemma4:31b-it-q4_K_M",
"dt": 2.2020654678344727,
"eval_count": 42,
"ok": true,
"parsed": {
"words": [
"HEARTBEAT",
"HANDSOME",
"FOOTPRINT",
"ARMCHAIR"
],
"reason": "Each word contains a body part as a substring."
},
"raw": null,
"error": null,
"score": "MANUAL"
},
{
"case_id": "ca-y1",
"type": "CREATIVE_ACCEPT",
"model": "gemma4:31b-it-q4_K_M",
"dt": 2.137995719909668,
"eval_count": 34,
"ok": true,
"parsed": {
"valid": "yes",
"reason": "All four words are synonyms for the action of moving upward.",
"weak_words": []
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "ca-y2",
"type": "CREATIVE_ACCEPT",
"model": "gemma4:31b-it-q4_K_M",
"dt": 1.8336284160614014,
"eval_count": 31,
"ok": true,
"parsed": {
"valid": "yes",
"reason": "All four words are common types of fruit.",
"weak_words": []
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "ca-y3",
"type": "CREATIVE_ACCEPT",
"model": "gemma4:31b-it-q4_K_M",
"dt": 2.1915175914764404,
"eval_count": 40,
"ok": true,
"parsed": {
"valid": "yes",
"reason": "All four words are common objects or concepts that can be 'cracked'.",
"weak_words": [
"WHIP"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "ca-y4",
"type": "CREATIVE_ACCEPT",
"model": "gemma4:31b-it-q4_K_M",
"dt": 1.9880671501159668,
"eval_count": 34,
"ok": true,
"parsed": {
"valid": "yes",
"reason": "All four items are standard pieces of equipment used in baseball.",
"weak_words": []
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "ca-y5",
"type": "CREATIVE_ACCEPT",
"model": "gemma4:31b-it-q4_K_M",
"dt": 1.996727705001831,
"eval_count": 35,
"ok": true,
"parsed": {
"valid": "yes",
"reason": "All four words are names of Roman deities as well as planets.",
"weak_words": []
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "ca-n1",
"type": "CREATIVE_ACCEPT",
"model": "gemma4:31b-it-q4_K_M",
"dt": 2.716524839401245,
"eval_count": 54,
"ok": true,
"parsed": {
"valid": "no",
"reason": "These are types of trees/woods, not brands of furniture.",
"weak_words": [
"OAK",
"MAPLE",
"BIRCH",
"PINE"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "ca-n2",
"type": "CREATIVE_ACCEPT",
"model": "gemma4:31b-it-q4_K_M",
"dt": 2.5698249340057373,
"eval_count": 48,
"ok": true,
"parsed": {
"valid": "no",
"reason": "These are Greek letters used as designations, not synonyms for 'small'.",
"weak_words": [
"ALPHA",
"BETA",
"GAMMA",
"DELTA"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "ca-n3",
"type": "CREATIVE_ACCEPT",
"model": "gemma4:31b-it-q4_K_M",
"dt": 2.312160015106201,
"eval_count": 43,
"ok": true,
"parsed": {
"valid": "no",
"reason": "A bat and a ball are not worn on the hand.",
"weak_words": [
"BAT",
"BALL"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "ca-n4",
"type": "CREATIVE_ACCEPT",
"model": "gemma4:31b-it-q4_K_M",
"dt": 2.6066768169403076,
"eval_count": 50,
"ok": true,
"parsed": {
"valid": "no",
"reason": "The words are days of the week, not months of the year.",
"weak_words": [
"MONDAY",
"FRIDAY",
"SUNDAY",
"WEDNESDAY"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "ca-n5",
"type": "CREATIVE_ACCEPT",
"model": "gemma4:31b-it-q4_K_M",
"dt": 2.3842716217041016,
"eval_count": 45,
"ok": true,
"parsed": {
"valid": "no",
"reason": "Code and smile are not items typically found in a kitchen.",
"weak_words": [
"CODE",
"SMILE"
]
},
"raw": null,
"error": null,
"score": "PASS"
},
{
"case_id": "ca-b1",
"type": "CREATIVE_ACCEPT",
"model": "gemma4:31b-it-q4_K_M",
"dt": 2.347870349884033,
"eval_count": 44,
"ok": true,
"parsed": {
"valid": "no",
"reason": "Orange and Kiwi are not recognized tech or phone brands.",
"weak_words": [
"ORANGE",
"KIWI"
]
},
"raw": null,
"error": null,
"score": "BORDERLINE"
},
{
"case_id": "ca-b2",
"type": "CREATIVE_ACCEPT",
"model": "gemma4:31b-it-q4_K_M",
"dt": 2.40256667137146,
"eval_count": 46,
"ok": true,
"parsed": {
"valid": "no",
"reason": "Natural and Sharp are not standard terms used in real estate listings.",
"weak_words": [
"NATURAL",
"SHARP"
]
},
"raw": null,
"error": null,
"score": "BORDERLINE"
}
]