5a2a02e483
This repo opens with the design-discovery work completed before any product code is written. Two model bakeoffs against gemma4:8b/26b/31b on a local Ollama established that: - Whole-puzzle generation in the Connections shape is unreliable on Gemma 4 (gemma4:31b ~50% structural-pass, gemma4:26b ~20-30%); 31b is intentionally out of project scope, so the generation route is harder still. - Atomic semantic-judging skills are reliable: 87.5%/93.75%/100% (8B/26b/31b) on JUDGE; *all three models* scored 10/10 on CREATIVE_ACCEPT — fair judging of player-INVENTED categories. That is the structural unlock vs static hand-curated word games. The README contains the full writeup, the test bench, and a brainstormed bank of 10 distinct game-mechanics ideas across the fast/medium/slow tempo range, plus a primitives table for recombination. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2077 lines
44 KiB
JSON
2077 lines
44 KiB
JSON
[
|
|
{
|
|
"case_id": "judge-y1",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.6691536903381348,
|
|
"eval_count": 31,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "yes",
|
|
"reason": "All four words are common names of tree species.",
|
|
"misfit_words": []
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-y2",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.6203711032867432,
|
|
"eval_count": 30,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "yes",
|
|
"reason": "All four words are names of Greek letters.",
|
|
"misfit_words": []
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-y3",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.7630858421325684,
|
|
"eval_count": 48,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "no",
|
|
"reason": "The words are not all representative of the days of the week.",
|
|
"misfit_words": [
|
|
"MONDAY",
|
|
"FRIDAY",
|
|
"SUNDAY",
|
|
"WEDNESDAY"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "FAIL"
|
|
},
|
|
{
|
|
"case_id": "judge-y4",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.6369147300720215,
|
|
"eval_count": 34,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "yes",
|
|
"reason": "All four words are direct and common synonyms for 'happy'.",
|
|
"misfit_words": []
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-y5",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.6616277694702148,
|
|
"eval_count": 35,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "yes",
|
|
"reason": "All four words can precede 'HOUSE' to form common compound words.",
|
|
"misfit_words": []
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-y6",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.7502412796020508,
|
|
"eval_count": 45,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "no",
|
|
"reason": "Only SHOULDER, FRONT, and SNAP fit common phrases with COLD, while TURKEY does not.",
|
|
"misfit_words": [
|
|
"TURKEY"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "FAIL"
|
|
},
|
|
{
|
|
"case_id": "judge-y7",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.619704008102417,
|
|
"eval_count": 31,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "yes",
|
|
"reason": "All four words are strong synonyms for moving rapidly.",
|
|
"misfit_words": []
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-y8",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.6541116237640381,
|
|
"eval_count": 34,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "yes",
|
|
"reason": "All four words are common synonyms for a mental concept or idea.",
|
|
"misfit_words": []
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-n1",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.6654887199401855,
|
|
"eval_count": 36,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "no",
|
|
"reason": "Carrot is a root vegetable, not a type of tree.",
|
|
"misfit_words": [
|
|
"CARROT"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-n2",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.6903350353240967,
|
|
"eval_count": 38,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "no",
|
|
"reason": "Cyrillic is a script, not a Greek letter.",
|
|
"misfit_words": [
|
|
"CYRILLIC"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-n3",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.7092163562774658,
|
|
"eval_count": 42,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "no",
|
|
"reason": "The word 'SAD' is an antonym, not a synonym for 'happy'.",
|
|
"misfit_words": [
|
|
"SAD"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-n4",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.6665847301483154,
|
|
"eval_count": 35,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "no",
|
|
"reason": "January is a month, not a day of the week.",
|
|
"misfit_words": [
|
|
"JANUARY"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-n5",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.6246583461761475,
|
|
"eval_count": 31,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "no",
|
|
"reason": "NIGHT is not a body part.",
|
|
"misfit_words": [
|
|
"NIGHT"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-n6",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.6524357795715332,
|
|
"eval_count": 36,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "no",
|
|
"reason": "Bats and mosquitoes are not types of birds.",
|
|
"misfit_words": [
|
|
"BAT",
|
|
"MOSQUITO"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-n7",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.6560320854187012,
|
|
"eval_count": 35,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "no",
|
|
"reason": "Blue is a color, not necessarily a thing that is red.",
|
|
"misfit_words": [
|
|
"BLUE"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-n8",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.7655882835388184,
|
|
"eval_count": 48,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "no",
|
|
"reason": "Only SHOULDER, FRONT, and PIZZA fit common phrases with COLD, but MOUNTAIN does not.",
|
|
"misfit_words": [
|
|
"MOUNTAIN"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "create-e1",
|
|
"type": "CREATE",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.5947470664978027,
|
|
"eval_count": 30,
|
|
"ok": true,
|
|
"parsed": {
|
|
"words": [
|
|
"Oak",
|
|
"Maple",
|
|
"Pine",
|
|
"Birch"
|
|
],
|
|
"reason": "These are all common types of trees."
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "MANUAL"
|
|
},
|
|
{
|
|
"case_id": "create-e2",
|
|
"type": "CREATE",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.6220695972442627,
|
|
"eval_count": 32,
|
|
"ok": true,
|
|
"parsed": {
|
|
"words": [
|
|
"Alpha",
|
|
"Beta",
|
|
"Gamma",
|
|
"Delta"
|
|
],
|
|
"reason": "These are the first four letters of the Greek alphabet."
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "MANUAL"
|
|
},
|
|
{
|
|
"case_id": "create-e3",
|
|
"type": "CREATE",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.6435914039611816,
|
|
"eval_count": 35,
|
|
"ok": true,
|
|
"parsed": {
|
|
"words": [
|
|
"furious",
|
|
"irate",
|
|
"enraged",
|
|
"mad"
|
|
],
|
|
"reason": "All four words are direct synonyms for feeling intense anger."
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "MANUAL"
|
|
},
|
|
{
|
|
"case_id": "create-e4",
|
|
"type": "CREATE",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.6068227291107178,
|
|
"eval_count": 30,
|
|
"ok": true,
|
|
"parsed": {
|
|
"words": [
|
|
"Monday",
|
|
"Tuesday",
|
|
"Wednesday",
|
|
"Thursday"
|
|
],
|
|
"reason": "These are four consecutive days of the week."
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "MANUAL"
|
|
},
|
|
{
|
|
"case_id": "create-m1",
|
|
"type": "CREATE",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.7005162239074707,
|
|
"eval_count": 41,
|
|
"ok": true,
|
|
"parsed": {
|
|
"words": [
|
|
"BLIZZARD",
|
|
"HEAT",
|
|
"STORM",
|
|
"SNOW"
|
|
],
|
|
"reason": "These words can all precede 'STORM' to form a type of storm."
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "MANUAL"
|
|
},
|
|
{
|
|
"case_id": "create-m2",
|
|
"type": "CREATE",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.6505529880523682,
|
|
"eval_count": 37,
|
|
"ok": true,
|
|
"parsed": {
|
|
"words": [
|
|
"tiny",
|
|
"petite",
|
|
"minuscule",
|
|
"diminutive"
|
|
],
|
|
"reason": "All four words are direct synonyms for 'small' in size."
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "MANUAL"
|
|
},
|
|
{
|
|
"case_id": "create-m3",
|
|
"type": "CREATE",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.6751258373260498,
|
|
"eval_count": 37,
|
|
"ok": true,
|
|
"parsed": {
|
|
"words": [
|
|
"JEANS",
|
|
"BIRD",
|
|
"MOON",
|
|
"BLOOD"
|
|
],
|
|
"reason": "All words can follow 'BLUE' to form common phrases."
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "MANUAL"
|
|
},
|
|
{
|
|
"case_id": "create-m4",
|
|
"type": "CREATE",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.6219143867492676,
|
|
"eval_count": 32,
|
|
"ok": true,
|
|
"parsed": {
|
|
"words": [
|
|
"heart",
|
|
"promise",
|
|
"record",
|
|
"bone"
|
|
],
|
|
"reason": "All four items can be metaphorically or literally broken."
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "MANUAL"
|
|
},
|
|
{
|
|
"case_id": "create-h1",
|
|
"type": "CREATE",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.7682175636291504,
|
|
"eval_count": 49,
|
|
"ok": true,
|
|
"parsed": {
|
|
"words": [
|
|
"SEA",
|
|
"SEE",
|
|
"HEAR",
|
|
"HERE"
|
|
],
|
|
"reason": "All four words are homophones of common body parts (sea/see for 'C' sound, hear/here for 'hear')."
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "MANUAL"
|
|
},
|
|
{
|
|
"case_id": "create-h2",
|
|
"type": "CREATE",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.7266902923583984,
|
|
"eval_count": 42,
|
|
"ok": true,
|
|
"parsed": {
|
|
"words": [
|
|
"KNEEHIGH",
|
|
"EARDRUM",
|
|
"STOMACHACHE",
|
|
"EYELASH"
|
|
],
|
|
"reason": "Each word contains a body part name as a substring."
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "MANUAL"
|
|
},
|
|
{
|
|
"case_id": "ca-y1",
|
|
"type": "CREATIVE_ACCEPT",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.6249268054962158,
|
|
"eval_count": 31,
|
|
"ok": true,
|
|
"parsed": {
|
|
"valid": "yes",
|
|
"reason": "All four words can be used as verbs meaning to go up.",
|
|
"weak_words": []
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "ca-y2",
|
|
"type": "CREATIVE_ACCEPT",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.5928308963775635,
|
|
"eval_count": 27,
|
|
"ok": true,
|
|
"parsed": {
|
|
"valid": "yes",
|
|
"reason": "All four listed items are types of fruit.",
|
|
"weak_words": []
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "ca-y3",
|
|
"type": "CREATIVE_ACCEPT",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.6987485885620117,
|
|
"eval_count": 40,
|
|
"ok": true,
|
|
"parsed": {
|
|
"valid": "yes",
|
|
"reason": "You can crack a nut, a code, a smile, or a whip (like a whip crack).",
|
|
"weak_words": []
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "ca-y4",
|
|
"type": "CREATIVE_ACCEPT",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.6028926372528076,
|
|
"eval_count": 27,
|
|
"ok": true,
|
|
"parsed": {
|
|
"valid": "yes",
|
|
"reason": "All four items are commonly associated with baseball.",
|
|
"weak_words": []
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "ca-y5",
|
|
"type": "CREATIVE_ACCEPT",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.6008837223052979,
|
|
"eval_count": 29,
|
|
"ok": true,
|
|
"parsed": {
|
|
"valid": "yes",
|
|
"reason": "All four are names of Roman deities associated with planets.",
|
|
"weak_words": []
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "ca-n1",
|
|
"type": "CREATIVE_ACCEPT",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.728452205657959,
|
|
"eval_count": 43,
|
|
"ok": true,
|
|
"parsed": {
|
|
"valid": "no",
|
|
"reason": "These words are types of trees, not furniture brands.",
|
|
"weak_words": [
|
|
"OAK",
|
|
"MAPLE",
|
|
"BIRCH",
|
|
"PINE"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "ca-n2",
|
|
"type": "CREATIVE_ACCEPT",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.621466875076294,
|
|
"eval_count": 32,
|
|
"ok": true,
|
|
"parsed": {
|
|
"valid": "no",
|
|
"reason": "These words are Greek letters, not inherently meaning 'small'.",
|
|
"weak_words": [
|
|
"ALPHA"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "ca-n3",
|
|
"type": "CREATIVE_ACCEPT",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.6523487567901611,
|
|
"eval_count": 33,
|
|
"ok": true,
|
|
"parsed": {
|
|
"valid": "no",
|
|
"reason": "BAT and BALL are not things worn on the hand.",
|
|
"weak_words": [
|
|
"BAT",
|
|
"BALL"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "ca-n4",
|
|
"type": "CREATIVE_ACCEPT",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.7682509422302246,
|
|
"eval_count": 48,
|
|
"ok": true,
|
|
"parsed": {
|
|
"valid": "no",
|
|
"reason": "All the words listed are days of the week, not months of the year.",
|
|
"weak_words": [
|
|
"MONDAY",
|
|
"FRIDAY",
|
|
"SUNDAY",
|
|
"WEDNESDAY"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "ca-n5",
|
|
"type": "CREATIVE_ACCEPT",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.6511921882629395,
|
|
"eval_count": 34,
|
|
"ok": true,
|
|
"parsed": {
|
|
"valid": "no",
|
|
"reason": "Nut and smile are not typically considered kitchen items.",
|
|
"weak_words": [
|
|
"NUT",
|
|
"SMILE"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "ca-b1",
|
|
"type": "CREATIVE_ACCEPT",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.6162059307098389,
|
|
"eval_count": 30,
|
|
"ok": true,
|
|
"parsed": {
|
|
"valid": "no",
|
|
"reason": "None of these words are recognized tech or phone brands.",
|
|
"weak_words": [
|
|
"APPLE"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "BORDERLINE"
|
|
},
|
|
{
|
|
"case_id": "ca-b2",
|
|
"type": "CREATIVE_ACCEPT",
|
|
"model": "gemma4:latest",
|
|
"dt": 0.8338274955749512,
|
|
"eval_count": 56,
|
|
"ok": true,
|
|
"parsed": {
|
|
"valid": "no",
|
|
"reason": "Only SHARP and FLAT are common musical terms, while KEY and NATURAL are also musical terms, making 'Real estate listing terms' a poor fit for all four.",
|
|
"weak_words": [
|
|
"KEY",
|
|
"NATURAL"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "BORDERLINE"
|
|
},
|
|
{
|
|
"case_id": "judge-y1",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.7991511821746826,
|
|
"eval_count": 38,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "yes",
|
|
"reason": "All four words are specific, well-known types of trees.",
|
|
"misfit_words": []
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-y2",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.7396526336669922,
|
|
"eval_count": 38,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "yes",
|
|
"reason": "All four words are standard names of letters in the Greek alphabet.",
|
|
"misfit_words": []
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-y3",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.7360649108886719,
|
|
"eval_count": 37,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "yes",
|
|
"reason": "All four words are specific names of days in the week.",
|
|
"misfit_words": []
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-y4",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.7296037673950195,
|
|
"eval_count": 37,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "yes",
|
|
"reason": "All four words are direct synonyms representing different degrees of happiness.",
|
|
"misfit_words": []
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-y5",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.9206283092498779,
|
|
"eval_count": 60,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "no",
|
|
"reason": "While Green, Ice, and Fire House are common, 'Court House' is typically written as two words (Courthouse) or refers to a different semantic structure.",
|
|
"misfit_words": [
|
|
"COURT"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "FAIL"
|
|
},
|
|
{
|
|
"case_id": "judge-y6",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.9253664016723633,
|
|
"eval_count": 50,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "yes",
|
|
"reason": "All four words form common phrases when preceded by 'cold': cold shoulder, cold front, cold snap, and cold turkey.",
|
|
"misfit_words": []
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-y7",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.7268784046173096,
|
|
"eval_count": 36,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "yes",
|
|
"reason": "All four words are synonyms for moving at high speed.",
|
|
"misfit_words": []
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-y8",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.7770085334777832,
|
|
"eval_count": 42,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "yes",
|
|
"reason": "All four words are direct synonyms for a mental concept, notion, or vague understanding.",
|
|
"misfit_words": []
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-n1",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.7453572750091553,
|
|
"eval_count": 40,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "no",
|
|
"reason": "Carrot is a root vegetable, not a type of tree.",
|
|
"misfit_words": [
|
|
"CARROT"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-n2",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.7958042621612549,
|
|
"eval_count": 44,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "no",
|
|
"reason": "CYRILLIC refers to a script, not a specific Greek letter.",
|
|
"misfit_words": [
|
|
"CYRILLIC"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-n3",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.7643091678619385,
|
|
"eval_count": 42,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "no",
|
|
"reason": "The word 'SAD' is an antonym rather than a synonym.",
|
|
"misfit_words": [
|
|
"SAD"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-n4",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.7373373508453369,
|
|
"eval_count": 40,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "no",
|
|
"reason": "JANUARY is a month, not a day of the week.",
|
|
"misfit_words": [
|
|
"JANUARY"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-n5",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.7540380954742432,
|
|
"eval_count": 39,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "no",
|
|
"reason": "The word 'NIGHT' is not a body part.",
|
|
"misfit_words": [
|
|
"NIGHT"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-n6",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.7899594306945801,
|
|
"eval_count": 46,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "no",
|
|
"reason": "BAT and MOSQUITO are mammals and insects, respectively, not birds.",
|
|
"misfit_words": [
|
|
"BAT",
|
|
"MOSQUITO"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-n7",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.7485904693603516,
|
|
"eval_count": 39,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "no",
|
|
"reason": "BLUE and GRASS are not typically red.",
|
|
"misfit_words": [
|
|
"BLUE",
|
|
"GRASS"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-n8",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.9349522590637207,
|
|
"eval_count": 63,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "no",
|
|
"reason": "While shoulder, front, and mountain can follow 'cold', 'cold pizza' is a description of temperature rather than a common idiomatic phrase or compound noun like the others.",
|
|
"misfit_words": [
|
|
"PIZZA"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "create-e1",
|
|
"type": "CREATE",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.6970863342285156,
|
|
"eval_count": 34,
|
|
"ok": true,
|
|
"parsed": {
|
|
"words": [
|
|
"Oak",
|
|
"Maple",
|
|
"Birch",
|
|
"Cedar"
|
|
],
|
|
"reason": "These are all common species of trees."
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "MANUAL"
|
|
},
|
|
{
|
|
"case_id": "create-e2",
|
|
"type": "CREATE",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.7034692764282227,
|
|
"eval_count": 37,
|
|
"ok": true,
|
|
"parsed": {
|
|
"words": [
|
|
"ALPHA",
|
|
"BETA",
|
|
"GAMMA",
|
|
"DELTA"
|
|
],
|
|
"reason": "These are the first four letters of the Greek alphabet."
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "MANUAL"
|
|
},
|
|
{
|
|
"case_id": "create-e3",
|
|
"type": "CREATE",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.723792314529419,
|
|
"eval_count": 40,
|
|
"ok": true,
|
|
"parsed": {
|
|
"words": [
|
|
"Irate",
|
|
"Livid",
|
|
"Furious",
|
|
"Incensed"
|
|
],
|
|
"reason": "All four words are direct synonyms for being extremely angry."
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "MANUAL"
|
|
},
|
|
{
|
|
"case_id": "create-e4",
|
|
"type": "CREATE",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.6912398338317871,
|
|
"eval_count": 34,
|
|
"ok": true,
|
|
"parsed": {
|
|
"words": [
|
|
"Monday",
|
|
"Tuesday",
|
|
"Wednesday",
|
|
"Thursday"
|
|
],
|
|
"reason": "These are all consecutive days of the week."
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "MANUAL"
|
|
},
|
|
{
|
|
"case_id": "create-m1",
|
|
"type": "CREATE",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.7368042469024658,
|
|
"eval_count": 42,
|
|
"ok": true,
|
|
"parsed": {
|
|
"words": [
|
|
"BRAIN",
|
|
"DUST",
|
|
"FIRE",
|
|
"SNOW"
|
|
],
|
|
"reason": "Each word forms a common compound word when followed by 'STORM'."
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "MANUAL"
|
|
},
|
|
{
|
|
"case_id": "create-m2",
|
|
"type": "CREATE",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.7377326488494873,
|
|
"eval_count": 42,
|
|
"ok": true,
|
|
"parsed": {
|
|
"words": [
|
|
"tiny",
|
|
"microscopic",
|
|
"diminutive",
|
|
"minuscule"
|
|
],
|
|
"reason": "All four words are direct synonyms for something of very small size."
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "MANUAL"
|
|
},
|
|
{
|
|
"case_id": "create-m3",
|
|
"type": "CREATE",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.7627155780792236,
|
|
"eval_count": 43,
|
|
"ok": true,
|
|
"parsed": {
|
|
"words": [
|
|
"BIRD",
|
|
"JAY",
|
|
"PRINT",
|
|
"WHALE"
|
|
],
|
|
"reason": "Each word can be preceded by 'blue' to form a common term."
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "MANUAL"
|
|
},
|
|
{
|
|
"case_id": "create-m4",
|
|
"type": "CREATE",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.7079043388366699,
|
|
"eval_count": 37,
|
|
"ok": true,
|
|
"parsed": {
|
|
"words": [
|
|
"Promise",
|
|
"Record",
|
|
"Heart",
|
|
"Law"
|
|
],
|
|
"reason": "All four are common objects or concepts that can be broken."
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "MANUAL"
|
|
},
|
|
{
|
|
"case_id": "create-h1",
|
|
"type": "CREATE",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.8588354587554932,
|
|
"eval_count": 53,
|
|
"ok": true,
|
|
"parsed": {
|
|
"words": [
|
|
"MUSSEL",
|
|
"HARE",
|
|
"EYE",
|
|
"KNEAD"
|
|
],
|
|
"reason": "Each word is a homophone of a body part (muscle, hair, eye, knee) but spelled differently."
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "MANUAL"
|
|
},
|
|
{
|
|
"case_id": "create-h2",
|
|
"type": "CREATE",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.8054819107055664,
|
|
"eval_count": 48,
|
|
"ok": true,
|
|
"parsed": {
|
|
"words": [
|
|
"HANDSOME",
|
|
"FOOTPRINT",
|
|
"EARNEST",
|
|
"ARMORY"
|
|
],
|
|
"reason": "Each word contains a body part (hand, foot, ear, arm) as a substring."
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "MANUAL"
|
|
},
|
|
{
|
|
"case_id": "ca-y1",
|
|
"type": "CREATIVE_ACCEPT",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.7489900588989258,
|
|
"eval_count": 39,
|
|
"ok": true,
|
|
"parsed": {
|
|
"valid": "yes",
|
|
"reason": "All four words are synonyms or closely related verbs describing the act of rising or ascending.",
|
|
"weak_words": []
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "ca-y2",
|
|
"type": "CREATIVE_ACCEPT",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.7018086910247803,
|
|
"eval_count": 31,
|
|
"ok": true,
|
|
"parsed": {
|
|
"valid": "yes",
|
|
"reason": "All four words are common types of fruit.",
|
|
"weak_words": []
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "ca-y3",
|
|
"type": "CREATIVE_ACCEPT",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.7803692817687988,
|
|
"eval_count": 41,
|
|
"ok": true,
|
|
"parsed": {
|
|
"valid": "yes",
|
|
"reason": "You can crack a whip, crack a nut, crack a code, and crack a smile.",
|
|
"weak_words": []
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "ca-y4",
|
|
"type": "CREATIVE_ACCEPT",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.7275149822235107,
|
|
"eval_count": 37,
|
|
"ok": true,
|
|
"parsed": {
|
|
"valid": "yes",
|
|
"reason": "All four words are standard pieces of equipment used in the game of baseball.",
|
|
"weak_words": []
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "ca-y5",
|
|
"type": "CREATIVE_ACCEPT",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.7079963684082031,
|
|
"eval_count": 34,
|
|
"ok": true,
|
|
"parsed": {
|
|
"valid": "yes",
|
|
"reason": "All four words are names of prominent deities in Roman mythology.",
|
|
"weak_words": []
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "ca-n1",
|
|
"type": "CREATIVE_ACCEPT",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.8165183067321777,
|
|
"eval_count": 47,
|
|
"ok": true,
|
|
"parsed": {
|
|
"valid": "no",
|
|
"reason": "These words represent types of trees rather than furniture brands.",
|
|
"weak_words": [
|
|
"OAK",
|
|
"MAPLE",
|
|
"BIRCH",
|
|
"PINE"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "ca-n2",
|
|
"type": "CREATIVE_ACCEPT",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.9430453777313232,
|
|
"eval_count": 60,
|
|
"ok": true,
|
|
"parsed": {
|
|
"valid": "no",
|
|
"reason": "Alpha, beta, gamma, and delta are Greek letters used to denote various things, but none of them inherently mean 'small'.",
|
|
"weak_words": [
|
|
"ALPHA",
|
|
"BETA",
|
|
"GAMMA",
|
|
"DELTA"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "ca-n3",
|
|
"type": "CREATIVE_ACCEPT",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.7568156719207764,
|
|
"eval_count": 37,
|
|
"ok": true,
|
|
"parsed": {
|
|
"valid": "no",
|
|
"reason": "Bat and ball are not items worn on the hand.",
|
|
"weak_words": [
|
|
"BAT",
|
|
"BALL"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "ca-n4",
|
|
"type": "CREATIVE_ACCEPT",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.882706880569458,
|
|
"eval_count": 54,
|
|
"ok": true,
|
|
"parsed": {
|
|
"valid": "no",
|
|
"reason": "None of the words are months of the year; they are all days of the week.",
|
|
"weak_words": [
|
|
"MONDAY",
|
|
"FRIDAY",
|
|
"SUNDAY",
|
|
"WEDNESDAY"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "ca-n5",
|
|
"type": "CREATIVE_ACCEPT",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.8334591388702393,
|
|
"eval_count": 48,
|
|
"ok": true,
|
|
"parsed": {
|
|
"valid": "no",
|
|
"reason": "While whip and nut can relate to food, code and smile have no plausible connection to kitchen items.",
|
|
"weak_words": [
|
|
"CODE",
|
|
"SMILE"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "ca-b1",
|
|
"type": "CREATIVE_ACCEPT",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.7428381443023682,
|
|
"eval_count": 38,
|
|
"ok": true,
|
|
"parsed": {
|
|
"valid": "yes",
|
|
"reason": "All four words are names of well-known technology companies or mobile device brands.",
|
|
"weak_words": []
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "BORDERLINE"
|
|
},
|
|
{
|
|
"case_id": "ca-b2",
|
|
"type": "CREATIVE_ACCEPT",
|
|
"model": "gemma4:26b",
|
|
"dt": 0.9244155883789062,
|
|
"eval_count": 58,
|
|
"ok": true,
|
|
"parsed": {
|
|
"valid": "no",
|
|
"reason": "While 'flat' and 'key' can relate to real estate, 'sharp' and 'natural' have no plausible connection to real estate listings.",
|
|
"weak_words": [
|
|
"SHARP",
|
|
"NATURAL"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "BORDERLINE"
|
|
},
|
|
{
|
|
"case_id": "judge-y1",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 2.0965914726257324,
|
|
"eval_count": 34,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "yes",
|
|
"reason": "All four words are common types of trees.",
|
|
"misfit_words": []
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-y2",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 2.0503978729248047,
|
|
"eval_count": 38,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "yes",
|
|
"reason": "All four words are the first four letters of the Greek alphabet.",
|
|
"misfit_words": []
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-y3",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 1.9456124305725098,
|
|
"eval_count": 35,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "yes",
|
|
"reason": "All four words are standard days of the week.",
|
|
"misfit_words": []
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-y4",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 2.055443048477173,
|
|
"eval_count": 38,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "yes",
|
|
"reason": "All four words are direct synonyms for the state of being happy.",
|
|
"misfit_words": []
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-y5",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 2.074954032897949,
|
|
"eval_count": 39,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "yes",
|
|
"reason": "All four words form common compound words or phrases with 'house'.",
|
|
"misfit_words": []
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-y6",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 2.1484782695770264,
|
|
"eval_count": 41,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "yes",
|
|
"reason": "All four words form common compound words or phrases when preceded by 'cold'.",
|
|
"misfit_words": []
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-y7",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 1.980708122253418,
|
|
"eval_count": 35,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "yes",
|
|
"reason": "All four words are common synonyms for moving rapidly.",
|
|
"misfit_words": []
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-y8",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 2.1886539459228516,
|
|
"eval_count": 42,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "yes",
|
|
"reason": "All four words are common synonyms for a mental representation or a suggestion of an idea.",
|
|
"misfit_words": []
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-n1",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 2.060978412628174,
|
|
"eval_count": 38,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "no",
|
|
"reason": "Carrot is a root vegetable, not a tree.",
|
|
"misfit_words": [
|
|
"CARROT"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-n2",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 2.2179007530212402,
|
|
"eval_count": 43,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "no",
|
|
"reason": "Cyrillic is a writing system, not a Greek letter.",
|
|
"misfit_words": [
|
|
"CYRILLIC"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-n3",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 2.1611642837524414,
|
|
"eval_count": 40,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "no",
|
|
"reason": "SAD is an antonym of happy, not a synonym.",
|
|
"misfit_words": [
|
|
"SAD"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-n4",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 2.117088794708252,
|
|
"eval_count": 40,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "no",
|
|
"reason": "JANUARY is a month, not a day of the week.",
|
|
"misfit_words": [
|
|
"JANUARY"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-n5",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 2.1064138412475586,
|
|
"eval_count": 40,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "no",
|
|
"reason": "NIGHT is a time of day, not a body part.",
|
|
"misfit_words": [
|
|
"NIGHT"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-n6",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 2.0780858993530273,
|
|
"eval_count": 38,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "no",
|
|
"reason": "Bat and mosquito are not birds.",
|
|
"misfit_words": [
|
|
"BAT",
|
|
"MOSQUITO"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-n7",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 2.004055976867676,
|
|
"eval_count": 37,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "no",
|
|
"reason": "Blue and grass are not red.",
|
|
"misfit_words": [
|
|
"BLUE",
|
|
"GRASS"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "judge-n8",
|
|
"type": "JUDGE",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 2.7377030849456787,
|
|
"eval_count": 57,
|
|
"ok": true,
|
|
"parsed": {
|
|
"verdict": "no",
|
|
"reason": "Cold pizza and cold mountain are not established compound words or common idioms like cold shoulder and cold front.",
|
|
"misfit_words": [
|
|
"PIZZA",
|
|
"MOUNTAIN"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "create-e1",
|
|
"type": "CREATE",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 1.9321458339691162,
|
|
"eval_count": 34,
|
|
"ok": true,
|
|
"parsed": {
|
|
"words": [
|
|
"Oak",
|
|
"Maple",
|
|
"Pine",
|
|
"Birch"
|
|
],
|
|
"reason": "All four are common types of trees."
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "MANUAL"
|
|
},
|
|
{
|
|
"case_id": "create-e2",
|
|
"type": "CREATE",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 1.9048969745635986,
|
|
"eval_count": 36,
|
|
"ok": true,
|
|
"parsed": {
|
|
"words": [
|
|
"Alpha",
|
|
"Beta",
|
|
"Gamma",
|
|
"Delta"
|
|
],
|
|
"reason": "These are the first four letters of the Greek alphabet."
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "MANUAL"
|
|
},
|
|
{
|
|
"case_id": "create-e3",
|
|
"type": "CREATE",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 2.0525240898132324,
|
|
"eval_count": 40,
|
|
"ok": true,
|
|
"parsed": {
|
|
"words": [
|
|
"Furious",
|
|
"Irate",
|
|
"Livid",
|
|
"Enraged"
|
|
],
|
|
"reason": "All four words are strong synonyms for being extremely angry."
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "MANUAL"
|
|
},
|
|
{
|
|
"case_id": "create-e4",
|
|
"type": "CREATE",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 1.7998781204223633,
|
|
"eval_count": 33,
|
|
"ok": true,
|
|
"parsed": {
|
|
"words": [
|
|
"Monday",
|
|
"Wednesday",
|
|
"Friday",
|
|
"Sunday"
|
|
],
|
|
"reason": "They are all days of the week."
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "MANUAL"
|
|
},
|
|
{
|
|
"case_id": "create-m1",
|
|
"type": "CREATE",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 2.073843479156494,
|
|
"eval_count": 40,
|
|
"ok": true,
|
|
"parsed": {
|
|
"words": [
|
|
"BRAIN",
|
|
"FIRE",
|
|
"THUNDER",
|
|
"SNOW"
|
|
],
|
|
"reason": "All four words form common compound words when followed by STORM."
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "MANUAL"
|
|
},
|
|
{
|
|
"case_id": "create-m2",
|
|
"type": "CREATE",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 2.000436782836914,
|
|
"eval_count": 38,
|
|
"ok": true,
|
|
"parsed": {
|
|
"words": [
|
|
"tiny",
|
|
"petite",
|
|
"slight",
|
|
"miniature"
|
|
],
|
|
"reason": "All four words are direct synonyms for small in size."
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "MANUAL"
|
|
},
|
|
{
|
|
"case_id": "create-m3",
|
|
"type": "CREATE",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 2.1925113201141357,
|
|
"eval_count": 43,
|
|
"ok": true,
|
|
"parsed": {
|
|
"words": [
|
|
"BERRY",
|
|
"JAY",
|
|
"MOON",
|
|
"PRINT"
|
|
],
|
|
"reason": "All four words follow 'BLUE' to form common compound words or phrases."
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "MANUAL"
|
|
},
|
|
{
|
|
"case_id": "create-m4",
|
|
"type": "CREATE",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 2.063403606414795,
|
|
"eval_count": 38,
|
|
"ok": true,
|
|
"parsed": {
|
|
"words": [
|
|
"Record",
|
|
"Promise",
|
|
"Silence",
|
|
"Heart"
|
|
],
|
|
"reason": "All four are common abstract or physical things that can be broken."
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "MANUAL"
|
|
},
|
|
{
|
|
"case_id": "create-h1",
|
|
"type": "CREATE",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 6.69955849647522,
|
|
"eval_count": 170,
|
|
"ok": false,
|
|
"parsed": null,
|
|
"raw": "{\"words\": [\"HARE\", \"MUSSEL\", \"CEREAL\", \"STEEL\"], \"reason\": \"They are homophones of hair, muscle, serial (not a body part) - wait.\"}\n\n*(Self-correction: Cereal/Serial is not a body part. Let me refine the list: Hare/Hair, Mussel/Muscle, Nose/Knows is not it, Eye/I is not it. Let's use: Hare, Mussel, Hue/Hew is not it. Let's try: Hare, Mussel, Waist/Waste, Sole/Soul.)*\n\n{\"words\": [\"HARE\", \"MUSSEL\", \"WASTE\", \"SOUL\"], \"reason\": \"They are homophones of hair, muscle, waist, and sole.\"}",
|
|
"error": "JSONDecodeError('Extra data: line 3 column 1 (char 133)')",
|
|
"score": "PARSE_FAIL"
|
|
},
|
|
{
|
|
"case_id": "create-h2",
|
|
"type": "CREATE",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 2.2020654678344727,
|
|
"eval_count": 42,
|
|
"ok": true,
|
|
"parsed": {
|
|
"words": [
|
|
"HEARTBEAT",
|
|
"HANDSOME",
|
|
"FOOTPRINT",
|
|
"ARMCHAIR"
|
|
],
|
|
"reason": "Each word contains a body part as a substring."
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "MANUAL"
|
|
},
|
|
{
|
|
"case_id": "ca-y1",
|
|
"type": "CREATIVE_ACCEPT",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 2.137995719909668,
|
|
"eval_count": 34,
|
|
"ok": true,
|
|
"parsed": {
|
|
"valid": "yes",
|
|
"reason": "All four words are synonyms for the action of moving upward.",
|
|
"weak_words": []
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "ca-y2",
|
|
"type": "CREATIVE_ACCEPT",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 1.8336284160614014,
|
|
"eval_count": 31,
|
|
"ok": true,
|
|
"parsed": {
|
|
"valid": "yes",
|
|
"reason": "All four words are common types of fruit.",
|
|
"weak_words": []
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "ca-y3",
|
|
"type": "CREATIVE_ACCEPT",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 2.1915175914764404,
|
|
"eval_count": 40,
|
|
"ok": true,
|
|
"parsed": {
|
|
"valid": "yes",
|
|
"reason": "All four words are common objects or concepts that can be 'cracked'.",
|
|
"weak_words": [
|
|
"WHIP"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "ca-y4",
|
|
"type": "CREATIVE_ACCEPT",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 1.9880671501159668,
|
|
"eval_count": 34,
|
|
"ok": true,
|
|
"parsed": {
|
|
"valid": "yes",
|
|
"reason": "All four items are standard pieces of equipment used in baseball.",
|
|
"weak_words": []
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "ca-y5",
|
|
"type": "CREATIVE_ACCEPT",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 1.996727705001831,
|
|
"eval_count": 35,
|
|
"ok": true,
|
|
"parsed": {
|
|
"valid": "yes",
|
|
"reason": "All four words are names of Roman deities as well as planets.",
|
|
"weak_words": []
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "ca-n1",
|
|
"type": "CREATIVE_ACCEPT",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 2.716524839401245,
|
|
"eval_count": 54,
|
|
"ok": true,
|
|
"parsed": {
|
|
"valid": "no",
|
|
"reason": "These are types of trees/woods, not brands of furniture.",
|
|
"weak_words": [
|
|
"OAK",
|
|
"MAPLE",
|
|
"BIRCH",
|
|
"PINE"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "ca-n2",
|
|
"type": "CREATIVE_ACCEPT",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 2.5698249340057373,
|
|
"eval_count": 48,
|
|
"ok": true,
|
|
"parsed": {
|
|
"valid": "no",
|
|
"reason": "These are Greek letters used as designations, not synonyms for 'small'.",
|
|
"weak_words": [
|
|
"ALPHA",
|
|
"BETA",
|
|
"GAMMA",
|
|
"DELTA"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "ca-n3",
|
|
"type": "CREATIVE_ACCEPT",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 2.312160015106201,
|
|
"eval_count": 43,
|
|
"ok": true,
|
|
"parsed": {
|
|
"valid": "no",
|
|
"reason": "A bat and a ball are not worn on the hand.",
|
|
"weak_words": [
|
|
"BAT",
|
|
"BALL"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "ca-n4",
|
|
"type": "CREATIVE_ACCEPT",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 2.6066768169403076,
|
|
"eval_count": 50,
|
|
"ok": true,
|
|
"parsed": {
|
|
"valid": "no",
|
|
"reason": "The words are days of the week, not months of the year.",
|
|
"weak_words": [
|
|
"MONDAY",
|
|
"FRIDAY",
|
|
"SUNDAY",
|
|
"WEDNESDAY"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "ca-n5",
|
|
"type": "CREATIVE_ACCEPT",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 2.3842716217041016,
|
|
"eval_count": 45,
|
|
"ok": true,
|
|
"parsed": {
|
|
"valid": "no",
|
|
"reason": "Code and smile are not items typically found in a kitchen.",
|
|
"weak_words": [
|
|
"CODE",
|
|
"SMILE"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "PASS"
|
|
},
|
|
{
|
|
"case_id": "ca-b1",
|
|
"type": "CREATIVE_ACCEPT",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 2.347870349884033,
|
|
"eval_count": 44,
|
|
"ok": true,
|
|
"parsed": {
|
|
"valid": "no",
|
|
"reason": "Orange and Kiwi are not recognized tech or phone brands.",
|
|
"weak_words": [
|
|
"ORANGE",
|
|
"KIWI"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "BORDERLINE"
|
|
},
|
|
{
|
|
"case_id": "ca-b2",
|
|
"type": "CREATIVE_ACCEPT",
|
|
"model": "gemma4:31b-it-q4_K_M",
|
|
"dt": 2.40256667137146,
|
|
"eval_count": 46,
|
|
"ok": true,
|
|
"parsed": {
|
|
"valid": "no",
|
|
"reason": "Natural and Sharp are not standard terms used in real estate listings.",
|
|
"weak_words": [
|
|
"NATURAL",
|
|
"SHARP"
|
|
]
|
|
},
|
|
"raw": null,
|
|
"error": null,
|
|
"score": "BORDERLINE"
|
|
}
|
|
] |