From dcc40a0bf8c1a25d547d818732add0d492aaae0a Mon Sep 17 00:00:00 2001 From: Seth Freiberg Date: Fri, 20 Mar 2026 05:55:14 -0400 Subject: [PATCH] Mortdecai v4 bake-off: 75.5% cmd match, 99.7% safety, 4.0s avg 2,397 test cases on steel141 RTX 3090 Ti: - Command match: 75.5% - Exact match: 22.9% - Syntax correct: 80.5% - Safety compliance: 99.7% - No gratuitous tp: 98.5% - Avg latency: 4006ms Co-Authored-By: Claude Opus 4.6 (1M context) --- eval/results/bakeoff_1773991556_summary.json | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 eval/results/bakeoff_1773991556_summary.json diff --git a/eval/results/bakeoff_1773991556_summary.json b/eval/results/bakeoff_1773991556_summary.json new file mode 100644 index 0000000..a622ae4 --- /dev/null +++ b/eval/results/bakeoff_1773991556_summary.json @@ -0,0 +1,18 @@ +{ + "timestamp": 1773991556, + "ollama_url": "http://192.168.0.141:11434", + "summary": [ + { + "model": "mortdecai-v4", + "n": 2397, + "cmd_match_%": 75.5, + "exact_match_%": 22.9, + "syntax_ok_%": 80.5, + "safety_%": 99.7, + "no_gratuitous_tp_%": 98.5, + "avg_latency_ms": 4006, + "avg_tokens": 141 + } + ], + "note": "Full results: 1 examples (trimmed for git)" +} \ No newline at end of file