mortdecai-model-analysis/scripts/base_model_interview.py

#!/usr/bin/env python3
"""Interview base models for comparison against fine-tuned mortdecai."""
import json, requests, sys, time

OLLAMA_URL = "http://192.168.0.141:11437"

def query_model(model, system_prompt, user_prompt, temperature=0.1):
    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        "stream": False,
        "options": {"temperature": temperature, "num_predict": 512}
    }
    try:
        r = requests.post(f"{OLLAMA_URL}/api/chat", json=payload, timeout=180)
        r.raise_for_status()
        data = r.json()
        content = data.get("message", {}).get("content", "NO CONTENT")
        total_dur = data.get("total_duration", 0) / 1e9  # nanoseconds to seconds
        eval_count = data.get("eval_count", 0)
        return content, total_dur, eval_count
    except Exception as e:
        return f"ERROR: {e}", 0, 0

TRAINING_SYSTEM = """You are a Minecraft 1.21 command translator for a server admin. You receive natural language requests and return valid RCON commands.

PERMISSION LEVEL: 4 (generous). You are serving an admin. Do what they ask. Only refuse level 0-1 actions.

Return ONLY JSON: {"commands": ["cmd1", "cmd2"], "reasoning": "why"}
No prose, no markdown, no labels, no leading slash on commands.

SYNTAX RULES (1.21+):
- Items always need minecraft: prefix: minecraft:diamond_sword, not diamond_sword
- Effects: effect give <target> minecraft:<effect> <seconds> <amplifier>
- Weather: weather clear | weather rain | weather thunder
- Gamemode: gamemode survival|creative|adventure|spectator <target>"""

# Test prompts
test_prompts = [
    "give me a diamond sword",
    "set the time to day and make it stop raining and give me full diamond armor",
    "build me a 5x5 house out of oak planks at my location",
    "give me op",
    "teleport all players to me",
]

# Models to test (base models on Matt's machine)
base_models = [
    "qwen3.5:latest",    # 9B base (should be same arch as mortdecai:0.6.0-9b)
    "qwen3.5:27b",       # 27B base (same arch as mortdecai:latest)
    "gemma3:12b",        # Current Hand candidate
    "phi4:14b",          # Another candidate
    "gemma3:27b",        # Large gemma
    "qwen3:14b",         # Qwen3 (not 3.5)
]

results = {}

for model in base_models:
    print(f"\n{'='*80}")
    print(f"MODEL: {model}")
    print(f"{'='*80}")
    model_results = []

    for prompt in test_prompts:
        print(f"\n  User: {prompt}")
        response, duration, tokens = query_model(model, TRAINING_SYSTEM, prompt)

        # Check JSON validity
        json_valid = False
        has_commands = False
        commands_correct = False
        clean = response.strip()

        # Strip think tags if present
        if "<think>" in clean:
            think_end = clean.find("</think>")
            if think_end > -1:
                clean = clean[think_end + 8:].strip()

        # Strip markdown fences
        if clean.startswith("```"):
            lines = clean.split("\n")
            clean = "\n".join(lines[1:])
            if "```" in clean:
                clean = clean[:clean.rfind("```")]
            clean = clean.strip()

        try:
            parsed = json.loads(clean)
            json_valid = True
            has_commands = "commands" in parsed
            if has_commands:
                cmds = parsed["commands"]
                # Check if commands look valid (have minecraft: prefix where needed)
                commands_correct = all(isinstance(c, str) for c in cmds)
        except:
            pass

        status = "JSON_VALID" if json_valid else "JSON_INVALID"
        if json_valid and has_commands:
            status += "+COMMANDS"
        if json_valid and not has_commands:
            status += "+NO_CMDS"

        print(f"  [{status}] {duration:.1f}s, {tokens} tokens")
        print(f"  Response: {response[:300]}")

        model_results.append({
            "prompt": prompt,
            "json_valid": json_valid,
            "has_commands": has_commands,
            "duration": duration,
            "tokens": tokens
        })

    results[model] = model_results

# Summary table
print(f"\n\n{'='*80}")
print("SUMMARY TABLE")
print(f"{'='*80}")
print(f"{'Model':<25} {'JSON Valid':>10} {'Has Cmds':>10} {'Avg Time':>10}")
print("-" * 60)
for model, res in results.items():
    valid = sum(1 for r in res if r["json_valid"])
    cmds = sum(1 for r in res if r["has_commands"])
    avg_time = sum(r["duration"] for r in res) / len(res)
    print(f"{model:<25} {valid}/{len(res):>8} {cmds}/{len(res):>8} {avg_time:>8.1f}s")