Add knowledge corpus: 14 command references, server context, and TF-IDF search index (Phase 1.3)

- knowledge/mc-commands/commands.json: 14 MC commands with JE syntax, args, examples, common errors, 1.21 version notes - knowledge/server-context/servers.json: all 4 servers (mc1, shrink, paper-ai, paper-dev) with full config - knowledge/build_index.py: TF-IDF indexer + search function (19 docs, 725 terms) - All command syntax validated live on dev server via RCON (12/13 passed) - PLAN.md: mark Phase 1.3 complete
2026-03-18 02:01:12 -04:00
parent 827850b8d7
commit 77efac0283
5 changed files with 2825 additions and 10 deletions
@@ -0,0 +1,219 @@
+#!/usr/bin/env python3
+"""
+Build a simple TF-IDF-based search index over the knowledge corpus.
+
+Indexes:
+  - knowledge/mc-commands/commands.json  (command reference)
+  - knowledge/server-context/servers.json (server configs)
+  - knowledge/wiki-chunks/*.json          (wiki content, if present)
+
+Outputs: knowledge/index.json
+
+Usage: python3 knowledge/build_index.py
+"""
+
+import json
+import os
+import re
+from collections import Counter
+from pathlib import Path
+import math
+
+ROOT = Path(__file__).resolve().parent
+
+
+def tokenize(text: str) -> list:
+    """Simple whitespace + punctuation tokenizer."""
+    return re.findall(r'[a-z0-9_:/.]{2,}', (text or '').lower())
+
+
+def build_command_docs(commands_path: Path) -> list:
+    """Build searchable documents from commands.json."""
+    docs = []
+    if not commands_path.exists():
+        return docs
+    commands = json.loads(commands_path.read_text())
+    for cmd in commands:
+        name = cmd.get('command', '')
+        # Build a text blob from all fields
+        parts = [
+            f"/{name} command",
+            cmd.get('description', ''),
+            ' '.join(cmd.get('je_syntax', [])),
+        ]
+        # Arguments
+        for arg_name, arg_info in cmd.get('arguments', {}).items():
+            if isinstance(arg_info, dict):
+                parts.append(f"{arg_name}: {arg_info.get('description', '')}")
+            else:
+                parts.append(f"{arg_name}: {arg_info}")
+        # Examples
+        for ex_name, ex_val in cmd.get('examples', {}).items():
+            parts.append(f"example {ex_name}: {ex_val}")
+        # Common errors
+        for err in cmd.get('common_errors', []):
+            parts.append(f"common error: {err}")
+        # Version notes
+        parts.append(cmd.get('version_notes', ''))
+
+        text = '\n'.join(p for p in parts if p)
+        snippet = f"/{name}: {cmd.get('description', '')}. Syntax: {'; '.join(cmd.get('je_syntax', [])[:2])}"
+
+        docs.append({
+            'id': f'cmd_{name}',
+            'type': 'command',
+            'title': f'/{name}',
+            'text': text,
+            'snippet': snippet[:300],
+            'source': 'mc-commands/commands.json',
+        })
+    return docs
+
+
+def build_server_docs(servers_path: Path) -> list:
+    """Build searchable documents from servers.json."""
+    docs = []
+    if not servers_path.exists():
+        return docs
+    data = json.loads(servers_path.read_text())
+    for srv in data.get('servers', []):
+        name = srv.get('name', '')
+        text = json.dumps(srv, indent=2)
+        snippet = f"Server '{name}': {srv.get('type', '')} {srv.get('version', '')} on port {srv.get('game_port', '')}. {srv.get('notes', '')}"
+        docs.append({
+            'id': f'srv_{name}',
+            'type': 'server',
+            'title': f'Server: {name}',
+            'text': text,
+            'snippet': snippet[:300],
+            'source': 'server-context/servers.json',
+        })
+    # Version notes as a separate doc
+    vn = data.get('version_notes', {})
+    if vn:
+        text = '\n'.join(f"{k}: {v}" for k, v in vn.items())
+        docs.append({
+            'id': 'version_notes',
+            'type': 'reference',
+            'title': 'Minecraft 1.21 Version Notes',
+            'text': text,
+            'snippet': text[:300],
+            'source': 'server-context/servers.json',
+        })
+    return docs
+
+
+def build_wiki_docs(wiki_dir: Path) -> list:
+    """Build searchable documents from wiki chunk files."""
+    docs = []
+    if not wiki_dir.exists():
+        return docs
+    for p in wiki_dir.glob('*.json'):
+        try:
+            chunks = json.loads(p.read_text())
+            if isinstance(chunks, list):
+                for i, chunk in enumerate(chunks):
+                    text = chunk.get('text', '') if isinstance(chunk, dict) else str(chunk)
+                    title = chunk.get('title', p.stem) if isinstance(chunk, dict) else p.stem
+                    docs.append({
+                        'id': f'wiki_{p.stem}_{i}',
+                        'type': 'wiki',
+                        'title': title,
+                        'text': text,
+                        'snippet': text[:300],
+                        'source': f'wiki-chunks/{p.name}',
+                    })
+        except Exception:
+            pass
+    return docs
+
+
+def build_index():
+    """Build and save the search index."""
+    docs = []
+    docs.extend(build_command_docs(ROOT / 'mc-commands' / 'commands.json'))
+    docs.extend(build_server_docs(ROOT / 'server-context' / 'servers.json'))
+    docs.extend(build_wiki_docs(ROOT / 'wiki-chunks'))
+
+    # Build TF-IDF
+    doc_freq = Counter()
+    for doc in docs:
+        tokens = set(tokenize(doc['text']))
+        doc['_tokens'] = list(tokens)
+        for t in tokens:
+            doc_freq[t] += 1
+
+    N = len(docs)
+    idf = {t: math.log(N / (1 + df)) for t, df in doc_freq.items()}
+
+    # Store index
+    index = {
+        'generated_at': __import__('time').time(),
+        'doc_count': N,
+        'docs': [{
+            'id': d['id'],
+            'type': d['type'],
+            'title': d['title'],
+            'snippet': d['snippet'],
+            'source': d['source'],
+            'tokens': d['_tokens'],
+        } for d in docs],
+        'idf': {t: round(v, 4) for t, v in sorted(idf.items()) if v > 0.1},
+    }
+
+    out_path = ROOT / 'index.json'
+    out_path.write_text(json.dumps(index, ensure_ascii=True, indent=2))
+    print(f"Index built: {N} documents, {len(idf)} unique terms -> {out_path}")
+    return index
+
+
+def search(query: str, index: dict = None, limit: int = 5) -> list:
+    """Search the index. Returns top matches."""
+    if index is None:
+        idx_path = ROOT / 'index.json'
+        index = json.loads(idx_path.read_text())
+
+    q_tokens = set(tokenize(query))
+    idf = index.get('idf', {})
+    results = []
+
+    for doc in index.get('docs', []):
+        d_tokens = set(doc.get('tokens', []))
+        overlap = q_tokens & d_tokens
+        if not overlap:
+            continue
+        score = sum(idf.get(t, 0.5) for t in overlap)
+        # Boost title matches
+        title_tokens = set(tokenize(doc.get('title', '')))
+        title_overlap = q_tokens & title_tokens
+        score += len(title_overlap) * 2.0
+        results.append((score, doc))
+
+    results.sort(key=lambda x: x[0], reverse=True)
+    return [{'score': round(s, 2), **d} for s, d in results[:limit]]
+
+
+if __name__ == '__main__':
+    import sys
+    if len(sys.argv) > 1 and sys.argv[1] == 'search':
+        query = ' '.join(sys.argv[2:])
+        results = search(query)
+        for r in results:
+            print(f"[{r['score']:.1f}] {r['title']}: {r['snippet'][:100]}")
+    else:
+        build_index()
+        # Run test queries
+        print()
+        for q in [
+            "how to give enchanted sword",
+            "effect speed player",
+            "weather thunder storm",
+            "execute as vs at position",
+            "paper server port rcon",
+            "1.21 enchantment syntax",
+        ]:
+            results = search(q)
+            print(f"Query: '{q}'")
+            for r in results[:3]:
+                print(f"  [{r['score']:.1f}] {r['title']}")
+            print()