Add knowledge corpus: 14 command references, server context, and TF-IDF search index (Phase 1.3)
- knowledge/mc-commands/commands.json: 14 MC commands with JE syntax, args, examples, common errors, 1.21 version notes - knowledge/server-context/servers.json: all 4 servers (mc1, shrink, paper-ai, paper-dev) with full config - knowledge/build_index.py: TF-IDF indexer + search function (19 docs, 725 terms) - All command syntax validated live on dev server via RCON (12/13 passed) - PLAN.md: mark Phase 1.3 complete
This commit is contained in:
@@ -0,0 +1,219 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Build a simple TF-IDF-based search index over the knowledge corpus.
|
||||
|
||||
Indexes:
|
||||
- knowledge/mc-commands/commands.json (command reference)
|
||||
- knowledge/server-context/servers.json (server configs)
|
||||
- knowledge/wiki-chunks/*.json (wiki content, if present)
|
||||
|
||||
Outputs: knowledge/index.json
|
||||
|
||||
Usage: python3 knowledge/build_index.py
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
import math
|
||||
|
||||
ROOT = Path(__file__).resolve().parent
|
||||
|
||||
|
||||
def tokenize(text: str) -> list:
|
||||
"""Simple whitespace + punctuation tokenizer."""
|
||||
return re.findall(r'[a-z0-9_:/.]{2,}', (text or '').lower())
|
||||
|
||||
|
||||
def build_command_docs(commands_path: Path) -> list:
|
||||
"""Build searchable documents from commands.json."""
|
||||
docs = []
|
||||
if not commands_path.exists():
|
||||
return docs
|
||||
commands = json.loads(commands_path.read_text())
|
||||
for cmd in commands:
|
||||
name = cmd.get('command', '')
|
||||
# Build a text blob from all fields
|
||||
parts = [
|
||||
f"/{name} command",
|
||||
cmd.get('description', ''),
|
||||
' '.join(cmd.get('je_syntax', [])),
|
||||
]
|
||||
# Arguments
|
||||
for arg_name, arg_info in cmd.get('arguments', {}).items():
|
||||
if isinstance(arg_info, dict):
|
||||
parts.append(f"{arg_name}: {arg_info.get('description', '')}")
|
||||
else:
|
||||
parts.append(f"{arg_name}: {arg_info}")
|
||||
# Examples
|
||||
for ex_name, ex_val in cmd.get('examples', {}).items():
|
||||
parts.append(f"example {ex_name}: {ex_val}")
|
||||
# Common errors
|
||||
for err in cmd.get('common_errors', []):
|
||||
parts.append(f"common error: {err}")
|
||||
# Version notes
|
||||
parts.append(cmd.get('version_notes', ''))
|
||||
|
||||
text = '\n'.join(p for p in parts if p)
|
||||
snippet = f"/{name}: {cmd.get('description', '')}. Syntax: {'; '.join(cmd.get('je_syntax', [])[:2])}"
|
||||
|
||||
docs.append({
|
||||
'id': f'cmd_{name}',
|
||||
'type': 'command',
|
||||
'title': f'/{name}',
|
||||
'text': text,
|
||||
'snippet': snippet[:300],
|
||||
'source': 'mc-commands/commands.json',
|
||||
})
|
||||
return docs
|
||||
|
||||
|
||||
def build_server_docs(servers_path: Path) -> list:
|
||||
"""Build searchable documents from servers.json."""
|
||||
docs = []
|
||||
if not servers_path.exists():
|
||||
return docs
|
||||
data = json.loads(servers_path.read_text())
|
||||
for srv in data.get('servers', []):
|
||||
name = srv.get('name', '')
|
||||
text = json.dumps(srv, indent=2)
|
||||
snippet = f"Server '{name}': {srv.get('type', '')} {srv.get('version', '')} on port {srv.get('game_port', '')}. {srv.get('notes', '')}"
|
||||
docs.append({
|
||||
'id': f'srv_{name}',
|
||||
'type': 'server',
|
||||
'title': f'Server: {name}',
|
||||
'text': text,
|
||||
'snippet': snippet[:300],
|
||||
'source': 'server-context/servers.json',
|
||||
})
|
||||
# Version notes as a separate doc
|
||||
vn = data.get('version_notes', {})
|
||||
if vn:
|
||||
text = '\n'.join(f"{k}: {v}" for k, v in vn.items())
|
||||
docs.append({
|
||||
'id': 'version_notes',
|
||||
'type': 'reference',
|
||||
'title': 'Minecraft 1.21 Version Notes',
|
||||
'text': text,
|
||||
'snippet': text[:300],
|
||||
'source': 'server-context/servers.json',
|
||||
})
|
||||
return docs
|
||||
|
||||
|
||||
def build_wiki_docs(wiki_dir: Path) -> list:
|
||||
"""Build searchable documents from wiki chunk files."""
|
||||
docs = []
|
||||
if not wiki_dir.exists():
|
||||
return docs
|
||||
for p in wiki_dir.glob('*.json'):
|
||||
try:
|
||||
chunks = json.loads(p.read_text())
|
||||
if isinstance(chunks, list):
|
||||
for i, chunk in enumerate(chunks):
|
||||
text = chunk.get('text', '') if isinstance(chunk, dict) else str(chunk)
|
||||
title = chunk.get('title', p.stem) if isinstance(chunk, dict) else p.stem
|
||||
docs.append({
|
||||
'id': f'wiki_{p.stem}_{i}',
|
||||
'type': 'wiki',
|
||||
'title': title,
|
||||
'text': text,
|
||||
'snippet': text[:300],
|
||||
'source': f'wiki-chunks/{p.name}',
|
||||
})
|
||||
except Exception:
|
||||
pass
|
||||
return docs
|
||||
|
||||
|
||||
def build_index():
|
||||
"""Build and save the search index."""
|
||||
docs = []
|
||||
docs.extend(build_command_docs(ROOT / 'mc-commands' / 'commands.json'))
|
||||
docs.extend(build_server_docs(ROOT / 'server-context' / 'servers.json'))
|
||||
docs.extend(build_wiki_docs(ROOT / 'wiki-chunks'))
|
||||
|
||||
# Build TF-IDF
|
||||
doc_freq = Counter()
|
||||
for doc in docs:
|
||||
tokens = set(tokenize(doc['text']))
|
||||
doc['_tokens'] = list(tokens)
|
||||
for t in tokens:
|
||||
doc_freq[t] += 1
|
||||
|
||||
N = len(docs)
|
||||
idf = {t: math.log(N / (1 + df)) for t, df in doc_freq.items()}
|
||||
|
||||
# Store index
|
||||
index = {
|
||||
'generated_at': __import__('time').time(),
|
||||
'doc_count': N,
|
||||
'docs': [{
|
||||
'id': d['id'],
|
||||
'type': d['type'],
|
||||
'title': d['title'],
|
||||
'snippet': d['snippet'],
|
||||
'source': d['source'],
|
||||
'tokens': d['_tokens'],
|
||||
} for d in docs],
|
||||
'idf': {t: round(v, 4) for t, v in sorted(idf.items()) if v > 0.1},
|
||||
}
|
||||
|
||||
out_path = ROOT / 'index.json'
|
||||
out_path.write_text(json.dumps(index, ensure_ascii=True, indent=2))
|
||||
print(f"Index built: {N} documents, {len(idf)} unique terms -> {out_path}")
|
||||
return index
|
||||
|
||||
|
||||
def search(query: str, index: dict = None, limit: int = 5) -> list:
|
||||
"""Search the index. Returns top matches."""
|
||||
if index is None:
|
||||
idx_path = ROOT / 'index.json'
|
||||
index = json.loads(idx_path.read_text())
|
||||
|
||||
q_tokens = set(tokenize(query))
|
||||
idf = index.get('idf', {})
|
||||
results = []
|
||||
|
||||
for doc in index.get('docs', []):
|
||||
d_tokens = set(doc.get('tokens', []))
|
||||
overlap = q_tokens & d_tokens
|
||||
if not overlap:
|
||||
continue
|
||||
score = sum(idf.get(t, 0.5) for t in overlap)
|
||||
# Boost title matches
|
||||
title_tokens = set(tokenize(doc.get('title', '')))
|
||||
title_overlap = q_tokens & title_tokens
|
||||
score += len(title_overlap) * 2.0
|
||||
results.append((score, doc))
|
||||
|
||||
results.sort(key=lambda x: x[0], reverse=True)
|
||||
return [{'score': round(s, 2), **d} for s, d in results[:limit]]
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
if len(sys.argv) > 1 and sys.argv[1] == 'search':
|
||||
query = ' '.join(sys.argv[2:])
|
||||
results = search(query)
|
||||
for r in results:
|
||||
print(f"[{r['score']:.1f}] {r['title']}: {r['snippet'][:100]}")
|
||||
else:
|
||||
build_index()
|
||||
# Run test queries
|
||||
print()
|
||||
for q in [
|
||||
"how to give enchanted sword",
|
||||
"effect speed player",
|
||||
"weather thunder storm",
|
||||
"execute as vs at position",
|
||||
"paper server port rcon",
|
||||
"1.21 enchantment syntax",
|
||||
]:
|
||||
results = search(q)
|
||||
print(f"Query: '{q}'")
|
||||
for r in results[:3]:
|
||||
print(f" [{r['score']:.1f}] {r['title']}")
|
||||
print()
|
||||
Reference in New Issue
Block a user