docs: integration tools — cross-reference graph, concept index, research digest
Codex-built tooling: cross-reference graph, concept index with build script, and research integrator that extracted 142 scholars, 175 bibliography items, 4 contradiction topics, and coverage maps for Paper 009 planning. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,470 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Build cross-reference artifacts for the VIBECODE-THEORY paper series.
|
||||
|
||||
Outputs:
|
||||
- graph.json
|
||||
- graph.mermaid
|
||||
- dangling_threads.md
|
||||
- concept_flow.md
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[2]
|
||||
OUT_DIR = Path(__file__).resolve().parent
|
||||
|
||||
PAPER_GLOB = "00*-*.md"
|
||||
ALLEGORY_GLOB = "allegorical/*.md"
|
||||
|
||||
STOPWORDS = {
|
||||
"about",
|
||||
"after",
|
||||
"again",
|
||||
"also",
|
||||
"because",
|
||||
"between",
|
||||
"could",
|
||||
"does",
|
||||
"from",
|
||||
"have",
|
||||
"into",
|
||||
"just",
|
||||
"like",
|
||||
"might",
|
||||
"more",
|
||||
"most",
|
||||
"over",
|
||||
"paper",
|
||||
"question",
|
||||
"series",
|
||||
"should",
|
||||
"than",
|
||||
"that",
|
||||
"their",
|
||||
"them",
|
||||
"then",
|
||||
"this",
|
||||
"those",
|
||||
"through",
|
||||
"what",
|
||||
"when",
|
||||
"which",
|
||||
"with",
|
||||
"would",
|
||||
}
|
||||
|
||||
RELATION_PRIORITY = {
|
||||
"supersedes": 8,
|
||||
"refutes": 7,
|
||||
"challenges": 6,
|
||||
"revises": 5,
|
||||
"extends": 4,
|
||||
"addresses": 3,
|
||||
"introduces concept used by": 2,
|
||||
"references": 1,
|
||||
}
|
||||
|
||||
CONCEPT_CATALOG = {
|
||||
"vibe coding as social skill": {
|
||||
"aliases": ["vibe coding", "social skill", "meta-skill"],
|
||||
"intro": "001",
|
||||
},
|
||||
"cognitive surplus": {
|
||||
"aliases": ["cognitive surplus", "surplus"],
|
||||
"intro": "002",
|
||||
},
|
||||
"dependency trap": {
|
||||
"aliases": ["dependency trap", "systemic dependency"],
|
||||
"intro": "002",
|
||||
},
|
||||
"cognitive preference shift": {
|
||||
"aliases": ["cognitive preference shift", "preference shift"],
|
||||
"intro": "005",
|
||||
},
|
||||
"automation spiral": {
|
||||
"aliases": ["automation spiral"],
|
||||
"intro": "003",
|
||||
},
|
||||
"feedback loop": {
|
||||
"aliases": ["feedback loop", "uncomfortable middle"],
|
||||
"intro": "006",
|
||||
},
|
||||
"biological ratchet": {
|
||||
"aliases": ["biological ratchet", "ratchet"],
|
||||
"intro": "007",
|
||||
},
|
||||
"infrastructure threshold": {
|
||||
"aliases": ["infrastructure threshold", "application phase"],
|
||||
"intro": "007",
|
||||
},
|
||||
"premature dependency hibernation": {
|
||||
"aliases": ["premature dependencies", "hibernation"],
|
||||
"intro": "007",
|
||||
},
|
||||
"knowledge unification": {
|
||||
"aliases": ["knowledge unification", "defragmentation"],
|
||||
"intro": "008",
|
||||
},
|
||||
"ship of theseus identity problem": {
|
||||
"aliases": ["ship of theseus", "species identity"],
|
||||
"intro": "008",
|
||||
},
|
||||
"cheating frame": {
|
||||
"aliases": ['"cheating"', "cheating frame"],
|
||||
"intro": "008",
|
||||
},
|
||||
"dependency chain": {
|
||||
"aliases": ["dependency chain"],
|
||||
"intro": "007",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class Document:
|
||||
doc_id: str
|
||||
title: str
|
||||
kind: str
|
||||
path: Path
|
||||
text: str
|
||||
|
||||
|
||||
def read_documents() -> list[Document]:
|
||||
docs: list[Document] = []
|
||||
|
||||
for path in sorted(ROOT.glob(PAPER_GLOB)):
|
||||
text = path.read_text(encoding="utf-8")
|
||||
m = re.search(r"^#\s+Paper\s+(\d{3}):\s*(.+)$", text, flags=re.M)
|
||||
if m:
|
||||
doc_id, title = m.group(1), m.group(2).strip()
|
||||
else:
|
||||
doc_id = path.name.split("-", 1)[0]
|
||||
title = path.stem
|
||||
docs.append(Document(doc_id=doc_id, title=title, kind="paper", path=path, text=text))
|
||||
|
||||
for path in sorted(ROOT.glob(ALLEGORY_GLOB)):
|
||||
text = path.read_text(encoding="utf-8")
|
||||
m = re.search(r"^#\s+(.+)$", text, flags=re.M)
|
||||
title = m.group(1).strip() if m else path.stem.replace("-", " ").title()
|
||||
docs.append(
|
||||
Document(
|
||||
doc_id=f"A:{path.stem}",
|
||||
title=title,
|
||||
kind="allegory",
|
||||
path=path,
|
||||
text=text,
|
||||
)
|
||||
)
|
||||
|
||||
return docs
|
||||
|
||||
|
||||
def sentence_chunks(text: str) -> Iterable[str]:
|
||||
for chunk in re.split(r"(?<=[.!?])\s+|\n{2,}", text):
|
||||
cleaned = " ".join(chunk.strip().split())
|
||||
if cleaned:
|
||||
yield cleaned
|
||||
|
||||
|
||||
def classify_relationship(text: str) -> str:
|
||||
lower = text.lower()
|
||||
if "supersed" in lower:
|
||||
return "supersedes"
|
||||
if any(k in lower for k in ("refute", "rebuttal", "against")):
|
||||
return "refutes"
|
||||
if any(k in lower for k in ("challenge", "critic", "unfalsifiable")):
|
||||
return "challenges"
|
||||
if "revis" in lower:
|
||||
return "revises"
|
||||
if "extend" in lower:
|
||||
return "extends"
|
||||
if any(k in lower for k in ("respond", "address", "engage")):
|
||||
return "addresses"
|
||||
return "references"
|
||||
|
||||
|
||||
def find_paper_targets(text: str) -> list[str]:
|
||||
if "paper" not in text.lower():
|
||||
return []
|
||||
return sorted(set(re.findall(r"\b00[1-8]\b", text)))
|
||||
|
||||
|
||||
def add_edge(edges: dict[tuple[str, str], dict], source: str, target: str, edge_type: str, context: str) -> None:
|
||||
if source == target:
|
||||
return
|
||||
key = (source, target)
|
||||
candidate = {"source": source, "target": target, "type": edge_type, "context": context}
|
||||
existing = edges.get(key)
|
||||
if not existing:
|
||||
edges[key] = candidate
|
||||
return
|
||||
if RELATION_PRIORITY[edge_type] > RELATION_PRIORITY[existing["type"]]:
|
||||
edges[key] = candidate
|
||||
|
||||
|
||||
def extract_explicit_edges(docs: list[Document]) -> dict[tuple[str, str], dict]:
|
||||
edges: dict[tuple[str, str], dict] = {}
|
||||
allegory_name_to_id = {doc.path.stem.replace("-", " "): doc.doc_id for doc in docs if doc.kind == "allegory"}
|
||||
|
||||
for doc in docs:
|
||||
for sent in sentence_chunks(doc.text):
|
||||
targets = find_paper_targets(sent)
|
||||
if targets:
|
||||
rel = classify_relationship(sent)
|
||||
for target in targets:
|
||||
add_edge(edges, doc.doc_id, target, rel, sent[:220])
|
||||
|
||||
if doc.kind == "paper" and doc.doc_id == "007":
|
||||
lower = doc.text.lower()
|
||||
for name, target_id in allegory_name_to_id.items():
|
||||
if name in lower:
|
||||
add_edge(
|
||||
edges,
|
||||
doc.doc_id,
|
||||
target_id,
|
||||
"extends",
|
||||
f"Paper 007 explicitly maps the {name.title()} allegory into the ratchet framework.",
|
||||
)
|
||||
|
||||
if doc.kind == "allegory":
|
||||
for sent in sentence_chunks(doc.text):
|
||||
targets = find_paper_targets(sent)
|
||||
for target in targets:
|
||||
add_edge(edges, doc.doc_id, target, "addresses", sent[:220])
|
||||
|
||||
return edges
|
||||
|
||||
|
||||
def collect_concept_presence(docs: list[Document]) -> tuple[dict[str, str], dict[str, set[str]]]:
|
||||
intro: dict[str, str] = {}
|
||||
usage: dict[str, set[str]] = defaultdict(set)
|
||||
|
||||
ordered = sorted([d for d in docs if d.kind == "paper"], key=lambda d: d.doc_id) + [
|
||||
d for d in docs if d.kind == "allegory"
|
||||
]
|
||||
|
||||
for doc in ordered:
|
||||
lower = doc.text.lower()
|
||||
for concept, info in CONCEPT_CATALOG.items():
|
||||
aliases = info["aliases"]
|
||||
if any(alias.lower() in lower for alias in aliases):
|
||||
usage[concept].add(doc.doc_id)
|
||||
expected_intro = info["intro"]
|
||||
if expected_intro in {d.doc_id for d in docs if d.kind == "paper"}:
|
||||
intro.setdefault(concept, expected_intro)
|
||||
else:
|
||||
intro.setdefault(concept, doc.doc_id)
|
||||
return intro, usage
|
||||
|
||||
|
||||
def extract_implicit_edges(
|
||||
docs: list[Document], intro: dict[str, str], usage: dict[str, set[str]], edges: dict[tuple[str, str], dict]
|
||||
) -> None:
|
||||
for concept, source in intro.items():
|
||||
if not re.match(r"^00[1-8]$", source):
|
||||
continue
|
||||
for target in sorted(usage[concept]):
|
||||
if target == source or not re.match(r"^00[1-8]$", target):
|
||||
continue
|
||||
if target <= source:
|
||||
continue
|
||||
add_edge(
|
||||
edges,
|
||||
source,
|
||||
target,
|
||||
"introduces concept used by",
|
||||
f"{concept} appears first in {source} and recurs in {target}.",
|
||||
)
|
||||
|
||||
|
||||
def build_nodes(docs: list[Document], intro: dict[str, str]) -> list[dict]:
|
||||
concept_by_doc: dict[str, list[str]] = defaultdict(list)
|
||||
for concept, doc_id in intro.items():
|
||||
concept_by_doc[doc_id].append(concept)
|
||||
|
||||
nodes: list[dict] = []
|
||||
for doc in sorted(docs, key=lambda d: (d.kind != "paper", d.doc_id)):
|
||||
nodes.append(
|
||||
{
|
||||
"id": doc.doc_id,
|
||||
"title": doc.title,
|
||||
"kind": doc.kind,
|
||||
"concepts_introduced": sorted(concept_by_doc.get(doc.doc_id, [])),
|
||||
}
|
||||
)
|
||||
return nodes
|
||||
|
||||
|
||||
def write_mermaid(nodes: list[dict], edges: list[dict]) -> None:
|
||||
def mm_id(node_id: str) -> str:
|
||||
return re.sub(r"[^A-Za-z0-9_]", "_", node_id)
|
||||
|
||||
lines = ["graph TD"]
|
||||
for node in nodes:
|
||||
nid = mm_id(node["id"])
|
||||
label = f'{node["id"]}: {node["title"]}'
|
||||
lines.append(f' {nid}["{label}"]')
|
||||
for edge in edges:
|
||||
src = mm_id(edge["source"])
|
||||
dst = mm_id(edge["target"])
|
||||
rel = edge["type"].replace('"', "")
|
||||
lines.append(f" {src} -->|{rel}| {dst}")
|
||||
(OUT_DIR / "graph.mermaid").write_text("\n".join(lines) + "\n", encoding="utf-8")
|
||||
|
||||
|
||||
def extract_open_questions(paper: Document) -> list[str]:
|
||||
lines = paper.text.splitlines()
|
||||
start = None
|
||||
for i, line in enumerate(lines):
|
||||
if line.strip().lower().startswith("## open questions"):
|
||||
start = i + 1
|
||||
break
|
||||
if start is None:
|
||||
return []
|
||||
|
||||
questions: list[str] = []
|
||||
for line in lines[start:]:
|
||||
if line.startswith("## "):
|
||||
break
|
||||
stripped = line.strip()
|
||||
if re.match(r"^(\d+\.|-)\s+", stripped):
|
||||
body = re.sub(r"^(\d+\.|-)\s+", "", stripped).strip()
|
||||
if body:
|
||||
questions.append(body)
|
||||
return questions
|
||||
|
||||
|
||||
def question_keywords(text: str) -> set[str]:
|
||||
words = re.findall(r"[A-Za-z][A-Za-z\-]{3,}", text.lower())
|
||||
return {w for w in words if w not in STOPWORDS}
|
||||
|
||||
|
||||
def build_dangling_threads(papers: list[Document]) -> str:
|
||||
paper_map = {p.doc_id: p for p in papers}
|
||||
ordered_ids = sorted(paper_map.keys())
|
||||
lines = ["# Dangling Threads", ""]
|
||||
|
||||
found_any = False
|
||||
for doc_id in ordered_ids:
|
||||
paper = paper_map[doc_id]
|
||||
questions = extract_open_questions(paper)
|
||||
later = [paper_map[i] for i in ordered_ids if i > doc_id]
|
||||
|
||||
for question in questions:
|
||||
kws = question_keywords(question)
|
||||
hits: list[str] = []
|
||||
if kws:
|
||||
for other in later:
|
||||
lower = other.text.lower()
|
||||
overlap = sum(1 for kw in kws if kw in lower)
|
||||
if overlap >= 2:
|
||||
hits.append(other.doc_id)
|
||||
found_any = True
|
||||
if hits:
|
||||
lines.append(
|
||||
f"- Raised in **Paper {doc_id}**: {question} \n"
|
||||
f" Partially addressed in later papers: {', '.join(f'Paper {h}' for h in hits)}."
|
||||
)
|
||||
else:
|
||||
lines.append(
|
||||
f"- Raised in **Paper {doc_id}**: {question} \n"
|
||||
" Partially addressed in later papers: none detected."
|
||||
)
|
||||
|
||||
if not found_any:
|
||||
lines.append("- No open-question sections were detected in the source files.")
|
||||
lines.append("")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def build_concept_flow(
|
||||
papers: list[Document], intro: dict[str, str], usage: dict[str, set[str]], explicit_edges: list[dict]
|
||||
) -> str:
|
||||
lines = ["# Concept Flow", ""]
|
||||
|
||||
paper_ids = sorted(p.doc_id for p in papers)
|
||||
paper_map = {p.doc_id: p for p in papers}
|
||||
|
||||
for concept in sorted(CONCEPT_CATALOG.keys()):
|
||||
introduced = intro.get(concept, "unknown")
|
||||
used_in = sorted(d for d in usage.get(concept, set()) if d in paper_ids)
|
||||
aliases = CONCEPT_CATALOG[concept]["aliases"]
|
||||
|
||||
challenged: set[str] = set()
|
||||
revised: set[str] = set()
|
||||
for doc_id in used_in:
|
||||
has_concept_sentence = False
|
||||
for sent in sentence_chunks(paper_map[doc_id].text):
|
||||
lower_sent = sent.lower()
|
||||
if not any(a.lower() in lower_sent for a in aliases):
|
||||
continue
|
||||
has_concept_sentence = True
|
||||
if any(k in lower_sent for k in ("challenge", "critic", "rebuttal", "against", "unfalsifiable")):
|
||||
challenged.add(doc_id)
|
||||
if any(k in lower_sent for k in ("revision", "revised", "supersedes", "responds", "extends")):
|
||||
revised.add(doc_id)
|
||||
if not has_concept_sentence:
|
||||
continue
|
||||
|
||||
challenged_list = sorted(challenged)
|
||||
revised_list = sorted(revised)
|
||||
current = used_in[-1] if used_in else "unknown"
|
||||
|
||||
lines.append(f"## {concept.title()}")
|
||||
lines.append(f"- Introduced in: Paper {introduced}" if introduced != "unknown" else "- Introduced in: unknown")
|
||||
lines.append(
|
||||
f"- Challenged in: {', '.join(f'Paper {p}' for p in challenged_list)}"
|
||||
if challenged_list
|
||||
else "- Challenged in: none detected"
|
||||
)
|
||||
lines.append(
|
||||
f"- Revised in: {', '.join(f'Paper {p}' for p in revised_list)}"
|
||||
if revised_list
|
||||
else "- Revised in: none detected"
|
||||
)
|
||||
lines.append(
|
||||
f"- Referenced in: {', '.join(f'Paper {p}' for p in used_in)}" if used_in else "- Referenced in: none detected"
|
||||
)
|
||||
lines.append(f"- Current standing: active in latest mention (Paper {current})." if current != "unknown" else "- Current standing: unclear.")
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
docs = read_documents()
|
||||
papers = [d for d in docs if d.kind == "paper"]
|
||||
|
||||
intro, usage = collect_concept_presence(docs)
|
||||
edge_map = extract_explicit_edges(docs)
|
||||
extract_implicit_edges(docs, intro, usage, edge_map)
|
||||
|
||||
nodes = build_nodes(docs, intro)
|
||||
edges = sorted(edge_map.values(), key=lambda e: (e["source"], e["target"], e["type"]))
|
||||
|
||||
graph = {"nodes": nodes, "edges": edges}
|
||||
(OUT_DIR / "graph.json").write_text(json.dumps(graph, indent=2) + "\n", encoding="utf-8")
|
||||
write_mermaid(nodes, edges)
|
||||
|
||||
dangling = build_dangling_threads(papers)
|
||||
(OUT_DIR / "dangling_threads.md").write_text(dangling, encoding="utf-8")
|
||||
|
||||
flow = build_concept_flow(papers, intro, usage, edges)
|
||||
(OUT_DIR / "concept_flow.md").write_text(flow, encoding="utf-8")
|
||||
|
||||
print(f"Wrote {OUT_DIR / 'graph.json'}")
|
||||
print(f"Wrote {OUT_DIR / 'graph.mermaid'}")
|
||||
print(f"Wrote {OUT_DIR / 'dangling_threads.md'}")
|
||||
print(f"Wrote {OUT_DIR / 'concept_flow.md'}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user