docs: integration tools — cross-reference graph, concept index, research digest

Codex-built tooling: cross-reference graph, concept index with build script, and research integrator that extracted 142 scholars, 175 bibliography items, 4 contradiction topics, and coverage maps for Paper 009 planning. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-03 08:31:20 -04:00
parent d34f447e1f
commit f654b30de9
18 changed files with 12535 additions and 0 deletions
@@ -0,0 +1,470 @@
+#!/usr/bin/env python3
+"""Build cross-reference artifacts for the VIBECODE-THEORY paper series.
+
+Outputs:
+- graph.json
+- graph.mermaid
+- dangling_threads.md
+- concept_flow.md
+"""
+
+from __future__ import annotations
+
+import json
+import re
+from collections import defaultdict
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Iterable
+
+
+ROOT = Path(__file__).resolve().parents[2]
+OUT_DIR = Path(__file__).resolve().parent
+
+PAPER_GLOB = "00*-*.md"
+ALLEGORY_GLOB = "allegorical/*.md"
+
+STOPWORDS = {
+    "about",
+    "after",
+    "again",
+    "also",
+    "because",
+    "between",
+    "could",
+    "does",
+    "from",
+    "have",
+    "into",
+    "just",
+    "like",
+    "might",
+    "more",
+    "most",
+    "over",
+    "paper",
+    "question",
+    "series",
+    "should",
+    "than",
+    "that",
+    "their",
+    "them",
+    "then",
+    "this",
+    "those",
+    "through",
+    "what",
+    "when",
+    "which",
+    "with",
+    "would",
+}
+
+RELATION_PRIORITY = {
+    "supersedes": 8,
+    "refutes": 7,
+    "challenges": 6,
+    "revises": 5,
+    "extends": 4,
+    "addresses": 3,
+    "introduces concept used by": 2,
+    "references": 1,
+}
+
+CONCEPT_CATALOG = {
+    "vibe coding as social skill": {
+        "aliases": ["vibe coding", "social skill", "meta-skill"],
+        "intro": "001",
+    },
+    "cognitive surplus": {
+        "aliases": ["cognitive surplus", "surplus"],
+        "intro": "002",
+    },
+    "dependency trap": {
+        "aliases": ["dependency trap", "systemic dependency"],
+        "intro": "002",
+    },
+    "cognitive preference shift": {
+        "aliases": ["cognitive preference shift", "preference shift"],
+        "intro": "005",
+    },
+    "automation spiral": {
+        "aliases": ["automation spiral"],
+        "intro": "003",
+    },
+    "feedback loop": {
+        "aliases": ["feedback loop", "uncomfortable middle"],
+        "intro": "006",
+    },
+    "biological ratchet": {
+        "aliases": ["biological ratchet", "ratchet"],
+        "intro": "007",
+    },
+    "infrastructure threshold": {
+        "aliases": ["infrastructure threshold", "application phase"],
+        "intro": "007",
+    },
+    "premature dependency hibernation": {
+        "aliases": ["premature dependencies", "hibernation"],
+        "intro": "007",
+    },
+    "knowledge unification": {
+        "aliases": ["knowledge unification", "defragmentation"],
+        "intro": "008",
+    },
+    "ship of theseus identity problem": {
+        "aliases": ["ship of theseus", "species identity"],
+        "intro": "008",
+    },
+    "cheating frame": {
+        "aliases": ['"cheating"', "cheating frame"],
+        "intro": "008",
+    },
+    "dependency chain": {
+        "aliases": ["dependency chain"],
+        "intro": "007",
+    },
+}
+
+
+@dataclass
+class Document:
+    doc_id: str
+    title: str
+    kind: str
+    path: Path
+    text: str
+
+
+def read_documents() -> list[Document]:
+    docs: list[Document] = []
+
+    for path in sorted(ROOT.glob(PAPER_GLOB)):
+        text = path.read_text(encoding="utf-8")
+        m = re.search(r"^#\s+Paper\s+(\d{3}):\s*(.+)$", text, flags=re.M)
+        if m:
+            doc_id, title = m.group(1), m.group(2).strip()
+        else:
+            doc_id = path.name.split("-", 1)[0]
+            title = path.stem
+        docs.append(Document(doc_id=doc_id, title=title, kind="paper", path=path, text=text))
+
+    for path in sorted(ROOT.glob(ALLEGORY_GLOB)):
+        text = path.read_text(encoding="utf-8")
+        m = re.search(r"^#\s+(.+)$", text, flags=re.M)
+        title = m.group(1).strip() if m else path.stem.replace("-", " ").title()
+        docs.append(
+            Document(
+                doc_id=f"A:{path.stem}",
+                title=title,
+                kind="allegory",
+                path=path,
+                text=text,
+            )
+        )
+
+    return docs
+
+
+def sentence_chunks(text: str) -> Iterable[str]:
+    for chunk in re.split(r"(?<=[.!?])\s+|\n{2,}", text):
+        cleaned = " ".join(chunk.strip().split())
+        if cleaned:
+            yield cleaned
+
+
+def classify_relationship(text: str) -> str:
+    lower = text.lower()
+    if "supersed" in lower:
+        return "supersedes"
+    if any(k in lower for k in ("refute", "rebuttal", "against")):
+        return "refutes"
+    if any(k in lower for k in ("challenge", "critic", "unfalsifiable")):
+        return "challenges"
+    if "revis" in lower:
+        return "revises"
+    if "extend" in lower:
+        return "extends"
+    if any(k in lower for k in ("respond", "address", "engage")):
+        return "addresses"
+    return "references"
+
+
+def find_paper_targets(text: str) -> list[str]:
+    if "paper" not in text.lower():
+        return []
+    return sorted(set(re.findall(r"\b00[1-8]\b", text)))
+
+
+def add_edge(edges: dict[tuple[str, str], dict], source: str, target: str, edge_type: str, context: str) -> None:
+    if source == target:
+        return
+    key = (source, target)
+    candidate = {"source": source, "target": target, "type": edge_type, "context": context}
+    existing = edges.get(key)
+    if not existing:
+        edges[key] = candidate
+        return
+    if RELATION_PRIORITY[edge_type] > RELATION_PRIORITY[existing["type"]]:
+        edges[key] = candidate
+
+
+def extract_explicit_edges(docs: list[Document]) -> dict[tuple[str, str], dict]:
+    edges: dict[tuple[str, str], dict] = {}
+    allegory_name_to_id = {doc.path.stem.replace("-", " "): doc.doc_id for doc in docs if doc.kind == "allegory"}
+
+    for doc in docs:
+        for sent in sentence_chunks(doc.text):
+            targets = find_paper_targets(sent)
+            if targets:
+                rel = classify_relationship(sent)
+                for target in targets:
+                    add_edge(edges, doc.doc_id, target, rel, sent[:220])
+
+        if doc.kind == "paper" and doc.doc_id == "007":
+            lower = doc.text.lower()
+            for name, target_id in allegory_name_to_id.items():
+                if name in lower:
+                    add_edge(
+                        edges,
+                        doc.doc_id,
+                        target_id,
+                        "extends",
+                        f"Paper 007 explicitly maps the {name.title()} allegory into the ratchet framework.",
+                    )
+
+        if doc.kind == "allegory":
+            for sent in sentence_chunks(doc.text):
+                targets = find_paper_targets(sent)
+                for target in targets:
+                    add_edge(edges, doc.doc_id, target, "addresses", sent[:220])
+
+    return edges
+
+
+def collect_concept_presence(docs: list[Document]) -> tuple[dict[str, str], dict[str, set[str]]]:
+    intro: dict[str, str] = {}
+    usage: dict[str, set[str]] = defaultdict(set)
+
+    ordered = sorted([d for d in docs if d.kind == "paper"], key=lambda d: d.doc_id) + [
+        d for d in docs if d.kind == "allegory"
+    ]
+
+    for doc in ordered:
+        lower = doc.text.lower()
+        for concept, info in CONCEPT_CATALOG.items():
+            aliases = info["aliases"]
+            if any(alias.lower() in lower for alias in aliases):
+                usage[concept].add(doc.doc_id)
+                expected_intro = info["intro"]
+                if expected_intro in {d.doc_id for d in docs if d.kind == "paper"}:
+                    intro.setdefault(concept, expected_intro)
+                else:
+                    intro.setdefault(concept, doc.doc_id)
+    return intro, usage
+
+
+def extract_implicit_edges(
+    docs: list[Document], intro: dict[str, str], usage: dict[str, set[str]], edges: dict[tuple[str, str], dict]
+) -> None:
+    for concept, source in intro.items():
+        if not re.match(r"^00[1-8]$", source):
+            continue
+        for target in sorted(usage[concept]):
+            if target == source or not re.match(r"^00[1-8]$", target):
+                continue
+            if target <= source:
+                continue
+            add_edge(
+                edges,
+                source,
+                target,
+                "introduces concept used by",
+                f"{concept} appears first in {source} and recurs in {target}.",
+            )
+
+
+def build_nodes(docs: list[Document], intro: dict[str, str]) -> list[dict]:
+    concept_by_doc: dict[str, list[str]] = defaultdict(list)
+    for concept, doc_id in intro.items():
+        concept_by_doc[doc_id].append(concept)
+
+    nodes: list[dict] = []
+    for doc in sorted(docs, key=lambda d: (d.kind != "paper", d.doc_id)):
+        nodes.append(
+            {
+                "id": doc.doc_id,
+                "title": doc.title,
+                "kind": doc.kind,
+                "concepts_introduced": sorted(concept_by_doc.get(doc.doc_id, [])),
+            }
+        )
+    return nodes
+
+
+def write_mermaid(nodes: list[dict], edges: list[dict]) -> None:
+    def mm_id(node_id: str) -> str:
+        return re.sub(r"[^A-Za-z0-9_]", "_", node_id)
+
+    lines = ["graph TD"]
+    for node in nodes:
+        nid = mm_id(node["id"])
+        label = f'{node["id"]}: {node["title"]}'
+        lines.append(f'    {nid}["{label}"]')
+    for edge in edges:
+        src = mm_id(edge["source"])
+        dst = mm_id(edge["target"])
+        rel = edge["type"].replace('"', "")
+        lines.append(f"    {src} -->|{rel}| {dst}")
+    (OUT_DIR / "graph.mermaid").write_text("\n".join(lines) + "\n", encoding="utf-8")
+
+
+def extract_open_questions(paper: Document) -> list[str]:
+    lines = paper.text.splitlines()
+    start = None
+    for i, line in enumerate(lines):
+        if line.strip().lower().startswith("## open questions"):
+            start = i + 1
+            break
+    if start is None:
+        return []
+
+    questions: list[str] = []
+    for line in lines[start:]:
+        if line.startswith("## "):
+            break
+        stripped = line.strip()
+        if re.match(r"^(\d+\.|-)\s+", stripped):
+            body = re.sub(r"^(\d+\.|-)\s+", "", stripped).strip()
+            if body:
+                questions.append(body)
+    return questions
+
+
+def question_keywords(text: str) -> set[str]:
+    words = re.findall(r"[A-Za-z][A-Za-z\-]{3,}", text.lower())
+    return {w for w in words if w not in STOPWORDS}
+
+
+def build_dangling_threads(papers: list[Document]) -> str:
+    paper_map = {p.doc_id: p for p in papers}
+    ordered_ids = sorted(paper_map.keys())
+    lines = ["# Dangling Threads", ""]
+
+    found_any = False
+    for doc_id in ordered_ids:
+        paper = paper_map[doc_id]
+        questions = extract_open_questions(paper)
+        later = [paper_map[i] for i in ordered_ids if i > doc_id]
+
+        for question in questions:
+            kws = question_keywords(question)
+            hits: list[str] = []
+            if kws:
+                for other in later:
+                    lower = other.text.lower()
+                    overlap = sum(1 for kw in kws if kw in lower)
+                    if overlap >= 2:
+                        hits.append(other.doc_id)
+            found_any = True
+            if hits:
+                lines.append(
+                    f"- Raised in **Paper {doc_id}**: {question}  \n"
+                    f"  Partially addressed in later papers: {', '.join(f'Paper {h}' for h in hits)}."
+                )
+            else:
+                lines.append(
+                    f"- Raised in **Paper {doc_id}**: {question}  \n"
+                    "  Partially addressed in later papers: none detected."
+                )
+
+    if not found_any:
+        lines.append("- No open-question sections were detected in the source files.")
+    lines.append("")
+    return "\n".join(lines)
+
+
+def build_concept_flow(
+    papers: list[Document], intro: dict[str, str], usage: dict[str, set[str]], explicit_edges: list[dict]
+) -> str:
+    lines = ["# Concept Flow", ""]
+
+    paper_ids = sorted(p.doc_id for p in papers)
+    paper_map = {p.doc_id: p for p in papers}
+
+    for concept in sorted(CONCEPT_CATALOG.keys()):
+        introduced = intro.get(concept, "unknown")
+        used_in = sorted(d for d in usage.get(concept, set()) if d in paper_ids)
+        aliases = CONCEPT_CATALOG[concept]["aliases"]
+
+        challenged: set[str] = set()
+        revised: set[str] = set()
+        for doc_id in used_in:
+            has_concept_sentence = False
+            for sent in sentence_chunks(paper_map[doc_id].text):
+                lower_sent = sent.lower()
+                if not any(a.lower() in lower_sent for a in aliases):
+                    continue
+                has_concept_sentence = True
+                if any(k in lower_sent for k in ("challenge", "critic", "rebuttal", "against", "unfalsifiable")):
+                    challenged.add(doc_id)
+                if any(k in lower_sent for k in ("revision", "revised", "supersedes", "responds", "extends")):
+                    revised.add(doc_id)
+            if not has_concept_sentence:
+                continue
+
+        challenged_list = sorted(challenged)
+        revised_list = sorted(revised)
+        current = used_in[-1] if used_in else "unknown"
+
+        lines.append(f"## {concept.title()}")
+        lines.append(f"- Introduced in: Paper {introduced}" if introduced != "unknown" else "- Introduced in: unknown")
+        lines.append(
+            f"- Challenged in: {', '.join(f'Paper {p}' for p in challenged_list)}"
+            if challenged_list
+            else "- Challenged in: none detected"
+        )
+        lines.append(
+            f"- Revised in: {', '.join(f'Paper {p}' for p in revised_list)}"
+            if revised_list
+            else "- Revised in: none detected"
+        )
+        lines.append(
+            f"- Referenced in: {', '.join(f'Paper {p}' for p in used_in)}" if used_in else "- Referenced in: none detected"
+        )
+        lines.append(f"- Current standing: active in latest mention (Paper {current})." if current != "unknown" else "- Current standing: unclear.")
+        lines.append("")
+
+    return "\n".join(lines)
+
+
+def main() -> None:
+    docs = read_documents()
+    papers = [d for d in docs if d.kind == "paper"]
+
+    intro, usage = collect_concept_presence(docs)
+    edge_map = extract_explicit_edges(docs)
+    extract_implicit_edges(docs, intro, usage, edge_map)
+
+    nodes = build_nodes(docs, intro)
+    edges = sorted(edge_map.values(), key=lambda e: (e["source"], e["target"], e["type"]))
+
+    graph = {"nodes": nodes, "edges": edges}
+    (OUT_DIR / "graph.json").write_text(json.dumps(graph, indent=2) + "\n", encoding="utf-8")
+    write_mermaid(nodes, edges)
+
+    dangling = build_dangling_threads(papers)
+    (OUT_DIR / "dangling_threads.md").write_text(dangling, encoding="utf-8")
+
+    flow = build_concept_flow(papers, intro, usage, edges)
+    (OUT_DIR / "concept_flow.md").write_text(flow, encoding="utf-8")
+
+    print(f"Wrote {OUT_DIR / 'graph.json'}")
+    print(f"Wrote {OUT_DIR / 'graph.mermaid'}")
+    print(f"Wrote {OUT_DIR / 'dangling_threads.md'}")
+    print(f"Wrote {OUT_DIR / 'concept_flow.md'}")
+
+
+if __name__ == "__main__":
+    main()