f654b30de9
Codex-built tooling: cross-reference graph, concept index with build script, and research integrator that extracted 142 scholars, 175 bibliography items, 4 contradiction topics, and coverage maps for Paper 009 planning. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
912 lines
29 KiB
Python
912 lines
29 KiB
Python
#!/usr/bin/env python3
|
|
"""Integrate research markdown files into a unified digest for Paper 009 planning."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
|
|
HEADING_RE = re.compile(r"^(#{1,6})\s+(.+?)\s*$", re.MULTILINE)
|
|
BULLET_RE = re.compile(r"^\s*[-*]\s+(.+?)\s*$")
|
|
NUMBERED_RE = re.compile(r"^\s*(\d+)\.\s+(.+?)\s*$")
|
|
SENTENCE_SPLIT_RE = re.compile(r"(?<=[.!?])\s+|\n+")
|
|
NAME_RE = re.compile(
|
|
r"\b(?:[A-Z](?:\.[A-Z])+\.?|[A-Z][a-zA-Z'-]+)"
|
|
r"(?:\s+(?:[A-Z](?:\.[A-Z])+\.?|[A-Z][a-zA-Z'-]+)){1,3}\b"
|
|
)
|
|
BAD_NAME_WORDS = {
|
|
"executive",
|
|
"summary",
|
|
"task",
|
|
"sources",
|
|
"paper",
|
|
"physics",
|
|
"technology",
|
|
"society",
|
|
"logs",
|
|
"pricing",
|
|
"history",
|
|
"quantum",
|
|
"analysis",
|
|
}
|
|
|
|
|
|
TOPIC_RULES = [
|
|
{
|
|
"id": "determinism_vs_agency",
|
|
"label": "Technological determinism vs social agency",
|
|
"pro_markers": [
|
|
"autonomous technique",
|
|
"irreversible",
|
|
"lock-in",
|
|
"path dependence",
|
|
"ratchet",
|
|
"structurally fixed",
|
|
"cannot reverse",
|
|
],
|
|
"con_markers": [
|
|
"social construct",
|
|
"interpretive flexibility",
|
|
"democratic rationalization",
|
|
"human agency",
|
|
"selective adoption",
|
|
"tool taming",
|
|
"re-shaped",
|
|
"can change",
|
|
],
|
|
},
|
|
{
|
|
"id": "unification_vs_homogenization",
|
|
"label": "Knowledge unification vs statistical homogenization",
|
|
"pro_markers": [
|
|
"knowledge unification",
|
|
"integration layer",
|
|
"interconnectedness",
|
|
"consilience",
|
|
"compiled",
|
|
"coherent",
|
|
],
|
|
"con_markers": [
|
|
"stochastic parrot",
|
|
"homogenization",
|
|
"illusion",
|
|
"veneer",
|
|
"lossy",
|
|
"lacks understanding",
|
|
"database lookup",
|
|
],
|
|
},
|
|
{
|
|
"id": "cognition_commodity_vs_mimicry",
|
|
"label": "AI cognition commodity vs token mimicry",
|
|
"pro_markers": [
|
|
"cognition as a commodity",
|
|
"price of thinking",
|
|
"task-based framework",
|
|
"automation",
|
|
"productivity",
|
|
"cognitive offloading",
|
|
],
|
|
"con_markers": [
|
|
"stochastic parrot",
|
|
"doesn't think",
|
|
"mimicry",
|
|
"predicts tokens",
|
|
"no cognitive model",
|
|
],
|
|
},
|
|
{
|
|
"id": "retrocausal_attractor",
|
|
"label": "Teleological attractor vs unfalsifiable retrocausality",
|
|
"pro_markers": [
|
|
"teleological attractor",
|
|
"retrocausal",
|
|
"omega point",
|
|
"final cause",
|
|
"participatory universe",
|
|
"transactional interpretation",
|
|
],
|
|
"con_markers": [
|
|
"unfalsifiability",
|
|
"pseudoscience",
|
|
"woo",
|
|
"causality violation",
|
|
"superdeterminism",
|
|
],
|
|
},
|
|
{
|
|
"id": "efficiency_vs_jevons",
|
|
"label": "Efficiency frees time vs Jevons expansion",
|
|
"pro_markers": [
|
|
"efficiency gains",
|
|
"free up human time",
|
|
"productivity",
|
|
"surplus",
|
|
"cost disease",
|
|
],
|
|
"con_markers": [
|
|
"jevons paradox",
|
|
"increased consumption",
|
|
"reasoning inflation",
|
|
"more complex systems",
|
|
"dependency",
|
|
],
|
|
},
|
|
]
|
|
|
|
|
|
CHALLENGE_KEYWORDS = {
|
|
"unfalsifiable": 5,
|
|
"dogma": 4,
|
|
"pseudoscience": 5,
|
|
"illusion": 4,
|
|
"mimicry": 4,
|
|
"lacks understanding": 4,
|
|
"circular": 3,
|
|
"causality violation": 4,
|
|
"superdeterminism": 3,
|
|
"lossy": 2,
|
|
"stochastic parrot": 5,
|
|
}
|
|
|
|
|
|
QUESTION_KEYWORDS = {
|
|
1: [
|
|
"falsifiable",
|
|
"falsifiability",
|
|
"unification",
|
|
"replacement",
|
|
"fragment",
|
|
"distort",
|
|
"evidence",
|
|
"test",
|
|
"stochastic",
|
|
],
|
|
2: [
|
|
"identity",
|
|
"human",
|
|
"consciousness",
|
|
"agency",
|
|
"values",
|
|
"pragmatic",
|
|
"continuity",
|
|
"survival",
|
|
],
|
|
3: [
|
|
"individual",
|
|
"workers",
|
|
"labor",
|
|
"skills",
|
|
"strategy",
|
|
"governance",
|
|
"practical",
|
|
"action",
|
|
],
|
|
4: [
|
|
"cheating",
|
|
"tools",
|
|
"dependency",
|
|
"ratchet",
|
|
"adoption",
|
|
"ethics",
|
|
"norm",
|
|
],
|
|
5: [
|
|
"timeline",
|
|
"threshold",
|
|
"when",
|
|
"prediction",
|
|
"curve",
|
|
"years",
|
|
"exponential",
|
|
"phase",
|
|
],
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class Doc:
|
|
path: Path
|
|
slug: str
|
|
title: str
|
|
text: str
|
|
sections: dict[str, str]
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(description=__doc__)
|
|
default_root = Path(__file__).resolve().parents[2]
|
|
parser.add_argument("--project-root", type=Path, default=default_root)
|
|
parser.add_argument("--research-dir", type=Path)
|
|
parser.add_argument("--paper-008", type=Path)
|
|
parser.add_argument("--out-dir", type=Path, default=Path(__file__).resolve().parent)
|
|
return parser.parse_args()
|
|
|
|
|
|
def clean_inline_md(text: str) -> str:
|
|
text = text.strip()
|
|
text = re.sub(r"`([^`]+)`", r"\1", text)
|
|
text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text)
|
|
text = re.sub(r"\*([^*]+)\*", r"\1", text)
|
|
text = re.sub(r"\[(.*?)\]\((.*?)\)", r"\1", text)
|
|
return re.sub(r"\s+", " ", text).strip()
|
|
|
|
|
|
def extract_sections(text: str) -> dict[str, str]:
|
|
matches = list(HEADING_RE.finditer(text))
|
|
if not matches:
|
|
return {}
|
|
|
|
sections: dict[str, str] = {}
|
|
for idx, match in enumerate(matches):
|
|
heading = clean_inline_md(match.group(2)).lower()
|
|
start = match.end()
|
|
end = matches[idx + 1].start() if idx + 1 < len(matches) else len(text)
|
|
sections[heading] = text[start:end].strip()
|
|
return sections
|
|
|
|
|
|
def load_research_docs(research_dir: Path) -> list[Doc]:
|
|
docs: list[Doc] = []
|
|
for path in sorted(research_dir.glob("*.md")):
|
|
raw = path.read_text(encoding="utf-8")
|
|
title = path.stem
|
|
for line in raw.splitlines():
|
|
if line.startswith("# "):
|
|
title = clean_inline_md(line[2:])
|
|
break
|
|
docs.append(
|
|
Doc(
|
|
path=path,
|
|
slug=path.stem,
|
|
title=title,
|
|
text=raw,
|
|
sections=extract_sections(raw),
|
|
)
|
|
)
|
|
return docs
|
|
|
|
|
|
def normalize_person_name(name: str) -> str:
|
|
stripped = re.sub(r"\bet al\.?", "", name, flags=re.IGNORECASE)
|
|
stripped = stripped.replace("&", " and ")
|
|
stripped = re.sub(r"\([^)]*\)", "", stripped)
|
|
stripped = re.sub(r"[^A-Za-z .'-]", " ", stripped)
|
|
stripped = re.sub(r"\s+", " ", stripped).strip()
|
|
return stripped.lower()
|
|
|
|
|
|
def split_possible_names(chunk: str) -> list[str]:
|
|
chunk = clean_inline_md(chunk)
|
|
chunk = chunk.split(":", 1)[0]
|
|
chunk = re.sub(r"\([^)]*\)", "", chunk)
|
|
chunk = re.sub(r'"[^"]+"', "", chunk)
|
|
chunk = chunk.replace("&", " and ")
|
|
names = NAME_RE.findall(chunk)
|
|
out: list[str] = []
|
|
for name in names:
|
|
name = re.sub(r"\s+", " ", name).strip(" .,:;")
|
|
words = [w for w in name.split() if w and w[0].isalpha()]
|
|
if len(words) >= 2 and not any(w.lower() in BAD_NAME_WORDS for w in words):
|
|
out.append(" ".join(words))
|
|
if not out:
|
|
single = re.sub(r"[^A-Za-z'-]", "", chunk).strip()
|
|
if (
|
|
single
|
|
and single[0].isupper()
|
|
and single.lower() not in BAD_NAME_WORDS
|
|
and len(single) > 3
|
|
):
|
|
out.append(single)
|
|
return out
|
|
|
|
|
|
def extract_scholars(docs: list[Doc]) -> dict[str, dict[str, Any]]:
|
|
scholars: dict[str, dict[str, Any]] = {}
|
|
|
|
for doc in docs:
|
|
key_sections = [text for name, text in doc.sections.items() if "key scholars" in name]
|
|
|
|
candidates: list[str] = []
|
|
for section in key_sections:
|
|
for line in section.splitlines():
|
|
bullet_match = BULLET_RE.match(line)
|
|
if not bullet_match:
|
|
continue
|
|
raw = bullet_match.group(1)
|
|
bullet = clean_inline_md(raw)
|
|
bold_match = re.search(r"\*\*([^*]+)\*\*", raw)
|
|
if bold_match:
|
|
candidates.extend(split_possible_names(bold_match.group(1)))
|
|
else:
|
|
lead = bullet.split(":", 1)[0]
|
|
candidates.extend(split_possible_names(lead))
|
|
|
|
unique = sorted(set(candidates))
|
|
text_lower = doc.text.lower()
|
|
for name in unique:
|
|
key = normalize_person_name(name)
|
|
if not key:
|
|
continue
|
|
entry = scholars.setdefault(
|
|
key,
|
|
{
|
|
"name": name,
|
|
"aliases": set(),
|
|
"files": set(),
|
|
"mention_count": 0,
|
|
"contexts": [],
|
|
},
|
|
)
|
|
entry["aliases"].add(name)
|
|
entry["files"].add(doc.slug)
|
|
|
|
surname = name.split()[-1].lower().strip(".,")
|
|
local_mentions = []
|
|
for sentence in SENTENCE_SPLIT_RE.split(doc.text):
|
|
sentence_clean = clean_inline_md(sentence)
|
|
if surname and surname in sentence_clean.lower():
|
|
local_mentions.append(sentence_clean)
|
|
|
|
if not local_mentions:
|
|
if surname and surname in text_lower:
|
|
local_mentions = [f"Mentioned in {doc.slug}"]
|
|
else:
|
|
local_mentions = [f"Listed in {doc.slug}"]
|
|
|
|
entry["mention_count"] += len(local_mentions)
|
|
for snippet in local_mentions[:3]:
|
|
entry["contexts"].append({"file": doc.slug, "snippet": snippet})
|
|
|
|
for entry in scholars.values():
|
|
entry["aliases"] = sorted(entry["aliases"])
|
|
entry["files"] = sorted(entry["files"])
|
|
|
|
return scholars
|
|
|
|
|
|
def extract_title_from_source_line(line: str) -> str:
|
|
line_clean = clean_inline_md(line)
|
|
quoted = re.findall(r'"([^"]+)"', line)
|
|
if quoted:
|
|
return clean_inline_md(quoted[0])
|
|
italic = re.findall(r"\*([^*]+)\*", line)
|
|
if italic:
|
|
return clean_inline_md(italic[0])
|
|
|
|
year_match = re.search(r"\(\d{4}\)\.?", line_clean)
|
|
if year_match:
|
|
tail = line_clean[year_match.end() :].strip(" .:-")
|
|
if tail:
|
|
return tail.split(".", 1)[0].strip()
|
|
|
|
return line_clean
|
|
|
|
|
|
def extract_authors_from_source_line(line: str) -> list[str]:
|
|
line_clean = clean_inline_md(line)
|
|
year_match = re.search(r"\(\d{4}\)", line_clean)
|
|
head = line_clean[: year_match.start()].strip() if year_match else line_clean
|
|
head = head.replace("&", " and ")
|
|
head = re.sub(r"\bet al\.?", "", head, flags=re.IGNORECASE)
|
|
parts = [p.strip(" ,.-") for p in re.split(r"\band\b|;", head) if p.strip(" ,.-")]
|
|
|
|
names: list[str] = []
|
|
for part in parts:
|
|
if re.search(r"[A-Za-z]", part):
|
|
names.append(part)
|
|
return names
|
|
|
|
|
|
def normalize_title(title: str) -> str:
|
|
title = title.lower()
|
|
title = re.sub(r"[^a-z0-9 ]", " ", title)
|
|
return re.sub(r"\s+", " ", title).strip()
|
|
|
|
|
|
def extract_bibliography(docs: list[Doc]) -> dict[str, dict[str, Any]]:
|
|
bibliography: dict[str, dict[str, Any]] = {}
|
|
for doc in docs:
|
|
sources = [text for name, text in doc.sections.items() if name.startswith("sources")]
|
|
for src in sources:
|
|
for line in src.splitlines():
|
|
bullet = BULLET_RE.match(line)
|
|
if not bullet:
|
|
continue
|
|
raw = bullet.group(1)
|
|
title = extract_title_from_source_line(raw)
|
|
if not title:
|
|
continue
|
|
key = normalize_title(title)
|
|
if not key:
|
|
continue
|
|
entry = bibliography.setdefault(
|
|
key,
|
|
{
|
|
"title": title,
|
|
"authors": set(),
|
|
"files": set(),
|
|
"raw_mentions": [],
|
|
},
|
|
)
|
|
entry["files"].add(doc.slug)
|
|
entry["raw_mentions"].append(clean_inline_md(raw))
|
|
for author in extract_authors_from_source_line(raw):
|
|
entry["authors"].add(author)
|
|
|
|
for entry in bibliography.values():
|
|
entry["authors"] = sorted(entry["authors"])
|
|
entry["files"] = sorted(entry["files"])
|
|
entry["relevance"] = len(entry["files"]) * 2 + len(entry["raw_mentions"])
|
|
|
|
return bibliography
|
|
|
|
|
|
def first_sentence_with_marker(text: str, marker: str) -> str | None:
|
|
for sentence in SENTENCE_SPLIT_RE.split(text):
|
|
if marker in sentence.lower():
|
|
return clean_inline_md(sentence)
|
|
return None
|
|
|
|
|
|
def detect_contradictions(docs: list[Doc]) -> list[dict[str, Any]]:
|
|
contradictions: list[dict[str, Any]] = []
|
|
|
|
for rule in TOPIC_RULES:
|
|
pro_evidence: list[dict[str, str]] = []
|
|
con_evidence: list[dict[str, str]] = []
|
|
|
|
for doc in docs:
|
|
text_lower = doc.text.lower()
|
|
for marker in rule["pro_markers"]:
|
|
if marker in text_lower:
|
|
snippet = first_sentence_with_marker(doc.text, marker)
|
|
if snippet:
|
|
pro_evidence.append(
|
|
{"file": doc.slug, "marker": marker, "snippet": snippet}
|
|
)
|
|
break
|
|
for marker in rule["con_markers"]:
|
|
if marker in text_lower:
|
|
snippet = first_sentence_with_marker(doc.text, marker)
|
|
if snippet:
|
|
con_evidence.append(
|
|
{"file": doc.slug, "marker": marker, "snippet": snippet}
|
|
)
|
|
break
|
|
|
|
pro_files = {item["file"] for item in pro_evidence}
|
|
con_files = {item["file"] for item in con_evidence}
|
|
if pro_files - con_files and con_files - pro_files:
|
|
contradictions.append(
|
|
{
|
|
"topic": rule["label"],
|
|
"topic_id": rule["id"],
|
|
"supports": pro_evidence[:4],
|
|
"challenges": con_evidence[:4],
|
|
"supporting_files": sorted(pro_files),
|
|
"challenging_files": sorted(con_files),
|
|
}
|
|
)
|
|
|
|
return contradictions
|
|
|
|
|
|
def extract_open_questions(paper_008: Path) -> list[dict[str, Any]]:
|
|
text = paper_008.read_text(encoding="utf-8")
|
|
marker = "## Open Questions for Paper 009"
|
|
if marker not in text:
|
|
raise RuntimeError("Could not find 'Open Questions for Paper 009' in paper 008")
|
|
|
|
section = text.split(marker, 1)[1]
|
|
next_header = re.search(r"\n##\s+", section)
|
|
if next_header:
|
|
section = section[: next_header.start()]
|
|
|
|
questions: list[dict[str, Any]] = []
|
|
for line in section.splitlines():
|
|
match = NUMBERED_RE.match(line)
|
|
if not match:
|
|
continue
|
|
idx = int(match.group(1))
|
|
body = clean_inline_md(match.group(2))
|
|
body = re.sub(r"^\*\*", "", body)
|
|
body = re.sub(r"\*\*", "", body)
|
|
questions.append({"id": idx, "text": body})
|
|
|
|
if not questions:
|
|
raise RuntimeError("No numbered open questions found in paper 008")
|
|
return questions
|
|
|
|
|
|
def map_to_open_questions(
|
|
docs: list[Doc], open_questions: list[dict[str, Any]]
|
|
) -> list[dict[str, Any]]:
|
|
coverage: list[dict[str, Any]] = []
|
|
|
|
for question in open_questions:
|
|
qid = question["id"]
|
|
keywords = QUESTION_KEYWORDS.get(qid, [])
|
|
file_scores: list[dict[str, Any]] = []
|
|
total = 0
|
|
|
|
for doc in docs:
|
|
score = 0
|
|
snippets: list[str] = []
|
|
for sentence in SENTENCE_SPLIT_RE.split(doc.text):
|
|
sentence_clean = clean_inline_md(sentence)
|
|
hits = sum(1 for kw in keywords if kw in sentence_clean.lower())
|
|
if hits:
|
|
score += hits
|
|
if len(snippets) < 3:
|
|
snippets.append(sentence_clean)
|
|
if score:
|
|
total += score
|
|
if score >= 12:
|
|
level = "high"
|
|
elif score >= 6:
|
|
level = "medium"
|
|
else:
|
|
level = "low"
|
|
file_scores.append(
|
|
{
|
|
"file": doc.slug,
|
|
"score": score,
|
|
"level": level,
|
|
"snippets": snippets,
|
|
}
|
|
)
|
|
|
|
file_scores.sort(key=lambda x: x["score"], reverse=True)
|
|
coverage.append(
|
|
{
|
|
"question_id": qid,
|
|
"question": question["text"],
|
|
"total_score": total,
|
|
"supporting_files": file_scores,
|
|
}
|
|
)
|
|
|
|
coverage.sort(key=lambda x: x["question_id"])
|
|
return coverage
|
|
|
|
|
|
def extract_strongest_challenges(docs: list[Doc]) -> list[dict[str, Any]]:
|
|
challenges: list[dict[str, Any]] = []
|
|
|
|
for doc in docs:
|
|
counter_sections = [
|
|
text
|
|
for name, text in doc.sections.items()
|
|
if "counterarguments" in name or "critiques" in name
|
|
]
|
|
if not counter_sections:
|
|
continue
|
|
for section in counter_sections:
|
|
for line in section.splitlines():
|
|
bullet = BULLET_RE.match(line)
|
|
if not bullet:
|
|
continue
|
|
text = clean_inline_md(bullet.group(1))
|
|
lower = text.lower()
|
|
score = 1
|
|
for keyword, weight in CHALLENGE_KEYWORDS.items():
|
|
if keyword in lower:
|
|
score += weight
|
|
challenges.append({"file": doc.slug, "text": text, "score": score})
|
|
|
|
merged: dict[str, dict[str, Any]] = {}
|
|
for challenge in challenges:
|
|
key = challenge["text"].lower()
|
|
if key not in merged:
|
|
merged[key] = {
|
|
"text": challenge["text"],
|
|
"score": challenge["score"],
|
|
"files": {challenge["file"]},
|
|
}
|
|
else:
|
|
merged[key]["score"] += challenge["score"]
|
|
merged[key]["files"].add(challenge["file"])
|
|
|
|
ranked = sorted(
|
|
(
|
|
{
|
|
"text": item["text"],
|
|
"score": item["score"],
|
|
"files": sorted(item["files"]),
|
|
}
|
|
for item in merged.values()
|
|
),
|
|
key=lambda x: x["score"],
|
|
reverse=True,
|
|
)
|
|
|
|
return ranked[:10]
|
|
|
|
|
|
def detect_emergent_themes(docs: list[Doc]) -> list[dict[str, Any]]:
|
|
themes = {
|
|
"Governance and agency design": [
|
|
"agency",
|
|
"democratic",
|
|
"community",
|
|
"policy",
|
|
"selective adoption",
|
|
"governance",
|
|
],
|
|
"Economic concentration and labor shift": [
|
|
"labor",
|
|
"capital",
|
|
"commodity",
|
|
"automation",
|
|
"class",
|
|
"pricing",
|
|
],
|
|
"Epistemic reliability and grounding": [
|
|
"understand",
|
|
"stochastic",
|
|
"illusion",
|
|
"lossy",
|
|
"falsifiable",
|
|
"evidence",
|
|
],
|
|
"Civilizational lock-in and resilience": [
|
|
"lock-in",
|
|
"path dependence",
|
|
"retreat",
|
|
"dependency",
|
|
"ratchet",
|
|
"reversal",
|
|
],
|
|
}
|
|
|
|
scored: list[dict[str, Any]] = []
|
|
corpus = "\n".join(doc.text.lower() for doc in docs)
|
|
for theme, keywords in themes.items():
|
|
score = sum(corpus.count(k) for k in keywords)
|
|
if score > 0:
|
|
scored.append({"theme": theme, "score": score})
|
|
scored.sort(key=lambda x: x["score"], reverse=True)
|
|
return scored
|
|
|
|
|
|
def build_structured_result(
|
|
docs: list[Doc],
|
|
scholars: dict[str, dict[str, Any]],
|
|
bibliography: dict[str, dict[str, Any]],
|
|
contradictions: list[dict[str, Any]],
|
|
open_question_coverage: list[dict[str, Any]],
|
|
strongest_challenges: list[dict[str, Any]],
|
|
emergent_themes: list[dict[str, Any]],
|
|
) -> dict[str, Any]:
|
|
scholars_ranked = sorted(
|
|
scholars.values(),
|
|
key=lambda s: (len(s["files"]), s["mention_count"]),
|
|
reverse=True,
|
|
)
|
|
|
|
bibliography_ranked = sorted(
|
|
bibliography.values(), key=lambda b: b["relevance"], reverse=True
|
|
)
|
|
|
|
return {
|
|
"meta": {
|
|
"research_files": [doc.slug for doc in docs],
|
|
"research_file_count": len(docs),
|
|
},
|
|
"scholars": scholars_ranked,
|
|
"bibliography": bibliography_ranked,
|
|
"contradictions": contradictions,
|
|
"open_question_coverage": open_question_coverage,
|
|
"strongest_challenges": strongest_challenges,
|
|
"emergent_themes": emergent_themes,
|
|
}
|
|
|
|
|
|
def render_digest(result: dict[str, Any]) -> str:
|
|
lines: list[str] = []
|
|
lines.append("# Integrated Research Digest")
|
|
lines.append("")
|
|
lines.append("## Scope")
|
|
lines.append(
|
|
f"Processed {result['meta']['research_file_count']} research file(s): "
|
|
+ ", ".join(result["meta"]["research_files"])
|
|
)
|
|
lines.append("")
|
|
|
|
lines.append("## Scholars by Frequency")
|
|
for scholar in result["scholars"][:20]:
|
|
files = ", ".join(scholar["files"])
|
|
lines.append(
|
|
f"- **{scholar['name']}** — files: {len(scholar['files'])}; mentions: {scholar['mention_count']}; in: {files}"
|
|
)
|
|
lines.append("")
|
|
|
|
lines.append("## Unified Bibliography")
|
|
for item in result["bibliography"][:40]:
|
|
authors = ", ".join(item["authors"]) if item["authors"] else "Unknown"
|
|
files = ", ".join(item["files"])
|
|
lines.append(
|
|
f"- **{item['title']}** ({authors}) — relevance {item['relevance']}; cited in: {files}"
|
|
)
|
|
lines.append("")
|
|
|
|
lines.append("## Contradiction Report")
|
|
if not result["contradictions"]:
|
|
lines.append("- No cross-file contradictions detected by the current heuristic.")
|
|
for item in result["contradictions"]:
|
|
lines.append(f"### {item['topic']}")
|
|
lines.append("- Supporting evidence:")
|
|
for support in item["supports"]:
|
|
lines.append(
|
|
f" - `{support['file']}` ({support['marker']}): {support['snippet']}"
|
|
)
|
|
lines.append("- Challenging evidence:")
|
|
for challenge in item["challenges"]:
|
|
lines.append(
|
|
f" - `{challenge['file']}` ({challenge['marker']}): {challenge['snippet']}"
|
|
)
|
|
lines.append("")
|
|
|
|
lines.append("## Paper 009 Coverage Map")
|
|
for item in result["open_question_coverage"]:
|
|
if item["supporting_files"]:
|
|
max_level = max(fs["level"] for fs in item["supporting_files"])
|
|
else:
|
|
max_level = "none"
|
|
lines.append(
|
|
f"### Q{item['question_id']} (total score {item['total_score']}, strongest level {max_level})"
|
|
)
|
|
lines.append(f"{item['question']}")
|
|
if not item["supporting_files"]:
|
|
lines.append("- No supporting material detected.")
|
|
continue
|
|
for fs in item["supporting_files"][:4]:
|
|
lines.append(f"- `{fs['file']}`: score {fs['score']} ({fs['level']})")
|
|
for snip in fs["snippets"][:2]:
|
|
lines.append(f" - {snip}")
|
|
lines.append("")
|
|
|
|
lines.append("## Strongest Challenges")
|
|
if not result["strongest_challenges"]:
|
|
lines.append("- No challenge bullets detected.")
|
|
for item in result["strongest_challenges"]:
|
|
lines.append(
|
|
f"- **Score {item['score']}** ({', '.join(item['files'])}): {item['text']}"
|
|
)
|
|
|
|
return "\n".join(lines) + "\n"
|
|
|
|
|
|
def render_outline(result: dict[str, Any]) -> str:
|
|
coverage_sorted = sorted(
|
|
result["open_question_coverage"], key=lambda x: x["total_score"], reverse=True
|
|
)
|
|
most_covered = coverage_sorted[:2]
|
|
least_covered = coverage_sorted[-2:] if len(coverage_sorted) >= 2 else coverage_sorted
|
|
|
|
lines: list[str] = []
|
|
lines.append("# Suggested Outline for Paper 009")
|
|
lines.append("")
|
|
lines.append("## Why This Sequence")
|
|
lines.append(
|
|
"Order starts with heavily-supported questions, then closes with low-coverage questions that require new argumentation or new research."
|
|
)
|
|
lines.append("")
|
|
|
|
lines.append("## Coverage Priorities")
|
|
lines.append("- Most supported open questions:")
|
|
for item in most_covered:
|
|
lines.append(
|
|
f" - Q{item['question_id']} (score {item['total_score']}): {item['question']}"
|
|
)
|
|
lines.append("- Least supported open questions:")
|
|
for item in least_covered:
|
|
lines.append(
|
|
f" - Q{item['question_id']} (score {item['total_score']}): {item['question']}"
|
|
)
|
|
lines.append("")
|
|
|
|
lines.append("## Proposed Sections")
|
|
for item in coverage_sorted:
|
|
qid = item["question_id"]
|
|
lines.append(f"### Section {qid}: Q{qid}")
|
|
lines.append(item["question"])
|
|
if item["supporting_files"]:
|
|
top_files = ", ".join(fs["file"] for fs in item["supporting_files"][:3])
|
|
lines.append(f"- Primary evidence files: {top_files}")
|
|
top_snips = [
|
|
snip
|
|
for fs in item["supporting_files"][:2]
|
|
for snip in fs["snippets"][:1]
|
|
]
|
|
for snip in top_snips:
|
|
lines.append(f"- Anchor claim: {snip}")
|
|
else:
|
|
lines.append("- Primary evidence files: none detected; requires fresh synthesis.")
|
|
lines.append("")
|
|
|
|
lines.append("## Cross-Cutting Counterarguments To Address Explicitly")
|
|
for challenge in result["strongest_challenges"][:5]:
|
|
lines.append(f"- {challenge['text']} ({', '.join(challenge['files'])})")
|
|
lines.append("")
|
|
|
|
lines.append("## New Themes To Add Beyond Original Open Questions")
|
|
for theme in result["emergent_themes"][:4]:
|
|
lines.append(f"- {theme['theme']} (signal score {theme['score']})")
|
|
|
|
return "\n".join(lines) + "\n"
|
|
|
|
|
|
def main() -> int:
|
|
args = parse_args()
|
|
project_root = args.project_root.resolve()
|
|
research_dir = (args.research_dir or (project_root / "research")).resolve()
|
|
paper_008 = (args.paper_008 or (project_root / "008-the-ship-of-theseus.md")).resolve()
|
|
out_dir = args.out_dir.resolve()
|
|
out_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
print(f"[integrator] project root: {project_root}")
|
|
print(f"[integrator] research dir: {research_dir}")
|
|
print(f"[integrator] paper 008: {paper_008}")
|
|
print(f"[integrator] output dir: {out_dir}")
|
|
|
|
docs = load_research_docs(research_dir)
|
|
print(f"[integrator] loaded {len(docs)} research file(s)")
|
|
if not docs:
|
|
print("[integrator] no research files found; writing empty digest/outline")
|
|
|
|
open_questions = extract_open_questions(paper_008)
|
|
print(f"[integrator] extracted {len(open_questions)} open question(s) from Paper 008")
|
|
|
|
scholars = extract_scholars(docs)
|
|
print(f"[integrator] extracted {len(scholars)} unique scholar name(s)")
|
|
|
|
bibliography = extract_bibliography(docs)
|
|
print(f"[integrator] extracted {len(bibliography)} bibliography item(s)")
|
|
|
|
contradictions = detect_contradictions(docs)
|
|
print(f"[integrator] detected {len(contradictions)} contradiction topic(s)")
|
|
|
|
coverage = map_to_open_questions(docs, open_questions)
|
|
print("[integrator] mapped research evidence to Paper 008 open questions")
|
|
|
|
strongest_challenges = extract_strongest_challenges(docs)
|
|
print(f"[integrator] ranked {len(strongest_challenges)} strongest challenge(s)")
|
|
|
|
emergent_themes = detect_emergent_themes(docs)
|
|
print(f"[integrator] found {len(emergent_themes)} emergent theme(s)")
|
|
|
|
result = build_structured_result(
|
|
docs,
|
|
scholars,
|
|
bibliography,
|
|
contradictions,
|
|
coverage,
|
|
strongest_challenges,
|
|
emergent_themes,
|
|
)
|
|
|
|
json_path = out_dir / "integrated.json"
|
|
digest_path = out_dir / "digest.md"
|
|
outline_path = out_dir / "009_outline_suggestion.md"
|
|
|
|
json_path.write_text(json.dumps(result, indent=2), encoding="utf-8")
|
|
digest_path.write_text(render_digest(result), encoding="utf-8")
|
|
outline_path.write_text(render_outline(result), encoding="utf-8")
|
|
|
|
print(f"[integrator] wrote {json_path}")
|
|
print(f"[integrator] wrote {digest_path}")
|
|
print(f"[integrator] wrote {outline_path}")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|