Files

110 lines
3.7 KiB
Python

#!/usr/bin/env python3
"""
extract_paper.py — Turn a research-paper PDF into structured material the agent
can reason over, and rip out the paper's own figures so they can be reused.
Usage:
python3 extract_paper.py paper.pdf --out workdir/
Outputs into --out:
paper_text.md full text, page-delimited, with detected section headers
figures/figNN.png every embedded raster image (the paper's real figures)
figures.json manifest: {file, page, width, height} for each figure
meta.json title guess, page count, word count
The agent reads paper_text.md to understand the work, decides which extracted
figures to drop straight into slides, and writes image prompts for the rest.
"""
import argparse
import json
import os
import re
import fitz # PyMuPDF
HEADING_RE = re.compile(
r"^\s*((\d+(\.\d+)*)\s+[A-Z][\w\- ]{2,60}"
r"|(abstract|introduction|related work|background|method(s|ology)?|"
r"approach|architecture|experiments?|results?|evaluation|discussion|"
r"conclusions?|limitations|references)\b.*)$",
re.IGNORECASE,
)
def extract_text(doc):
parts = []
for i, page in enumerate(doc):
parts.append(f"\n\n<!-- page {i + 1} -->\n")
for line in page.get_text("text").splitlines():
s = line.strip()
if not s:
continue
if HEADING_RE.match(s) and len(s) < 80:
parts.append(f"\n## {s}\n")
else:
parts.append(s)
return "\n".join(parts)
def extract_figures(doc, outdir):
figdir = os.path.join(outdir, "figures")
os.makedirs(figdir, exist_ok=True)
manifest, seen, n = [], set(), 0
for pno, page in enumerate(doc):
for img in page.get_images(full=True):
xref = img[0]
if xref in seen:
continue
seen.add(xref)
try:
pix = fitz.Pixmap(doc, xref)
if pix.n - pix.alpha >= 4: # CMYK → RGB
pix = fitz.Pixmap(fitz.csRGB, pix)
if pix.width < 120 or pix.height < 120:
continue # skip logos / icons
n += 1
fn = f"fig{n:02d}.png"
pix.save(os.path.join(figdir, fn))
manifest.append(dict(file=f"figures/{fn}", page=pno + 1,
width=pix.width, height=pix.height))
except Exception as e:
print(f" ! fig on page {pno+1}: {e}")
return manifest
def main():
ap = argparse.ArgumentParser()
ap.add_argument("pdf")
ap.add_argument("--out", default="paper_work")
args = ap.parse_args()
os.makedirs(args.out, exist_ok=True)
doc = fitz.open(args.pdf)
text = extract_text(doc)
with open(os.path.join(args.out, "paper_text.md"), "w") as fh:
fh.write(text)
figs = extract_figures(doc, args.out)
with open(os.path.join(args.out, "figures.json"), "w") as fh:
json.dump(figs, fh, indent=2)
title = (doc.metadata.get("title") or "").strip()
if not title:
first = next((l.strip() for l in doc[0].get_text("text").splitlines()
if len(l.strip()) > 15), "Untitled")
title = first
meta = dict(title=title, pages=doc.page_count,
words=len(text.split()), figures=len(figs))
with open(os.path.join(args.out, "meta.json"), "w") as fh:
json.dump(meta, fh, indent=2)
print(f"{args.pdf}")
print(f" pages={meta['pages']} words={meta['words']} "
f"figures_extracted={len(figs)}")
print(f"{args.out}/paper_text.md, figures/, figures.json, meta.json")
if __name__ == "__main__":
main()