#!/usr/bin/env python3 """ extract_paper.py — Turn a research-paper PDF into structured material the agent can reason over, and rip out the paper's own figures so they can be reused. Usage: python3 extract_paper.py paper.pdf --out workdir/ Outputs into --out: paper_text.md full text, page-delimited, with detected section headers figures/figNN.png every embedded raster image (the paper's real figures) figures.json manifest: {file, page, width, height} for each figure meta.json title guess, page count, word count The agent reads paper_text.md to understand the work, decides which extracted figures to drop straight into slides, and writes image prompts for the rest. """ import argparse import json import os import re import fitz # PyMuPDF HEADING_RE = re.compile( r"^\s*((\d+(\.\d+)*)\s+[A-Z][\w\- ]{2,60}" r"|(abstract|introduction|related work|background|method(s|ology)?|" r"approach|architecture|experiments?|results?|evaluation|discussion|" r"conclusions?|limitations|references)\b.*)$", re.IGNORECASE, ) def extract_text(doc): parts = [] for i, page in enumerate(doc): parts.append(f"\n\n\n") for line in page.get_text("text").splitlines(): s = line.strip() if not s: continue if HEADING_RE.match(s) and len(s) < 80: parts.append(f"\n## {s}\n") else: parts.append(s) return "\n".join(parts) def extract_figures(doc, outdir): figdir = os.path.join(outdir, "figures") os.makedirs(figdir, exist_ok=True) manifest, seen, n = [], set(), 0 for pno, page in enumerate(doc): for img in page.get_images(full=True): xref = img[0] if xref in seen: continue seen.add(xref) try: pix = fitz.Pixmap(doc, xref) if pix.n - pix.alpha >= 4: # CMYK → RGB pix = fitz.Pixmap(fitz.csRGB, pix) if pix.width < 120 or pix.height < 120: continue # skip logos / icons n += 1 fn = f"fig{n:02d}.png" pix.save(os.path.join(figdir, fn)) manifest.append(dict(file=f"figures/{fn}", page=pno + 1, width=pix.width, height=pix.height)) except Exception as e: print(f" ! fig on page {pno+1}: {e}") return manifest def main(): ap = argparse.ArgumentParser() ap.add_argument("pdf") ap.add_argument("--out", default="paper_work") args = ap.parse_args() os.makedirs(args.out, exist_ok=True) doc = fitz.open(args.pdf) text = extract_text(doc) with open(os.path.join(args.out, "paper_text.md"), "w") as fh: fh.write(text) figs = extract_figures(doc, args.out) with open(os.path.join(args.out, "figures.json"), "w") as fh: json.dump(figs, fh, indent=2) title = (doc.metadata.get("title") or "").strip() if not title: first = next((l.strip() for l in doc[0].get_text("text").splitlines() if len(l.strip()) > 15), "Untitled") title = first meta = dict(title=title, pages=doc.page_count, words=len(text.split()), figures=len(figs)) with open(os.path.join(args.out, "meta.json"), "w") as fh: json.dump(meta, fh, indent=2) print(f"✓ {args.pdf}") print(f" pages={meta['pages']} words={meta['words']} " f"figures_extracted={len(figs)}") print(f" → {args.out}/paper_text.md, figures/, figures.json, meta.json") if __name__ == "__main__": main()