110 lines
3.7 KiB
Python
110 lines
3.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
extract_paper.py — Turn a research-paper PDF into structured material the agent
|
|
can reason over, and rip out the paper's own figures so they can be reused.
|
|
|
|
Usage:
|
|
python3 extract_paper.py paper.pdf --out workdir/
|
|
|
|
Outputs into --out:
|
|
paper_text.md full text, page-delimited, with detected section headers
|
|
figures/figNN.png every embedded raster image (the paper's real figures)
|
|
figures.json manifest: {file, page, width, height} for each figure
|
|
meta.json title guess, page count, word count
|
|
|
|
The agent reads paper_text.md to understand the work, decides which extracted
|
|
figures to drop straight into slides, and writes image prompts for the rest.
|
|
"""
|
|
import argparse
|
|
import json
|
|
import os
|
|
import re
|
|
|
|
import fitz # PyMuPDF
|
|
|
|
|
|
HEADING_RE = re.compile(
|
|
r"^\s*((\d+(\.\d+)*)\s+[A-Z][\w\- ]{2,60}"
|
|
r"|(abstract|introduction|related work|background|method(s|ology)?|"
|
|
r"approach|architecture|experiments?|results?|evaluation|discussion|"
|
|
r"conclusions?|limitations|references)\b.*)$",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
|
|
def extract_text(doc):
|
|
parts = []
|
|
for i, page in enumerate(doc):
|
|
parts.append(f"\n\n<!-- page {i + 1} -->\n")
|
|
for line in page.get_text("text").splitlines():
|
|
s = line.strip()
|
|
if not s:
|
|
continue
|
|
if HEADING_RE.match(s) and len(s) < 80:
|
|
parts.append(f"\n## {s}\n")
|
|
else:
|
|
parts.append(s)
|
|
return "\n".join(parts)
|
|
|
|
|
|
def extract_figures(doc, outdir):
|
|
figdir = os.path.join(outdir, "figures")
|
|
os.makedirs(figdir, exist_ok=True)
|
|
manifest, seen, n = [], set(), 0
|
|
for pno, page in enumerate(doc):
|
|
for img in page.get_images(full=True):
|
|
xref = img[0]
|
|
if xref in seen:
|
|
continue
|
|
seen.add(xref)
|
|
try:
|
|
pix = fitz.Pixmap(doc, xref)
|
|
if pix.n - pix.alpha >= 4: # CMYK → RGB
|
|
pix = fitz.Pixmap(fitz.csRGB, pix)
|
|
if pix.width < 120 or pix.height < 120:
|
|
continue # skip logos / icons
|
|
n += 1
|
|
fn = f"fig{n:02d}.png"
|
|
pix.save(os.path.join(figdir, fn))
|
|
manifest.append(dict(file=f"figures/{fn}", page=pno + 1,
|
|
width=pix.width, height=pix.height))
|
|
except Exception as e:
|
|
print(f" ! fig on page {pno+1}: {e}")
|
|
return manifest
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("pdf")
|
|
ap.add_argument("--out", default="paper_work")
|
|
args = ap.parse_args()
|
|
os.makedirs(args.out, exist_ok=True)
|
|
|
|
doc = fitz.open(args.pdf)
|
|
text = extract_text(doc)
|
|
with open(os.path.join(args.out, "paper_text.md"), "w") as fh:
|
|
fh.write(text)
|
|
|
|
figs = extract_figures(doc, args.out)
|
|
with open(os.path.join(args.out, "figures.json"), "w") as fh:
|
|
json.dump(figs, fh, indent=2)
|
|
|
|
title = (doc.metadata.get("title") or "").strip()
|
|
if not title:
|
|
first = next((l.strip() for l in doc[0].get_text("text").splitlines()
|
|
if len(l.strip()) > 15), "Untitled")
|
|
title = first
|
|
meta = dict(title=title, pages=doc.page_count,
|
|
words=len(text.split()), figures=len(figs))
|
|
with open(os.path.join(args.out, "meta.json"), "w") as fh:
|
|
json.dump(meta, fh, indent=2)
|
|
|
|
print(f"✓ {args.pdf}")
|
|
print(f" pages={meta['pages']} words={meta['words']} "
|
|
f"figures_extracted={len(figs)}")
|
|
print(f" → {args.out}/paper_text.md, figures/, figures.json, meta.json")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|