From 661ad32394aba6fa3200a58f1e33049c13feb890 Mon Sep 17 00:00:00 2001 From: ibn-ezzoubayr Date: Fri, 19 Jun 2026 21:47:51 +0100 Subject: [PATCH] First Commit: research-paper-presenter skill for hermes! --- research-paper-presenter/SKILL.md | 164 ++++++++ research-paper-presenter/requirements.txt | 7 + research-paper-presenter/scripts/build_odp.py | 370 ++++++++++++++++++ .../scripts/extract_paper.py | 109 ++++++ .../scripts/generate_images.py | 120 ++++++ 5 files changed, 770 insertions(+) create mode 100644 research-paper-presenter/SKILL.md create mode 100644 research-paper-presenter/requirements.txt create mode 100644 research-paper-presenter/scripts/build_odp.py create mode 100644 research-paper-presenter/scripts/extract_paper.py create mode 100644 research-paper-presenter/scripts/generate_images.py diff --git a/research-paper-presenter/SKILL.md b/research-paper-presenter/SKILL.md new file mode 100644 index 0000000..357352e --- /dev/null +++ b/research-paper-presenter/SKILL.md @@ -0,0 +1,164 @@ +# Research Paper → LibreOffice Impress Explainer + +## Purpose +Turn a research-paper PDF into a **detailed, image-rich `.odp` presentation** that +explains the work clearly enough for a learner to understand everything — the +problem, the core idea, the method, the math intuition, the results, and why it +matters. Output is a **native LibreOffice Impress file (`.odp`)** — never `.pptx`, +never PowerPoint. + +## When to use +The user gives you a paper (a `.pdf`, or an arXiv/URL they want explained) and +wants to *learn it* via slides. Phrases like "make slides from this paper", +"explain this paper", "turn this into a presentation", "deck I can study from". + +## What "good" looks like +- **Teaches, doesn't summarize.** Every claim from the paper is unpacked into + plain language with intuition first, formalism second. Assume the learner is + smart but new to the subfield. +- **Image-rich.** Most content slides carry a visual. Two image sources: + 1. **The paper's own figures**, extracted automatically (highest fidelity — + use these for the real architecture diagrams, result plots, tables). + 2. **Generated explainer visuals** from `gpt-image-2` — schematics, analogies, + step-by-step diagrams, intuition pictures that the paper *doesn't* contain. +- **Detailed.** A typical paper becomes **18–32 slides**, not 8. Break the method + into multiple slides. One idea per slide. +- **Coherent look.** Generated images share a style so the deck feels designed. + +--- + +## Pipeline (run in order) + +All scripts live in `scripts/`. Work inside a scratch dir, e.g. `workdir/`. + +### 1 — Extract the paper +```bash +python3 scripts/extract_paper.py PAPER.pdf --out workdir +``` +Produces `workdir/paper_text.md` (page-delimited text with detected section +headers), `workdir/figures/*.png` (the paper's real figures), `figures.json` +(manifest with page + dimensions), and `meta.json` (title, page/word counts). + +### 2 — Read and understand +Read `paper_text.md` end to end. Build a mental model: What problem? What was +broken before? What's the key insight? How does the method work mechanically? +What do the experiments show? What are the limits? **Do not start slides until +you can explain the paper to a beginner without looking.** + +Inspect the extracted figures (`workdir/figures/`). Decide which are worth putting +on slides directly (architecture diagrams, key result plots, tables). + +### 3 — Plan the deck + write image prompts +Draft two files: + +- `workdir/prompts.json` — visuals to generate (see schema below). Write a prompt + for each concept that benefits from a picture the paper lacks: the core + analogy, a simplified mechanism diagram, before/after comparisons, a "how data + flows" schematic, an intuition pump for the math. Aim for **roughly one + generated image per 1–2 content slides**, on top of reused paper figures. +- `workdir/deck.json` — the full deck spec (schema below). Reference generated + images as `.png` and reused paper figures by their path + (`figures/fig03.png`). + +### 4 — Generate images +```bash +export OPENAI_API_KEY=... # Codex/Hermes usually has this in env +python3 scripts/generate_images.py workdir/prompts.json --assets workdir/assets +``` +Writes one PNG per prompt as `workdir/assets/.png`. Also copy any reused +paper figures into the assets dir so everything resolves from one place: +```bash +cp workdir/figures/*.png workdir/assets/ # optional, keeps paths simple +``` +> If the Hermes runtime has **native gpt-image-2** generation (Codex does), you +> may generate images directly and just save them as `workdir/assets/.png`. +> The script is the portable fallback. + +### 5 — Build the .odp +```bash +python3 scripts/build_odp.py workdir/deck.json workdir/output.odp --assets workdir/assets +``` +That's the deliverable. It opens directly in LibreOffice Impress on Ubuntu. +(Optional sanity check / PDF preview: +`libreoffice --headless --convert-to pdf workdir/output.odp`.) + +--- + +## Deck spec schema (`deck.json`) + +```jsonc +{ + "theme": "midnight", // midnight | paper | forest + "slides": [ /* slide objects, in order */ ] +} +``` + +Slide objects by `type`: + +```jsonc +// Opening slide +{"type":"title","title":"...","subtitle":"...","eyebrow":"RESEARCH WALKTHROUGH", + "meta":"Authors, year • one line", "notes":"speaker notes (optional)"} + +// Section divider between major parts +{"type":"section","number":"02","title":"The Method","subtitle":"optional"} + +// Workhorse slide: bullets, with an optional image on the right +{"type":"content","kicker":"motivation","title":"...", + "bullets":["point","another point",{"text":"sub-point","level":1}], + "image":"diagram_attention.png", // omit for full-width text + "caption":"Fig 2 — ...", "notes":"..."} + +// Full-bleed image with a caption — use for big architecture diagrams / plots +{"type":"bigimage","kicker":"architecture","title":"...", + "image":"figures/fig03.png","caption":"...","notes":"..."} + +// Side-by-side comparison (before/after, baseline/proposed, RNN/Transformer) +{"type":"comparison","title":"...", + "left":{"heading":"Baseline","bullets":["...","..."]}, + "right":{"heading":"Proposed","bullets":["...","..."]}} + +// Pull-quote / key takeaway +{"type":"quote","text":"...","attribution":"paper abstract (paraphrased)"} +``` + +Notes: +- `bullets` items are strings, or `{"text": "...", "level": 1}` for one indent. +- `image` is resolved against `--assets`. A missing image renders a labelled + placeholder (the deck still builds), so a failed generation never blocks you. +- Put the deeper explanation a learner can read later into `notes` (speaker + notes) — keep on-slide bullets tight. + +## Image prompts schema (`prompts.json`) +```jsonc +[ + {"id":"attention_schematic", + "prompt":"Technical diagram: scaled dot-product attention. Show Q, K, V as " + "labelled boxes, a matrix multiply, a softmax, and a weighted sum. " + "Clear arrows and labels.", + "shape":"landscape"}, // landscape | portrait | square + {"id":"rnn_bottleneck","prompt":"...","shape":"portrait", + "transparent": false} +] +``` +- `id` becomes the filename (`.png`) → reference it in `deck.json`. +- gpt-image-2 renders **text inside images** well, so labelled diagrams, + flowcharts and infographics are fair game — lean into them. +- A shared style suffix is auto-appended for visual coherence; override per-run + with `--style-suffix`. + +--- + +## Pedagogy checklist (the part that makes it a *learning* deck) +- Open with the **problem and stakes** before any method. +- For every mechanism: **intuition / analogy first**, then the precise version. +- Turn each equation into a sentence ("this just measures how similar two + vectors are, normalized so big dimensions don't blow up the scale"). +- Use `comparison` slides for "old way vs new way". +- Reuse the paper's real result figures; explain *what to look at* in the caption. +- End with: what's genuinely new, what it enables, and stated limitations. +- Prefer more slides over crowded ones. One idea per slide. + +## Theme choice +`midnight` (dark, technical — default), `paper` (warm light, academic), +`forest` (dark green). Pick one that fits the subject; keep it consistent. diff --git a/research-paper-presenter/requirements.txt b/research-paper-presenter/requirements.txt new file mode 100644 index 0000000..2d4b928 --- /dev/null +++ b/research-paper-presenter/requirements.txt @@ -0,0 +1,7 @@ +# Python deps for the research-paper-presenter skill +odfpy>=1.4.1 # native .odp generation (LibreOffice Impress) +PyMuPDF>=1.24 # PDF text + figure extraction (imported as: fitz) +openai>=1.40 # gpt-image-2 image generation (API fallback path) + +# System (Ubuntu): LibreOffice is only needed for the optional PDF preview: +# sudo apt-get install -y libreoffice-impress diff --git a/research-paper-presenter/scripts/build_odp.py b/research-paper-presenter/scripts/build_odp.py new file mode 100644 index 0000000..c7d2767 --- /dev/null +++ b/research-paper-presenter/scripts/build_odp.py @@ -0,0 +1,370 @@ +#!/usr/bin/env python3 +""" +build_odp.py — Render a deck spec (JSON) into a native LibreOffice Impress (.odp) file. + +This produces a TRUE OpenDocument Presentation (not a .pptx). No PowerPoint, no +conversion step. Images are embedded directly into the .odp container. + +Usage: + python3 build_odp.py deck.json out.odp [--assets ASSETS_DIR] + +The deck spec schema is documented in SKILL.md. Image fields are filenames that are +resolved relative to --assets (default: directory of deck.json). + +Slide types: title | section | content | bigimage | comparison | quote +""" +import argparse +import json +import os +import sys + +from odf.opendocument import OpenDocumentPresentation +from odf.style import ( + Style, MasterPage, PageLayout, PageLayoutProperties, + GraphicProperties, ParagraphProperties, TextProperties, + DrawingPageProperties, +) +from odf.text import P, Span +from odf.draw import Page, Frame, TextBox, Image, Rect +from odf.presentation import Notes + +# ---- 16:9 geometry (centimetres) ------------------------------------------- +PW, PH = 33.867, 19.05 # page width / height (PowerPoint-widescreen equiv.) +MARGIN = 1.7 + +# ---- palette --------------------------------------------------------------- +THEMES = { + "midnight": dict(bg="#0f172a", band="#1e293b", accent="#38bdf8", + accent2="#a78bfa", title="#f8fafc", body="#cbd5e1", + muted="#64748b", chip="#0ea5e9"), + "paper": dict(bg="#fbfaf7", band="#efe9dd", accent="#b4541f", + accent2="#1f6f6b", title="#1c1917", body="#3f3a35", + muted="#8a8178", chip="#b4541f"), + "forest": dict(bg="#0c1f1a", band="#13302a", accent="#34d399", + accent2="#fbbf24", title="#ecfdf5", body="#bbf7d0", + muted="#5e8b7e", chip="#10b981"), +} + + +def cm(v): + return f"{v:.3f}cm" + + +class Deck: + def __init__(self, doc, theme): + self.doc = doc + self.t = theme + self._n = 0 + self._styles = {} + + # -- style helpers ------------------------------------------------------- + def _key(self, prefix): + self._n += 1 + return f"{prefix}{self._n}" + + def gstyle(self, fill=None, stroke="none"): + st = Style(name=self._key("gr"), family="graphic") + props = {} + if fill: + props.update(fill="solid", fillcolor=fill) + else: + props.update(fill="none") + props["stroke"] = stroke + st.addElement(GraphicProperties(**props)) + self.doc.automaticstyles.addElement(st) + return st + + def pstyle(self, color, size, bold=False, align="left", spacing=None, + font="Inter"): + st = Style(name=self._key("pp"), family="paragraph") + para = {"textalign": align} + if spacing: + para["marginbottom"] = cm(spacing) + st.addElement(ParagraphProperties(**para)) + st.addElement(TextProperties( + color=color, fontsize=f"{size}pt", + fontweight="bold" if bold else "normal", + fontfamily=font, + )) + self.doc.automaticstyles.addElement(st) + return st + + def dpstyle(self, bg): + st = Style(name=self._key("dp"), family="drawing-page") + st.addElement(DrawingPageProperties(fill="solid", fillcolor=bg, + displayfooter="true", + displaypagenumber="false")) + self.doc.automaticstyles.addElement(st) + return st + + # -- primitives ---------------------------------------------------------- + def page(self, bg=None): + dp = self.dpstyle(bg or self.t["bg"]) + pg = Page(stylename=dp, masterpagename=self.master) + self.doc.presentation.addElement(pg) + return pg + + def rect(self, pg, x, y, w, h, fill): + f = Rect(stylename=self.gstyle(fill=fill), + x=cm(x), y=cm(y), width=cm(w), height=cm(h)) + pg.addElement(f) + + def text(self, pg, x, y, w, h, lines, color=None, size=None, bold=False, + align="left", line_spacing=0.0, font="Inter"): + """lines: list of strings OR list of (text, size, color, bold) tuples.""" + fr = Frame(stylename=self.gstyle(), x=cm(x), y=cm(y), + width=cm(w), height=cm(h)) + tb = TextBox() + fr.addElement(tb) + for ln in lines: + if isinstance(ln, tuple): + txt, s, c, b = ln + ps = self.pstyle(c, s, b, align, line_spacing, font) + else: + txt, ps = ln, self.pstyle(color or "#ffffff", size or 14, + bold, align, line_spacing, font) + tb.addElement(P(stylename=ps, text=txt)) + pg.addElement(fr) + return fr + + def bullets(self, pg, x, y, w, h, items, color, size, accent, + line_spacing=0.55): + fr = Frame(stylename=self.gstyle(), x=cm(x), y=cm(y), + width=cm(w), height=cm(h)) + tb = TextBox() + fr.addElement(tb) + for it in items: + sub = isinstance(it, dict) + txt = it["text"] if sub else it + indent = it.get("level", 0) if sub else 0 + ps = Style(name=self._key("pp"), family="paragraph") + ps.addElement(ParagraphProperties( + marginbottom=cm(line_spacing), + marginleft=cm(0.7 + indent * 0.9), + textindent=cm(-0.7), + )) + ps.addElement(TextProperties(color=color, fontsize=f"{size}pt", + fontfamily="Inter")) + self.doc.automaticstyles.addElement(ps) + bullet = "▸ " if indent == 0 else "– " + p = P(stylename=ps) + p.addElement(Span(text=bullet, + stylename=self.tspan(accent, size, bold=True))) + p.addElement(Span(text=txt)) + tb.addElement(p) + pg.addElement(fr) + return fr + + def tspan(self, color, size, bold=False): + st = Style(name=self._key("ts"), family="text") + st.addElement(TextProperties(color=color, fontsize=f"{size}pt", + fontweight="bold" if bold else "normal", + fontfamily="Inter")) + self.doc.automaticstyles.addElement(st) + return st + + def picture(self, pg, path, x, y, w, h, fit=True): + if not os.path.exists(path): + # draw a placeholder so the deck still builds + self.rect(pg, x, y, w, h, self.t["band"]) + self.text(pg, x, y + h / 2 - 0.6, w, 1.2, + [f"[missing image: {os.path.basename(path)}]"], + self.t["muted"], 12, align="center") + return + iw, ih = _image_dims(path) + if fit and iw and ih: + scale = min(w / iw, h / ih) + dw, dh = iw * scale, ih * scale + dx, dy = x + (w - dw) / 2, y + (h - dh) / 2 + else: + dw, dh, dx, dy = w, h, x, y + href = self.doc.addPicture(path) + fr = Frame(x=cm(dx), y=cm(dy), width=cm(dw), height=cm(dh)) + fr.addElement(Image(href=href, type="simple", show="embed", + actuate="onLoad")) + pg.addElement(fr) + + def notes(self, pg, text): + if not text: + return + n = Notes() + fr = Frame(x=cm(2), y=cm(12), width=cm(18), height=cm(6)) + tb = TextBox() + fr.addElement(tb) + for para in text.split("\n"): + tb.addElement(P(text=para)) + n.addElement(fr) + pg.addElement(n) + + # -- slide renderers ----------------------------------------------------- + def slide_title(self, s): + pg = self.page() + self.rect(pg, 0, PH * 0.42, PW, 0.12, self.t["accent"]) + self.rect(pg, 0, PH * 0.42 + 0.12, PW * 0.33, 0.12, self.t["accent2"]) + self.text(pg, MARGIN, PH * 0.18, PW - 2 * MARGIN, 3, + [(s.get("eyebrow", "RESEARCH WALKTHROUGH"), 14, + self.t["accent"], True)]) + self.text(pg, MARGIN, PH * 0.24, PW - 2 * MARGIN, 6, + [(s["title"], 40, self.t["title"], True)]) + if s.get("subtitle"): + self.text(pg, MARGIN, PH * 0.55, PW - 2 * MARGIN, 4, + [(s["subtitle"], 19, self.t["body"], False)]) + meta = s.get("meta") + if meta: + self.text(pg, MARGIN, PH - 2.4, PW - 2 * MARGIN, 1.5, + [(meta, 13, self.t["muted"], False)]) + self.notes(pg, s.get("notes")) + + def slide_section(self, s): + pg = self.page(bg=self.t["band"]) + num = s.get("number", "") + self.text(pg, MARGIN, PH * 0.28, 6, 4, + [(str(num), 80, self.t["accent"], True)]) + self.rect(pg, MARGIN, PH * 0.52, 4.5, 0.1, self.t["accent2"]) + self.text(pg, MARGIN, PH * 0.55, PW - 2 * MARGIN, 4, + [(s["title"], 34, self.t["title"], True)]) + if s.get("subtitle"): + self.text(pg, MARGIN, PH * 0.72, PW - 2 * MARGIN, 3, + [(s["subtitle"], 17, self.t["body"], False)]) + self.notes(pg, s.get("notes")) + + def _header(self, pg, title, kicker=None): + self.rect(pg, 0, 0, 0.45, PH, self.t["accent"]) + if kicker: + self.text(pg, MARGIN, 1.0, PW - 2 * MARGIN, 1, + [(kicker.upper(), 12, self.t["accent"], True)]) + self.text(pg, MARGIN, 1.5, PW - 2 * MARGIN, 2.2, + [(title, 26, self.t["title"], True)]) + self.rect(pg, MARGIN, 3.5, 3.2, 0.08, self.t["accent2"]) + + def slide_content(self, s): + pg = self.page() + self._header(pg, s["title"], s.get("kicker")) + has_img = bool(s.get("image")) + bx, bw = MARGIN, (PW * 0.52 if has_img else PW - 2 * MARGIN) + self.bullets(pg, bx, 4.4, bw, PH - 5.5, s.get("bullets", []), + self.t["body"], 16, self.t["accent"]) + if has_img: + ix = PW * 0.56 + iw = PW - ix - MARGIN + self.picture(pg, s["_imgpath"], ix, 4.4, iw, PH - 5.8) + if s.get("caption"): + self.text(pg, ix, PH - 1.4, iw, 1, + [(s["caption"], 11, self.t["muted"], False)], + self.t["muted"], 11, align="center") + self.notes(pg, s.get("notes")) + + def slide_bigimage(self, s): + pg = self.page() + self._header(pg, s["title"], s.get("kicker")) + self.picture(pg, s["_imgpath"], MARGIN, 4.3, PW - 2 * MARGIN, PH - 6.2) + if s.get("caption"): + self.text(pg, MARGIN, PH - 1.6, PW - 2 * MARGIN, 1, + [(s["caption"], 12, self.t["muted"], False)], + self.t["muted"], 12, align="center") + self.notes(pg, s.get("notes")) + + def slide_comparison(self, s): + pg = self.page() + self._header(pg, s["title"], s.get("kicker")) + colw = (PW - 2 * MARGIN - 1.2) / 2 + left = s.get("left", {}) + right = s.get("right", {}) + for i, col in enumerate((left, right)): + x = MARGIN + i * (colw + 1.2) + self.rect(pg, x, 4.3, colw, 1.0, + self.t["accent"] if i == 0 else self.t["accent2"]) + self.text(pg, x + 0.3, 4.5, colw - 0.6, 0.9, + [(col.get("heading", ""), 15, self.t["bg"], True)]) + self.bullets(pg, x, 5.7, colw, PH - 6.8, + col.get("bullets", []), self.t["body"], 15, + self.t["accent"] if i == 0 else self.t["accent2"]) + self.notes(pg, s.get("notes")) + + def slide_quote(self, s): + pg = self.page(bg=self.t["band"]) + self.text(pg, MARGIN, PH * 0.2, 4, 4, + [("\u201C", 90, self.t["accent"], True)]) + self.text(pg, MARGIN + 0.3, PH * 0.34, PW - 2 * MARGIN - 1, 6, + [(s["text"], 24, self.t["title"], False)]) + if s.get("attribution"): + self.text(pg, MARGIN + 0.3, PH * 0.74, PW - 2 * MARGIN, 2, + [("— " + s["attribution"], 15, self.t["accent"], True)]) + self.notes(pg, s.get("notes")) + + RENDER = { + "title": slide_title, "section": slide_section, + "content": slide_content, "bigimage": slide_bigimage, + "comparison": slide_comparison, "quote": slide_quote, + } + + def build(self, spec): + # master page + layout + pl = PageLayout(name="DeckPL") + pl.addElement(PageLayoutProperties( + pagewidth=cm(PW), pageheight=cm(PH), + printorientation="landscape")) + self.doc.automaticstyles.addElement(pl) + self.master = MasterPage(name="DeckMaster", pagelayoutname=pl) + self.doc.masterstyles.addElement(self.master) + + for s in spec["slides"]: + renderer = self.RENDER.get(s["type"]) + if renderer is None: + print(f" ! unknown slide type: {s['type']}", file=sys.stderr) + continue + renderer(self, s) + + +def _image_dims(path): + try: + from struct import unpack + with open(path, "rb") as fh: + head = fh.read(26) + if head[:8] == b"\x89PNG\r\n\x1a\n": + w, h = unpack(">II", head[16:24]) + return w, h + if head[:2] == b"\xff\xd8": # jpeg — walk markers + with open(path, "rb") as fh: + fh.read(2) + while True: + b = fh.read(1) + while b and b != b"\xff": + b = fh.read(1) + marker = fh.read(1) + if marker in (b"\xc0", b"\xc1", b"\xc2", b"\xc3"): + fh.read(3) + h, w = unpack(">HH", fh.read(4)) + return w, h + size = unpack(">H", fh.read(2))[0] + fh.read(size - 2) + except Exception: + pass + return None, None + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("spec") + ap.add_argument("out") + ap.add_argument("--assets", default=None) + args = ap.parse_args() + + with open(args.spec) as fh: + spec = json.load(fh) + + assets = args.assets or os.path.dirname(os.path.abspath(args.spec)) + for s in spec["slides"]: + if s.get("image"): + s["_imgpath"] = os.path.join(assets, s["image"]) + + theme = THEMES.get(spec.get("theme", "midnight"), THEMES["midnight"]) + doc = OpenDocumentPresentation() + Deck(doc, theme).build(spec) + doc.save(args.out) + n = len(spec["slides"]) + print(f"✓ wrote {args.out} ({n} slides, theme={spec.get('theme','midnight')})") + + +if __name__ == "__main__": + main() diff --git a/research-paper-presenter/scripts/extract_paper.py b/research-paper-presenter/scripts/extract_paper.py new file mode 100644 index 0000000..73eae19 --- /dev/null +++ b/research-paper-presenter/scripts/extract_paper.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +""" +extract_paper.py — Turn a research-paper PDF into structured material the agent +can reason over, and rip out the paper's own figures so they can be reused. + +Usage: + python3 extract_paper.py paper.pdf --out workdir/ + +Outputs into --out: + paper_text.md full text, page-delimited, with detected section headers + figures/figNN.png every embedded raster image (the paper's real figures) + figures.json manifest: {file, page, width, height} for each figure + meta.json title guess, page count, word count + +The agent reads paper_text.md to understand the work, decides which extracted +figures to drop straight into slides, and writes image prompts for the rest. +""" +import argparse +import json +import os +import re + +import fitz # PyMuPDF + + +HEADING_RE = re.compile( + r"^\s*((\d+(\.\d+)*)\s+[A-Z][\w\- ]{2,60}" + r"|(abstract|introduction|related work|background|method(s|ology)?|" + r"approach|architecture|experiments?|results?|evaluation|discussion|" + r"conclusions?|limitations|references)\b.*)$", + re.IGNORECASE, +) + + +def extract_text(doc): + parts = [] + for i, page in enumerate(doc): + parts.append(f"\n\n\n") + for line in page.get_text("text").splitlines(): + s = line.strip() + if not s: + continue + if HEADING_RE.match(s) and len(s) < 80: + parts.append(f"\n## {s}\n") + else: + parts.append(s) + return "\n".join(parts) + + +def extract_figures(doc, outdir): + figdir = os.path.join(outdir, "figures") + os.makedirs(figdir, exist_ok=True) + manifest, seen, n = [], set(), 0 + for pno, page in enumerate(doc): + for img in page.get_images(full=True): + xref = img[0] + if xref in seen: + continue + seen.add(xref) + try: + pix = fitz.Pixmap(doc, xref) + if pix.n - pix.alpha >= 4: # CMYK → RGB + pix = fitz.Pixmap(fitz.csRGB, pix) + if pix.width < 120 or pix.height < 120: + continue # skip logos / icons + n += 1 + fn = f"fig{n:02d}.png" + pix.save(os.path.join(figdir, fn)) + manifest.append(dict(file=f"figures/{fn}", page=pno + 1, + width=pix.width, height=pix.height)) + except Exception as e: + print(f" ! fig on page {pno+1}: {e}") + return manifest + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("pdf") + ap.add_argument("--out", default="paper_work") + args = ap.parse_args() + os.makedirs(args.out, exist_ok=True) + + doc = fitz.open(args.pdf) + text = extract_text(doc) + with open(os.path.join(args.out, "paper_text.md"), "w") as fh: + fh.write(text) + + figs = extract_figures(doc, args.out) + with open(os.path.join(args.out, "figures.json"), "w") as fh: + json.dump(figs, fh, indent=2) + + title = (doc.metadata.get("title") or "").strip() + if not title: + first = next((l.strip() for l in doc[0].get_text("text").splitlines() + if len(l.strip()) > 15), "Untitled") + title = first + meta = dict(title=title, pages=doc.page_count, + words=len(text.split()), figures=len(figs)) + with open(os.path.join(args.out, "meta.json"), "w") as fh: + json.dump(meta, fh, indent=2) + + print(f"✓ {args.pdf}") + print(f" pages={meta['pages']} words={meta['words']} " + f"figures_extracted={len(figs)}") + print(f" → {args.out}/paper_text.md, figures/, figures.json, meta.json") + + +if __name__ == "__main__": + main() diff --git a/research-paper-presenter/scripts/generate_images.py b/research-paper-presenter/scripts/generate_images.py new file mode 100644 index 0000000..b3e5915 --- /dev/null +++ b/research-paper-presenter/scripts/generate_images.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 +""" +generate_images.py — Generate slide visuals with OpenAI's gpt-image-2. + +Reads a prompts file (JSON list) and writes one PNG per entry into --assets. +gpt-image-2 has reasoning + strong in-image text rendering, so it is well suited +to diagrams, labelled schematics and infographics — not just decorative art. + +Prompts file format (JSON): +[ + {"id": "attention_schematic", + "prompt": "Clean technical diagram of scaled dot-product attention ...", + "shape": "landscape"}, + {"id": "rnn_vs_transformer", "prompt": "...", "shape": "portrait"} +] + +Usage: + export OPENAI_API_KEY=... + python3 generate_images.py prompts.json --assets workdir/assets + # options: --model gpt-image-2 --quality high --style-suffix "..." + +Each PNG is saved as .png — reference that exact filename in the deck spec's +"image" field. + +NOTE for Codex/Hermes: if your agent runtime already has native gpt-image-2 image +generation, you may instead generate images directly and just save them as +.png into the assets dir — this script is the portable API fallback. +""" +import argparse +import base64 +import json +import os +import sys +import time + +SIZES = { # gpt-image-2 supported sizes + "landscape": "1536x1024", + "portrait": "1024x1536", + "square": "1024x1024", +} + +# Appended to every prompt so generated visuals share a coherent look that sits +# well on the deck background. Override with --style-suffix. +DEFAULT_STYLE = ( + " Modern flat editorial illustration / technical diagram style. " + "Crisp vector-like shapes, generous whitespace, high contrast, " + "legible labels, restrained palette of deep navy, cyan and violet on a " + "near-black background. No watermark, no signature, no stock-photo look." +) + + +def make_client(): + try: + from openai import OpenAI + except ImportError: + sys.exit("openai SDK not installed — run: pip install openai") + key = os.environ.get("OPENAI_API_KEY") + if not key: + sys.exit("OPENAI_API_KEY is not set.") + return OpenAI() + + +def generate(client, model, prompt, size, quality, transparent): + kwargs = dict(model=model, prompt=prompt, size=size, n=1) + # gpt-image models support quality + optional transparent background + if quality: + kwargs["quality"] = quality + if transparent: + kwargs["background"] = "transparent" + resp = client.images.generate(**kwargs) + return base64.b64decode(resp.data[0].b64_json) + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("prompts") + ap.add_argument("--assets", default="assets") + ap.add_argument("--model", default="gpt-image-2") + ap.add_argument("--quality", default="high", + help="low | medium | high (gpt-image-2)") + ap.add_argument("--style-suffix", default=DEFAULT_STYLE) + ap.add_argument("--overwrite", action="store_true") + args = ap.parse_args() + + os.makedirs(args.assets, exist_ok=True) + with open(args.prompts) as fh: + prompts = json.load(fh) + + client = make_client() + ok = 0 + for item in prompts: + iid = item["id"] + out = os.path.join(args.assets, f"{iid}.png") + if os.path.exists(out) and not args.overwrite: + print(f"· skip {iid} (exists)") + ok += 1 + continue + size = SIZES.get(item.get("shape", "landscape"), SIZES["landscape"]) + prompt = item["prompt"].strip() + args.style_suffix + for attempt in range(1, 4): + try: + data = generate(client, args.model, prompt, size, + args.quality, item.get("transparent", False)) + with open(out, "wb") as fh: + fh.write(data) + print(f"✓ {iid} ({size})") + ok += 1 + break + except Exception as e: + wait = 3 * attempt + print(f" ! {iid} attempt {attempt} failed: {e} " + f"(retry in {wait}s)" if attempt < 3 + else f" ✗ {iid} gave up: {e}") + if attempt < 3: + time.sleep(wait) + print(f"\n{ok}/{len(prompts)} images ready in {args.assets}/") + + +if __name__ == "__main__": + main()