Files
thgtoa/scripts/convert.py
T
nopeitsnothing 343ad7f037 fix(convert): fail fast with helpful message if pdftoppm or qpdf missing
Previously the script crashed with a FileNotFoundError traceback when
system tools were absent. Now _check_dependencies() runs before any
work begins and prints install instructions for Linux/WSL, macOS, and
a pointer to develop.md for Windows.

Signed-off-by: nopeitsnothing <no@anonymousplanet.org>
2026-05-23 22:53:28 -04:00

265 lines
8.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Dark-mode PDF converter (pixel-based, batch-safe).
Rasterizes each page with pdftoppm, applies the hacker theme palette
pixel-by-pixel, then reassembles into a PDF. Processes in batches of 50
pages to stay within memory limits on large documents, then merges with qpdf.
Usage:
python scripts/convert.py INPUT.pdf [OUTPUT.pdf]
python scripts/convert.py INPUT.pdf [OUTPUT.pdf] [--dpi 200]
[--bg 1f1f31] [--text e0e0e0] [--link 5e8bde]
[--batch-size 50]
Examples:
python scripts/convert.py export/thgtoa.pdf export/thgtoa-dark.pdf
python scripts/convert.py export/thgtoa.pdf --dpi 150 --bg 0d1117
"""
from __future__ import annotations
import argparse
import glob
import os
import subprocess
import sys
import tempfile
from pathlib import Path
import numpy as np
from PIL import Image
# --------------------------------------------------------------------------- #
# Defaults (Hacker theme)
# --------------------------------------------------------------------------- #
DEFAULT_BG = (0x1f, 0x1f, 0x31)
DEFAULT_TEXT = (0xe0, 0xe0, 0xe0)
DEFAULT_LINK = (0x5e, 0x8b, 0xde)
DEFAULT_DPI = 200
DEFAULT_BATCH = 50
def hex_to_rgb(h: str) -> tuple:
h = h.lstrip('#')
return tuple(int(h[i:i+2], 16) for i in (0, 2, 4))
def apply_dark_theme(
img: Image.Image,
bg=DEFAULT_BG,
text=DEFAULT_TEXT,
link=DEFAULT_LINK,
) -> Image.Image:
"""
Remap a white-background page image to a dark theme.
- Near-white pixels → bg color
- Dark pixels (ink/text) → text color
- Blue-ish pixels → link color
"""
arr = np.array(img.convert('RGB'), dtype=np.float32)
orig = arr.copy()
norm = arr / 255.0
lightness = (
0.299 * norm[:, :, 0]
+ 0.587 * norm[:, :, 1]
+ 0.114 * norm[:, :, 2]
)
r, g, b = orig[:, :, 0], orig[:, :, 1], orig[:, :, 2]
link_mask = (
(b > 100)
& (b > r * 1.3)
& (b > g * 0.9)
& (lightness < 0.85)
)
content_mask = (lightness < 0.85) & ~link_mask
blend = ((1.0 - lightness) / 0.85).clip(0, 1)
bg_f = [c / 255.0 for c in bg]
text_f = [c / 255.0 for c in text]
link_f = [c / 255.0 for c in link]
out = np.zeros_like(norm)
for i, (b_c, t, lc) in enumerate(zip(bg_f, text_f, link_f)):
channel = np.full(lightness.shape, b_c)
channel = np.where(content_mask, b_c + blend * (t - b_c), channel)
channel = np.where(link_mask, b_c + blend * (lc - b_c), channel)
out[:, :, i] = channel
return Image.fromarray((out * 255).clip(0, 255).astype('uint8'))
def _save_images_as_pdf(images: list, output_path: str) -> None:
"""Save a list of RGB PIL images as a PDF using PNG compression via qpdf.
Pillow's built-in PDF writer defaults to JPEG encoding for RGB images,
which fails when libjpeg is not available in the environment. Instead we
write each page as a lossless PNG to a temp directory and assemble them
with qpdf, which embeds the PNGs directly without re-encoding.
"""
import tempfile as _tempfile
with _tempfile.TemporaryDirectory() as staging:
png_paths = []
for i, img in enumerate(images):
p = os.path.join(staging, f'p{i:05d}.png')
img.save(p, format='PNG')
png_paths.append(p)
subprocess.run(
['qpdf', '--empty', '--pages'] + png_paths + ['--', output_path],
check=True,
)
def _check_qpdf() -> bool:
return subprocess.run(
['qpdf', '--version'], capture_output=True
).returncode == 0
def _check_dependencies() -> None:
"""Verify required system tools are available before doing any work."""
missing = []
for tool in ('pdftoppm', 'qpdf'):
if subprocess.run(['which', tool], capture_output=True).returncode != 0:
missing.append(tool)
if missing:
tools = ', '.join(missing)
instructions = (
f"Install with:\n"
f" Linux/WSL: sudo apt install poppler-utils qpdf\n"
f" macOS: brew install poppler qpdf\n"
f" Windows: see docs/code/develop.md"
)
raise RuntimeError(
f"Missing required system tool(s): {tools}\n{instructions}"
)
def convert_pdf_to_dark(
input_path: str | Path,
output_path: str | Path,
dpi: int = DEFAULT_DPI,
bg=DEFAULT_BG,
text=DEFAULT_TEXT,
link=DEFAULT_LINK,
batch_size: int = DEFAULT_BATCH,
) -> None:
"""
Full pipeline: rasterize → apply dark theme → reassemble as PDF.
For large documents, pages are processed in batches of `batch_size` to
avoid OOM, then merged with qpdf. Falls back to single-pass Pillow save
if qpdf is not available (fine for small documents).
"""
input_path = str(input_path)
output_path = str(output_path)
_check_dependencies()
with tempfile.TemporaryDirectory() as tmp:
# 1. Rasterize all pages
prefix = os.path.join(tmp, 'page')
result = subprocess.run(
['pdftoppm', '-r', str(dpi), '-png', input_path, prefix],
capture_output=True,
)
if result.returncode != 0:
raise RuntimeError(
f"pdftoppm failed:\n{result.stderr.decode()}"
)
pages = sorted(glob.glob(prefix + '-*.png'))
if not pages:
raise RuntimeError(
"pdftoppm produced no output pages — "
"is the PDF valid and not password-protected?"
)
total = len(pages)
print(f" Converting {total} page(s) at {dpi} DPI…", flush=True)
out_dir = os.path.dirname(output_path)
if out_dir:
os.makedirs(out_dir, exist_ok=True)
# 2. Process in batches
use_batches = total > batch_size and _check_qpdf()
if use_batches:
batch_dir = os.path.join(tmp, 'batches')
os.makedirs(batch_dir)
batch_files = []
for start in range(0, total, batch_size):
batch = pages[start:start + batch_size]
batch_num = start // batch_size + 1
batch_path = os.path.join(batch_dir, f'batch_{batch_num:04d}.pdf')
print(
f" Batch {batch_num}/{(total + batch_size - 1) // batch_size}: "
f"pages {start + 1}{start + len(batch)}",
flush=True,
)
dark = [apply_dark_theme(Image.open(p), bg, text, link) for p in batch]
_save_images_as_pdf(dark, batch_path)
batch_files.append(batch_path)
del dark
# 3. Merge batches with qpdf
print(" Merging batches…", flush=True)
subprocess.run(
['qpdf', '--empty', '--pages'] + batch_files + ['--', output_path],
check=True,
)
else:
# Single-pass for small documents or when qpdf is unavailable
dark_pages = []
for i, p in enumerate(pages, 1):
if i % 50 == 0 or i == 1:
print(f" Page {i}/{total}", flush=True)
dark_pages.append(apply_dark_theme(Image.open(p), bg, text, link))
_save_images_as_pdf(dark_pages, output_path)
size_mb = os.path.getsize(output_path) / 1024 / 1024
print(f" Saved → {output_path} ({size_mb:.1f} MB)")
# --------------------------------------------------------------------------- #
# CLI
# --------------------------------------------------------------------------- #
def main() -> int:
parser = argparse.ArgumentParser(description='Convert a PDF to dark mode.')
parser.add_argument('input', help='Input PDF path')
parser.add_argument('output', nargs='?', help='Output PDF path (optional)')
parser.add_argument('--dpi', type=int, default=DEFAULT_DPI, help='Rasterization DPI (default: 200)')
parser.add_argument('--batch-size', type=int, default=DEFAULT_BATCH, help='Pages per batch (default: 50)')
parser.add_argument('--bg', default='1f1f31', help='Background hex color (default: 1f1f31)')
parser.add_argument('--text', default='e0e0e0', help='Body text hex color (default: e0e0e0)')
parser.add_argument('--link', default='5e8bde', help='Link/blue hex color (default: 5e8bde)')
args = parser.parse_args()
if not args.output:
base = Path(args.input).stem
args.output = str(Path(args.input).parent / f"{base}-dark.pdf")
convert_pdf_to_dark(
args.input,
args.output,
dpi=args.dpi,
bg=hex_to_rgb(args.bg),
text=hex_to_rgb(args.text),
link=hex_to_rgb(args.link),
batch_size=args.batch_size,
)
return 0
if __name__ == '__main__':
raise SystemExit(main())