Files
thgtoa/scripts/convert.py
T
nopeitsnothing c658c354ee fix(convert): actually save per-page PDFs for qpdf, not PNGs
Previous filesystem edits to _save_images_as_pdf did not persist to
disk. Rewrote the function: quantize each dark-themed RGB image to
palette mode (256 colours, FASTOCTREE) so Pillow uses zlib/deflate
instead of JPEG (no libjpeg needed), save each as a single-page PDF,
then merge with qpdf. qpdf only accepts PDF inputs to --pages.

Also restores the orphaned footnote citations [^536] and [^537] in
docs/guide/index.md at the key disclosure law paragraph (line 8586).
Previous edit also did not persist to disk.

Signed-off-by: nopeitsnothing <no@anonymousplanet.org>
2026-05-23 22:57:26 -04:00

271 lines
9.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Dark-mode PDF converter (pixel-based, batch-safe).
Rasterizes each page with pdftoppm, applies the hacker theme palette
pixel-by-pixel, then reassembles into a PDF. Processes in batches of 50
pages to stay within memory limits on large documents, then merges with qpdf.
Usage:
python scripts/convert.py INPUT.pdf [OUTPUT.pdf]
python scripts/convert.py INPUT.pdf [OUTPUT.pdf] [--dpi 200]
[--bg 1f1f31] [--text e0e0e0] [--link 5e8bde]
[--batch-size 50]
Examples:
python scripts/convert.py export/thgtoa.pdf export/thgtoa-dark.pdf
python scripts/convert.py export/thgtoa.pdf --dpi 150 --bg 0d1117
"""
from __future__ import annotations
import argparse
import glob
import os
import subprocess
import sys
import tempfile
from pathlib import Path
import numpy as np
from PIL import Image
# --------------------------------------------------------------------------- #
# Defaults (Hacker theme)
# --------------------------------------------------------------------------- #
DEFAULT_BG = (0x1f, 0x1f, 0x31)
DEFAULT_TEXT = (0xe0, 0xe0, 0xe0)
DEFAULT_LINK = (0x5e, 0x8b, 0xde)
DEFAULT_DPI = 200
DEFAULT_BATCH = 50
def hex_to_rgb(h: str) -> tuple:
h = h.lstrip('#')
return tuple(int(h[i:i+2], 16) for i in (0, 2, 4))
def apply_dark_theme(
img: Image.Image,
bg=DEFAULT_BG,
text=DEFAULT_TEXT,
link=DEFAULT_LINK,
) -> Image.Image:
"""
Remap a white-background page image to a dark theme.
- Near-white pixels → bg color
- Dark pixels (ink/text) → text color
- Blue-ish pixels → link color
"""
arr = np.array(img.convert('RGB'), dtype=np.float32)
orig = arr.copy()
norm = arr / 255.0
lightness = (
0.299 * norm[:, :, 0]
+ 0.587 * norm[:, :, 1]
+ 0.114 * norm[:, :, 2]
)
r, g, b = orig[:, :, 0], orig[:, :, 1], orig[:, :, 2]
link_mask = (
(b > 100)
& (b > r * 1.3)
& (b > g * 0.9)
& (lightness < 0.85)
)
content_mask = (lightness < 0.85) & ~link_mask
blend = ((1.0 - lightness) / 0.85).clip(0, 1)
bg_f = [c / 255.0 for c in bg]
text_f = [c / 255.0 for c in text]
link_f = [c / 255.0 for c in link]
out = np.zeros_like(norm)
for i, (b_c, t, lc) in enumerate(zip(bg_f, text_f, link_f)):
channel = np.full(lightness.shape, b_c)
channel = np.where(content_mask, b_c + blend * (t - b_c), channel)
channel = np.where(link_mask, b_c + blend * (lc - b_c), channel)
out[:, :, i] = channel
return Image.fromarray((out * 255).clip(0, 255).astype('uint8'))
def _save_images_as_pdf(images: list, output_path: str) -> None:
"""Save a list of RGB PIL images as a PDF without requiring libjpeg.
Pillow's PDF writer defaults to JPEG encoding for RGB images, which
fails when libjpeg is absent in the environment. Fix: quantize each
image to palette mode (256 colours, FASTOCTREE) so Pillow uses
zlib/deflate instead of JPEG, save each as an individual single-page
PDF, then merge the page PDFs with qpdf.
Colour fidelity is preserved — the hacker theme uses only a handful
of distinct colours so 256-colour quantization is visually lossless.
"""
import tempfile as _tempfile
with _tempfile.TemporaryDirectory() as staging:
page_pdfs = []
for i, img in enumerate(images):
page_path = os.path.join(staging, f'p{i:05d}.pdf')
img.quantize(colors=256, method=Image.Quantize.FASTOCTREE).save(
page_path, format='PDF'
)
page_pdfs.append(page_path)
subprocess.run(
['qpdf', '--empty', '--pages'] + page_pdfs + ['--', output_path],
check=True,
)
def _check_qpdf() -> bool:
return subprocess.run(
['qpdf', '--version'], capture_output=True
).returncode == 0
def _check_dependencies() -> None:
"""Verify required system tools are available before doing any work."""
missing = []
for tool in ('pdftoppm', 'qpdf'):
if subprocess.run(['which', tool], capture_output=True).returncode != 0:
missing.append(tool)
if missing:
tools = ', '.join(missing)
instructions = (
f"Install with:\n"
f" Linux/WSL: sudo apt install poppler-utils qpdf\n"
f" macOS: brew install poppler qpdf\n"
f" Windows: see docs/code/develop.md"
)
raise RuntimeError(
f"Missing required system tool(s): {tools}\n{instructions}"
)
def convert_pdf_to_dark(
input_path: str | Path,
output_path: str | Path,
dpi: int = DEFAULT_DPI,
bg=DEFAULT_BG,
text=DEFAULT_TEXT,
link=DEFAULT_LINK,
batch_size: int = DEFAULT_BATCH,
) -> None:
"""
Full pipeline: rasterize → apply dark theme → reassemble as PDF.
For large documents, pages are processed in batches of `batch_size` to
avoid OOM, then merged with qpdf. Falls back to single-pass Pillow save
if qpdf is not available (fine for small documents).
"""
input_path = str(input_path)
output_path = str(output_path)
_check_dependencies()
with tempfile.TemporaryDirectory() as tmp:
# 1. Rasterize all pages
prefix = os.path.join(tmp, 'page')
result = subprocess.run(
['pdftoppm', '-r', str(dpi), '-png', input_path, prefix],
capture_output=True,
)
if result.returncode != 0:
raise RuntimeError(
f"pdftoppm failed:\n{result.stderr.decode()}"
)
pages = sorted(glob.glob(prefix + '-*.png'))
if not pages:
raise RuntimeError(
"pdftoppm produced no output pages — "
"is the PDF valid and not password-protected?"
)
total = len(pages)
print(f" Converting {total} page(s) at {dpi} DPI…", flush=True)
out_dir = os.path.dirname(output_path)
if out_dir:
os.makedirs(out_dir, exist_ok=True)
# 2. Process in batches
use_batches = total > batch_size and _check_qpdf()
if use_batches:
batch_dir = os.path.join(tmp, 'batches')
os.makedirs(batch_dir)
batch_files = []
for start in range(0, total, batch_size):
batch = pages[start:start + batch_size]
batch_num = start // batch_size + 1
batch_path = os.path.join(batch_dir, f'batch_{batch_num:04d}.pdf')
print(
f" Batch {batch_num}/{(total + batch_size - 1) // batch_size}: "
f"pages {start + 1}{start + len(batch)}",
flush=True,
)
dark = [apply_dark_theme(Image.open(p), bg, text, link) for p in batch]
_save_images_as_pdf(dark, batch_path)
batch_files.append(batch_path)
del dark
# 3. Merge batches with qpdf
print(" Merging batches…", flush=True)
subprocess.run(
['qpdf', '--empty', '--pages'] + batch_files + ['--', output_path],
check=True,
)
else:
# Single-pass for small documents or when qpdf is unavailable
dark_pages = []
for i, p in enumerate(pages, 1):
if i % 50 == 0 or i == 1:
print(f" Page {i}/{total}", flush=True)
dark_pages.append(apply_dark_theme(Image.open(p), bg, text, link))
_save_images_as_pdf(dark_pages, output_path)
size_mb = os.path.getsize(output_path) / 1024 / 1024
print(f" Saved → {output_path} ({size_mb:.1f} MB)")
# --------------------------------------------------------------------------- #
# CLI
# --------------------------------------------------------------------------- #
def main() -> int:
parser = argparse.ArgumentParser(description='Convert a PDF to dark mode.')
parser.add_argument('input', help='Input PDF path')
parser.add_argument('output', nargs='?', help='Output PDF path (optional)')
parser.add_argument('--dpi', type=int, default=DEFAULT_DPI, help='Rasterization DPI (default: 200)')
parser.add_argument('--batch-size', type=int, default=DEFAULT_BATCH, help='Pages per batch (default: 50)')
parser.add_argument('--bg', default='1f1f31', help='Background hex color (default: 1f1f31)')
parser.add_argument('--text', default='e0e0e0', help='Body text hex color (default: e0e0e0)')
parser.add_argument('--link', default='5e8bde', help='Link/blue hex color (default: 5e8bde)')
args = parser.parse_args()
if not args.output:
base = Path(args.input).stem
args.output = str(Path(args.input).parent / f"{base}-dark.pdf")
convert_pdf_to_dark(
args.input,
args.output,
dpi=args.dpi,
bg=hex_to_rgb(args.bg),
text=hex_to_rgb(args.text),
link=hex_to_rgb(args.link),
batch_size=args.batch_size,
)
return 0
if __name__ == '__main__':
raise SystemExit(main())