fix sloppy llm code for pdf parsing

This commit is contained in:
Brent Schroeter 2026-01-15 21:27:04 +00:00
parent 1ca2238c5d
commit ce42ab58f1

View file

@ -6,7 +6,6 @@ microfiche scans.
import os
import urllib
from dataclasses import dataclass
from io import BytesIO
from typing import Optional
import pymupdf
@ -104,52 +103,36 @@ class ArchiveDoc:
leaves = []
# Open PDF from bytes
pdf_doc = pymupdf.open(stream=pdf_data, filetype="pdf")
with pymupdf.open(stream=pdf_data, filetype="pdf") as pdf:
for i, page in enumerate(pdf):
bound = page.bound()
im = page.get_pixmap(dpi=300).pil_image().convert("L")
scale_factor_x = im.size[0] / bound.width
scale_factor_y = im.size[1] / bound.height
try:
for page_num in range(len(pdf_doc)):
page = pdf_doc[page_num]
# Extract text blocks with coordinates
# Convert to TextBlock objects, discarding block_no and block_type
BLOCK_TYPE_TEXT = 0
text_blocks = [
TextBlock(
x0=int(x0),
y0=int(y0),
x1=int(x1),
y1=int(y1),
x0=int((x0 - bound.x0) * scale_factor_x),
y0=int((y0 - bound.y0) * scale_factor_y),
x1=int((x1 - bound.x0) * scale_factor_x),
y1=int((y1 - bound.y0) * scale_factor_y),
text=text,
)
for x0, y0, x1, y1, text, *_ in page.get_text("blocks")
# Tuple schema is documented here (accessed 2026-01-15):
# https://pymupdf.readthedocs.io/en/latest/textpage.html#TextPage.extractText
for x0, y0, x1, y1, text, _, block_type in page.get_text("blocks")
if block_type == BLOCK_TYPE_TEXT
]
# Render page to image
# Use a matrix to scale appropriately (default is 72 DPI)
# Scale factor 4.44 gives approximately 320 DPI, which should produce
# images with long edge around 3200px for typical page sizes
mat = pymupdf.Matrix(4.44, 4.44)
pix = page.get_pixmap(matrix=mat, alpha=False)
# Convert PyMuPDF pixmap to PIL Image
img_data = pix.tobytes("ppm")
image = Image.open(BytesIO(img_data)).convert("L")
# Ensure long edge is no more than 3200 px
image.thumbnail((3200, 3200))
# Page numbers are 1-indexed for human readability
leaves.append(
ArchiveLeaf(
image=image,
page_number=str(page_num + 1),
image=im,
page_number=str(page.number),
text_blocks=text_blocks,
)
)
finally:
pdf_doc.close()
return leaves