fix sloppy llm code for pdf parsing

This commit is contained in:
Brent Schroeter 2026-01-15 21:27:04 +00:00
parent 1ca2238c5d
commit ce42ab58f1

View file

@ -6,7 +6,6 @@ microfiche scans.
import os import os
import urllib import urllib
from dataclasses import dataclass from dataclasses import dataclass
from io import BytesIO
from typing import Optional from typing import Optional
import pymupdf import pymupdf
@ -104,52 +103,36 @@ class ArchiveDoc:
leaves = [] leaves = []
# Open PDF from bytes with pymupdf.open(stream=pdf_data, filetype="pdf") as pdf:
pdf_doc = pymupdf.open(stream=pdf_data, filetype="pdf") for i, page in enumerate(pdf):
bound = page.bound()
im = page.get_pixmap(dpi=300).pil_image().convert("L")
scale_factor_x = im.size[0] / bound.width
scale_factor_y = im.size[1] / bound.height
try: BLOCK_TYPE_TEXT = 0
for page_num in range(len(pdf_doc)):
page = pdf_doc[page_num]
# Extract text blocks with coordinates
# Convert to TextBlock objects, discarding block_no and block_type
text_blocks = [ text_blocks = [
TextBlock( TextBlock(
x0=int(x0), x0=int((x0 - bound.x0) * scale_factor_x),
y0=int(y0), y0=int((y0 - bound.y0) * scale_factor_y),
x1=int(x1), x1=int((x1 - bound.x0) * scale_factor_x),
y1=int(y1), y1=int((y1 - bound.y0) * scale_factor_y),
text=text, text=text,
) )
for x0, y0, x1, y1, text, *_ in page.get_text("blocks") # Tuple schema is documented here (accessed 2026-01-15):
# https://pymupdf.readthedocs.io/en/latest/textpage.html#TextPage.extractText
for x0, y0, x1, y1, text, _, block_type in page.get_text("blocks")
if block_type == BLOCK_TYPE_TEXT
] ]
# Render page to image
# Use a matrix to scale appropriately (default is 72 DPI)
# Scale factor 4.44 gives approximately 320 DPI, which should produce
# images with long edge around 3200px for typical page sizes
mat = pymupdf.Matrix(4.44, 4.44)
pix = page.get_pixmap(matrix=mat, alpha=False)
# Convert PyMuPDF pixmap to PIL Image
img_data = pix.tobytes("ppm")
image = Image.open(BytesIO(img_data)).convert("L")
# Ensure long edge is no more than 3200 px
image.thumbnail((3200, 3200))
# Page numbers are 1-indexed for human readability
leaves.append( leaves.append(
ArchiveLeaf( ArchiveLeaf(
image=image, image=im,
page_number=str(page_num + 1), page_number=str(page.number),
text_blocks=text_blocks, text_blocks=text_blocks,
) )
) )
finally:
pdf_doc.close()
return leaves return leaves