fix sloppy llm code for pdf parsing

2026-01-15 21:27:04 +00:00 · 2026-01-15 21:27:04 +00:00 · ce42ab58f1
commit ce42ab58f1
parent 1ca2238c5d
1 changed files with 17 additions and 34 deletions
--- a/microqa/items.py
+++ b/microqa/items.py
@ -6,7 +6,6 @@ microfiche scans.
 import os
 import urllib
 from dataclasses import dataclass
 from io import BytesIO
 from typing import Optional
 import pymupdf
@ -104,52 +103,36 @@ class ArchiveDoc:
        leaves = []
-        # Open PDF from bytes
+        with pymupdf.open(stream=pdf_data, filetype="pdf") as pdf:
-        pdf_doc = pymupdf.open(stream=pdf_data, filetype="pdf")
+            for i, page in enumerate(pdf):
                bound = page.bound()
                im = page.get_pixmap(dpi=300).pil_image().convert("L")
                scale_factor_x = im.size[0] / bound.width
                scale_factor_y = im.size[1] / bound.height
-        try:
+                BLOCK_TYPE_TEXT = 0
            for page_num in range(len(pdf_doc)):
                page = pdf_doc[page_num]
                # Extract text blocks with coordinates
                # Convert to TextBlock objects, discarding block_no and block_type
                text_blocks = [
                    TextBlock(
-                        x0=int(x0),
+                        x0=int((x0 - bound.x0) * scale_factor_x),
-                        y0=int(y0),
+                        y0=int((y0 - bound.y0) * scale_factor_y),
-                        x1=int(x1),
+                        x1=int((x1 - bound.x0) * scale_factor_x),
-                        y1=int(y1),
+                        y1=int((y1 - bound.y0) * scale_factor_y),
                        text=text,
                    )
-                    for x0, y0, x1, y1, text, *_ in page.get_text("blocks")
+                    # Tuple schema is documented here (accessed 2026-01-15):
                    # https://pymupdf.readthedocs.io/en/latest/textpage.html#TextPage.extractText
                    for x0, y0, x1, y1, text, _, block_type in page.get_text("blocks")
                    if block_type == BLOCK_TYPE_TEXT
                ]
                # Render page to image
                # Use a matrix to scale appropriately (default is 72 DPI)
                # Scale factor 4.44 gives approximately 320 DPI, which should produce
                # images with long edge around 3200px for typical page sizes
                mat = pymupdf.Matrix(4.44, 4.44)
                pix = page.get_pixmap(matrix=mat, alpha=False)
                # Convert PyMuPDF pixmap to PIL Image
                img_data = pix.tobytes("ppm")
                image = Image.open(BytesIO(img_data)).convert("L")
                # Ensure long edge is no more than 3200 px
                image.thumbnail((3200, 3200))
                # Page numbers are 1-indexed for human readability
                leaves.append(
                    ArchiveLeaf(
-                        image=image,
+                        image=im,
-                        page_number=str(page_num + 1),
+                        page_number=str(page.number),
                        text_blocks=text_blocks,
                    )
                )
        finally:
            pdf_doc.close()
        return leaves