fix sloppy llm code for pdf parsing

2026-01-15 21:27:04 +00:00 · 2026-01-15 21:27:04 +00:00 · ce42ab58f1
commit ce42ab58f1
parent 1ca2238c5d
1 changed files with 17 additions and 34 deletions
--- a/microqa/items.py
+++ b/microqa/items.py
@ -6,7 +6,6 @@ microfiche scans.
 import os
 import urllib
 from dataclasses import dataclass
-from io import BytesIO
 from typing import Optional

 import pymupdf
@ -104,52 +103,36 @@ class ArchiveDoc:

        leaves = []

-        # Open PDF from bytes
-        pdf_doc = pymupdf.open(stream=pdf_data, filetype="pdf")
+        with pymupdf.open(stream=pdf_data, filetype="pdf") as pdf:
+            for i, page in enumerate(pdf):
+                bound = page.bound()
+                im = page.get_pixmap(dpi=300).pil_image().convert("L")
+                scale_factor_x = im.size[0] / bound.width
+                scale_factor_y = im.size[1] / bound.height

-        try:
-            for page_num in range(len(pdf_doc)):
-                page = pdf_doc[page_num]
-
-                # Extract text blocks with coordinates
-                # Convert to TextBlock objects, discarding block_no and block_type
+                BLOCK_TYPE_TEXT = 0
                text_blocks = [
                    TextBlock(
-                        x0=int(x0),
-                        y0=int(y0),
-                        x1=int(x1),
-                        y1=int(y1),
+                        x0=int((x0 - bound.x0) * scale_factor_x),
+                        y0=int((y0 - bound.y0) * scale_factor_y),
+                        x1=int((x1 - bound.x0) * scale_factor_x),
+                        y1=int((y1 - bound.y0) * scale_factor_y),
                        text=text,
                    )
-                    for x0, y0, x1, y1, text, *_ in page.get_text("blocks")
+                    # Tuple schema is documented here (accessed 2026-01-15):
+                    # https://pymupdf.readthedocs.io/en/latest/textpage.html#TextPage.extractText
+                    for x0, y0, x1, y1, text, _, block_type in page.get_text("blocks")
+                    if block_type == BLOCK_TYPE_TEXT
                ]

-                # Render page to image
-                # Use a matrix to scale appropriately (default is 72 DPI)
-                # Scale factor 4.44 gives approximately 320 DPI, which should produce
-                # images with long edge around 3200px for typical page sizes
-                mat = pymupdf.Matrix(4.44, 4.44)
-                pix = page.get_pixmap(matrix=mat, alpha=False)
-
-                # Convert PyMuPDF pixmap to PIL Image
-                img_data = pix.tobytes("ppm")
-                image = Image.open(BytesIO(img_data)).convert("L")
-
-                # Ensure long edge is no more than 3200 px
-                image.thumbnail((3200, 3200))
-
-                # Page numbers are 1-indexed for human readability
                leaves.append(
                    ArchiveLeaf(
-                        image=image,
-                        page_number=str(page_num + 1),
+                        image=im,
+                        page_number=str(page.number),
                        text_blocks=text_blocks,
                    )
                )

-        finally:
-            pdf_doc.close()
-
        return leaves