diff --git a/microqa/items.py b/microqa/items.py index 5906381..ee72617 100644 --- a/microqa/items.py +++ b/microqa/items.py @@ -6,7 +6,6 @@ microfiche scans. import os import urllib from dataclasses import dataclass -from io import BytesIO from typing import Optional import pymupdf @@ -104,52 +103,36 @@ class ArchiveDoc: leaves = [] - # Open PDF from bytes - pdf_doc = pymupdf.open(stream=pdf_data, filetype="pdf") + with pymupdf.open(stream=pdf_data, filetype="pdf") as pdf: + for i, page in enumerate(pdf): + bound = page.bound() + im = page.get_pixmap(dpi=300).pil_image().convert("L") + scale_factor_x = im.size[0] / bound.width + scale_factor_y = im.size[1] / bound.height - try: - for page_num in range(len(pdf_doc)): - page = pdf_doc[page_num] - - # Extract text blocks with coordinates - # Convert to TextBlock objects, discarding block_no and block_type + BLOCK_TYPE_TEXT = 0 text_blocks = [ TextBlock( - x0=int(x0), - y0=int(y0), - x1=int(x1), - y1=int(y1), + x0=int((x0 - bound.x0) * scale_factor_x), + y0=int((y0 - bound.y0) * scale_factor_y), + x1=int((x1 - bound.x0) * scale_factor_x), + y1=int((y1 - bound.y0) * scale_factor_y), text=text, ) - for x0, y0, x1, y1, text, *_ in page.get_text("blocks") + # Tuple schema is documented here (accessed 2026-01-15): + # https://pymupdf.readthedocs.io/en/latest/textpage.html#TextPage.extractText + for x0, y0, x1, y1, text, _, block_type in page.get_text("blocks") + if block_type == BLOCK_TYPE_TEXT ] - # Render page to image - # Use a matrix to scale appropriately (default is 72 DPI) - # Scale factor 4.44 gives approximately 320 DPI, which should produce - # images with long edge around 3200px for typical page sizes - mat = pymupdf.Matrix(4.44, 4.44) - pix = page.get_pixmap(matrix=mat, alpha=False) - - # Convert PyMuPDF pixmap to PIL Image - img_data = pix.tobytes("ppm") - image = Image.open(BytesIO(img_data)).convert("L") - - # Ensure long edge is no more than 3200 px - image.thumbnail((3200, 3200)) - - # Page numbers are 1-indexed for human readability leaves.append( ArchiveLeaf( - image=image, - page_number=str(page_num + 1), + image=im, + page_number=str(page.number), text_blocks=text_blocks, ) ) - finally: - pdf_doc.close() - return leaves