reuse pdf ocr when available

2025-12-20 02:16:41 +00:00 · 2025-12-20 02:16:41 +00:00 · 3da76d4537
commit 3da76d4537
parent ac7e93a75b
9 changed files with 1373 additions and 901 deletions
--- a/diagnostics.py
+++ b/diagnostics.py
@ -11,9 +11,25 @@ from microqa.engine import analyze_doc
 def main():
    parser = ArgumentParser()
    parser.add_argument("--item-id")
-    parser.add_argument("--cpus", type=int, default=4)
+    parser.add_argument(
+        "--ocr-backend",
+        help="which local OCR backend to use when available text in archived PDF files is insufficient; one of 'tesseract' or 'paddleocr'",
+        default="tesseract",
+    )
+    parser.add_argument("--verbose", action="store_true")
    args = parser.parse_args()

+    # Import OCR engine modules only as needed, to avoid unnecessary slow
+    # startups and/or missing dependency errors.
+    if args.ocr_backend == "tesseract":
+        from microqa.ocr.tesseract import TesseractOcrEngine
+
+        ocr_engine = TesseractOcrEngine(languages=["eng", "fra"])
+    elif args.ocr_backend == "paddleocr":
+        from microqa.ocr.paddleocr import PaddleOcrEngine
+
+        ocr_engine = PaddleOcrEngine(languages=["eng", "fra"])
+
    cache_item(
        args.item_id,
        # Will not refetch if value is already cached.
@ -29,7 +45,10 @@ def main():
        else item.docs
    )
    analyses = [
-        analyze_doc(doc, parallel=args.cpus, use_cache=True) for doc in minimal_docs
+        analyze_doc(
+            doc=doc, ocr_engine=ocr_engine, use_cache=True, verbose=args.verbose
+        )
+        for doc in minimal_docs
    ]

    t_end = time()
@ -43,7 +62,7 @@ def main():
                    [
                        i
                        for i, page in enumerate(doc["pages"])
-                        if not page["ocr_orientation_match"]
+                        if 45 < page["page_angle"] < 315
                    ]
                    for doc in analyses
                ],
--- a/main.py
+++ b/main.py
@ -30,8 +30,24 @@ def main():
        help="script will attempt to analyze all items with a review date greater than or equal to this value (YYYYMMDD)",
        default="20250701",
    )
+    parser.add_argument(
+        "--ocr-backend",
+        help="which local OCR backend to use when available text in archived PDF files is insufficient; one of 'tesseract' or 'paddleocr'",
+        default="tesseract",
+    )
    args = parser.parse_args()

+    # Import OCR engine modules only as needed, to avoid unnecessary slow
+    # startups and/or missing dependency errors.
+    if args.ocr_backend == "tesseract":
+        from microqa.ocr.tesseract import TesseractOcrEngine
+
+        ocr_engine = TesseractOcrEngine(languages=["eng", "fra"])
+    elif args.ocr_backend == "paddleocr":
+        from microqa.ocr.paddleocr import PaddleOcrEngine
+
+        ocr_engine = PaddleOcrEngine(languages=["eng", "fra"])
+
    with sqlite3.connect(args.database) as conn:
        cur = conn.cursor()
        cur.execute("""
@ -51,7 +67,7 @@ create table if not exists pages (
    id int primary key,
    doc text not null,
    page int not null,
-    orientation_match boolean not null,
+    page_angle float not null,
    sharpness real not null,
    is_blank boolean not null,
    text_margin_px int not null
@ -95,7 +111,7 @@ order by review_date
                                [doc.name, item_id],
                            )
                            analysis = analyze_doc(
-                                doc, parallel=args.cpus, verbose=True
+                                doc=doc, ocr_engine=ocr_engine, verbose=True
                            )
                            for i, page in enumerate(analysis["pages"]):
                                cur.execute(
@ -103,7 +119,7 @@ order by review_date
 insert into pages (
    doc,
    page,
-    orientation_match,
+    page_angle,
    sharpness,
    is_blank,
    text_margin_px
@ -111,7 +127,7 @@ insert into pages (
                                    [
                                        doc.name,
                                        i + 1,
-                                        page["ocr_orientation_match"],
+                                        page["page_angle"],
                                        page["sharpness"],
                                        page["blank"],
                                        page["text_margin_px"],
--- a/microqa/engine.py
+++ b/microqa/engine.py
@ -1,57 +1,41 @@
-from dataclasses import dataclass, field
-from multiprocessing import Pool
 from sys import stdout

 import numpy as np
 from PIL import Image, ImageFilter

-from .items import ArchiveDoc
-from .ocr.tesseract import OcrEngine
+from .items import ArchiveDoc, ArchiveLeaf
+from .ocr import OcrEngine, TextBlock


-def analyze_doc(doc: ArchiveDoc, parallel=1, use_cache=False, verbose=False):
+def analyze_doc(
+    doc: ArchiveDoc,
+    ocr_engine: OcrEngine,
+    use_cache: bool = False,
+    verbose: bool = False,
+):
+    """
+    Analyzes all pages in an ArchiveDoc for useful metrics such as sharpness,
+    orientation, presence of text overflows, and so on.
+    """
+
    if verbose:
        print(f"Loading {doc.name}...")
        stdout.flush()

-    tasks: PageAnalysisTask = [
-        PageAnalysisTask(im=leaf.image)
-        for leaf in doc.fetch_leaves(use_cache=use_cache)
-    ]
+    all_leaves = doc.fetch_leaves(use_cache=use_cache)

    if verbose:
-        print(f"Processing {len(tasks)} pages...", file=stdout)
+        print(f"Processing {len(all_leaves)} pages...", file=stdout)
        stdout.flush()

-    if parallel > 1:
-        # Parallelize image processing and OCR of pages across up to n cores.
-        with Pool(parallel) as pool:
-            return {"pages": pool.map(analyze_page, tasks)}
-
-    return {"pages": [analyze_page(task) for task in tasks]}
-
-
-@dataclass
-class PageAnalysisTask:
-    """
-    Attributes:
-        im          PIL Image, pre-scaled using .thumbnail() to fit the long
-                    edge to 3200 px.
-        ocr_langs   Tesseract language codes (3 letters each, in a "+"-separated
-                    list).
-    """
-
-    im: Image.Image
-    ocr_langs: list[str] = field(default_factory=lambda: ["eng"])
-
-
-def analyze_page(task):
-    im_cropped = task.im.crop(
+    analyzed_pages = []
+    for leaf in all_leaves:
+        im_cropped = leaf.image.crop(
            (
-            task.im.size[0] * 0.1,
-            task.im.size[1] * 0.1,
-            task.im.size[0] * 0.9,
-            task.im.size[1] * 0.9,
+                leaf.image.size[0] * 0.1,
+                leaf.image.size[1] * 0.1,
+                leaf.image.size[0] * 0.9,
+                leaf.image.size[1] * 0.9,
            )
        )

@ -59,9 +43,15 @@ def analyze_page(task):

        if is_blank:
            max_sharpness = 1
-        ocr_orientation_match = True
            text_margin_px = -1
+            page_angle = 0
        else:
+            # Sharpness is determined by percentile of pixels that match some
+            # criteria, so it may vary significantly depending on which portion
+            # of the image is analyzed. In an effort to identify the sharpest
+            # edges, we split up the image into chunks and assume that the
+            # highest sharpness value obtained across all chunks is
+            # representative of the image as a whole.
            max_sharpness = 0.0
            if im_cropped.size[0] < im_cropped.size[1]:
                # Page is in portrait orientation.
@ -87,62 +77,73 @@ def analyze_page(task):
                        ),
                    )

-        OCR_SCALE = 1
-        # TODO: Refactor orientation detection logic into the OCR engine
-        # modules.
-        best_ocr_score = -1
-        best_ocr_words = None
-        best_ocr_orientation = -1
-        for orientation in range(4):
-            im_rotated = task.im.resize(
-                np.int_(np.array(task.im.size) * OCR_SCALE)
-            ).rotate(90 * orientation, expand=True)
-            ocr, ocr_meta = OcrEngine.process(im_rotated, languages=task.ocr_langs)
-
-            if "page_angle" in ocr_meta:
-                # OCR engine automatically accounts for page rotation.
-                best_ocr_score = ocr.shape[0]
-                # PaddleOCR counts rotation as degrees, in the opposite
-                # direction as PIL's `Image.rotate()`
-                best_ocr_orientation = (
-                    4 - round(((ocr_meta["page_angle"] + 360) % 360) / 90)
-                ) % 4
-                best_ocr_words = ocr
-                break
-
-            if ocr.shape[0] > best_ocr_score:
-                best_ocr_score = ocr.shape[0]
-                best_ocr_orientation = orientation
-                best_ocr_words = ocr
-            if best_ocr_score > 50:
-                # Unlikely that another orientation will have more words, so
-                # stop eating up CPU.
-                break
-
-        if best_ocr_words.empty:
-            ocr_orientation_match = True
-            text_margin_px = -1
-        else:
-            ocr_orientation_match = best_ocr_orientation == 0
-
-            best_ocr_dims = OCR_SCALE * np.array(
-                task.im.size
-                if best_ocr_orientation % 2 == 0
-                else (task.im.size[1], task.im.size[0])
+            # OCR is computationally expensive, so we try to take advantage of
+            # the Tesseract data already parsed by the Internet Archive and
+            # embedded in the PDF, when possible. If there is not sufficient
+            # text in the PDF to be confident that the Archive's OCR
+            # postprocessing captured it all, then OCR is recomputed locally.
+            #
+            # In some instances, the Archive's OCR detects rotated text but
+            # parses it as gibberish. To partially mitigate this, we ignore all
+            # precomputed text blocks with a "portrait" aspect ratio. This will
+            # not necessarily help with text that is rotated 180 degrees, but in
+            # practice that case is rarely encountered. This will also not work
+            # well with non-latin scripts that are intended to be oriented
+            # vertically.
+            OCR_RECOMPUTE_THRESHOLD_WORDS = 30
+            if (
+                sum(
+                    (
+                        len(block.text.split())
+                        for block in leaf.text_blocks
+                        if block.x1 - block.x0 > block.y1 - block.y0
                    )
+                )
+                >= OCR_RECOMPUTE_THRESHOLD_WORDS
+            ):
+                if verbose:
+                    print("Using PDF text.")
+                ocred_leaf = leaf
+                page_angle = 0
+            else:
+                if verbose:
+                    print("Using OCR.")
+                OCR_SCALE = 1
+                im_scaled = leaf.image.resize(
+                    np.int_(np.array(leaf.image.size) * OCR_SCALE)
+                )
+                ocr_result = ocr_engine.process(im_scaled)
+                ocred_leaf = ArchiveLeaf(
+                    image=leaf.image,
+                    page_number=leaf.page_number,
+                    text_blocks=[
+                        TextBlock(
+                            x0=int(block.x0 / OCR_SCALE),
+                            y0=int(block.y0 / OCR_SCALE),
+                            x1=int(block.x1 / OCR_SCALE),
+                            y1=int(block.y1 / OCR_SCALE),
+                            text=block.text,
+                        )
+                        for block in ocr_result.blocks
+                    ],
+                )
+                page_angle = ocr_result.page_angle

            word_margins_all_directions = np.sort(
                np.int_(
                    np.concat(
-                        (
-                            best_ocr_words["x0"].to_numpy(),
-                            best_ocr_words["y0"].to_numpy(),
-                            best_ocr_dims[0] - best_ocr_words["x1"].to_numpy(),
-                            best_ocr_dims[1] - best_ocr_words["y1"].to_numpy(),
+                        [
+                            np.array(
+                                [
+                                    block.x0,
+                                    block.y0,
+                                    leaf.image.size[0] - block.x1,
+                                    leaf.image.size[1] - block.y1,
+                                ]
                            )
+                            for block in ocred_leaf.text_blocks
+                        ]
                    )
-                    # Transform back into original image pixel density
-                    / OCR_SCALE
                )
            )
            # Skip the n closest words to the edge, to help ignore stray OCR artifacts.
@ -153,16 +154,23 @@ def analyze_page(task):
                else -1
            )

-    return {
+        # Make sure the OCR engine is running with orientation detection.
+        assert page_angle is not None
+
+        analyzed_pages.append(
+            {
                "blank": is_blank,
-        "ocr_orientation_match": ocr_orientation_match,
-        "size_analyzed": task.im.size,
+                "page_angle": page_angle,
+                "size_analyzed": leaf.image.size,
                "sharpness": max_sharpness,
                "text_margin_px": text_margin_px,
            }
+        )
+
+    return {"pages": analyzed_pages}


-def analyze_sharpness(im):
+def analyze_sharpness(im: Image.Image):
    """
    Crudely quantifies the "sharpness" of edges in an image, on a scale of 0 to
    1. The scale is not linear with respect to scan quality: anything above 0.1
--- a/microqa/items.py
+++ b/microqa/items.py
@ -3,18 +3,18 @@ Python utilities for structuring data and metadata pulled from archive.org
 microfiche scans.
 """

-import json
 import os
 import urllib
-from contextlib import nullcontext
 from dataclasses import dataclass
 from io import BytesIO
 from typing import Optional
-from zipfile import ZipFile

+import pymupdf
 import requests
 from PIL import Image

+from .ocr import TextBlock
+

 CACHE_DIR = "./archive_cache"

@ -38,16 +38,20 @@ class ArchiveLeaf:
                        presented to users, otherwise a (potentially empty)
                        string with the inferred page number as defined by the
                        document being scanned.
+
+        text_blocks     List of text blocks extracted from PyMuPDF's
+                        TextPage.extractBlocks() method.
    """

-    image: Image
+    image: Image.Image
    page_number: Optional[str]
+    text_blocks: list[TextBlock]


@dataclass
 class ArchiveDoc:
    """
-    Information pertaining to a single set of processed pages, of which there
+    Information pdertaining to a single set of processed pages, of which there
    may be multiple for any given ArchiveItem. For example, one SCOTUS case may
    contain several briefs/petitions/etc., each presented as a distinct PDF but
    all held within the parent `ArchiveItem`.
@ -80,62 +84,71 @@ class ArchiveDoc:
    name: str
    title: Optional[str]

-    def fetch_leaves(self, numbered_only=True, use_cache=False) -> list[ArchiveLeaf]:
+    def fetch_leaves(self, use_cache=False) -> list[ArchiveLeaf]:
        """
-        Fetch images and page number data for this document from archive.org,
-        over the Internet.
+        Fetch images and OCR text data for this document from archive.org PDF files.

        Params:

-            numbered_only   If `True`, discards any leaves with no corresponding
-                            page number entries. Leaves for which the page
-                            number is an empty string are retained.
-            use_cache       If `True`, locally cached zip files under the
+            use_cache   If `True`, locally cached PDF files under the
                        `./archive_cache` directory (relative to the working
                        directory) will be used instead of fetching over
                        HTTPS.
        """

        if use_cache:
-            # Cached file names are derived from the percent-encoded verison of
-            # `self.name`, so that there's no need to worry about directory
-            # separators or other disallowed characters in the file names
-            # defined by archive.org.
-            with open(
-                f"{CACHE_DIR}/{_url_encode(self.name)}_page_numbers.json", "r"
-            ) as f:
-                page_nums = json.load(f)["pages"]
-            zip_reader_ctx = open(f"{CACHE_DIR}/{_url_encode(self.name)}_jp2.zip", "rb")
-
+            with open(f"{CACHE_DIR}/{_url_encode(self.name)}.pdf", "rb") as f:
+                pdf_data = f.read()
        else:
-            page_nums = _fetch_page_nums(self.identifier, self.name)["pages"]
-
-            # Wrap in a context manager so that the reader can be used in a `with`
-            # block in the same way as a file accessed with `open()`.
-            zip_reader_ctx = nullcontext(
-                BytesIO(_fetch_jp2_zip(self.identifier, self.name))
-            )
+            pdf_data = _fetch_pdf(self.identifier, self.name)

        leaves = []

-        with zip_reader_ctx as zip_reader, ZipFile(zip_reader) as jp_zip:
-            for leaf_num, file_name in enumerate(sorted(jp_zip.namelist())):
-                for page_index, page_num_info in enumerate(page_nums):
-                    if page_num_info["leafNum"] == leaf_num:
-                        # Stop iterating and keep page_index set to the current
-                        # value.
-                        break
-                else:
-                    # Indicate that leaf was not found in page_num list.
-                    page_index = None
+        # Open PDF from bytes
+        pdf_doc = pymupdf.open(stream=pdf_data, filetype="pdf")

-                if not numbered_only or page_index is not None:
-                    with jp_zip.open(file_name) as jp_file:
-                        # Convert to single-channel greyscale ("L").
-                        image = Image.open(jp_file).convert("L")
-                        # Rescale long edge to no more than 3200 px.
+        try:
+            for page_num in range(len(pdf_doc)):
+                page = pdf_doc[page_num]
+
+                # Extract text blocks with coordinates
+                # Convert to TextBlock objects, discarding block_no and block_type
+                text_blocks = [
+                    TextBlock(
+                        x0=int(x0),
+                        y0=int(y0),
+                        x1=int(x1),
+                        y1=int(y1),
+                        text=text,
+                    )
+                    for x0, y0, x1, y1, text, *_ in page.get_text("blocks")
+                ]
+
+                # Render page to image
+                # Use a matrix to scale appropriately (default is 72 DPI)
+                # Scale factor 4.44 gives approximately 320 DPI, which should produce
+                # images with long edge around 3200px for typical page sizes
+                mat = pymupdf.Matrix(4.44, 4.44)
+                pix = page.get_pixmap(matrix=mat, alpha=False)
+
+                # Convert PyMuPDF pixmap to PIL Image
+                img_data = pix.tobytes("ppm")
+                image = Image.open(BytesIO(img_data)).convert("L")
+
+                # Ensure long edge is no more than 3200 px
                image.thumbnail((3200, 3200))
-                        leaves.append(ArchiveLeaf(image=image, page_number=page_index))
+
+                # Page numbers are 1-indexed for human readability
+                leaves.append(
+                    ArchiveLeaf(
+                        image=image,
+                        page_number=str(page_num + 1),
+                        text_blocks=text_blocks,
+                    )
+                )
+
+        finally:
+            pdf_doc.close()

        return leaves

@ -201,20 +214,18 @@ def fetch_item(identifier: str, use_cache=False) -> ArchiveItem:

    doc_names = [
        # Strip suffix, to just leave the identifier, and title if present.
-        name[: -len("_jp2.zip")]
+        name[: -len(".pdf")]
        for name in file_names
-        if name.lower().endswith("_jp2.zip")
-        # Exclude unprocessed scans, which are also named `..._jp2.zip`.
-        and name.lower() != f"{identifier.lower()}_micro_jp2.zip"
+        if name.lower().endswith(".pdf")
    ]

    # Assert that all files we expect to find are actually present.
    for doc_name in doc_names:
-        if f"{_url_encode(doc_name.lower())}_page_numbers.json" not in [
+        if f"{_url_encode(doc_name.lower())}.pdf" not in [
            name.lower() for name in file_names
        ]:
            raise Exception(
-                f"expected file not found: {_url_encode(doc_name.lower())}_page_numbers.zip"
+                f"expected file not found: {_url_encode(doc_name.lower())}.pdf"
            )

    return ArchiveItem(
@ -232,7 +243,7 @@ def fetch_item(identifier: str, use_cache=False) -> ArchiveItem:

 def cache_item(identifier: str, overwrite=True):
    """
-    Load the relevant files for an `ArchiveItem` and its component `ArchiveDoc`s
+    Load the PDF files for an `ArchiveItem` and its component `ArchiveDoc`s
    and store them within the `archive_cache` directory (relative to the working
    directory). The `archive_cache` directory will be created if it does not
    exist.
@ -249,16 +260,14 @@ def cache_item(identifier: str, overwrite=True):

    for name in os.listdir(CACHE_DIR):
        if _url_decode(name.lower()).startswith(identifier.lower()):
+            if not overwrite:
                return

    item = fetch_item(identifier)
    for doc in item.docs:
-        page_nums = _fetch_page_nums(identifier, doc.name)
-        zip_file = _fetch_jp2_zip(identifier, doc.name)
-        with open(f"{CACHE_DIR}/{_url_encode(doc.name)}_page_numbers.json", "w") as f:
-            json.dump(page_nums, f)
-        with open(f"{CACHE_DIR}/{_url_encode(doc.name)}_jp2.zip", "wb") as f:
-            f.write(zip_file)
+        pdf_data = _fetch_pdf(identifier, doc.name)
+        with open(f"{CACHE_DIR}/{_url_encode(doc.name)}.pdf", "wb") as f:
+            f.write(pdf_data)


 def _url_encode(string: str) -> str:
@ -278,31 +287,16 @@ def _url_decode(string: str) -> str:
    return urllib.parse.unquote(string)


-def _fetch_page_nums(identifier: str, doc_name: str) -> dict:
+def _fetch_pdf(identifier: str, doc_name: str) -> bytes:
    """
-    Fetch JSON file with page number metadata for an `ArchiveDoc`.
+    Fetch PDF file for an `ArchiveDoc`.
    """

-    # `self.name` does not get percent-encoded, because it is derived from the
+    # `doc_name` does not get percent-encoded, because it is derived from the
    # file path itself as defined by archive.org. Percent-encoding it further
    # may result in a 404 error.
-    page_nums_resp = requests.get(
-        f"https://archive.org/download/{_url_encode(identifier)}/{doc_name}_page_numbers.json"
+    resp = requests.get(
+        f"https://archive.org/download/{_url_encode(identifier)}/{doc_name}.pdf"
    )
-    page_nums_resp.raise_for_status()
-    return page_nums_resp.json()
-
-
-def _fetch_jp2_zip(identifier: str, doc_name: str) -> bytes:
-    """
-    Fetch zip file with processed page scans for an `ArchiveDoc`.
-    """
-
-    # `self.name` does not get percent-encoded, because it is derived
-    # from the file path itself as defined by archive.org. Percent-
-    # encoding it further may result in a 404 error.
-    zip_resp = requests.get(
-        f"https://archive.org/download/{_url_encode(identifier)}/{doc_name}_jp2.zip"
-    )
-    zip_resp.raise_for_status()
-    return zip_resp.content
+    resp.raise_for_status()
+    return resp.content
--- a/microqa/ocr/init.py
+++ b/microqa/ocr/init.py
@ -2,13 +2,74 @@
 This module contains interchangeable engines for optical character recognition,
 making it easy to swap implementations in and out based on speed and accuracy
 advantages without rewriting business logic.
-
-Each nested module exports a class named `OcrEngine` with a method named
-`process()`, which accepts a PIL `Image` and list of languages, and which
-returns a tuple containing a standardized `DataFrame` as well as a dictionary
-containing any additional specialized metadata made available from the
-underlying OCR engine. The `DataFrame` has columns
-`["text", "x0", "y0", "x1", "y1"]`, where X and Y coordinates are in pixels
-measured from the top left corner of the image. `x1` and `y1` values will be
-greater than or equal to the corresponding `x0` and `y0` values.
 """
+
+from dataclasses import dataclass
+from typing import Optional
+
+from PIL import Image
+
+
+@dataclass
+class TextBlock:
+    """
+    Attributes:
+
+        x0      Left coordinate of the bounding box, in pixels.
+
+        y0      Top coordinate of the bounding box, in pixels.
+
+        x1      Right coordinate of the bounding box, in pixels from left of
+                image.
+
+        y1      Bottom coordinate of the bounding box, in pixels from top of
+                image.
+
+        text    Text content of the block.
+    """
+
+    x0: int
+    y0: int
+    x1: int
+    y1: int
+    text: str
+
+
+@dataclass
+class OcrResult:
+    """
+    OCR data parsed from a single page.
+
+    Attributes:
+
+        blocks      Blocks of text detected on a page.
+
+        page_angle  Optional detected rotation of the page, in degrees clockwise
+                    relative to upright.
+    """
+
+    blocks: list[TextBlock]
+    page_angle: Optional[float]
+
+
+class OcrEngine:
+    """
+    Abstract class for interchangeable OCR processing backends.
+
+    Params:
+
+        detect_angle    Allows page angle detection to be enabled or disabled
+                        for certain implementations. Defaults to True.
+
+        languages       List of ISO-639-3 language codes fed to the OCR backend.
+    """
+
+    _detect_angle: bool
+    _languages: list[str]
+
+    def __init__(self, languages: list[str], detect_angle: bool = True):
+        self._detect_angle = detect_angle
+        self._languages = languages.copy()
+
+    def process(self, image: Image.Image) -> OcrResult:
+        raise NotImplementedError()
--- a/microqa/ocr/paddleocr.py
+++ b/microqa/ocr/paddleocr.py
@ -1,11 +1,8 @@
 import numpy as np
-import pandas as pd
 from paddleocr import PaddleOCR
 from PIL import Image

-
-# Reuse OCR instances per language.
-instances: dict[str, PaddleOCR] = {}
+from . import OcrEngine, OcrResult, TextBlock


 def convert_language(iso639_3_code: str) -> str:
@ -21,16 +18,23 @@ def convert_language(iso639_3_code: str) -> str:
    return iso639_3_code


-class OcrEngine:
-    def process(
-        image: Image, languages: list[str] = ["eng"]
-    ) -> tuple[pd.DataFrame, dict]:
+class PaddleOcrEngine(OcrEngine):
+    # Dict of ISO 639-3 language code to PaddleOCR instance.
+    _ocr_instances: dict[str, PaddleOCR] = {}
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        for language in self._languages:
+            self._ocr_instances[language] = PaddleOCR(
+                use_doc_orientation_classify=True,
+                use_doc_unwarping=False,
+                use_textline_orientation=False,
+                lang=convert_language(language),
+            )
+
+    def process(self, image: Image.Image) -> OcrResult:
        """
-        Use `paddleocr` to parse an image to a `DataFrame` with columns
-        `["text", "x0", "y0", "x1", "y1"]`, where X and Y coordinates are in
-        pixels measured from the top left corner of the image. `x1` and `y1`
-        values will be greater than or equal to the corresponding `x0` and `y0`
-        values.
+        Use `paddleocr` to parse an image.

        Compared to Tesseract, PaddleOCR is more accurate at low image
        resolutions and able to perform one-shot page angle detection. However,
@ -46,33 +50,23 @@ class OcrEngine:
        language and choose the result it thinks is best. Thus, it's
        recommended to stick to one language if at all possible.

-        Troubleshooting: The PaddlePaddle core package has/had a bug triggering
-        segfaults on ARM systems. Installing the nightly development build of
-        `paddlepaddle` may be necessary to avoid it. Refer to:
+        Note: Though it works well when it works, PaddlePaddle has a tendency to
+        segfault and generally has been found to be buggy and unreliable.
+        Installing the nightly development build of `paddlepaddle` may help.
+        Refer to:
        - [PaddleOCR issue 16609](https://github.com/PaddlePaddle/PaddleOCR/issues/16609)
        - [PaddlePaddle PR 75731](https://github.com/PaddlePaddle/Paddle/pull/75731)

        Params:

            image       PIL image data.
-
-            languages   List of ISO-639-3 language codes fed to the OCR backend.
        """

        best_result = None
-        for language in languages:
-            if language in instances:
-                ocr_instance = instances[language]
-            else:
-                ocr_instance = PaddleOCR(
-                    use_doc_orientation_classify=True,
-                    use_doc_unwarping=False,
-                    use_textline_orientation=False,
-                    lang=convert_language(language),
+        for language in self._languages:
+            [res] = self._ocr_instances[language].predict(
+                np.array(image.convert("RGB"))
            )
-                instances[language] = ocr_instance
-
-            [res] = ocr_instance.predict(np.array(image.convert("RGB")))
            if best_result is None or len(res["rec_texts"]) > len(
                best_result["rec_texts"]
            ):
@ -83,17 +77,16 @@ class OcrEngine:

        res = best_result

-        return (
-            pd.DataFrame(
-                {
-                    "text": res["rec_texts"],
-                    "x0": [x0 for [x0, _, _, _] in res["rec_boxes"]],
-                    "y0": [y0 for [_, y0, _, _] in res["rec_boxes"]],
-                    "x1": [x1 for [_, _, x1, _] in res["rec_boxes"]],
-                    "y1": [y1 for [_, _, _, y1] in res["rec_boxes"]],
-                }
-            ),
-            {
-                "page_angle": res["doc_preprocessor_res"]["angle"],
-            },
+        return OcrResult(
+            blocks=[
+                TextBlock(
+                    text=res["rec_texts"][i],
+                    x0=res["rec_boxes"][i][0],
+                    y0=res["rec_boxes"][i][1],
+                    x1=res["rec_boxes"][i][2],
+                    y1=res["rec_boxes"][i][3],
+                )
+                for i, _ in enumerate(res["rec_texts"])
+            ],
+            page_angle=(360 - res["doc_preprocessor_res"]["angle"]) % 360,
        )
--- a/microqa/ocr/tesseract.py
+++ b/microqa/ocr/tesseract.py
@ -1,16 +1,13 @@
-import pandas as pd
 import pytesseract
 from PIL import Image

+from . import OcrEngine, OcrResult, TextBlock

-class OcrEngine:
-    def process(image: Image, languages: list[str]) -> tuple[pd.DataFrame, dict]:
+
+class TesseractOcrEngine(OcrEngine):
+    def process(self, image: Image.Image) -> OcrResult:
        """
-        Use `pytesseract` to parse an image to a `DataFrame` with columns
-        `["text", "x0", "y0", "x1", "y1"]`, where X and Y coordinates are in
-        pixels measured from the top left corner of the image. `x1` and `y1`
-        values will be greater than or equal to the corresponding `x0` and `y0`
-        values.
+        Use `pytesseract` to parse an image.

        Note: Each Tesseract command runs single-threaded, so speed can be
        improved up to ~4x by distributing pages across processes running in
@ -22,34 +19,97 @@ class OcrEngine:
        Params:

            image   PIL image data.
-
-            languages   List of ISO-639-3 language codes fed to the OCR backend.
        """

+        blocks_best = []
+        angle_best = None
+
+        angles = [0, 90, 180, 270] if self._detect_angle else [0]
+        for angle in angles:
+            # Rotate the image counter-clockwise, since we care about
+            # keeping track of the angle from the upright position *to*
+            # the original position, not *from*.
+            rotated_image = image.rotate(360 - angle, expand=True)
            df = pytesseract.image_to_data(
-            image,
-            lang="+".join(languages),
+                rotated_image,
+                lang="+".join(self._languages),
                config="--oem 1 --tessdata-dir ./data/tessdata_fast-4.1.0",
                output_type=pytesseract.Output.DATAFRAME,
-        )
+            ).fillna({"text": ""})

-        # Exclude words with relatively low confidence ratings.
+            # Exclude blocks with relatively low confidence ratings.
            df = df[df["conf"] > 80]

-        # Attempt to exclude words that seem vertically oriented.
+            # Exclude empty words
+            df = df[df["text"] != ""]
+
+            # Attempt to exclude blocks that seem vertically oriented.
            # TODO: Will this work for non-Latin scripts? Probably not all.
            df = df[(df["width"] / df["height"]) > 0.8]

-        return (
-            pd.DataFrame(
-                {
-                    "text": df["text"],
-                    "x0": df["left"],
-                    "y0": df["top"],
-                    "x1": df["left"] + df["width"],
-                    "y1": df["top"] + df["height"],
-                }
+            print(
+                [
+                    TextBlock(
+                        # Rotate X and Y coordinates back to match the original image.
+                        *_box_after_rotation(
+                            int(row["left"]),
+                            int(row["top"]),
+                            int(row["left"] + row["width"]),
+                            int(row["top"] + row["height"]),
+                            *rotated_image.size,
+                            angle,
                        ),
-            # We don't use any page-level metadata from the Tesseract output.
-            {},
+                        text=row["text"],
                    )
+                    for _, row in df.iterrows()
+                ]
+            )
+            if angle_best is None or df.shape[0] > len(blocks_best):
+                angle_best = angle
+                blocks_best = [
+                    TextBlock(
+                        # Rotate X and Y coordinates back to match the original image.
+                        *_box_after_rotation(
+                            int(row["left"]),
+                            int(row["top"]),
+                            int(row["left"] + row["width"]),
+                            int(row["top"] + row["height"]),
+                            *rotated_image.size,
+                            angle,
+                        ),
+                        text=row["text"],
+                    )
+                    for _, row in df.iterrows()
+                ]
+
+        return OcrResult(
+            blocks=blocks_best, page_angle=angle_best if self._detect_angle else None
+        )
+
+
+def _box_after_rotation(
+    x0: int,
+    y0: int,
+    x1: int,
+    y1: int,
+    image_width: int,
+    image_height: int,
+    degrees_clockwise: int,
+) -> tuple[int, int, int, int]:
+    """
+    Given the corners of a box in an image, returns the corners of an equivalent
+    box if the image is rotated by some multiple of 90 degrees. Both input and
+    output coordinates are expected to be top left followed by bottom right,
+    where the origin is at the top left.
+    """
+    angle = ((degrees_clockwise % 360) + 360) % 360
+    if angle == 0:
+        return x0, y0, x1, y1
+    if angle == 90:
+        return image_height - y1, x0, image_height - y0, x1
+    if angle == 180:
+        return image_width - x1, image_height - y1, image_width - x0, image_height - y0
+    if angle == 270:
+        return y0, image_width - x1, y1, image_width - x0
+    else:
+        raise Exception("_box_after_rotation() only accepts multiples of 90 degrees")
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,10 +6,12 @@ readme = "README.md"
 requires-python = ">=3.11"
 dependencies = [
    "numpy>=2.3.2",
-    "paddleocr>=3.2.0",
-    "paddlepaddle>=3.2.0",
+    "paddleocr>=3.3.0",
+    "paddlepaddle>=3.2.2",
    "pandas>=2.3.1",
    "pillow>=11.3.0",
+    "psycopg[binary]>=3.2.12",
+    "pymupdf>=1.26.6",
    "pytesseract>=0.3.13",
    "requests>=2.32.4",
 ]
--- a/uv.lock
+++ b/uv.lock