reuse pdf ocr when available

2025-12-20 02:16:41 +00:00 · 2025-12-20 02:16:41 +00:00 · 3da76d4537
commit 3da76d4537
parent ac7e93a75b
9 changed files with 1373 additions and 901 deletions
--- a/diagnostics.py
+++ b/diagnostics.py
@ -11,9 +11,25 @@ from microqa.engine import analyze_doc
 def main():
    parser = ArgumentParser()
    parser.add_argument("--item-id")
-    parser.add_argument("--cpus", type=int, default=4)
+    parser.add_argument(
        "--ocr-backend",
        help="which local OCR backend to use when available text in archived PDF files is insufficient; one of 'tesseract' or 'paddleocr'",
        default="tesseract",
    )
    parser.add_argument("--verbose", action="store_true")
    args = parser.parse_args()
    # Import OCR engine modules only as needed, to avoid unnecessary slow
    # startups and/or missing dependency errors.
    if args.ocr_backend == "tesseract":
        from microqa.ocr.tesseract import TesseractOcrEngine
        ocr_engine = TesseractOcrEngine(languages=["eng", "fra"])
    elif args.ocr_backend == "paddleocr":
        from microqa.ocr.paddleocr import PaddleOcrEngine
        ocr_engine = PaddleOcrEngine(languages=["eng", "fra"])
    cache_item(
        args.item_id,
        # Will not refetch if value is already cached.
@ -29,7 +45,10 @@ def main():
        else item.docs
    )
    analyses = [
-        analyze_doc(doc, parallel=args.cpus, use_cache=True) for doc in minimal_docs
+        analyze_doc(
            doc=doc, ocr_engine=ocr_engine, use_cache=True, verbose=args.verbose
        )
        for doc in minimal_docs
    ]
    t_end = time()
@ -43,7 +62,7 @@ def main():
                    [
                        i
                        for i, page in enumerate(doc["pages"])
-                        if not page["ocr_orientation_match"]
+                        if 45 < page["page_angle"] < 315
                    ]
                    for doc in analyses
                ],
--- a/main.py
+++ b/main.py
@ -30,8 +30,24 @@ def main():
        help="script will attempt to analyze all items with a review date greater than or equal to this value (YYYYMMDD)",
        default="20250701",
    )
    parser.add_argument(
        "--ocr-backend",
        help="which local OCR backend to use when available text in archived PDF files is insufficient; one of 'tesseract' or 'paddleocr'",
        default="tesseract",
    )
    args = parser.parse_args()
    # Import OCR engine modules only as needed, to avoid unnecessary slow
    # startups and/or missing dependency errors.
    if args.ocr_backend == "tesseract":
        from microqa.ocr.tesseract import TesseractOcrEngine
        ocr_engine = TesseractOcrEngine(languages=["eng", "fra"])
    elif args.ocr_backend == "paddleocr":
        from microqa.ocr.paddleocr import PaddleOcrEngine
        ocr_engine = PaddleOcrEngine(languages=["eng", "fra"])
    with sqlite3.connect(args.database) as conn:
        cur = conn.cursor()
        cur.execute("""
@ -51,7 +67,7 @@ create table if not exists pages (
    id int primary key,
    doc text not null,
    page int not null,
-    orientation_match boolean not null,
+    page_angle float not null,
    sharpness real not null,
    is_blank boolean not null,
    text_margin_px int not null
@ -95,7 +111,7 @@ order by review_date
                                [doc.name, item_id],
                            )
                            analysis = analyze_doc(
-                                doc, parallel=args.cpus, verbose=True
+                                doc=doc, ocr_engine=ocr_engine, verbose=True
                            )
                            for i, page in enumerate(analysis["pages"]):
                                cur.execute(
@ -103,7 +119,7 @@ order by review_date
 insert into pages (
    doc,
    page,
-    orientation_match,
+    page_angle,
    sharpness,
    is_blank,
    text_margin_px
@ -111,7 +127,7 @@ insert into pages (
                                    [
                                        doc.name,
                                        i + 1,
-                                        page["ocr_orientation_match"],
+                                        page["page_angle"],
                                        page["sharpness"],
                                        page["blank"],
                                        page["text_margin_px"],
--- a/microqa/engine.py
+++ b/microqa/engine.py
@ -1,57 +1,41 @@
 from dataclasses import dataclass, field
 from multiprocessing import Pool
 from sys import stdout
 import numpy as np
 from PIL import Image, ImageFilter
-from .items import ArchiveDoc
+from .items import ArchiveDoc, ArchiveLeaf
-from .ocr.tesseract import OcrEngine
+from .ocr import OcrEngine, TextBlock
-def analyze_doc(doc: ArchiveDoc, parallel=1, use_cache=False, verbose=False):
+def analyze_doc(
    doc: ArchiveDoc,
    ocr_engine: OcrEngine,
    use_cache: bool = False,
    verbose: bool = False,
 ):
    """
    Analyzes all pages in an ArchiveDoc for useful metrics such as sharpness,
    orientation, presence of text overflows, and so on.
    """
    if verbose:
        print(f"Loading {doc.name}...")
        stdout.flush()
-    tasks: PageAnalysisTask = [
+    all_leaves = doc.fetch_leaves(use_cache=use_cache)
        PageAnalysisTask(im=leaf.image)
        for leaf in doc.fetch_leaves(use_cache=use_cache)
    ]
    if verbose:
-        print(f"Processing {len(tasks)} pages...", file=stdout)
+        print(f"Processing {len(all_leaves)} pages...", file=stdout)
        stdout.flush()
-    if parallel > 1:
+    analyzed_pages = []
-        # Parallelize image processing and OCR of pages across up to n cores.
+    for leaf in all_leaves:
-        with Pool(parallel) as pool:
+        im_cropped = leaf.image.crop(
            return {"pages": pool.map(analyze_page, tasks)}
    return {"pages": [analyze_page(task) for task in tasks]}
@dataclass
 class PageAnalysisTask:
    """
    Attributes:
        im          PIL Image, pre-scaled using .thumbnail() to fit the long
                    edge to 3200 px.
        ocr_langs   Tesseract language codes (3 letters each, in a "+"-separated
                    list).
    """
    im: Image.Image
    ocr_langs: list[str] = field(default_factory=lambda: ["eng"])
 def analyze_page(task):
    im_cropped = task.im.crop(
            (
-            task.im.size[0] * 0.1,
+                leaf.image.size[0] * 0.1,
-            task.im.size[1] * 0.1,
+                leaf.image.size[1] * 0.1,
-            task.im.size[0] * 0.9,
+                leaf.image.size[0] * 0.9,
-            task.im.size[1] * 0.9,
+                leaf.image.size[1] * 0.9,
            )
        )
@ -59,9 +43,15 @@ def analyze_page(task):
        if is_blank:
            max_sharpness = 1
        ocr_orientation_match = True
            text_margin_px = -1
            page_angle = 0
        else:
            # Sharpness is determined by percentile of pixels that match some
            # criteria, so it may vary significantly depending on which portion
            # of the image is analyzed. In an effort to identify the sharpest
            # edges, we split up the image into chunks and assume that the
            # highest sharpness value obtained across all chunks is
            # representative of the image as a whole.
            max_sharpness = 0.0
            if im_cropped.size[0] < im_cropped.size[1]:
                # Page is in portrait orientation.
@ -87,62 +77,73 @@ def analyze_page(task):
                        ),
                    )
-        OCR_SCALE = 1
+            # OCR is computationally expensive, so we try to take advantage of
-        # TODO: Refactor orientation detection logic into the OCR engine
+            # the Tesseract data already parsed by the Internet Archive and
-        # modules.
+            # embedded in the PDF, when possible. If there is not sufficient
-        best_ocr_score = -1
+            # text in the PDF to be confident that the Archive's OCR
-        best_ocr_words = None
+            # postprocessing captured it all, then OCR is recomputed locally.
-        best_ocr_orientation = -1
+            #
-        for orientation in range(4):
+            # In some instances, the Archive's OCR detects rotated text but
-            im_rotated = task.im.resize(
+            # parses it as gibberish. To partially mitigate this, we ignore all
-                np.int_(np.array(task.im.size) * OCR_SCALE)
+            # precomputed text blocks with a "portrait" aspect ratio. This will
-            ).rotate(90 * orientation, expand=True)
+            # not necessarily help with text that is rotated 180 degrees, but in
-            ocr, ocr_meta = OcrEngine.process(im_rotated, languages=task.ocr_langs)
+            # practice that case is rarely encountered. This will also not work
-
+            # well with non-latin scripts that are intended to be oriented
-            if "page_angle" in ocr_meta:
+            # vertically.
-                # OCR engine automatically accounts for page rotation.
+            OCR_RECOMPUTE_THRESHOLD_WORDS = 30
-                best_ocr_score = ocr.shape[0]
+            if (
-                # PaddleOCR counts rotation as degrees, in the opposite
+                sum(
-                # direction as PIL's `Image.rotate()`
+                    (
-                best_ocr_orientation = (
+                        len(block.text.split())
-                    4 - round(((ocr_meta["page_angle"] + 360) % 360) / 90)
+                        for block in leaf.text_blocks
-                ) % 4
+                        if block.x1 - block.x0 > block.y1 - block.y0
                best_ocr_words = ocr
                break
            if ocr.shape[0] > best_ocr_score:
                best_ocr_score = ocr.shape[0]
                best_ocr_orientation = orientation
                best_ocr_words = ocr
            if best_ocr_score > 50:
                # Unlikely that another orientation will have more words, so
                # stop eating up CPU.
                break
        if best_ocr_words.empty:
            ocr_orientation_match = True
            text_margin_px = -1
        else:
            ocr_orientation_match = best_ocr_orientation == 0
            best_ocr_dims = OCR_SCALE * np.array(
                task.im.size
                if best_ocr_orientation % 2 == 0
                else (task.im.size[1], task.im.size[0])
                    )
                )
                >= OCR_RECOMPUTE_THRESHOLD_WORDS
            ):
                if verbose:
                    print("Using PDF text.")
                ocred_leaf = leaf
                page_angle = 0
            else:
                if verbose:
                    print("Using OCR.")
                OCR_SCALE = 1
                im_scaled = leaf.image.resize(
                    np.int_(np.array(leaf.image.size) * OCR_SCALE)
                )
                ocr_result = ocr_engine.process(im_scaled)
                ocred_leaf = ArchiveLeaf(
                    image=leaf.image,
                    page_number=leaf.page_number,
                    text_blocks=[
                        TextBlock(
                            x0=int(block.x0 / OCR_SCALE),
                            y0=int(block.y0 / OCR_SCALE),
                            x1=int(block.x1 / OCR_SCALE),
                            y1=int(block.y1 / OCR_SCALE),
                            text=block.text,
                        )
                        for block in ocr_result.blocks
                    ],
                )
                page_angle = ocr_result.page_angle
            word_margins_all_directions = np.sort(
                np.int_(
                    np.concat(
-                        (
+                        [
-                            best_ocr_words["x0"].to_numpy(),
+                            np.array(
-                            best_ocr_words["y0"].to_numpy(),
+                                [
-                            best_ocr_dims[0] - best_ocr_words["x1"].to_numpy(),
+                                    block.x0,
-                            best_ocr_dims[1] - best_ocr_words["y1"].to_numpy(),
+                                    block.y0,
                                    leaf.image.size[0] - block.x1,
                                    leaf.image.size[1] - block.y1,
                                ]
                            )
                            for block in ocred_leaf.text_blocks
                        ]
                    )
                    # Transform back into original image pixel density
                    / OCR_SCALE
                )
            )
            # Skip the n closest words to the edge, to help ignore stray OCR artifacts.
@ -153,16 +154,23 @@ def analyze_page(task):
                else -1
            )
-    return {
+        # Make sure the OCR engine is running with orientation detection.
        assert page_angle is not None
        analyzed_pages.append(
            {
                "blank": is_blank,
-        "ocr_orientation_match": ocr_orientation_match,
+                "page_angle": page_angle,
-        "size_analyzed": task.im.size,
+                "size_analyzed": leaf.image.size,
                "sharpness": max_sharpness,
                "text_margin_px": text_margin_px,
            }
        )
    return {"pages": analyzed_pages}
-def analyze_sharpness(im):
+def analyze_sharpness(im: Image.Image):
    """
    Crudely quantifies the "sharpness" of edges in an image, on a scale of 0 to
    1. The scale is not linear with respect to scan quality: anything above 0.1
--- a/microqa/items.py
+++ b/microqa/items.py
@ -3,18 +3,18 @@ Python utilities for structuring data and metadata pulled from archive.org
 microfiche scans.
 """
 import json
 import os
 import urllib
 from contextlib import nullcontext
 from dataclasses import dataclass
 from io import BytesIO
 from typing import Optional
 from zipfile import ZipFile
 import pymupdf
 import requests
 from PIL import Image
 from .ocr import TextBlock
 CACHE_DIR = "./archive_cache"
@ -38,16 +38,20 @@ class ArchiveLeaf:
                        presented to users, otherwise a (potentially empty)
                        string with the inferred page number as defined by the
                        document being scanned.
        text_blocks     List of text blocks extracted from PyMuPDF's
                        TextPage.extractBlocks() method.
    """
-    image: Image
+    image: Image.Image
    page_number: Optional[str]
    text_blocks: list[TextBlock]
@dataclass
 class ArchiveDoc:
    """
-    Information pertaining to a single set of processed pages, of which there
+    Information pdertaining to a single set of processed pages, of which there
    may be multiple for any given ArchiveItem. For example, one SCOTUS case may
    contain several briefs/petitions/etc., each presented as a distinct PDF but
    all held within the parent `ArchiveItem`.
@ -80,62 +84,71 @@ class ArchiveDoc:
    name: str
    title: Optional[str]
-    def fetch_leaves(self, numbered_only=True, use_cache=False) -> list[ArchiveLeaf]:
+    def fetch_leaves(self, use_cache=False) -> list[ArchiveLeaf]:
        """
-        Fetch images and page number data for this document from archive.org,
+        Fetch images and OCR text data for this document from archive.org PDF files.
        over the Internet.
        Params:
-            numbered_only   If `True`, discards any leaves with no corresponding
+            use_cache   If `True`, locally cached PDF files under the
                            page number entries. Leaves for which the page
                            number is an empty string are retained.
            use_cache       If `True`, locally cached zip files under the
                        `./archive_cache` directory (relative to the working
                        directory) will be used instead of fetching over
                        HTTPS.
        """
        if use_cache:
-            # Cached file names are derived from the percent-encoded verison of
+            with open(f"{CACHE_DIR}/{_url_encode(self.name)}.pdf", "rb") as f:
-            # `self.name`, so that there's no need to worry about directory
+                pdf_data = f.read()
            # separators or other disallowed characters in the file names
            # defined by archive.org.
            with open(
                f"{CACHE_DIR}/{_url_encode(self.name)}_page_numbers.json", "r"
            ) as f:
                page_nums = json.load(f)["pages"]
            zip_reader_ctx = open(f"{CACHE_DIR}/{_url_encode(self.name)}_jp2.zip", "rb")
        else:
-            page_nums = _fetch_page_nums(self.identifier, self.name)["pages"]
+            pdf_data = _fetch_pdf(self.identifier, self.name)
            # Wrap in a context manager so that the reader can be used in a `with`
            # block in the same way as a file accessed with `open()`.
            zip_reader_ctx = nullcontext(
                BytesIO(_fetch_jp2_zip(self.identifier, self.name))
            )
        leaves = []
-        with zip_reader_ctx as zip_reader, ZipFile(zip_reader) as jp_zip:
+        # Open PDF from bytes
-            for leaf_num, file_name in enumerate(sorted(jp_zip.namelist())):
+        pdf_doc = pymupdf.open(stream=pdf_data, filetype="pdf")
                for page_index, page_num_info in enumerate(page_nums):
                    if page_num_info["leafNum"] == leaf_num:
                        # Stop iterating and keep page_index set to the current
                        # value.
                        break
                else:
                    # Indicate that leaf was not found in page_num list.
                    page_index = None
-                if not numbered_only or page_index is not None:
+        try:
-                    with jp_zip.open(file_name) as jp_file:
+            for page_num in range(len(pdf_doc)):
-                        # Convert to single-channel greyscale ("L").
+                page = pdf_doc[page_num]
-                        image = Image.open(jp_file).convert("L")
+
-                        # Rescale long edge to no more than 3200 px.
+                # Extract text blocks with coordinates
                # Convert to TextBlock objects, discarding block_no and block_type
                text_blocks = [
                    TextBlock(
                        x0=int(x0),
                        y0=int(y0),
                        x1=int(x1),
                        y1=int(y1),
                        text=text,
                    )
                    for x0, y0, x1, y1, text, *_ in page.get_text("blocks")
                ]
                # Render page to image
                # Use a matrix to scale appropriately (default is 72 DPI)
                # Scale factor 4.44 gives approximately 320 DPI, which should produce
                # images with long edge around 3200px for typical page sizes
                mat = pymupdf.Matrix(4.44, 4.44)
                pix = page.get_pixmap(matrix=mat, alpha=False)
                # Convert PyMuPDF pixmap to PIL Image
                img_data = pix.tobytes("ppm")
                image = Image.open(BytesIO(img_data)).convert("L")
                # Ensure long edge is no more than 3200 px
                image.thumbnail((3200, 3200))
-                        leaves.append(ArchiveLeaf(image=image, page_number=page_index))
+
                # Page numbers are 1-indexed for human readability
                leaves.append(
                    ArchiveLeaf(
                        image=image,
                        page_number=str(page_num + 1),
                        text_blocks=text_blocks,
                    )
                )
        finally:
            pdf_doc.close()
        return leaves
@ -201,20 +214,18 @@ def fetch_item(identifier: str, use_cache=False) -> ArchiveItem:
    doc_names = [
        # Strip suffix, to just leave the identifier, and title if present.
-        name[: -len("_jp2.zip")]
+        name[: -len(".pdf")]
        for name in file_names
-        if name.lower().endswith("_jp2.zip")
+        if name.lower().endswith(".pdf")
        # Exclude unprocessed scans, which are also named `..._jp2.zip`.
        and name.lower() != f"{identifier.lower()}_micro_jp2.zip"
    ]
    # Assert that all files we expect to find are actually present.
    for doc_name in doc_names:
-        if f"{_url_encode(doc_name.lower())}_page_numbers.json" not in [
+        if f"{_url_encode(doc_name.lower())}.pdf" not in [
            name.lower() for name in file_names
        ]:
            raise Exception(
-                f"expected file not found: {_url_encode(doc_name.lower())}_page_numbers.zip"
+                f"expected file not found: {_url_encode(doc_name.lower())}.pdf"
            )
    return ArchiveItem(
@ -232,7 +243,7 @@ def fetch_item(identifier: str, use_cache=False) -> ArchiveItem:
 def cache_item(identifier: str, overwrite=True):
    """
-    Load the relevant files for an `ArchiveItem` and its component `ArchiveDoc`s
+    Load the PDF files for an `ArchiveItem` and its component `ArchiveDoc`s
    and store them within the `archive_cache` directory (relative to the working
    directory). The `archive_cache` directory will be created if it does not
    exist.
@ -249,16 +260,14 @@ def cache_item(identifier: str, overwrite=True):
    for name in os.listdir(CACHE_DIR):
        if _url_decode(name.lower()).startswith(identifier.lower()):
            if not overwrite:
                return
    item = fetch_item(identifier)
    for doc in item.docs:
-        page_nums = _fetch_page_nums(identifier, doc.name)
+        pdf_data = _fetch_pdf(identifier, doc.name)
-        zip_file = _fetch_jp2_zip(identifier, doc.name)
+        with open(f"{CACHE_DIR}/{_url_encode(doc.name)}.pdf", "wb") as f:
-        with open(f"{CACHE_DIR}/{_url_encode(doc.name)}_page_numbers.json", "w") as f:
+            f.write(pdf_data)
            json.dump(page_nums, f)
        with open(f"{CACHE_DIR}/{_url_encode(doc.name)}_jp2.zip", "wb") as f:
            f.write(zip_file)
 def _url_encode(string: str) -> str:
@ -278,31 +287,16 @@ def _url_decode(string: str) -> str:
    return urllib.parse.unquote(string)
-def _fetch_page_nums(identifier: str, doc_name: str) -> dict:
+def _fetch_pdf(identifier: str, doc_name: str) -> bytes:
    """
-    Fetch JSON file with page number metadata for an `ArchiveDoc`.
+    Fetch PDF file for an `ArchiveDoc`.
    """
-    # `self.name` does not get percent-encoded, because it is derived from the
+    # `doc_name` does not get percent-encoded, because it is derived from the
-    # file path itself as defined by archive.org. Percent- encoding it further
+    # file path itself as defined by archive.org. Percent-encoding it further
    # may result in a 404 error.
-    page_nums_resp = requests.get(
+    resp = requests.get(
-        f"https://archive.org/download/{_url_encode(identifier)}/{doc_name}_page_numbers.json"
+        f"https://archive.org/download/{_url_encode(identifier)}/{doc_name}.pdf"
    )
-    page_nums_resp.raise_for_status()
+    resp.raise_for_status()
-    return page_nums_resp.json()
+    return resp.content
 def _fetch_jp2_zip(identifier: str, doc_name: str) -> bytes:
    """
    Fetch zip file with processed page scans for an `ArchiveDoc`.
    """
    # `self.name` does not get percent-encoded, because it is derived
    # from the file path itself as defined by archive.org. Percent-
    # encoding it further may result in a 404 error.
    zip_resp = requests.get(
        f"https://archive.org/download/{_url_encode(identifier)}/{doc_name}_jp2.zip"
    )
    zip_resp.raise_for_status()
    return zip_resp.content
--- a/microqa/ocr/init.py
+++ b/microqa/ocr/init.py
@ -2,13 +2,74 @@
 This module contains interchangeable engines for optical character recognition,
 making it easy to swap implementations in and out based on speed and accuracy
 advantages without rewriting business logic.
 Each nested module exports a class named `OcrEngine` with a method named
 `process()`, which accepts a PIL `Image` and list of languages, and which
 returns a tuple containing a standardized `DataFrame` as well as a dictionary
 containing any additional specialized metadata made available from the
 underlying OCR engine. The `DataFrame` has columns
 `["text", "x0", "y0", "x1", "y1"]`, where X and Y coordinates are in pixels
 measured from the top left corner of the image. `x1` and `y1` values will be
 greater than or equal to the corresponding `x0` and `y0` values.
 """
 from dataclasses import dataclass
 from typing import Optional
 from PIL import Image
@dataclass
 class TextBlock:
    """
    Attributes:
        x0      Left coordinate of the bounding box, in pixels.
        y0      Top coordinate of the bounding box, in pixels.
        x1      Right coordinate of the bounding box, in pixels from left of
                image.
        y1      Bottom coordinate of the bounding box, in pixels from top of
                image.
        text    Text content of the block.
    """
    x0: int
    y0: int
    x1: int
    y1: int
    text: str
@dataclass
 class OcrResult:
    """
    OCR data parsed from a single page.
    Attributes:
        blocks      Blocks of text detected on a page.
        page_angle  Optional detected rotation of the page, in degrees clockwise
                    relative to upright.
    """
    blocks: list[TextBlock]
    page_angle: Optional[float]
 class OcrEngine:
    """
    Abstract class for interchangeable OCR processing backends.
    Params:
        detect_angle    Allows page angle detection to be enabled or disabled
                        for certain implementations. Defaults to True.
        languages       List of ISO-639-3 language codes fed to the OCR backend.
    """
    _detect_angle: bool
    _languages: list[str]
    def __init__(self, languages: list[str], detect_angle: bool = True):
        self._detect_angle = detect_angle
        self._languages = languages.copy()
    def process(self, image: Image.Image) -> OcrResult:
        raise NotImplementedError()
--- a/microqa/ocr/paddleocr.py
+++ b/microqa/ocr/paddleocr.py
@ -1,11 +1,8 @@
 import numpy as np
 import pandas as pd
 from paddleocr import PaddleOCR
 from PIL import Image
-
+from . import OcrEngine, OcrResult, TextBlock
 # Reuse OCR instances per language.
 instances: dict[str, PaddleOCR] = {}
 def convert_language(iso639_3_code: str) -> str:
@ -21,16 +18,23 @@ def convert_language(iso639_3_code: str) -> str:
    return iso639_3_code
-class OcrEngine:
+class PaddleOcrEngine(OcrEngine):
-    def process(
+    # Dict of ISO 639-3 language code to PaddleOCR instance.
-        image: Image, languages: list[str] = ["eng"]
+    _ocr_instances: dict[str, PaddleOCR] = {}
-    ) -> tuple[pd.DataFrame, dict]:
+
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        for language in self._languages:
            self._ocr_instances[language] = PaddleOCR(
                use_doc_orientation_classify=True,
                use_doc_unwarping=False,
                use_textline_orientation=False,
                lang=convert_language(language),
            )
    def process(self, image: Image.Image) -> OcrResult:
        """
-        Use `paddleocr` to parse an image to a `DataFrame` with columns
+        Use `paddleocr` to parse an image.
        `["text", "x0", "y0", "x1", "y1"]`, where X and Y coordinates are in
        pixels measured from the top left corner of the image. `x1` and `y1`
        values will be greater than or equal to the corresponding `x0` and `y0`
        values.
        Compared to Tesseract, PaddleOCR is more accurate at low image
        resolutions and able to perform one-shot page angle detection. However,
@ -46,33 +50,23 @@ class OcrEngine:
        language and choose the result it thinks is best. Thus, it's
        recommended to stick to one language if at all possible.
-        Troubleshooting: The PaddlePaddle core package has/had a bug triggering
+        Note: Though it works well when it works, PaddlePaddle has a tendency to
-        segfaults on ARM systems. Installing the nightly development build of
+        segfault and generally has been found to be buggy and unreliable.
-        `paddlepaddle` may be necessary to avoid it. Refer to:
+        Installing the nightly development build of `paddlepaddle` may help.
        Refer to:
        - [PaddleOCR issue 16609](https://github.com/PaddlePaddle/PaddleOCR/issues/16609)
        - [PaddlePaddle PR 75731](https://github.com/PaddlePaddle/Paddle/pull/75731)
        Params:
            image       PIL image data.
            languages   List of ISO-639-3 language codes fed to the OCR backend.
        """
        best_result = None
-        for language in languages:
+        for language in self._languages:
-            if language in instances:
+            [res] = self._ocr_instances[language].predict(
-                ocr_instance = instances[language]
+                np.array(image.convert("RGB"))
            else:
                ocr_instance = PaddleOCR(
                    use_doc_orientation_classify=True,
                    use_doc_unwarping=False,
                    use_textline_orientation=False,
                    lang=convert_language(language),
            )
                instances[language] = ocr_instance
            [res] = ocr_instance.predict(np.array(image.convert("RGB")))
            if best_result is None or len(res["rec_texts"]) > len(
                best_result["rec_texts"]
            ):
@ -83,17 +77,16 @@ class OcrEngine:
        res = best_result
-        return (
+        return OcrResult(
-            pd.DataFrame(
+            blocks=[
-                {
+                TextBlock(
-                    "text": res["rec_texts"],
+                    text=res["rec_texts"][i],
-                    "x0": [x0 for [x0, _, _, _] in res["rec_boxes"]],
+                    x0=res["rec_boxes"][i][0],
-                    "y0": [y0 for [_, y0, _, _] in res["rec_boxes"]],
+                    y0=res["rec_boxes"][i][1],
-                    "x1": [x1 for [_, _, x1, _] in res["rec_boxes"]],
+                    x1=res["rec_boxes"][i][2],
-                    "y1": [y1 for [_, _, _, y1] in res["rec_boxes"]],
+                    y1=res["rec_boxes"][i][3],
-                }
+                )
-            ),
+                for i, _ in enumerate(res["rec_texts"])
-            {
+            ],
-                "page_angle": res["doc_preprocessor_res"]["angle"],
+            page_angle=(360 - res["doc_preprocessor_res"]["angle"]) % 360,
            },
        )
--- a/microqa/ocr/tesseract.py
+++ b/microqa/ocr/tesseract.py
@ -1,16 +1,13 @@
 import pandas as pd
 import pytesseract
 from PIL import Image
 from . import OcrEngine, OcrResult, TextBlock
-class OcrEngine:
+
-    def process(image: Image, languages: list[str]) -> tuple[pd.DataFrame, dict]:
+class TesseractOcrEngine(OcrEngine):
    def process(self, image: Image.Image) -> OcrResult:
        """
-        Use `pytesseract` to parse an image to a `DataFrame` with columns
+        Use `pytesseract` to parse an image.
        `["text", "x0", "y0", "x1", "y1"]`, where X and Y coordinates are in
        pixels measured from the top left corner of the image. `x1` and `y1`
        values will be greater than or equal to the corresponding `x0` and `y0`
        values.
        Note: Each Tesseract command runs single-threaded, so speed can be
        improved up to ~4x by distributing pages across processes running in
@ -22,34 +19,97 @@ class OcrEngine:
        Params:
            image   PIL image data.
            languages   List of ISO-639-3 language codes fed to the OCR backend.
        """
        blocks_best = []
        angle_best = None
        angles = [0, 90, 180, 270] if self._detect_angle else [0]
        for angle in angles:
            # Rotate the image counter-clockwise, since we care about
            # keeping track of the angle from the upright position *to*
            # the original position, not *from*.
            rotated_image = image.rotate(360 - angle, expand=True)
            df = pytesseract.image_to_data(
-            image,
+                rotated_image,
-            lang="+".join(languages),
+                lang="+".join(self._languages),
                config="--oem 1 --tessdata-dir ./data/tessdata_fast-4.1.0",
                output_type=pytesseract.Output.DATAFRAME,
-        )
+            ).fillna({"text": ""})
-        # Exclude words with relatively low confidence ratings.
+            # Exclude blocks with relatively low confidence ratings.
            df = df[df["conf"] > 80]
-        # Attempt to exclude words that seem vertically oriented.
+            # Exclude empty words
            df = df[df["text"] != ""]
            # Attempt to exclude blocks that seem vertically oriented.
            # TODO: Will this work for non-Latin scripts? Probably not all.
            df = df[(df["width"] / df["height"]) > 0.8]
-        return (
+            print(
-            pd.DataFrame(
+                [
-                {
+                    TextBlock(
-                    "text": df["text"],
+                        # Rotate X and Y coordinates back to match the original image.
-                    "x0": df["left"],
+                        *_box_after_rotation(
-                    "y0": df["top"],
+                            int(row["left"]),
-                    "x1": df["left"] + df["width"],
+                            int(row["top"]),
-                    "y1": df["top"] + df["height"],
+                            int(row["left"] + row["width"]),
-                }
+                            int(row["top"] + row["height"]),
                            *rotated_image.size,
                            angle,
                        ),
-            # We don't use any page-level metadata from the Tesseract output.
+                        text=row["text"],
            {},
                    )
                    for _, row in df.iterrows()
                ]
            )
            if angle_best is None or df.shape[0] > len(blocks_best):
                angle_best = angle
                blocks_best = [
                    TextBlock(
                        # Rotate X and Y coordinates back to match the original image.
                        *_box_after_rotation(
                            int(row["left"]),
                            int(row["top"]),
                            int(row["left"] + row["width"]),
                            int(row["top"] + row["height"]),
                            *rotated_image.size,
                            angle,
                        ),
                        text=row["text"],
                    )
                    for _, row in df.iterrows()
                ]
        return OcrResult(
            blocks=blocks_best, page_angle=angle_best if self._detect_angle else None
        )
 def _box_after_rotation(
    x0: int,
    y0: int,
    x1: int,
    y1: int,
    image_width: int,
    image_height: int,
    degrees_clockwise: int,
 ) -> tuple[int, int, int, int]:
    """
    Given the corners of a box in an image, returns the corners of an equivalent
    box if the image is rotated by some multiple of 90 degrees. Both input and
    output coordinates are expected to be top left followed by bottom right,
    where the origin is at the top left.
    """
    angle = ((degrees_clockwise % 360) + 360) % 360
    if angle == 0:
        return x0, y0, x1, y1
    if angle == 90:
        return image_height - y1, x0, image_height - y0, x1
    if angle == 180:
        return image_width - x1, image_height - y1, image_width - x0, image_height - y0
    if angle == 270:
        return y0, image_width - x1, y1, image_width - x0
    else:
        raise Exception("_box_after_rotation() only accepts multiples of 90 degrees")
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,10 +6,12 @@ readme = "README.md"
 requires-python = ">=3.11"
 dependencies = [
    "numpy>=2.3.2",
-    "paddleocr>=3.2.0",
+    "paddleocr>=3.3.0",
-    "paddlepaddle>=3.2.0",
+    "paddlepaddle>=3.2.2",
    "pandas>=2.3.1",
    "pillow>=11.3.0",
    "psycopg[binary]>=3.2.12",
    "pymupdf>=1.26.6",
    "pytesseract>=0.3.13",
    "requests>=2.32.4",
 ]
--- a/uv.lock
+++ b/uv.lock