re-run ocr as needed to screen false positives

2026-01-15 21:33:57 +00:00 · 2026-01-15 21:33:57 +00:00 · f0a4ef253b
commit f0a4ef253b
parent 9a94719dc1
3 changed files with 194 additions and 100 deletions
--- a/diagnostics.py
+++ b/diagnostics.py
@ -62,18 +62,33 @@ def main():
                    [
                        i
                        for i, page in enumerate(doc["pages"])
-                        if 45 < page["page_angle"] < 315
+                        if 30 < page["page_angle"] < 330
                    ]
                    for doc in analyses
                ],
                "sharpness_max": max(
-                    *[page["sharpness"] for doc in analyses for page in doc["pages"]]
+                    [
+                        page["sharpness"]
+                        for doc in analyses
+                        for page in doc["pages"]
+                        if page["sharpness"] is not None
+                    ]
                ),
                "sharpness_median": np.median(
-                    [page["sharpness"] for doc in analyses for page in doc["pages"]]
+                    [
+                        page["sharpness"]
+                        for doc in analyses
+                        for page in doc["pages"]
+                        if page["sharpness"] is not None
+                    ]
                ).tolist(),
                "sharpness_min": min(
-                    *[page["sharpness"] for doc in analyses for page in doc["pages"]]
+                    [
+                        page["sharpness"]
+                        for doc in analyses
+                        for page in doc["pages"]
+                        if page["sharpness"] is not None
+                    ]
                ),
            }
        )
--- a/main.py
+++ b/main.py
@ -275,7 +275,7 @@ insert into phono.items (ia_id, review_date, oai_updatedate, url)
                [
                    doc["identifier"],
                    doc.get("review_date"),
-                    max(*[datetime.fromisoformat(t) for t in doc["oai_updatedate"]]),
+                    max([datetime.fromisoformat(t) for t in doc["oai_updatedate"]]),
                ]
                for doc in batch
            ],
--- a/microqa/engine.py
+++ b/microqa/engine.py
@ -1,10 +1,11 @@
 from sys import stdout
+from typing import Optional

 import numpy as np
 from PIL import Image

 from .items import ArchiveDoc, ArchiveLeaf
-from .ocr import OcrEngine, TextBlock
+from .ocr import OcrEngine, OcrResult, TextBlock


 def analyze_doc(
@ -30,99 +31,64 @@ def analyze_doc(

    analyzed_pages = []
    for leaf in all_leaves:
-        im, is_blank = normalize_contrast_for_text(leaf.image)
-
-        im_cropped = im.crop(
-            (
-                im.size[0] * 0.1,
-                im.size[1] * 0.1,
-                im.size[0] * 0.9,
-                im.size[1] * 0.9,
+        im_normalized, is_blank = normalize_contrast_for_text(leaf.image)
+        if is_blank:
+            analyzed_pages.append(
+                {
+                    "is_blank": True,
+                    "page_angle": 0,
+                    "size_analyzed": leaf.image.size,
+                    "sharpness": None,
+                    "text_margin_px": None,
+                }
            )
-        )
-        sharpness = analyze_sharpness(im_cropped)
-
-        # OCR is computationally expensive, so we try to take advantage of
-        # the Tesseract data already parsed by the Internet Archive and
-        # embedded in the PDF, when possible. If there is not sufficient
-        # text in the PDF to be confident that the Archive's OCR
-        # postprocessing captured it all, then OCR is recomputed locally.
-        #
-        # In some instances, the Archive's OCR detects rotated text but
-        # parses it as gibberish. To partially mitigate this, we ignore all
-        # precomputed text blocks with a "portrait" aspect ratio. This will
-        # not necessarily help with text that is rotated 180 degrees, but in
-        # practice that case is rarely encountered. This will also not work
-        # well with non-latin scripts that are intended to be oriented
-        # vertically.
-        OCR_RECOMPUTE_THRESHOLD_WORDS = 30
-        if (
-            sum(
-                (
-                    len(block.text.split())
-                    for block in leaf.text_blocks
-                    if block.x1 - block.x0 > block.y1 - block.y0
-                )
-            )
-            >= OCR_RECOMPUTE_THRESHOLD_WORDS
-        ):
-            ocred_leaf = leaf
-            page_angle = 0
        else:
-            OCR_SCALE = 1
-            im_scaled = im.resize(np.int_(np.array(im.size) * OCR_SCALE))
-            ocr_result = ocr_engine.process(im_scaled)
-            ocred_leaf = ArchiveLeaf(
-                image=im,
-                page_number=leaf.page_number,
-                text_blocks=[
-                    TextBlock(
-                        x0=int(block.x0 / OCR_SCALE),
-                        y0=int(block.y0 / OCR_SCALE),
-                        x1=int(block.x1 / OCR_SCALE),
-                        y1=int(block.y1 / OCR_SCALE),
-                        text=block.text,
+            sharpness = analyze_sharpness(
+                # Exclude edges, which typically include the page border.
+                im_normalized.crop(
+                    (
+                        im_normalized.size[0] * 0.15,
+                        im_normalized.size[1] * 0.15,
+                        im_normalized.size[0] * 0.85,
+                        im_normalized.size[1] * 0.85,
                    )
-                    for block in ocr_result.blocks
-                ],
+                )
+            )
+
+            ocr_result, leaf = compute_ocr(leaf, im_normalized, ocr_engine)
+            page_angle = 0 if ocr_result is None else ocr_result.page_angle
+            text_margin_px = compute_text_margin_px(
+                im_normalized.size, leaf.text_blocks
+            )
+
+            # If OCR turns up issues based on the PDF's original text boxes,
+            # re-run it ourselves to help weed out false positives.
+            CLIPPING_THRESHOLD_PX = 30
+            ROT_THRESHOLD_DEG = 30
+            if ocr_result is None and (
+                ROT_THRESHOLD_DEG < page_angle < 360 - ROT_THRESHOLD_DEG
+                or text_margin_px < CLIPPING_THRESHOLD_PX
+            ):
+                ocr_result, leaf = compute_ocr(
+                    leaf, im_normalized, ocr_engine, force_recompute=True
+                )
+                assert ocr_result is not None, (
+                    "compute_ocr(..., force_recompute=True) should always return an OcrResult"
                )
                page_angle = ocr_result.page_angle
-
-        word_margins_all_directions = (
-            np.sort(
-                np.concat(
-                    [
-                        np.array(
-                            [
-                                block.x0,
-                                block.y0,
-                                im.size[0] - block.x1,
-                                im.size[1] - block.y1,
-                            ]
-                        )
-                        for block in ocred_leaf.text_blocks
-                    ]
-                ).astype(np.int_)
-            )
-            if len(ocred_leaf.text_blocks) > 0
-            else np.array([])
-        )
-        # Skip the n closest words to the edge, to help ignore stray OCR artifacts.
-        SKIP_WORDS = 2
-        text_margin_px = int(
-            word_margins_all_directions[SKIP_WORDS]
-            if word_margins_all_directions.shape[0] > SKIP_WORDS
-            else -1
+                text_margin_px = compute_text_margin_px(
+                    im_normalized.size, leaf.text_blocks
                )

-        # Make sure the OCR engine is running with orientation detection.
-        assert page_angle is not None
+            assert page_angle is not None, (
+                "OCR engine should be running with page orientation detection"
+            )

            analyzed_pages.append(
                {
-                "is_blank": is_blank,
+                    "is_blank": False,
                    "page_angle": page_angle,
-                "size_analyzed": im.size,
+                    "size_analyzed": leaf.image.size,
                    "sharpness": sharpness,
                    "text_margin_px": text_margin_px,
                }
@ -154,13 +120,15 @@ def normalize_contrast_for_text(im: Image.Image) -> tuple[Image.Image, bool]:

        (normalized_image, is_blank)
    """
+
    pixel_values = np.asarray(
+        # Exclude edges, which typically include the page border.
        im.crop(
            (
-                im.size[0] * 0.1,
-                im.size[1] * 0.1,
-                im.size[0] * 0.9,
-                im.size[1] * 0.9,
+                im.size[0] * 0.15,
+                im.size[1] * 0.15,
+                im.size[0] * 0.85,
+                im.size[1] * 0.85,
            )
        )
    )
@ -189,4 +157,115 @@ def analyze_sharpness(im: Image.Image) -> float:
    # computation based on https://stackoverflow.com/a/26014796.
    grad_y, grad_x = np.gradient(np.asarray(im))
    return float(np.clip(np.quantile(np.sqrt(grad_x**2 + grad_y**2), 0.99) / 255, 0, 1))
+
+
+def compute_ocr(
+    leaf: ArchiveLeaf,
+    im_normalized: Image.Image,
+    ocr_engine: OcrEngine,
+    force_recompute: bool = False,
+) -> (Optional[OcrResult], ArchiveLeaf):
    """
+    OCR is computationally expensive, so we try to take advantage of
+    the Tesseract data already parsed by the Internet Archive and
+    embedded in the PDF, when possible. If there is not sufficient
+    text in the PDF to be confident that the Archive's OCR
+    postprocessing captured it all, then OCR is recomputed locally.
+
+    In some instances, the Archive's OCR detects rotated text but
+    parses it as gibberish. To partially mitigate this, we ignore all
+    precomputed text blocks with a "portrait" aspect ratio. This will
+    not necessarily help with text that is rotated 180 degrees, and will
+    not work well with non-latin scripts that are intended to be oriented
+    vertically.
+
+    Params:
+
+        leaf                Information for the document page.
+
+        im_normalized       Contrast-normalized image.
+
+        ocr_engine          Engine to use as needed for OCR.
+
+        force_recompute     If `True`, OCR is re-run even if there is already
+                            text data associated with the leaf.
+
+    Returns:
+
+        Tuple of `OcrResult` and `ArchiveLeaf` if OCR was recomputed; otherwise
+        tuple of `None` and `ArchiveLeaf` if existing text data was reused.
+    """
+
+    if not force_recompute:
+        PDF_OCR_THRESHOLD_WORDS = 30
+        pdf_word_count = sum(
+            (
+                len(block.text.split())
+                for block in leaf.text_blocks
+                if block.x1 - block.x0 > block.y1 - block.y0
+            )
+        )
+        if pdf_word_count >= PDF_OCR_THRESHOLD_WORDS:
+            return None, leaf
+
+    ocr_result = ocr_engine.process(im_normalized)
+    return ocr_result, ArchiveLeaf(
+        image=leaf.image,
+        page_number=leaf.page_number,
+        text_blocks=[
+            TextBlock(
+                x0=int(block.x0),
+                y0=int(block.y0),
+                x1=int(block.x1),
+                y1=int(block.y1),
+                text=block.text,
+            )
+            for block in ocr_result.blocks
+        ],
+    )
+
+
+def compute_text_margin_px(
+    im_size: tuple[int, int], text_blocks: list[TextBlock]
+) -> Optional[int]:
+    """
+    Infer the margins between OCR'ed text blocks and the edge of their page's
+    bounds. This helps to detect if adjacent content may be cropped out.
+
+    Params:
+
+        im_size         Dimensions of the page image, in pixels.
+
+        text_blocks     List of text blocks detected by OCR.
+
+    Returns:
+
+        Integer pixel count if text is present; otherwise `None`.
+    """
+
+    word_margins_all_directions = (
+        np.sort(
+            np.concat(
+                [
+                    np.array(
+                        [
+                            block.x0,
+                            block.y0,
+                            im_size[0] - block.x1,
+                            im_size[1] - block.y1,
+                        ]
+                    )
+                    for block in text_blocks
+                ]
+            ).astype(np.int_)
+        )
+        if len(text_blocks) > 0
+        else np.array([])
+    )
+    # Skip the n closest words to the edge, to help ignore stray OCR artifacts.
+    SKIP_WORDS = 2
+    return (
+        int(word_margins_all_directions[SKIP_WORDS])
+        if word_margins_all_directions.shape[0] > SKIP_WORDS
+        else None
+    )