re-run ocr as needed to screen false positives

2026-01-15 21:33:57 +00:00 · 2026-01-15 21:33:57 +00:00 · f0a4ef253b
commit f0a4ef253b
parent 9a94719dc1
3 changed files with 194 additions and 100 deletions
--- a/diagnostics.py
+++ b/diagnostics.py
@ -62,18 +62,33 @@ def main():
                    [
                        i
                        for i, page in enumerate(doc["pages"])
-                        if 45 < page["page_angle"] < 315
+                        if 30 < page["page_angle"] < 330
                    ]
                    for doc in analyses
                ],
                "sharpness_max": max(
-                    *[page["sharpness"] for doc in analyses for page in doc["pages"]]
+                    [
                        page["sharpness"]
                        for doc in analyses
                        for page in doc["pages"]
                        if page["sharpness"] is not None
                    ]
                ),
                "sharpness_median": np.median(
-                    [page["sharpness"] for doc in analyses for page in doc["pages"]]
+                    [
                        page["sharpness"]
                        for doc in analyses
                        for page in doc["pages"]
                        if page["sharpness"] is not None
                    ]
                ).tolist(),
                "sharpness_min": min(
-                    *[page["sharpness"] for doc in analyses for page in doc["pages"]]
+                    [
                        page["sharpness"]
                        for doc in analyses
                        for page in doc["pages"]
                        if page["sharpness"] is not None
                    ]
                ),
            }
        )
--- a/main.py
+++ b/main.py
@ -275,7 +275,7 @@ insert into phono.items (ia_id, review_date, oai_updatedate, url)
                [
                    doc["identifier"],
                    doc.get("review_date"),
-                    max(*[datetime.fromisoformat(t) for t in doc["oai_updatedate"]]),
+                    max([datetime.fromisoformat(t) for t in doc["oai_updatedate"]]),
                ]
                for doc in batch
            ],
--- a/microqa/engine.py
+++ b/microqa/engine.py
@ -1,10 +1,11 @@
 from sys import stdout
 from typing import Optional
 import numpy as np
 from PIL import Image
 from .items import ArchiveDoc, ArchiveLeaf
-from .ocr import OcrEngine, TextBlock
+from .ocr import OcrEngine, OcrResult, TextBlock
 def analyze_doc(
@ -30,103 +31,68 @@ def analyze_doc(
    analyzed_pages = []
    for leaf in all_leaves:
-        im, is_blank = normalize_contrast_for_text(leaf.image)
+        im_normalized, is_blank = normalize_contrast_for_text(leaf.image)
-
+        if is_blank:
-        im_cropped = im.crop(
+            analyzed_pages.append(
-            (
+                {
-                im.size[0] * 0.1,
+                    "is_blank": True,
-                im.size[1] * 0.1,
+                    "page_angle": 0,
-                im.size[0] * 0.9,
+                    "size_analyzed": leaf.image.size,
-                im.size[1] * 0.9,
+                    "sharpness": None,
                    "text_margin_px": None,
                }
            )
-        )
+        else:
-        sharpness = analyze_sharpness(im_cropped)
+            sharpness = analyze_sharpness(
-
+                # Exclude edges, which typically include the page border.
-        # OCR is computationally expensive, so we try to take advantage of
+                im_normalized.crop(
-        # the Tesseract data already parsed by the Internet Archive and
+                    (
-        # embedded in the PDF, when possible. If there is not sufficient
+                        im_normalized.size[0] * 0.15,
-        # text in the PDF to be confident that the Archive's OCR
+                        im_normalized.size[1] * 0.15,
-        # postprocessing captured it all, then OCR is recomputed locally.
+                        im_normalized.size[0] * 0.85,
-        #
+                        im_normalized.size[1] * 0.85,
-        # In some instances, the Archive's OCR detects rotated text but
+                    )
        # parses it as gibberish. To partially mitigate this, we ignore all
        # precomputed text blocks with a "portrait" aspect ratio. This will
        # not necessarily help with text that is rotated 180 degrees, but in
        # practice that case is rarely encountered. This will also not work
        # well with non-latin scripts that are intended to be oriented
        # vertically.
        OCR_RECOMPUTE_THRESHOLD_WORDS = 30
        if (
            sum(
                (
                    len(block.text.split())
                    for block in leaf.text_blocks
                    if block.x1 - block.x0 > block.y1 - block.y0
                )
            )
-            >= OCR_RECOMPUTE_THRESHOLD_WORDS
+
-        ):
+            ocr_result, leaf = compute_ocr(leaf, im_normalized, ocr_engine)
-            ocred_leaf = leaf
+            page_angle = 0 if ocr_result is None else ocr_result.page_angle
-            page_angle = 0
+            text_margin_px = compute_text_margin_px(
-        else:
+                im_normalized.size, leaf.text_blocks
            OCR_SCALE = 1
            im_scaled = im.resize(np.int_(np.array(im.size) * OCR_SCALE))
            ocr_result = ocr_engine.process(im_scaled)
            ocred_leaf = ArchiveLeaf(
                image=im,
                page_number=leaf.page_number,
                text_blocks=[
                    TextBlock(
                        x0=int(block.x0 / OCR_SCALE),
                        y0=int(block.y0 / OCR_SCALE),
                        x1=int(block.x1 / OCR_SCALE),
                        y1=int(block.y1 / OCR_SCALE),
                        text=block.text,
                    )
                    for block in ocr_result.blocks
                ],
            )
            page_angle = ocr_result.page_angle
-        word_margins_all_directions = (
+            # If OCR turns up issues based on the PDF's original text boxes,
-            np.sort(
+            # re-run it ourselves to help weed out false positives.
-                np.concat(
+            CLIPPING_THRESHOLD_PX = 30
-                    [
+            ROT_THRESHOLD_DEG = 30
-                        np.array(
+            if ocr_result is None and (
-                            [
+                ROT_THRESHOLD_DEG < page_angle < 360 - ROT_THRESHOLD_DEG
-                                block.x0,
+                or text_margin_px < CLIPPING_THRESHOLD_PX
-                                block.y0,
+            ):
-                                im.size[0] - block.x1,
+                ocr_result, leaf = compute_ocr(
-                                im.size[1] - block.y1,
+                    leaf, im_normalized, ocr_engine, force_recompute=True
-                            ]
+                )
-                        )
+                assert ocr_result is not None, (
-                        for block in ocred_leaf.text_blocks
+                    "compute_ocr(..., force_recompute=True) should always return an OcrResult"
-                    ]
+                )
-                ).astype(np.int_)
+                page_angle = ocr_result.page_angle
                text_margin_px = compute_text_margin_px(
                    im_normalized.size, leaf.text_blocks
                )
            assert page_angle is not None, (
                "OCR engine should be running with page orientation detection"
            )
            if len(ocred_leaf.text_blocks) > 0
            else np.array([])
        )
        # Skip the n closest words to the edge, to help ignore stray OCR artifacts.
        SKIP_WORDS = 2
        text_margin_px = int(
            word_margins_all_directions[SKIP_WORDS]
            if word_margins_all_directions.shape[0] > SKIP_WORDS
            else -1
        )
-        # Make sure the OCR engine is running with orientation detection.
+            analyzed_pages.append(
-        assert page_angle is not None
+                {
-
+                    "is_blank": False,
-        analyzed_pages.append(
+                    "page_angle": page_angle,
-            {
+                    "size_analyzed": leaf.image.size,
-                "is_blank": is_blank,
+                    "sharpness": sharpness,
-                "page_angle": page_angle,
+                    "text_margin_px": text_margin_px,
-                "size_analyzed": im.size,
+                }
-                "sharpness": sharpness,
+            )
                "text_margin_px": text_margin_px,
            }
        )
    return {"pages": analyzed_pages}
@ -154,13 +120,15 @@ def normalize_contrast_for_text(im: Image.Image) -> tuple[Image.Image, bool]:
        (normalized_image, is_blank)
    """
    pixel_values = np.asarray(
        # Exclude edges, which typically include the page border.
        im.crop(
            (
-                im.size[0] * 0.1,
+                im.size[0] * 0.15,
-                im.size[1] * 0.1,
+                im.size[1] * 0.15,
-                im.size[0] * 0.9,
+                im.size[0] * 0.85,
-                im.size[1] * 0.9,
+                im.size[1] * 0.85,
            )
        )
    )
@ -189,4 +157,115 @@ def analyze_sharpness(im: Image.Image) -> float:
    # computation based on https://stackoverflow.com/a/26014796.
    grad_y, grad_x = np.gradient(np.asarray(im))
    return float(np.clip(np.quantile(np.sqrt(grad_x**2 + grad_y**2), 0.99) / 255, 0, 1))
 def compute_ocr(
    leaf: ArchiveLeaf,
    im_normalized: Image.Image,
    ocr_engine: OcrEngine,
    force_recompute: bool = False,
 ) -> (Optional[OcrResult], ArchiveLeaf):
    """
    OCR is computationally expensive, so we try to take advantage of
    the Tesseract data already parsed by the Internet Archive and
    embedded in the PDF, when possible. If there is not sufficient
    text in the PDF to be confident that the Archive's OCR
    postprocessing captured it all, then OCR is recomputed locally.
    In some instances, the Archive's OCR detects rotated text but
    parses it as gibberish. To partially mitigate this, we ignore all
    precomputed text blocks with a "portrait" aspect ratio. This will
    not necessarily help with text that is rotated 180 degrees, and will
    not work well with non-latin scripts that are intended to be oriented
    vertically.
    Params:
        leaf                Information for the document page.
        im_normalized       Contrast-normalized image.
        ocr_engine          Engine to use as needed for OCR.
        force_recompute     If `True`, OCR is re-run even if there is already
                            text data associated with the leaf.
    Returns:
        Tuple of `OcrResult` and `ArchiveLeaf` if OCR was recomputed; otherwise
        tuple of `None` and `ArchiveLeaf` if existing text data was reused.
    """
    if not force_recompute:
        PDF_OCR_THRESHOLD_WORDS = 30
        pdf_word_count = sum(
            (
                len(block.text.split())
                for block in leaf.text_blocks
                if block.x1 - block.x0 > block.y1 - block.y0
            )
        )
        if pdf_word_count >= PDF_OCR_THRESHOLD_WORDS:
            return None, leaf
    ocr_result = ocr_engine.process(im_normalized)
    return ocr_result, ArchiveLeaf(
        image=leaf.image,
        page_number=leaf.page_number,
        text_blocks=[
            TextBlock(
                x0=int(block.x0),
                y0=int(block.y0),
                x1=int(block.x1),
                y1=int(block.y1),
                text=block.text,
            )
            for block in ocr_result.blocks
        ],
    )
 def compute_text_margin_px(
    im_size: tuple[int, int], text_blocks: list[TextBlock]
 ) -> Optional[int]:
    """
    Infer the margins between OCR'ed text blocks and the edge of their page's
    bounds. This helps to detect if adjacent content may be cropped out.
    Params:
        im_size         Dimensions of the page image, in pixels.
        text_blocks     List of text blocks detected by OCR.
    Returns:
        Integer pixel count if text is present; otherwise `None`.
    """
    word_margins_all_directions = (
        np.sort(
            np.concat(
                [
                    np.array(
                        [
                            block.x0,
                            block.y0,
                            im_size[0] - block.x1,
                            im_size[1] - block.y1,
                        ]
                    )
                    for block in text_blocks
                ]
            ).astype(np.int_)
        )
        if len(text_blocks) > 0
        else np.array([])
    )
    # Skip the n closest words to the edge, to help ignore stray OCR artifacts.
    SKIP_WORDS = 2
    return (
        int(word_margins_all_directions[SKIP_WORDS])
        if word_margins_all_directions.shape[0] > SKIP_WORDS
        else None
    )