MicroQA/microqa/engine.py

import re
from sys import stdout
from typing import Optional

import numpy as np
from PIL import Image

from .items import ArchiveDoc, ArchiveLeaf
from .ocr import OcrEngine, OcrResult, TextBlock


def analyze_doc(
    doc: ArchiveDoc,
    ocr_engine: OcrEngine,
    use_cache: bool = False,
    verbose: bool = False,
):
    """
    Analyzes all pages in an ArchiveDoc for useful metrics such as sharpness,
    orientation, presence of text overflows, and so on.
    """

    if verbose:
        print(f"Loading {doc.name}...")
        stdout.flush()

    all_leaves = doc.fetch_leaves(use_cache=use_cache)

    if verbose:
        print(f"Processing {len(all_leaves)} pages...", file=stdout)
        stdout.flush()

    analyzed_pages = []
    for leaf in all_leaves:
        im_normalized, is_blank = normalize_contrast_for_text(leaf.image)
        if is_blank:
            analyzed_pages.append(
                {
                    "is_blank": True,
                    "page_angle": 0,
                    "size_analyzed": leaf.image.size,
                    "sharpness": None,
                    "text_margin_px": None,
                }
            )
        else:
            sharpness = analyze_sharpness(
                # Exclude edges, which typically include the page border.
                im_normalized.crop(
                    (
                        im_normalized.size[0] * 0.15,
                        im_normalized.size[1] * 0.15,
                        im_normalized.size[0] * 0.85,
                        im_normalized.size[1] * 0.85,
                    )
                )
            )

            ocr_result, leaf = compute_ocr(leaf, im_normalized, ocr_engine)
            page_angle = 0 if ocr_result is None else ocr_result.page_angle
            text_margin_px = compute_text_margin_px(
                im_normalized.size, leaf.text_blocks
            )

            # If OCR turns up issues based on the PDF's original text boxes,
            # re-run it ourselves to help weed out false positives.
            CLIPPING_THRESHOLD_PX = 30
            ROT_THRESHOLD_DEG = 30
            if ocr_result is None and (
                ROT_THRESHOLD_DEG < page_angle < 360 - ROT_THRESHOLD_DEG
                or text_margin_px < CLIPPING_THRESHOLD_PX
            ):
                ocr_result, leaf = compute_ocr(
                    leaf, im_normalized, ocr_engine, force_recompute=True
                )
                assert ocr_result is not None, (
                    "compute_ocr(..., force_recompute=True) should always return an OcrResult"
                )
                page_angle = ocr_result.page_angle
                text_margin_px = compute_text_margin_px(
                    im_normalized.size, leaf.text_blocks
                )

            assert page_angle is not None, (
                "OCR engine should be running with page orientation detection"
            )

            analyzed_pages.append(
                {
                    "is_blank": False,
                    "page_angle": page_angle,
                    "size_analyzed": leaf.image.size,
                    "sharpness": sharpness,
                    "text_margin_px": text_margin_px,
                }
            )

    return {"pages": analyzed_pages}


def normalize_contrast_for_text(im: Image.Image) -> tuple[Image.Image, bool]:
    """
    Most of the pages being analyzed, and virtually all of the pages we care
    about for the purposes of QA, primarily contain text on a contrasting
    background. We can therefore typically assume that it is reasonable to boost
    contrast so that the lightest pixels are nearly white and the darkest pixels
    are nearly black. This can help make analysis more consistent across leaves
    with varying contrast ratios due to varied scanner settings, contrast ratios
    of the original documnets, or weathering/fading of the physical fiche.

    Processed leaves usually contain some amount of margin around the edges
    where the backlight of the scanning rig is visible through the unexposed
    region of the negative, so contrast detection is heavily center-weighted.

    Params:

        im  Scan image as a 2-dimensional numpy array. (Use `np.asarray()` to
            convert PIL `Image` objects to an array format.)

    Returns:

        (normalized_image, is_blank)
    """

    pixel_values = np.asarray(
        # Exclude edges, which typically include the page border.
        im.crop(
            (
                im.size[0] * 0.15,
                im.size[1] * 0.15,
                im.size[0] * 0.85,
                im.size[1] * 0.85,
            )
        )
    )
    # To avoid extreme outliers, use quantiles instead of absolute extrema.
    extrema = (np.quantile(pixel_values, 0.002), np.quantile(pixel_values, 0.998))
    if extrema[1] - extrema[0] < 64:
        # Assume there is essentially no content here and return the original.
        return im, True

    # Apply a rudimentary tone curve to the image, with the goal that the
    # extrema we just calculated will evaluate to values "pretty close to" 0%
    # and 100% of the available range.
    return im.point(
        lambda x: np.interp(x, (0, extrema[0], extrema[1], 255), (0, 8, 247, 255))
    ), False


def analyze_sharpness(im: Image.Image) -> float:
    """
    Attempts to quantify the sharpness an image, on a scale of 0 to 1.
    """

    # Inferring sharpness by measuring the peak intensity of an edge detection/
    # high pass filter over the image tends to produce different baseline
    # results across documents. We've had much more luck with a direct gradient
    # computation based on https://stackoverflow.com/a/26014796.
    grad_y, grad_x = np.gradient(np.asarray(im))
    return float(np.clip(np.quantile(np.sqrt(grad_x**2 + grad_y**2), 0.99) / 255, 0, 1))


def compute_ocr(
    leaf: ArchiveLeaf,
    im_normalized: Image.Image,
    ocr_engine: OcrEngine,
    force_recompute: bool = False,
) -> (Optional[OcrResult], ArchiveLeaf):
    """
    OCR is computationally expensive, so we try to take advantage of
    the Tesseract data already parsed by the Internet Archive and
    embedded in the PDF, when possible. If there is not sufficient
    text in the PDF to be confident that the Archive's OCR
    postprocessing captured it all, then OCR is recomputed locally.

    In some instances, the Archive's OCR detects rotated text but
    parses it as gibberish. To partially mitigate this, we ignore all
    precomputed text blocks with a "portrait" aspect ratio. This will
    not necessarily help with text that is rotated 180 degrees, and will
    not work well with non-latin scripts that are intended to be oriented
    vertically.

    Params:

        leaf                Information for the document page.

        im_normalized       Contrast-normalized image.

        ocr_engine          Engine to use as needed for OCR.

        force_recompute     If `True`, OCR is re-run even if there is already
                            text data associated with the leaf.

    Returns:

        Tuple of `OcrResult` and `ArchiveLeaf` if OCR was recomputed; otherwise
        tuple of `None` and `ArchiveLeaf` if existing text data was reused.
    """

    if not force_recompute:
        PDF_OCR_THRESHOLD_WORDS = 30
        pdf_word_count = sum(
            (
                len(block.text.split())
                for block in leaf.text_blocks
                if block.x1 - block.x0 > block.y1 - block.y0
            )
        )
        if pdf_word_count >= PDF_OCR_THRESHOLD_WORDS:
            return None, leaf

    ocr_result = ocr_engine.process(im_normalized)
    return ocr_result, ArchiveLeaf(
        image=leaf.image,
        page_number=leaf.page_number,
        text_blocks=[
            TextBlock(
                x0=int(block.x0),
                y0=int(block.y0),
                x1=int(block.x1),
                y1=int(block.y1),
                text=block.text,
            )
            for block in ocr_result.blocks
        ],
    )


def compute_text_margin_px(
    im_size: tuple[int, int], text_blocks: list[TextBlock]
) -> Optional[int]:
    """
    Infer the margins between OCR'ed text blocks and the edge of their page's
    bounds. This helps to detect if adjacent content may be cropped out.

    Params:

        im_size         Dimensions of the page image, in pixels.

        text_blocks     List of text blocks detected by OCR.

    Returns:

        Integer pixel count if text is present; otherwise `None`.
    """

    block_margins = [
        np.array(
            [
                block.x0,
                block.y0,
                im_size[0] - block.x1,
                im_size[1] - block.y1,
            ]
        )
        for block in text_blocks
        # Exclude text without clear alphanumeric substance.
        if len(re.sub(r"[^a-zA-Z0-9]", "", block.text)) > 1
    ]
    word_margins_all_directions = (
        np.sort(np.concat(block_margins).astype(np.int_))
        if len(block_margins) > 0
        else np.array([])
    )
    # Skip the n closest words to the edge, to help ignore stray OCR artifacts.
    SKIP_WORDS = 2
    return (
        int(word_margins_all_directions[SKIP_WORDS])
        if word_margins_all_directions.shape[0] > SKIP_WORDS
        else None
    )
ignore spurious text in clipping detection 2026-01-15 22:14:01 +00:00			`import re`
rewrite data fetching into archive_item.py 2025-10-04 18:03:03 -07:00			`from sys import stdout`
re-run ocr as needed to screen false positives 2026-01-15 21:33:57 +00:00			`from typing import Optional`
rewrite to engine.py 2025-10-04 15:09:16 -07:00
			`import numpy as np`
replace sharpness edge detection with gradient 2026-01-15 21:31:24 +00:00			`from PIL import Image`
rewrite to engine.py 2025-10-04 15:09:16 -07:00
reuse pdf ocr when available 2025-12-20 02:16:41 +00:00			`from .items import ArchiveDoc, ArchiveLeaf`
re-run ocr as needed to screen false positives 2026-01-15 21:33:57 +00:00			`from .ocr import OcrEngine, OcrResult, TextBlock`
rewrite to engine.py 2025-10-04 15:09:16 -07:00

reuse pdf ocr when available 2025-12-20 02:16:41 +00:00			`def analyze_doc(`
			`doc: ArchiveDoc,`
			`ocr_engine: OcrEngine,`
			`use_cache: bool = False,`
			`verbose: bool = False,`
			`):`
			`"""`
			`Analyzes all pages in an ArchiveDoc for useful metrics such as sharpness,`
			`orientation, presence of text overflows, and so on.`
			`"""`

rewrite to engine.py 2025-10-04 15:09:16 -07:00			`if verbose:`
rewrite data fetching into archive_item.py 2025-10-04 18:03:03 -07:00			`print(f"Loading {doc.name}...")`
			`stdout.flush()`
rewrite to engine.py 2025-10-04 15:09:16 -07:00
reuse pdf ocr when available 2025-12-20 02:16:41 +00:00			`all_leaves = doc.fetch_leaves(use_cache=use_cache)`
rewrite to engine.py 2025-10-04 15:09:16 -07:00
			`if verbose:`
reuse pdf ocr when available 2025-12-20 02:16:41 +00:00			`print(f"Processing {len(all_leaves)} pages...", file=stdout)`
rewrite data fetching into archive_item.py 2025-10-04 18:03:03 -07:00			`stdout.flush()`
rewrite to engine.py 2025-10-04 15:09:16 -07:00
reuse pdf ocr when available 2025-12-20 02:16:41 +00:00			`analyzed_pages = []`
			`for leaf in all_leaves:`
re-run ocr as needed to screen false positives 2026-01-15 21:33:57 +00:00			`im_normalized, is_blank = normalize_contrast_for_text(leaf.image)`
			`if is_blank:`
			`analyzed_pages.append(`
			`{`
			`"is_blank": True,`
			`"page_angle": 0,`
			`"size_analyzed": leaf.image.size,`
			`"sharpness": None,`
			`"text_margin_px": None,`
			`}`
reuse pdf ocr when available 2025-12-20 02:16:41 +00:00			`)`
rewrite to engine.py 2025-10-04 15:09:16 -07:00			`else:`
re-run ocr as needed to screen false positives 2026-01-15 21:33:57 +00:00			`sharpness = analyze_sharpness(`
			`# Exclude edges, which typically include the page border.`
			`im_normalized.crop(`
			`(`
			`im_normalized.size[0] * 0.15,`
			`im_normalized.size[1] * 0.15,`
			`im_normalized.size[0] * 0.85,`
			`im_normalized.size[1] * 0.85,`
reuse pdf ocr when available 2025-12-20 02:16:41 +00:00			`)`
re-run ocr as needed to screen false positives 2026-01-15 21:33:57 +00:00			`)`
rewrite to engine.py 2025-10-04 15:09:16 -07:00			`)`
re-run ocr as needed to screen false positives 2026-01-15 21:33:57 +00:00
			`ocr_result, leaf = compute_ocr(leaf, im_normalized, ocr_engine)`
			`page_angle = 0 if ocr_result is None else ocr_result.page_angle`
			`text_margin_px = compute_text_margin_px(`
			`im_normalized.size, leaf.text_blocks`
rewrite to engine.py 2025-10-04 15:09:16 -07:00			`)`

re-run ocr as needed to screen false positives 2026-01-15 21:33:57 +00:00			`# If OCR turns up issues based on the PDF's original text boxes,`
			`# re-run it ourselves to help weed out false positives.`
			`CLIPPING_THRESHOLD_PX = 30`
			`ROT_THRESHOLD_DEG = 30`
			`if ocr_result is None and (`
			`ROT_THRESHOLD_DEG < page_angle < 360 - ROT_THRESHOLD_DEG`
			`or text_margin_px < CLIPPING_THRESHOLD_PX`
			`):`
			`ocr_result, leaf = compute_ocr(`
			`leaf, im_normalized, ocr_engine, force_recompute=True`
			`)`
			`assert ocr_result is not None, (`
			`"compute_ocr(..., force_recompute=True) should always return an OcrResult"`
			`)`
			`page_angle = ocr_result.page_angle`
			`text_margin_px = compute_text_margin_px(`
			`im_normalized.size, leaf.text_blocks`
			`)`

			`assert page_angle is not None, (`
			`"OCR engine should be running with page orientation detection"`
			`)`

			`analyzed_pages.append(`
			`{`
			`"is_blank": False,`
			`"page_angle": page_angle,`
			`"size_analyzed": leaf.image.size,`
			`"sharpness": sharpness,`
			`"text_margin_px": text_margin_px,`
			`}`
			`)`
reuse pdf ocr when available 2025-12-20 02:16:41 +00:00
			`return {"pages": analyzed_pages}`
rewrite to engine.py 2025-10-04 15:09:16 -07:00

improve contrast norm and sharpness measurement 2025-12-20 08:58:49 +00:00			`def normalize_contrast_for_text(im: Image.Image) -> tuple[Image.Image, bool]:`
rewrite to engine.py 2025-10-04 15:09:16 -07:00			`"""`
improve contrast norm and sharpness measurement 2025-12-20 08:58:49 +00:00			`Most of the pages being analyzed, and virtually all of the pages we care`
			`about for the purposes of QA, primarily contain text on a contrasting`
			`background. We can therefore typically assume that it is reasonable to boost`
			`contrast so that the lightest pixels are nearly white and the darkest pixels`
			`are nearly black. This can help make analysis more consistent across leaves`
			`with varying contrast ratios due to varied scanner settings, contrast ratios`
			`of the original documnets, or weathering/fading of the physical fiche.`

			`Processed leaves usually contain some amount of margin around the edges`
			`where the backlight of the scanning rig is visible through the unexposed`
			`region of the negative, so contrast detection is heavily center-weighted.`

			`Params:`

			im Scan image as a 2-dimensional numpy array. (Use `np.asarray()` to
			convert PIL `Image` objects to an array format.)

			`Returns:`

			`(normalized_image, is_blank)`
rewrite to engine.py 2025-10-04 15:09:16 -07:00			`"""`
re-run ocr as needed to screen false positives 2026-01-15 21:33:57 +00:00
improve contrast norm and sharpness measurement 2025-12-20 08:58:49 +00:00			`pixel_values = np.asarray(`
re-run ocr as needed to screen false positives 2026-01-15 21:33:57 +00:00			`# Exclude edges, which typically include the page border.`
improve contrast norm and sharpness measurement 2025-12-20 08:58:49 +00:00			`im.crop(`
			`(`
re-run ocr as needed to screen false positives 2026-01-15 21:33:57 +00:00			`im.size[0] * 0.15,`
			`im.size[1] * 0.15,`
			`im.size[0] * 0.85,`
			`im.size[1] * 0.85,`
improve contrast norm and sharpness measurement 2025-12-20 08:58:49 +00:00			`)`
rewrite to engine.py 2025-10-04 15:09:16 -07:00			`)`
			`)`
improve contrast norm and sharpness measurement 2025-12-20 08:58:49 +00:00			`# To avoid extreme outliers, use quantiles instead of absolute extrema.`
			`extrema = (np.quantile(pixel_values, 0.002), np.quantile(pixel_values, 0.998))`
			`if extrema[1] - extrema[0] < 64:`
			`# Assume there is essentially no content here and return the original.`
			`return im, True`

			`# Apply a rudimentary tone curve to the image, with the goal that the`
			`# extrema we just calculated will evaluate to values "pretty close to" 0%`
			`# and 100% of the available range.`
			`return im.point(`
			`lambda x: np.interp(x, (0, extrema[0], extrema[1], 255), (0, 8, 247, 255))`
			`), False`


replace sharpness edge detection with gradient 2026-01-15 21:31:24 +00:00			`def analyze_sharpness(im: Image.Image) -> float:`
improve contrast norm and sharpness measurement 2025-12-20 08:58:49 +00:00			`"""`
replace sharpness edge detection with gradient 2026-01-15 21:31:24 +00:00			`Attempts to quantify the sharpness an image, on a scale of 0 to 1.`
			`"""`

			`# Inferring sharpness by measuring the peak intensity of an edge detection/`
			`# high pass filter over the image tends to produce different baseline`
			`# results across documents. We've had much more luck with a direct gradient`
			`# computation based on https://stackoverflow.com/a/26014796.`
			`grad_y, grad_x = np.gradient(np.asarray(im))`
			`return float(np.clip(np.quantile(np.sqrt(grad_x2 + grad_y2), 0.99) / 255, 0, 1))`
re-run ocr as needed to screen false positives 2026-01-15 21:33:57 +00:00

			`def compute_ocr(`
			`leaf: ArchiveLeaf,`
			`im_normalized: Image.Image,`
			`ocr_engine: OcrEngine,`
			`force_recompute: bool = False,`
			`) -> (Optional[OcrResult], ArchiveLeaf):`
improve contrast norm and sharpness measurement 2025-12-20 08:58:49 +00:00			`"""`
re-run ocr as needed to screen false positives 2026-01-15 21:33:57 +00:00			`OCR is computationally expensive, so we try to take advantage of`
			`the Tesseract data already parsed by the Internet Archive and`
			`embedded in the PDF, when possible. If there is not sufficient`
			`text in the PDF to be confident that the Archive's OCR`
			`postprocessing captured it all, then OCR is recomputed locally.`

			`In some instances, the Archive's OCR detects rotated text but`
			`parses it as gibberish. To partially mitigate this, we ignore all`
			`precomputed text blocks with a "portrait" aspect ratio. This will`
			`not necessarily help with text that is rotated 180 degrees, and will`
			`not work well with non-latin scripts that are intended to be oriented`
			`vertically.`

			`Params:`

			`leaf Information for the document page.`

			`im_normalized Contrast-normalized image.`

			`ocr_engine Engine to use as needed for OCR.`

			force_recompute If `True`, OCR is re-run even if there is already
			`text data associated with the leaf.`

			`Returns:`

			Tuple of `OcrResult` and `ArchiveLeaf` if OCR was recomputed; otherwise
			tuple of `None` and `ArchiveLeaf` if existing text data was reused.
			`"""`

			`if not force_recompute:`
			`PDF_OCR_THRESHOLD_WORDS = 30`
			`pdf_word_count = sum(`
			`(`
			`len(block.text.split())`
			`for block in leaf.text_blocks`
			`if block.x1 - block.x0 > block.y1 - block.y0`
			`)`
			`)`
			`if pdf_word_count >= PDF_OCR_THRESHOLD_WORDS:`
			`return None, leaf`

			`ocr_result = ocr_engine.process(im_normalized)`
			`return ocr_result, ArchiveLeaf(`
			`image=leaf.image,`
			`page_number=leaf.page_number,`
			`text_blocks=[`
			`TextBlock(`
			`x0=int(block.x0),`
			`y0=int(block.y0),`
			`x1=int(block.x1),`
			`y1=int(block.y1),`
			`text=block.text,`
			`)`
			`for block in ocr_result.blocks`
			`],`
			`)`


			`def compute_text_margin_px(`
			`im_size: tuple[int, int], text_blocks: list[TextBlock]`
			`) -> Optional[int]:`
			`"""`
			`Infer the margins between OCR'ed text blocks and the edge of their page's`
			`bounds. This helps to detect if adjacent content may be cropped out.`

			`Params:`

			`im_size Dimensions of the page image, in pixels.`

			`text_blocks List of text blocks detected by OCR.`

			`Returns:`

			Integer pixel count if text is present; otherwise `None`.
			`"""`

ignore spurious text in clipping detection 2026-01-15 22:14:01 +00:00			`block_margins = [`
			`np.array(`
			`[`
			`block.x0,`
			`block.y0,`
			`im_size[0] - block.x1,`
			`im_size[1] - block.y1,`
			`]`
re-run ocr as needed to screen false positives 2026-01-15 21:33:57 +00:00			`)`
ignore spurious text in clipping detection 2026-01-15 22:14:01 +00:00			`for block in text_blocks`
			`# Exclude text without clear alphanumeric substance.`
			`if len(re.sub(r"[^a-zA-Z0-9]", "", block.text)) > 1`
			`]`
			`word_margins_all_directions = (`
			`np.sort(np.concat(block_margins).astype(np.int_))`
			`if len(block_margins) > 0`
re-run ocr as needed to screen false positives 2026-01-15 21:33:57 +00:00			`else np.array([])`
			`)`
			`# Skip the n closest words to the edge, to help ignore stray OCR artifacts.`
			`SKIP_WORDS = 2`
			`return (`
			`int(word_margins_all_directions[SKIP_WORDS])`
			`if word_margins_all_directions.shape[0] > SKIP_WORDS`
			`else None`
			`)`