MicroQA/microqa/engine.py

from sys import stdout

import numpy as np
from PIL import Image, ImageChops, ImageFilter

from .items import ArchiveDoc, ArchiveLeaf
from .ocr import OcrEngine, TextBlock


def analyze_doc(
    doc: ArchiveDoc,
    ocr_engine: OcrEngine,
    use_cache: bool = False,
    verbose: bool = False,
):
    """
    Analyzes all pages in an ArchiveDoc for useful metrics such as sharpness,
    orientation, presence of text overflows, and so on.
    """

    if verbose:
        print(f"Loading {doc.name}...")
        stdout.flush()

    all_leaves = doc.fetch_leaves(use_cache=use_cache)

    if verbose:
        print(f"Processing {len(all_leaves)} pages...", file=stdout)
        stdout.flush()

    analyzed_pages = []
    for leaf in all_leaves:
        im, is_blank = normalize_contrast_for_text(leaf.image)

        im_cropped = im.crop(
            (
                im.size[0] * 0.1,
                im.size[1] * 0.1,
                im.size[0] * 0.9,
                im.size[1] * 0.9,
            )
        )
        sharpness = analyze_sharpness(im_cropped)

        # OCR is computationally expensive, so we try to take advantage of
        # the Tesseract data already parsed by the Internet Archive and
        # embedded in the PDF, when possible. If there is not sufficient
        # text in the PDF to be confident that the Archive's OCR
        # postprocessing captured it all, then OCR is recomputed locally.
        #
        # In some instances, the Archive's OCR detects rotated text but
        # parses it as gibberish. To partially mitigate this, we ignore all
        # precomputed text blocks with a "portrait" aspect ratio. This will
        # not necessarily help with text that is rotated 180 degrees, but in
        # practice that case is rarely encountered. This will also not work
        # well with non-latin scripts that are intended to be oriented
        # vertically.
        OCR_RECOMPUTE_THRESHOLD_WORDS = 30
        if (
            sum(
                (
                    len(block.text.split())
                    for block in leaf.text_blocks
                    if block.x1 - block.x0 > block.y1 - block.y0
                )
            )
            >= OCR_RECOMPUTE_THRESHOLD_WORDS
        ):
            ocred_leaf = leaf
            page_angle = 0
        else:
            OCR_SCALE = 1
            im_scaled = im.resize(np.int_(np.array(im.size) * OCR_SCALE))
            ocr_result = ocr_engine.process(im_scaled)
            ocred_leaf = ArchiveLeaf(
                image=im,
                page_number=leaf.page_number,
                text_blocks=[
                    TextBlock(
                        x0=int(block.x0 / OCR_SCALE),
                        y0=int(block.y0 / OCR_SCALE),
                        x1=int(block.x1 / OCR_SCALE),
                        y1=int(block.y1 / OCR_SCALE),
                        text=block.text,
                    )
                    for block in ocr_result.blocks
                ],
            )
            page_angle = ocr_result.page_angle

        word_margins_all_directions = (
            np.sort(
                np.concat(
                    [
                        np.array(
                            [
                                block.x0,
                                block.y0,
                                im.size[0] - block.x1,
                                im.size[1] - block.y1,
                            ]
                        )
                        for block in ocred_leaf.text_blocks
                    ]
                ).astype(np.int_)
            )
            if len(ocred_leaf.text_blocks) > 0
            else np.array([])
        )
        # Skip the n closest words to the edge, to help ignore stray OCR artifacts.
        SKIP_WORDS = 2
        text_margin_px = int(
            word_margins_all_directions[SKIP_WORDS]
            if word_margins_all_directions.shape[0] > SKIP_WORDS
            else -1
        )

        # Make sure the OCR engine is running with orientation detection.
        assert page_angle is not None

        analyzed_pages.append(
            {
                "is_blank": is_blank,
                "page_angle": page_angle,
                "size_analyzed": im.size,
                "sharpness": sharpness,
                "text_margin_px": text_margin_px,
            }
        )

    return {"pages": analyzed_pages}


def normalize_contrast_for_text(im: Image.Image) -> tuple[Image.Image, bool]:
    """
    Most of the pages being analyzed, and virtually all of the pages we care
    about for the purposes of QA, primarily contain text on a contrasting
    background. We can therefore typically assume that it is reasonable to boost
    contrast so that the lightest pixels are nearly white and the darkest pixels
    are nearly black. This can help make analysis more consistent across leaves
    with varying contrast ratios due to varied scanner settings, contrast ratios
    of the original documnets, or weathering/fading of the physical fiche.

    Processed leaves usually contain some amount of margin around the edges
    where the backlight of the scanning rig is visible through the unexposed
    region of the negative, so contrast detection is heavily center-weighted.

    Params:

        im  Scan image as a 2-dimensional numpy array. (Use `np.asarray()` to
            convert PIL `Image` objects to an array format.)

    Returns:

        (normalized_image, is_blank)
    """
    pixel_values = np.asarray(
        im.crop(
            (
                im.size[0] * 0.1,
                im.size[1] * 0.1,
                im.size[0] * 0.9,
                im.size[1] * 0.9,
            )
        )
    )
    # To avoid extreme outliers, use quantiles instead of absolute extrema.
    extrema = (np.quantile(pixel_values, 0.002), np.quantile(pixel_values, 0.998))
    if extrema[1] - extrema[0] < 64:
        # Assume there is essentially no content here and return the original.
        return im, True

    # Apply a rudimentary tone curve to the image, with the goal that the
    # extrema we just calculated will evaluate to values "pretty close to" 0%
    # and 100% of the available range.
    return im.point(
        lambda x: np.interp(x, (0, extrema[0], extrema[1], 255), (0, 8, 247, 255))
    ), False


def analyze_sharpness(im: Image.Image):
    """
    Crudely quantifies the "sharpness" of edges in an image, on a scale of 0 to
    1, by measuring peak intensity of a high-pass filter.
    """
    blurred = im.filter(ImageFilter.GaussianBlur(8))
    diff = ImageChops.difference(im, blurred)
    return np.quantile(diff, 0.999) / 255