MicroQA/microqa/engine.py

from sys import stdout

import numpy as np
from PIL import Image, ImageFilter

from .items import ArchiveDoc, ArchiveLeaf
from .ocr import OcrEngine, TextBlock


def analyze_doc(
    doc: ArchiveDoc,
    ocr_engine: OcrEngine,
    use_cache: bool = False,
    verbose: bool = False,
):
    """
    Analyzes all pages in an ArchiveDoc for useful metrics such as sharpness,
    orientation, presence of text overflows, and so on.
    """

    if verbose:
        print(f"Loading {doc.name}...")
        stdout.flush()

    all_leaves = doc.fetch_leaves(use_cache=use_cache)

    if verbose:
        print(f"Processing {len(all_leaves)} pages...", file=stdout)
        stdout.flush()

    analyzed_pages = []
    for leaf in all_leaves:
        im_cropped = leaf.image.crop(
            (
                leaf.image.size[0] * 0.1,
                leaf.image.size[1] * 0.1,
                leaf.image.size[0] * 0.9,
                leaf.image.size[1] * 0.9,
            )
        )

        is_blank = im_cropped.getextrema()[0] > 255 * 0.8

        if is_blank:
            max_sharpness = 1
            text_margin_px = -1
            page_angle = 0
        else:
            # Sharpness is determined by percentile of pixels that match some
            # criteria, so it may vary significantly depending on which portion
            # of the image is analyzed. In an effort to identify the sharpest
            # edges, we split up the image into chunks and assume that the
            # highest sharpness value obtained across all chunks is
            # representative of the image as a whole.
            max_sharpness = 0.0
            if im_cropped.size[0] < im_cropped.size[1]:
                # Page is in portrait orientation.
                segments_x = 2
                segments_y = 3
            else:
                # Page is in landscape orientation.
                segments_x = 3
                segments_y = 2
            for i in range(segments_x):
                for j in range(segments_y):
                    max_sharpness = max(
                        max_sharpness,
                        analyze_sharpness(
                            im_cropped.crop(
                                (
                                    im_cropped.size[0] / segments_x * i,
                                    im_cropped.size[1] / segments_y * j,
                                    im_cropped.size[0] / segments_x * (i + 1),
                                    im_cropped.size[1] / segments_y * (j + 1),
                                )
                            )
                        ),
                    )

            # OCR is computationally expensive, so we try to take advantage of
            # the Tesseract data already parsed by the Internet Archive and
            # embedded in the PDF, when possible. If there is not sufficient
            # text in the PDF to be confident that the Archive's OCR
            # postprocessing captured it all, then OCR is recomputed locally.
            #
            # In some instances, the Archive's OCR detects rotated text but
            # parses it as gibberish. To partially mitigate this, we ignore all
            # precomputed text blocks with a "portrait" aspect ratio. This will
            # not necessarily help with text that is rotated 180 degrees, but in
            # practice that case is rarely encountered. This will also not work
            # well with non-latin scripts that are intended to be oriented
            # vertically.
            OCR_RECOMPUTE_THRESHOLD_WORDS = 30
            if (
                sum(
                    (
                        len(block.text.split())
                        for block in leaf.text_blocks
                        if block.x1 - block.x0 > block.y1 - block.y0
                    )
                )
                >= OCR_RECOMPUTE_THRESHOLD_WORDS
            ):
                if verbose:
                    print("Using PDF text.")
                ocred_leaf = leaf
                page_angle = 0
            else:
                if verbose:
                    print("Using OCR.")
                OCR_SCALE = 1
                im_scaled = leaf.image.resize(
                    np.int_(np.array(leaf.image.size) * OCR_SCALE)
                )
                ocr_result = ocr_engine.process(im_scaled)
                ocred_leaf = ArchiveLeaf(
                    image=leaf.image,
                    page_number=leaf.page_number,
                    text_blocks=[
                        TextBlock(
                            x0=int(block.x0 / OCR_SCALE),
                            y0=int(block.y0 / OCR_SCALE),
                            x1=int(block.x1 / OCR_SCALE),
                            y1=int(block.y1 / OCR_SCALE),
                            text=block.text,
                        )
                        for block in ocr_result.blocks
                    ],
                )
                page_angle = ocr_result.page_angle

            word_margins_all_directions = np.sort(
                np.int_(
                    np.concat(
                        [
                            np.array(
                                [
                                    block.x0,
                                    block.y0,
                                    leaf.image.size[0] - block.x1,
                                    leaf.image.size[1] - block.y1,
                                ]
                            )
                            for block in ocred_leaf.text_blocks
                        ]
                    )
                )
            )
            # Skip the n closest words to the edge, to help ignore stray OCR artifacts.
            SKIP_WORDS = 2
            text_margin_px = int(
                word_margins_all_directions[SKIP_WORDS]
                if word_margins_all_directions.shape[0] > SKIP_WORDS
                else -1
            )

        # Make sure the OCR engine is running with orientation detection.
        assert page_angle is not None

        analyzed_pages.append(
            {
                "blank": is_blank,
                "page_angle": page_angle,
                "size_analyzed": leaf.image.size,
                "sharpness": max_sharpness,
                "text_margin_px": text_margin_px,
            }
        )

    return {"pages": analyzed_pages}


def analyze_sharpness(im: Image.Image):
    """
    Crudely quantifies the "sharpness" of edges in an image, on a scale of 0 to
    1. The scale is not linear with respect to scan quality: anything above 0.1
    is usually fine.
    """
    arr = np.asarray(im)

    # Normalize contrast based on brightest and darkest pixels. For example,
    # NORM_QUANTILE=0.1 will attempt to transform pixel values so that 80% fall
    # between 10% brightness and 90% brightness. In practice, a value around
    # 0.02 seems to work fairly well.
    NORM_QUANTILE = 0.03
    pixel_range = np.quantile(arr, 1.0 - NORM_QUANTILE) - np.quantile(
        arr, NORM_QUANTILE
    )
    if pixel_range == 0:
        arr_normalized = arr
    else:
        arr_normalized = arr * (1.0 - NORM_QUANTILE * 2) / pixel_range
        arr_normalized = (
            arr_normalized - np.quantile(arr_normalized, NORM_QUANTILE) + NORM_QUANTILE
        )
        arr_normalized = np.uint8(np.clip(arr_normalized, 0, 1) * 255)

    # "Sharpness" is determined by measuring the median intensity of pixels
    # near edges, after an edge detection filter has been applied to the image.
    edges_arr = np.asarray(
        Image.fromarray(arr_normalized).filter(ImageFilter.FIND_EDGES)
    )
    EDGE_THRESHOLD = 8
    return np.median(edges_arr[edges_arr > EDGE_THRESHOLD]) / 255
rewrite data fetching into archive_item.py 2025-10-04 18:03:03 -07:00			`from sys import stdout`
rewrite to engine.py 2025-10-04 15:09:16 -07:00
			`import numpy as np`
			`from PIL import Image, ImageFilter`

reuse pdf ocr when available 2025-12-20 02:16:41 +00:00			`from .items import ArchiveDoc, ArchiveLeaf`
			`from .ocr import OcrEngine, TextBlock`
rewrite to engine.py 2025-10-04 15:09:16 -07:00

reuse pdf ocr when available 2025-12-20 02:16:41 +00:00			`def analyze_doc(`
			`doc: ArchiveDoc,`
			`ocr_engine: OcrEngine,`
			`use_cache: bool = False,`
			`verbose: bool = False,`
			`):`
			`"""`
			`Analyzes all pages in an ArchiveDoc for useful metrics such as sharpness,`
			`orientation, presence of text overflows, and so on.`
			`"""`

rewrite to engine.py 2025-10-04 15:09:16 -07:00			`if verbose:`
rewrite data fetching into archive_item.py 2025-10-04 18:03:03 -07:00			`print(f"Loading {doc.name}...")`
			`stdout.flush()`
rewrite to engine.py 2025-10-04 15:09:16 -07:00
reuse pdf ocr when available 2025-12-20 02:16:41 +00:00			`all_leaves = doc.fetch_leaves(use_cache=use_cache)`
rewrite to engine.py 2025-10-04 15:09:16 -07:00
			`if verbose:`
reuse pdf ocr when available 2025-12-20 02:16:41 +00:00			`print(f"Processing {len(all_leaves)} pages...", file=stdout)`
rewrite data fetching into archive_item.py 2025-10-04 18:03:03 -07:00			`stdout.flush()`
rewrite to engine.py 2025-10-04 15:09:16 -07:00
reuse pdf ocr when available 2025-12-20 02:16:41 +00:00			`analyzed_pages = []`
			`for leaf in all_leaves:`
			`im_cropped = leaf.image.crop(`
			`(`
			`leaf.image.size[0] * 0.1,`
			`leaf.image.size[1] * 0.1,`
			`leaf.image.size[0] * 0.9,`
			`leaf.image.size[1] * 0.9,`
			`)`
			`)`
rewrite to engine.py 2025-10-04 15:09:16 -07:00
reuse pdf ocr when available 2025-12-20 02:16:41 +00:00			`is_blank = im_cropped.getextrema()[0] > 255 * 0.8`
rewrite to engine.py 2025-10-04 15:09:16 -07:00
reuse pdf ocr when available 2025-12-20 02:16:41 +00:00			`if is_blank:`
			`max_sharpness = 1`
			`text_margin_px = -1`
			`page_angle = 0`
rewrite to engine.py 2025-10-04 15:09:16 -07:00			`else:`
reuse pdf ocr when available 2025-12-20 02:16:41 +00:00			`# Sharpness is determined by percentile of pixels that match some`
			`# criteria, so it may vary significantly depending on which portion`
			`# of the image is analyzed. In an effort to identify the sharpest`
			`# edges, we split up the image into chunks and assume that the`
			`# highest sharpness value obtained across all chunks is`
			`# representative of the image as a whole.`
			`max_sharpness = 0.0`
			`if im_cropped.size[0] < im_cropped.size[1]:`
			`# Page is in portrait orientation.`
			`segments_x = 2`
			`segments_y = 3`
			`else:`
			`# Page is in landscape orientation.`
			`segments_x = 3`
			`segments_y = 2`
			`for i in range(segments_x):`
			`for j in range(segments_y):`
			`max_sharpness = max(`
			`max_sharpness,`
			`analyze_sharpness(`
			`im_cropped.crop(`
			`(`
			`im_cropped.size[0] / segments_x * i,`
			`im_cropped.size[1] / segments_y * j,`
			`im_cropped.size[0] / segments_x * (i + 1),`
			`im_cropped.size[1] / segments_y * (j + 1),`
			`)`
rewrite to engine.py 2025-10-04 15:09:16 -07:00			`)`
reuse pdf ocr when available 2025-12-20 02:16:41 +00:00			`),`
			`)`

			`# OCR is computationally expensive, so we try to take advantage of`
			`# the Tesseract data already parsed by the Internet Archive and`
			`# embedded in the PDF, when possible. If there is not sufficient`
			`# text in the PDF to be confident that the Archive's OCR`
			`# postprocessing captured it all, then OCR is recomputed locally.`
			`#`
			`# In some instances, the Archive's OCR detects rotated text but`
			`# parses it as gibberish. To partially mitigate this, we ignore all`
			`# precomputed text blocks with a "portrait" aspect ratio. This will`
			`# not necessarily help with text that is rotated 180 degrees, but in`
			`# practice that case is rarely encountered. This will also not work`
			`# well with non-latin scripts that are intended to be oriented`
			`# vertically.`
			`OCR_RECOMPUTE_THRESHOLD_WORDS = 30`
			`if (`
			`sum(`
			`(`
			`len(block.text.split())`
			`for block in leaf.text_blocks`
			`if block.x1 - block.x0 > block.y1 - block.y0`
			`)`
			`)`
			`>= OCR_RECOMPUTE_THRESHOLD_WORDS`
			`):`
			`if verbose:`
			`print("Using PDF text.")`
			`ocred_leaf = leaf`
			`page_angle = 0`
			`else:`
			`if verbose:`
			`print("Using OCR.")`
			`OCR_SCALE = 1`
			`im_scaled = leaf.image.resize(`
			`np.int_(np.array(leaf.image.size) * OCR_SCALE)`
			`)`
			`ocr_result = ocr_engine.process(im_scaled)`
			`ocred_leaf = ArchiveLeaf(`
			`image=leaf.image,`
			`page_number=leaf.page_number,`
			`text_blocks=[`
			`TextBlock(`
			`x0=int(block.x0 / OCR_SCALE),`
			`y0=int(block.y0 / OCR_SCALE),`
			`x1=int(block.x1 / OCR_SCALE),`
			`y1=int(block.y1 / OCR_SCALE),`
			`text=block.text,`
rewrite to engine.py 2025-10-04 15:09:16 -07:00			`)`
reuse pdf ocr when available 2025-12-20 02:16:41 +00:00			`for block in ocr_result.blocks`
			`],`
rewrite to engine.py 2025-10-04 15:09:16 -07:00			`)`
reuse pdf ocr when available 2025-12-20 02:16:41 +00:00			`page_angle = ocr_result.page_angle`
rewrite to engine.py 2025-10-04 15:09:16 -07:00
			`word_margins_all_directions = np.sort(`
			`np.int_(`
			`np.concat(`
reuse pdf ocr when available 2025-12-20 02:16:41 +00:00			`[`
			`np.array(`
			`[`
			`block.x0,`
			`block.y0,`
			`leaf.image.size[0] - block.x1,`
			`leaf.image.size[1] - block.y1,`
			`]`
			`)`
			`for block in ocred_leaf.text_blocks`
			`]`
rewrite to engine.py 2025-10-04 15:09:16 -07:00			`)`
			`)`
			`)`
			`# Skip the n closest words to the edge, to help ignore stray OCR artifacts.`
			`SKIP_WORDS = 2`
			`text_margin_px = int(`
			`word_margins_all_directions[SKIP_WORDS]`
			`if word_margins_all_directions.shape[0] > SKIP_WORDS`
			`else -1`
			`)`

reuse pdf ocr when available 2025-12-20 02:16:41 +00:00			`# Make sure the OCR engine is running with orientation detection.`
			`assert page_angle is not None`

			`analyzed_pages.append(`
			`{`
			`"blank": is_blank,`
			`"page_angle": page_angle,`
			`"size_analyzed": leaf.image.size,`
			`"sharpness": max_sharpness,`
			`"text_margin_px": text_margin_px,`
			`}`
			`)`

			`return {"pages": analyzed_pages}`
rewrite to engine.py 2025-10-04 15:09:16 -07:00

reuse pdf ocr when available 2025-12-20 02:16:41 +00:00			`def analyze_sharpness(im: Image.Image):`
rewrite to engine.py 2025-10-04 15:09:16 -07:00			`"""`
			`Crudely quantifies the "sharpness" of edges in an image, on a scale of 0 to`
			`1. The scale is not linear with respect to scan quality: anything above 0.1`
			`is usually fine.`
			`"""`
			`arr = np.asarray(im)`

			`# Normalize contrast based on brightest and darkest pixels. For example,`
			`# NORM_QUANTILE=0.1 will attempt to transform pixel values so that 80% fall`
			`# between 10% brightness and 90% brightness. In practice, a value around`
			`# 0.02 seems to work fairly well.`
			`NORM_QUANTILE = 0.03`
			`pixel_range = np.quantile(arr, 1.0 - NORM_QUANTILE) - np.quantile(`
			`arr, NORM_QUANTILE`
			`)`
			`if pixel_range == 0:`
			`arr_normalized = arr`
			`else:`
			`arr_normalized = arr * (1.0 - NORM_QUANTILE * 2) / pixel_range`
			`arr_normalized = (`
			`arr_normalized - np.quantile(arr_normalized, NORM_QUANTILE) + NORM_QUANTILE`
			`)`
			`arr_normalized = np.uint8(np.clip(arr_normalized, 0, 1) * 255)`

			`# "Sharpness" is determined by measuring the median intensity of pixels`
			`# near edges, after an edge detection filter has been applied to the image.`
			`edges_arr = np.asarray(`
			`Image.fromarray(arr_normalized).filter(ImageFilter.FIND_EDGES)`
			`)`
			`EDGE_THRESHOLD = 8`
			`return np.median(edges_arr[edges_arr > EDGE_THRESHOLD]) / 255`