MicroQA/engine.py

import re
from dataclasses import dataclass
from multiprocessing import Pool
from sys import stdout

import numpy as np
import pytesseract
from PIL import Image, ImageFilter

from archive_item import ArchiveDoc


def analyze_doc(
    doc: ArchiveDoc, ocr_langs="eng+fra", parallel=1, use_cache=False, verbose=False
):
    if verbose:
        print(f"Loading {doc.name}...")
        stdout.flush()

    tasks: PageAnalysisTask = [
        PageAnalysisTask(im=leaf.image, ocr_langs=ocr_langs)
        for leaf in doc.fetch_leaves(use_cache=use_cache)
    ]

    if verbose:
        print(f"Processing {len(tasks)} pages...", file=stdout)
        stdout.flush()

    if parallel > 1:
        # Parallelize image processing and OCR of pages across up to n cores.
        with Pool(parallel) as pool:
            return {"pages": pool.map(analyze_page, tasks)}

    return {"pages": [analyze_page(task) for task in tasks]}


@dataclass
class PageAnalysisTask:
    """
    Attributes:
        im          PIL Image, pre-scaled using .thumbnail() to fit the long
                    edge to 3200 px.
        ocr_langs   Tesseract language codes (3 letters each, in a "+"-separated
                    list).
    """

    im: Image.Image
    ocr_langs: str = "eng+fra"


def analyze_page(task):
    im_cropped = task.im.crop(
        (
            task.im.size[0] * 0.1,
            task.im.size[1] * 0.1,
            task.im.size[0] * 0.9,
            task.im.size[1] * 0.9,
        )
    )

    is_blank = im_cropped.getextrema()[0] > 255 * 0.8

    if is_blank:
        max_sharpness = 1
        ocr_orientation_match = True
        text_margin_px = -1
    else:
        max_sharpness = 0.0
        if im_cropped.size[0] < im_cropped.size[1]:
            # Page is in portrait orientation.
            segments_x = 2
            segments_y = 3
        else:
            # Page is in landscape orientation.
            segments_x = 3
            segments_y = 2
        for i in range(segments_x):
            for j in range(segments_y):
                max_sharpness = max(
                    max_sharpness,
                    analyze_sharpness(
                        im_cropped.crop(
                            (
                                im_cropped.size[0] / segments_x * i,
                                im_cropped.size[1] / segments_y * j,
                                im_cropped.size[0] / segments_x * (i + 1),
                                im_cropped.size[1] / segments_y * (j + 1),
                            )
                        )
                    ),
                )

        OCR_SCALE = 1
        best_ocr_score = -1
        best_ocr_words = None
        best_ocr_orientation = -1
        for orientation in range(4):
            im_rotated = task.im.resize(
                np.int_(np.array(task.im.size) * OCR_SCALE)
            ).rotate(90 * orientation, expand=True)
            ocr = pytesseract.image_to_data(
                im_rotated,
                lang=task.ocr_langs,
                config=f"--oem 1 --dpi {int(300 * OCR_SCALE)} --tessdata-dir ./data/tessdata_fast-4.1.0",
                output_type=pytesseract.Output.DATAFRAME,
            ).fillna({"text": ""})
            # Keep only words that Tesseract is confident in, and which are
            # oriented horizontally.
            words = ocr[(ocr["conf"] > 90) & (ocr["width"] > ocr["height"])]
            # Keep only alphabetical words of 4 or more characters.
            words = words[
                words.apply(
                    lambda row: re.fullmatch(r"[a-zA-Z]{4,}", str(row["text"]))
                    is not None,
                    axis=1,
                )
            ]
            if words.shape[0] > best_ocr_score:
                best_ocr_score = words.shape[0]
                best_ocr_orientation = orientation
                best_ocr_words = words
            if best_ocr_score > 50:
                # Unlikely that another orientation will have more words, so
                # stop eating up CPU.
                break

        if best_ocr_words.empty:
            ocr_orientation_match = True
            text_margin_px = -1
        else:
            ocr_orientation_match = best_ocr_orientation == 0

            best_ocr_dims = OCR_SCALE * np.array(
                task.im.size
                if best_ocr_orientation % 2 == 0
                else (task.im.size[1], task.im.size[0])
            )

            word_margins_all_directions = np.sort(
                np.int_(
                    np.concat(
                        (
                            best_ocr_words["left"].to_numpy(),
                            best_ocr_words["top"].to_numpy(),
                            best_ocr_dims[0]
                            - (
                                best_ocr_words["left"] + best_ocr_words["width"]
                            ).to_numpy(),
                            best_ocr_dims[1]
                            - (
                                best_ocr_words["top"] + best_ocr_words["height"]
                            ).to_numpy(),
                        )
                    )
                    # Transform back into original image pixel density
                    / OCR_SCALE
                )
            )
            # Skip the n closest words to the edge, to help ignore stray OCR artifacts.
            SKIP_WORDS = 2
            text_margin_px = int(
                word_margins_all_directions[SKIP_WORDS]
                if word_margins_all_directions.shape[0] > SKIP_WORDS
                else -1
            )

    return {
        "blank": is_blank,
        "ocr_orientation_match": ocr_orientation_match,
        "size_analyzed": task.im.size,
        "sharpness": max_sharpness,
        "text_margin_px": text_margin_px,
    }


def analyze_sharpness(im):
    """
    Crudely quantifies the "sharpness" of edges in an image, on a scale of 0 to
    1. The scale is not linear with respect to scan quality: anything above 0.1
    is usually fine.
    """
    arr = np.asarray(im)

    # Normalize contrast based on brightest and darkest pixels. For example,
    # NORM_QUANTILE=0.1 will attempt to transform pixel values so that 80% fall
    # between 10% brightness and 90% brightness. In practice, a value around
    # 0.02 seems to work fairly well.
    NORM_QUANTILE = 0.03
    pixel_range = np.quantile(arr, 1.0 - NORM_QUANTILE) - np.quantile(
        arr, NORM_QUANTILE
    )
    if pixel_range == 0:
        arr_normalized = arr
    else:
        arr_normalized = arr * (1.0 - NORM_QUANTILE * 2) / pixel_range
        arr_normalized = (
            arr_normalized - np.quantile(arr_normalized, NORM_QUANTILE) + NORM_QUANTILE
        )
        arr_normalized = np.uint8(np.clip(arr_normalized, 0, 1) * 255)

    # "Sharpness" is determined by measuring the median intensity of pixels
    # near edges, after an edge detection filter has been applied to the image.
    edges_arr = np.asarray(
        Image.fromarray(arr_normalized).filter(ImageFilter.FIND_EDGES)
    )
    EDGE_THRESHOLD = 8
    return np.median(edges_arr[edges_arr > EDGE_THRESHOLD]) / 255