MicroQA/microqa/ocr/tesseract.py

import pandas as pd
import pytesseract
from PIL import Image


class OcrEngine:
    def process(image: Image, languages: list[str]) -> tuple[pd.DataFrame, dict]:
        """
        Use `pytesseract` to parse an image to a `DataFrame` with columns
        `["text", "x0", "y0", "x1", "y1"]`, where X and Y coordinates are in
        pixels measured from the top left corner of the image. `x1` and `y1`
        values will be greater than or equal to the corresponding `x0` and `y0`
        values.

        Note: Each Tesseract command runs single-threaded, so speed can be
        improved up to ~4x by distributing pages across processes running in
        parallel. In practice (at least on the M4 Pro chip) Tesseract seems to
        reach peak overall speed when running across approximately 6 processes
        in parallel. Increasing the CPU count further can cause performance to
        degrade sharply (regardless of RAM availability).

        Params:

            image       PIL image data.

            languages   List of ISO-639-3 language codes fed to the OCR backend.
        """

        df = pytesseract.image_to_data(
            image,
            lang="+".join(languages),
            config="--oem 1 --tessdata-dir ./data/tessdata_fast-4.1.0",
            output_type=pytesseract.Output.DATAFRAME,
        )

        # Exclude words with relatively low confidence ratings.
        df = df[df["conf"] > 80]

        # Attempt to exclude words that seem vertically oriented.
        # TODO: Will this work for non-Latin scripts? Probably not all.
        df = df[(df["width"] / df["height"]) > 0.8]

        return (
            pd.DataFrame(
                {
                    "text": df["text"],
                    "x0": df["left"],
                    "y0": df["top"],
                    "x1": df["left"] + df["width"],
                    "y1": df["top"] + df["height"],
                }
            ),
            # We don't use any page-level metadata from the Tesseract output.
            {},
        )