MicroQA/microqa/ocr/tesseract.py

import pandas as pd
import pytesseract
from PIL import Image


class OcrEngine:
    def process(image: Image, languages: list[str]) -> tuple[pd.DataFrame, dict]:
        """
        Use `pytesseract` to parse an image to a `DataFrame` with columns
        `["text", "x0", "y0", "x1", "y1"]`, where X and Y coordinates are in
        pixels measured from the top left corner of the image. `x1` and `y1`
        values will be greater than or equal to the corresponding `x0` and `y0`
        values.

        Note: Each Tesseract command runs single-threaded, so speed can be
        improved up to ~4x by distributing pages across processes running in
        parallel. In practice (at least on the M4 Pro chip) Tesseract seems to
        reach peak overall speed when running across approximately 6 processes
        in parallel. Increasing the CPU count further can cause performance to
        degrade sharply (regardless of RAM availability).

        Params:

            image       PIL image data.

            languages   List of ISO-639-3 language codes fed to the OCR backend.
        """

        df = pytesseract.image_to_data(
            image,
            lang="+".join(languages),
            config="--oem 1 --tessdata-dir ./data/tessdata_fast-4.1.0",
            output_type=pytesseract.Output.DATAFRAME,
        )

        # Exclude words with relatively low confidence ratings.
        df = df[df["conf"] > 80]

        # Attempt to exclude words that seem vertically oriented.
        # TODO: Will this work for non-Latin scripts? Probably not all.
        df = df[(df["width"] / df["height"]) > 0.8]

        return (
            pd.DataFrame(
                {
                    "text": df["text"],
                    "x0": df["left"],
                    "y0": df["top"],
                    "x1": df["left"] + df["width"],
                    "y1": df["top"] + df["height"],
                }
            ),
            # We don't use any page-level metadata from the Tesseract output.
            {},
        )
add interchangeable ocr engines 2025-11-07 05:41:18 +00:00			`import pandas as pd`
			`import pytesseract`
			`from PIL import Image`


			`class OcrEngine:`
			`def process(image: Image, languages: list[str]) -> tuple[pd.DataFrame, dict]:`
			`"""`
			Use `pytesseract` to parse an image to a `DataFrame` with columns
			`["text", "x0", "y0", "x1", "y1"]`, where X and Y coordinates are in
			pixels measured from the top left corner of the image. `x1` and `y1`
			values will be greater than or equal to the corresponding `x0` and `y0`
			`values.`

			`Note: Each Tesseract command runs single-threaded, so speed can be`
			`improved up to ~4x by distributing pages across processes running in`
			`parallel. In practice (at least on the M4 Pro chip) Tesseract seems to`
			`reach peak overall speed when running across approximately 6 processes`
			`in parallel. Increasing the CPU count further can cause performance to`
			`degrade sharply (regardless of RAM availability).`

			`Params:`

			`image PIL image data.`

			`languages List of ISO-639-3 language codes fed to the OCR backend.`
			`"""`

			`df = pytesseract.image_to_data(`
			`image,`
			`lang="+".join(languages),`
			`config="--oem 1 --tessdata-dir ./data/tessdata_fast-4.1.0",`
			`output_type=pytesseract.Output.DATAFRAME,`
			`)`

			`# Exclude words with relatively low confidence ratings.`
			`df = df[df["conf"] > 80]`

			`# Attempt to exclude words that seem vertically oriented.`
			`# TODO: Will this work for non-Latin scripts? Probably not all.`
			`df = df[(df["width"] / df["height"]) > 0.8]`

			`return (`
			`pd.DataFrame(`
			`{`
			`"text": df["text"],`
			`"x0": df["left"],`
			`"y0": df["top"],`
			`"x1": df["left"] + df["width"],`
			`"y1": df["top"] + df["height"],`
			`}`
			`),`
			`# We don't use any page-level metadata from the Tesseract output.`
			`{},`
			`)`