MicroQA/microqa/ocr/tesseract.py

import pytesseract
from PIL import Image

from . import OcrEngine, OcrResult, TextBlock


class TesseractOcrEngine(OcrEngine):
    def process(self, image: Image.Image) -> OcrResult:
        """
        Use `pytesseract` to parse an image.

        Note: Each Tesseract command runs single-threaded, so speed can be
        improved up to ~4x by distributing pages across processes running in
        parallel. In practice (at least on the M4 Pro chip) Tesseract seems to
        reach peak overall speed when running across approximately 6 processes
        in parallel. Increasing the CPU count further can cause performance to
        degrade sharply (regardless of RAM availability).

        Params:

            image   PIL image data.
        """

        blocks_best = []
        angle_best = None

        angles = [0, 90, 180, 270] if self._detect_angle else [0]
        for angle in angles:
            # Rotate the image counter-clockwise, since we care about
            # keeping track of the angle from the upright position *to*
            # the original position, not *from*.
            rotated_image = image.rotate(360 - angle, expand=True)
            df = pytesseract.image_to_data(
                rotated_image,
                lang="+".join(self._languages),
                config="--oem 1 --tessdata-dir ./data/tessdata_fast-4.1.0",
                output_type=pytesseract.Output.DATAFRAME,
            ).fillna({"text": ""})

            # Exclude blocks with relatively low confidence ratings.
            df = df[df["conf"] > 80]

            # Exclude empty words
            df = df[df["text"] != ""]

            # Attempt to exclude blocks that seem vertically oriented.
            # TODO: Will this work for non-Latin scripts? Probably not all.
            df = df[(df["width"] / df["height"]) > 0.8]

            print(
                [
                    TextBlock(
                        # Rotate X and Y coordinates back to match the original image.
                        *_box_after_rotation(
                            int(row["left"]),
                            int(row["top"]),
                            int(row["left"] + row["width"]),
                            int(row["top"] + row["height"]),
                            *rotated_image.size,
                            angle,
                        ),
                        text=row["text"],
                    )
                    for _, row in df.iterrows()
                ]
            )
            if angle_best is None or df.shape[0] > len(blocks_best):
                angle_best = angle
                blocks_best = [
                    TextBlock(
                        # Rotate X and Y coordinates back to match the original image.
                        *_box_after_rotation(
                            int(row["left"]),
                            int(row["top"]),
                            int(row["left"] + row["width"]),
                            int(row["top"] + row["height"]),
                            *rotated_image.size,
                            angle,
                        ),
                        text=row["text"],
                    )
                    for _, row in df.iterrows()
                ]

        return OcrResult(
            blocks=blocks_best, page_angle=angle_best if self._detect_angle else None
        )


def _box_after_rotation(
    x0: int,
    y0: int,
    x1: int,
    y1: int,
    image_width: int,
    image_height: int,
    degrees_clockwise: int,
) -> tuple[int, int, int, int]:
    """
    Given the corners of a box in an image, returns the corners of an equivalent
    box if the image is rotated by some multiple of 90 degrees. Both input and
    output coordinates are expected to be top left followed by bottom right,
    where the origin is at the top left.
    """
    angle = ((degrees_clockwise % 360) + 360) % 360
    if angle == 0:
        return x0, y0, x1, y1
    if angle == 90:
        return image_height - y1, x0, image_height - y0, x1
    if angle == 180:
        return image_width - x1, image_height - y1, image_width - x0, image_height - y0
    if angle == 270:
        return y0, image_width - x1, y1, image_width - x0
    else:
        raise Exception("_box_after_rotation() only accepts multiples of 90 degrees")
add interchangeable ocr engines 2025-11-07 05:41:18 +00:00			`import pytesseract`
			`from PIL import Image`

reuse pdf ocr when available 2025-12-20 02:16:41 +00:00			`from . import OcrEngine, OcrResult, TextBlock`
add interchangeable ocr engines 2025-11-07 05:41:18 +00:00
reuse pdf ocr when available 2025-12-20 02:16:41 +00:00
			`class TesseractOcrEngine(OcrEngine):`
			`def process(self, image: Image.Image) -> OcrResult:`
add interchangeable ocr engines 2025-11-07 05:41:18 +00:00			`"""`
reuse pdf ocr when available 2025-12-20 02:16:41 +00:00			Use `pytesseract` to parse an image.
add interchangeable ocr engines 2025-11-07 05:41:18 +00:00
			`Note: Each Tesseract command runs single-threaded, so speed can be`
			`improved up to ~4x by distributing pages across processes running in`
			`parallel. In practice (at least on the M4 Pro chip) Tesseract seems to`
			`reach peak overall speed when running across approximately 6 processes`
			`in parallel. Increasing the CPU count further can cause performance to`
			`degrade sharply (regardless of RAM availability).`

			`Params:`

reuse pdf ocr when available 2025-12-20 02:16:41 +00:00			`image PIL image data.`
add interchangeable ocr engines 2025-11-07 05:41:18 +00:00			`"""`

reuse pdf ocr when available 2025-12-20 02:16:41 +00:00			`blocks_best = []`
			`angle_best = None`

			`angles = [0, 90, 180, 270] if self._detect_angle else [0]`
			`for angle in angles:`
			`# Rotate the image counter-clockwise, since we care about`
			`# keeping track of the angle from the upright position to`
			`# the original position, not from.`
			`rotated_image = image.rotate(360 - angle, expand=True)`
			`df = pytesseract.image_to_data(`
			`rotated_image,`
			`lang="+".join(self._languages),`
			`config="--oem 1 --tessdata-dir ./data/tessdata_fast-4.1.0",`
			`output_type=pytesseract.Output.DATAFRAME,`
			`).fillna({"text": ""})`

			`# Exclude blocks with relatively low confidence ratings.`
			`df = df[df["conf"] > 80]`
add interchangeable ocr engines 2025-11-07 05:41:18 +00:00
reuse pdf ocr when available 2025-12-20 02:16:41 +00:00			`# Exclude empty words`
			`df = df[df["text"] != ""]`

			`# Attempt to exclude blocks that seem vertically oriented.`
			`# TODO: Will this work for non-Latin scripts? Probably not all.`
			`df = df[(df["width"] / df["height"]) > 0.8]`

			`print(`
			`[`
			`TextBlock(`
			`# Rotate X and Y coordinates back to match the original image.`
			`*_box_after_rotation(`
			`int(row["left"]),`
			`int(row["top"]),`
			`int(row["left"] + row["width"]),`
			`int(row["top"] + row["height"]),`
			`*rotated_image.size,`
			`angle,`
			`),`
			`text=row["text"],`
			`)`
			`for _, row in df.iterrows()`
			`]`
			`)`
			`if angle_best is None or df.shape[0] > len(blocks_best):`
			`angle_best = angle`
			`blocks_best = [`
			`TextBlock(`
			`# Rotate X and Y coordinates back to match the original image.`
			`*_box_after_rotation(`
			`int(row["left"]),`
			`int(row["top"]),`
			`int(row["left"] + row["width"]),`
			`int(row["top"] + row["height"]),`
			`*rotated_image.size,`
			`angle,`
			`),`
			`text=row["text"],`
			`)`
			`for _, row in df.iterrows()`
			`]`

			`return OcrResult(`
			`blocks=blocks_best, page_angle=angle_best if self._detect_angle else None`
add interchangeable ocr engines 2025-11-07 05:41:18 +00:00			`)`
reuse pdf ocr when available 2025-12-20 02:16:41 +00:00

			`def _box_after_rotation(`
			`x0: int,`
			`y0: int,`
			`x1: int,`
			`y1: int,`
			`image_width: int,`
			`image_height: int,`
			`degrees_clockwise: int,`
			`) -> tuple[int, int, int, int]:`
			`"""`
			`Given the corners of a box in an image, returns the corners of an equivalent`
			`box if the image is rotated by some multiple of 90 degrees. Both input and`
			`output coordinates are expected to be top left followed by bottom right,`
			`where the origin is at the top left.`
			`"""`
			`angle = ((degrees_clockwise % 360) + 360) % 360`
			`if angle == 0:`
			`return x0, y0, x1, y1`
			`if angle == 90:`
			`return image_height - y1, x0, image_height - y0, x1`
			`if angle == 180:`
			`return image_width - x1, image_height - y1, image_width - x0, image_height - y0`
			`if angle == 270:`
			`return y0, image_width - x1, y1, image_width - x0`
			`else:`
			`raise Exception("_box_after_rotation() only accepts multiples of 90 degrees")`