MicroQA/microqa/ocr/tesseract.py

56 lines
2 KiB
Python
Raw Normal View History

2025-11-07 05:41:18 +00:00
import pandas as pd
import pytesseract
from PIL import Image
class OcrEngine:
def process(image: Image, languages: list[str]) -> tuple[pd.DataFrame, dict]:
"""
Use `pytesseract` to parse an image to a `DataFrame` with columns
`["text", "x0", "y0", "x1", "y1"]`, where X and Y coordinates are in
pixels measured from the top left corner of the image. `x1` and `y1`
values will be greater than or equal to the corresponding `x0` and `y0`
values.
Note: Each Tesseract command runs single-threaded, so speed can be
improved up to ~4x by distributing pages across processes running in
parallel. In practice (at least on the M4 Pro chip) Tesseract seems to
reach peak overall speed when running across approximately 6 processes
in parallel. Increasing the CPU count further can cause performance to
degrade sharply (regardless of RAM availability).
Params:
image PIL image data.
languages List of ISO-639-3 language codes fed to the OCR backend.
"""
df = pytesseract.image_to_data(
image,
lang="+".join(languages),
config="--oem 1 --tessdata-dir ./data/tessdata_fast-4.1.0",
output_type=pytesseract.Output.DATAFRAME,
)
# Exclude words with relatively low confidence ratings.
df = df[df["conf"] > 80]
# Attempt to exclude words that seem vertically oriented.
# TODO: Will this work for non-Latin scripts? Probably not all.
df = df[(df["width"] / df["height"]) > 0.8]
return (
pd.DataFrame(
{
"text": df["text"],
"x0": df["left"],
"y0": df["top"],
"x1": df["left"] + df["width"],
"y1": df["top"] + df["height"],
}
),
# We don't use any page-level metadata from the Tesseract output.
{},
)