import pandas as pd import pytesseract from PIL import Image class OcrEngine: def process(image: Image, languages: list[str]) -> tuple[pd.DataFrame, dict]: """ Use `pytesseract` to parse an image to a `DataFrame` with columns `["text", "x0", "y0", "x1", "y1"]`, where X and Y coordinates are in pixels measured from the top left corner of the image. `x1` and `y1` values will be greater than or equal to the corresponding `x0` and `y0` values. Note: Each Tesseract command runs single-threaded, so speed can be improved up to ~4x by distributing pages across processes running in parallel. In practice (at least on the M4 Pro chip) Tesseract seems to reach peak overall speed when running across approximately 6 processes in parallel. Increasing the CPU count further can cause performance to degrade sharply (regardless of RAM availability). Params: image PIL image data. languages List of ISO-639-3 language codes fed to the OCR backend. """ df = pytesseract.image_to_data( image, lang="+".join(languages), config="--oem 1 --tessdata-dir ./data/tessdata_fast-4.1.0", output_type=pytesseract.Output.DATAFRAME, ) # Exclude words with relatively low confidence ratings. df = df[df["conf"] > 80] # Attempt to exclude words that seem vertically oriented. # TODO: Will this work for non-Latin scripts? Probably not all. df = df[(df["width"] / df["height"]) > 0.8] return ( pd.DataFrame( { "text": df["text"], "x0": df["left"], "y0": df["top"], "x1": df["left"] + df["width"], "y1": df["top"] + df["height"], } ), # We don't use any page-level metadata from the Tesseract output. {}, )