55 lines
2 KiB
Python
55 lines
2 KiB
Python
import pandas as pd
|
|
import pytesseract
|
|
from PIL import Image
|
|
|
|
|
|
class OcrEngine:
|
|
def process(image: Image, languages: list[str]) -> tuple[pd.DataFrame, dict]:
|
|
"""
|
|
Use `pytesseract` to parse an image to a `DataFrame` with columns
|
|
`["text", "x0", "y0", "x1", "y1"]`, where X and Y coordinates are in
|
|
pixels measured from the top left corner of the image. `x1` and `y1`
|
|
values will be greater than or equal to the corresponding `x0` and `y0`
|
|
values.
|
|
|
|
Note: Each Tesseract command runs single-threaded, so speed can be
|
|
improved up to ~4x by distributing pages across processes running in
|
|
parallel. In practice (at least on the M4 Pro chip) Tesseract seems to
|
|
reach peak overall speed when running across approximately 6 processes
|
|
in parallel. Increasing the CPU count further can cause performance to
|
|
degrade sharply (regardless of RAM availability).
|
|
|
|
Params:
|
|
|
|
image PIL image data.
|
|
|
|
languages List of ISO-639-3 language codes fed to the OCR backend.
|
|
"""
|
|
|
|
df = pytesseract.image_to_data(
|
|
image,
|
|
lang="+".join(languages),
|
|
config="--oem 1 --tessdata-dir ./data/tessdata_fast-4.1.0",
|
|
output_type=pytesseract.Output.DATAFRAME,
|
|
)
|
|
|
|
# Exclude words with relatively low confidence ratings.
|
|
df = df[df["conf"] > 80]
|
|
|
|
# Attempt to exclude words that seem vertically oriented.
|
|
# TODO: Will this work for non-Latin scripts? Probably not all.
|
|
df = df[(df["width"] / df["height"]) > 0.8]
|
|
|
|
return (
|
|
pd.DataFrame(
|
|
{
|
|
"text": df["text"],
|
|
"x0": df["left"],
|
|
"y0": df["top"],
|
|
"x1": df["left"] + df["width"],
|
|
"y1": df["top"] + df["height"],
|
|
}
|
|
),
|
|
# We don't use any page-level metadata from the Tesseract output.
|
|
{},
|
|
)
|