2025-11-07 05:41:18 +00:00
|
|
|
import pytesseract
|
|
|
|
|
from PIL import Image
|
|
|
|
|
|
2025-12-20 02:16:41 +00:00
|
|
|
from . import OcrEngine, OcrResult, TextBlock
|
2025-11-07 05:41:18 +00:00
|
|
|
|
2025-12-20 02:16:41 +00:00
|
|
|
|
|
|
|
|
class TesseractOcrEngine(OcrEngine):
|
|
|
|
|
def process(self, image: Image.Image) -> OcrResult:
|
2025-11-07 05:41:18 +00:00
|
|
|
"""
|
2025-12-20 02:16:41 +00:00
|
|
|
Use `pytesseract` to parse an image.
|
2025-11-07 05:41:18 +00:00
|
|
|
|
|
|
|
|
Note: Each Tesseract command runs single-threaded, so speed can be
|
|
|
|
|
improved up to ~4x by distributing pages across processes running in
|
|
|
|
|
parallel. In practice (at least on the M4 Pro chip) Tesseract seems to
|
|
|
|
|
reach peak overall speed when running across approximately 6 processes
|
|
|
|
|
in parallel. Increasing the CPU count further can cause performance to
|
|
|
|
|
degrade sharply (regardless of RAM availability).
|
|
|
|
|
|
|
|
|
|
Params:
|
|
|
|
|
|
2025-12-20 02:16:41 +00:00
|
|
|
image PIL image data.
|
2025-11-07 05:41:18 +00:00
|
|
|
"""
|
|
|
|
|
|
2025-12-20 02:16:41 +00:00
|
|
|
blocks_best = []
|
|
|
|
|
angle_best = None
|
|
|
|
|
|
|
|
|
|
angles = [0, 90, 180, 270] if self._detect_angle else [0]
|
|
|
|
|
for angle in angles:
|
|
|
|
|
# Rotate the image counter-clockwise, since we care about
|
|
|
|
|
# keeping track of the angle from the upright position *to*
|
|
|
|
|
# the original position, not *from*.
|
|
|
|
|
rotated_image = image.rotate(360 - angle, expand=True)
|
|
|
|
|
df = pytesseract.image_to_data(
|
|
|
|
|
rotated_image,
|
|
|
|
|
lang="+".join(self._languages),
|
|
|
|
|
config="--oem 1 --tessdata-dir ./data/tessdata_fast-4.1.0",
|
|
|
|
|
output_type=pytesseract.Output.DATAFRAME,
|
|
|
|
|
).fillna({"text": ""})
|
|
|
|
|
|
|
|
|
|
# Exclude blocks with relatively low confidence ratings.
|
|
|
|
|
df = df[df["conf"] > 80]
|
2025-11-07 05:41:18 +00:00
|
|
|
|
2025-12-20 02:16:41 +00:00
|
|
|
# Exclude empty words
|
|
|
|
|
df = df[df["text"] != ""]
|
|
|
|
|
|
|
|
|
|
# Attempt to exclude blocks that seem vertically oriented.
|
|
|
|
|
# TODO: Will this work for non-Latin scripts? Probably not all.
|
|
|
|
|
df = df[(df["width"] / df["height"]) > 0.8]
|
|
|
|
|
|
|
|
|
|
print(
|
|
|
|
|
[
|
|
|
|
|
TextBlock(
|
|
|
|
|
# Rotate X and Y coordinates back to match the original image.
|
|
|
|
|
*_box_after_rotation(
|
|
|
|
|
int(row["left"]),
|
|
|
|
|
int(row["top"]),
|
|
|
|
|
int(row["left"] + row["width"]),
|
|
|
|
|
int(row["top"] + row["height"]),
|
|
|
|
|
*rotated_image.size,
|
|
|
|
|
angle,
|
|
|
|
|
),
|
|
|
|
|
text=row["text"],
|
|
|
|
|
)
|
|
|
|
|
for _, row in df.iterrows()
|
|
|
|
|
]
|
|
|
|
|
)
|
|
|
|
|
if angle_best is None or df.shape[0] > len(blocks_best):
|
|
|
|
|
angle_best = angle
|
|
|
|
|
blocks_best = [
|
|
|
|
|
TextBlock(
|
|
|
|
|
# Rotate X and Y coordinates back to match the original image.
|
|
|
|
|
*_box_after_rotation(
|
|
|
|
|
int(row["left"]),
|
|
|
|
|
int(row["top"]),
|
|
|
|
|
int(row["left"] + row["width"]),
|
|
|
|
|
int(row["top"] + row["height"]),
|
|
|
|
|
*rotated_image.size,
|
|
|
|
|
angle,
|
|
|
|
|
),
|
|
|
|
|
text=row["text"],
|
|
|
|
|
)
|
|
|
|
|
for _, row in df.iterrows()
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
return OcrResult(
|
|
|
|
|
blocks=blocks_best, page_angle=angle_best if self._detect_angle else None
|
2025-11-07 05:41:18 +00:00
|
|
|
)
|
2025-12-20 02:16:41 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def _box_after_rotation(
|
|
|
|
|
x0: int,
|
|
|
|
|
y0: int,
|
|
|
|
|
x1: int,
|
|
|
|
|
y1: int,
|
|
|
|
|
image_width: int,
|
|
|
|
|
image_height: int,
|
|
|
|
|
degrees_clockwise: int,
|
|
|
|
|
) -> tuple[int, int, int, int]:
|
|
|
|
|
"""
|
|
|
|
|
Given the corners of a box in an image, returns the corners of an equivalent
|
|
|
|
|
box if the image is rotated by some multiple of 90 degrees. Both input and
|
|
|
|
|
output coordinates are expected to be top left followed by bottom right,
|
|
|
|
|
where the origin is at the top left.
|
|
|
|
|
"""
|
|
|
|
|
angle = ((degrees_clockwise % 360) + 360) % 360
|
|
|
|
|
if angle == 0:
|
|
|
|
|
return x0, y0, x1, y1
|
|
|
|
|
if angle == 90:
|
|
|
|
|
return image_height - y1, x0, image_height - y0, x1
|
|
|
|
|
if angle == 180:
|
|
|
|
|
return image_width - x1, image_height - y1, image_width - x0, image_height - y0
|
|
|
|
|
if angle == 270:
|
|
|
|
|
return y0, image_width - x1, y1, image_width - x0
|
|
|
|
|
else:
|
|
|
|
|
raise Exception("_box_after_rotation() only accepts multiples of 90 degrees")
|