MicroQA/microqa/ocr/tesseract.py

116 lines
4.2 KiB
Python
Raw Normal View History

2025-11-07 05:41:18 +00:00
import pytesseract
from PIL import Image
2025-12-20 02:16:41 +00:00
from . import OcrEngine, OcrResult, TextBlock
2025-11-07 05:41:18 +00:00
2025-12-20 02:16:41 +00:00
class TesseractOcrEngine(OcrEngine):
def process(self, image: Image.Image) -> OcrResult:
2025-11-07 05:41:18 +00:00
"""
2025-12-20 02:16:41 +00:00
Use `pytesseract` to parse an image.
2025-11-07 05:41:18 +00:00
Note: Each Tesseract command runs single-threaded, so speed can be
improved up to ~4x by distributing pages across processes running in
parallel. In practice (at least on the M4 Pro chip) Tesseract seems to
reach peak overall speed when running across approximately 6 processes
in parallel. Increasing the CPU count further can cause performance to
degrade sharply (regardless of RAM availability).
Params:
2025-12-20 02:16:41 +00:00
image PIL image data.
2025-11-07 05:41:18 +00:00
"""
2025-12-20 02:16:41 +00:00
blocks_best = []
angle_best = None
angles = [0, 90, 180, 270] if self._detect_angle else [0]
for angle in angles:
# Rotate the image counter-clockwise, since we care about
# keeping track of the angle from the upright position *to*
# the original position, not *from*.
rotated_image = image.rotate(360 - angle, expand=True)
df = pytesseract.image_to_data(
rotated_image,
lang="+".join(self._languages),
config="--oem 1 --tessdata-dir ./data/tessdata_fast-4.1.0",
output_type=pytesseract.Output.DATAFRAME,
).fillna({"text": ""})
# Exclude blocks with relatively low confidence ratings.
df = df[df["conf"] > 80]
2025-11-07 05:41:18 +00:00
2025-12-20 02:16:41 +00:00
# Exclude empty words
df = df[df["text"] != ""]
# Attempt to exclude blocks that seem vertically oriented.
# TODO: Will this work for non-Latin scripts? Probably not all.
df = df[(df["width"] / df["height"]) > 0.8]
print(
[
TextBlock(
# Rotate X and Y coordinates back to match the original image.
*_box_after_rotation(
int(row["left"]),
int(row["top"]),
int(row["left"] + row["width"]),
int(row["top"] + row["height"]),
*rotated_image.size,
angle,
),
text=row["text"],
)
for _, row in df.iterrows()
]
)
if angle_best is None or df.shape[0] > len(blocks_best):
angle_best = angle
blocks_best = [
TextBlock(
# Rotate X and Y coordinates back to match the original image.
*_box_after_rotation(
int(row["left"]),
int(row["top"]),
int(row["left"] + row["width"]),
int(row["top"] + row["height"]),
*rotated_image.size,
angle,
),
text=row["text"],
)
for _, row in df.iterrows()
]
return OcrResult(
blocks=blocks_best, page_angle=angle_best if self._detect_angle else None
2025-11-07 05:41:18 +00:00
)
2025-12-20 02:16:41 +00:00
def _box_after_rotation(
x0: int,
y0: int,
x1: int,
y1: int,
image_width: int,
image_height: int,
degrees_clockwise: int,
) -> tuple[int, int, int, int]:
"""
Given the corners of a box in an image, returns the corners of an equivalent
box if the image is rotated by some multiple of 90 degrees. Both input and
output coordinates are expected to be top left followed by bottom right,
where the origin is at the top left.
"""
angle = ((degrees_clockwise % 360) + 360) % 360
if angle == 0:
return x0, y0, x1, y1
if angle == 90:
return image_height - y1, x0, image_height - y0, x1
if angle == 180:
return image_width - x1, image_height - y1, image_width - x0, image_height - y0
if angle == 270:
return y0, image_width - x1, y1, image_width - x0
else:
raise Exception("_box_after_rotation() only accepts multiples of 90 degrees")