import pytesseract from PIL import Image from . import OcrEngine, OcrResult, TextBlock class TesseractOcrEngine(OcrEngine): def process(self, image: Image.Image) -> OcrResult: """ Use `pytesseract` to parse an image. Note: Each Tesseract command runs single-threaded, so speed can be improved up to ~4x by distributing pages across processes running in parallel. In practice (at least on the M4 Pro chip) Tesseract seems to reach peak overall speed when running across approximately 6 processes in parallel. Increasing the CPU count further can cause performance to degrade sharply (regardless of RAM availability). Params: image PIL image data. """ blocks_best = [] angle_best = None angles = [0, 90, 180, 270] if self._detect_angle else [0] for angle in angles: # Rotate the image counter-clockwise, since we care about # keeping track of the angle from the upright position *to* # the original position, not *from*. rotated_image = image.rotate(360 - angle, expand=True) df = pytesseract.image_to_data( rotated_image, lang="+".join(self._languages), config="--oem 1 --tessdata-dir ./data/tessdata_fast-4.1.0", output_type=pytesseract.Output.DATAFRAME, ).fillna({"text": ""}) # Exclude blocks with relatively low confidence ratings. df = df[df["conf"] > 80] # Exclude empty words df = df[df["text"] != ""] # Attempt to exclude blocks that seem vertically oriented. # TODO: Will this work for non-Latin scripts? Probably not all. df = df[(df["width"] / df["height"]) > 0.8] if angle_best is None or df.shape[0] > len(blocks_best): angle_best = angle blocks_best = [ TextBlock( # Rotate X and Y coordinates back to match the original image. *_box_after_rotation( int(row["left"]), int(row["top"]), int(row["left"] + row["width"]), int(row["top"] + row["height"]), *rotated_image.size, angle, ), text=row["text"], ) for _, row in df.iterrows() ] return OcrResult( blocks=blocks_best, page_angle=angle_best if self._detect_angle else None ) def _box_after_rotation( x0: int, y0: int, x1: int, y1: int, image_width: int, image_height: int, degrees_clockwise: int, ) -> tuple[int, int, int, int]: """ Given the corners of a box in an image, returns the corners of an equivalent box if the image is rotated by some multiple of 90 degrees. Both input and output coordinates are expected to be top left followed by bottom right, where the origin is at the top left. """ angle = ((degrees_clockwise % 360) + 360) % 360 if angle == 0: return x0, y0, x1, y1 if angle == 90: return image_height - y1, x0, image_height - y0, x1 if angle == 180: return image_width - x1, image_height - y1, image_width - x0, image_height - y0 if angle == 270: return y0, image_width - x1, y1, image_width - x0 else: raise Exception("_box_after_rotation() only accepts multiples of 90 degrees")