MicroQA/microqa/ocr/paddleocr.py

93 lines
3.3 KiB
Python
Raw Permalink Normal View History

2025-11-07 05:41:18 +00:00
import numpy as np
from paddleocr import PaddleOCR
from PIL import Image
2025-12-20 02:16:41 +00:00
from . import OcrEngine, OcrResult, TextBlock
2025-11-07 05:41:18 +00:00
def convert_language(iso639_3_code: str) -> str:
"""
Format an ISO 639-3 language code for PaddleOCR.
"""
# TODO: Add remaining language code mappings (refer to:
# https://github.com/PaddlePaddle/PaddleOCR/blob/eaede685bcaf22f287edf8865f4dd8d374acb75e/paddleocr/_pipelines/ocr.py#L306).
mappings = {"eng": "en", "fra": "fr"}
if iso639_3_code in mappings:
return mappings[iso639_3_code]
return iso639_3_code
2025-12-20 02:16:41 +00:00
class PaddleOcrEngine(OcrEngine):
# Dict of ISO 639-3 language code to PaddleOCR instance.
_ocr_instances: dict[str, PaddleOCR] = {}
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
for language in self._languages:
self._ocr_instances[language] = PaddleOCR(
use_doc_orientation_classify=True,
use_doc_unwarping=False,
use_textline_orientation=False,
lang=convert_language(language),
)
def process(self, image: Image.Image) -> OcrResult:
2025-11-07 05:41:18 +00:00
"""
2025-12-20 02:16:41 +00:00
Use `paddleocr` to parse an image.
2025-11-07 05:41:18 +00:00
Compared to Tesseract, PaddleOCR is more accurate at low image
resolutions and able to perform one-shot page angle detection. However,
it is at least 10x slower per CPU core, as it is implemented with a deep
learning model designed for GPU acceleration.
PaddleOCR runs multi-core out of the box, so it is not necessary to
process multiple pages in parallel to achieve good speed (though it
still helps a little because the algorithm performs some single-threaded
work at various points).
Note: Specifying multiple languages will cause OCR to run once for each
language and choose the result it thinks is best. Thus, it's
recommended to stick to one language if at all possible.
2025-12-20 02:16:41 +00:00
Note: Though it works well when it works, PaddlePaddle has a tendency to
segfault and generally has been found to be buggy and unreliable.
Installing the nightly development build of `paddlepaddle` may help.
Refer to:
2025-11-07 05:41:18 +00:00
- [PaddleOCR issue 16609](https://github.com/PaddlePaddle/PaddleOCR/issues/16609)
- [PaddlePaddle PR 75731](https://github.com/PaddlePaddle/Paddle/pull/75731)
Params:
image PIL image data.
"""
best_result = None
2025-12-20 02:16:41 +00:00
for language in self._languages:
[res] = self._ocr_instances[language].predict(
np.array(image.convert("RGB"))
)
2025-11-07 05:41:18 +00:00
if best_result is None or len(res["rec_texts"]) > len(
best_result["rec_texts"]
):
best_result = res
if best_result is None:
raise Exception("no languages specified")
res = best_result
2025-12-20 02:16:41 +00:00
return OcrResult(
blocks=[
TextBlock(
text=res["rec_texts"][i],
x0=res["rec_boxes"][i][0],
y0=res["rec_boxes"][i][1],
x1=res["rec_boxes"][i][2],
y1=res["rec_boxes"][i][3],
)
for i, _ in enumerate(res["rec_texts"])
],
page_angle=(360 - res["doc_preprocessor_res"]["angle"]) % 360,
2025-11-07 05:41:18 +00:00
)