2025-11-07 05:41:18 +00:00
|
|
|
import numpy as np
|
|
|
|
|
from paddleocr import PaddleOCR
|
|
|
|
|
from PIL import Image
|
|
|
|
|
|
2025-12-20 02:16:41 +00:00
|
|
|
from . import OcrEngine, OcrResult, TextBlock
|
2025-11-07 05:41:18 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def convert_language(iso639_3_code: str) -> str:
|
|
|
|
|
"""
|
|
|
|
|
Format an ISO 639-3 language code for PaddleOCR.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# TODO: Add remaining language code mappings (refer to:
|
|
|
|
|
# https://github.com/PaddlePaddle/PaddleOCR/blob/eaede685bcaf22f287edf8865f4dd8d374acb75e/paddleocr/_pipelines/ocr.py#L306).
|
|
|
|
|
mappings = {"eng": "en", "fra": "fr"}
|
|
|
|
|
if iso639_3_code in mappings:
|
|
|
|
|
return mappings[iso639_3_code]
|
|
|
|
|
return iso639_3_code
|
|
|
|
|
|
|
|
|
|
|
2025-12-20 02:16:41 +00:00
|
|
|
class PaddleOcrEngine(OcrEngine):
|
|
|
|
|
# Dict of ISO 639-3 language code to PaddleOCR instance.
|
|
|
|
|
_ocr_instances: dict[str, PaddleOCR] = {}
|
|
|
|
|
|
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
|
|
|
super().__init__(*args, **kwargs)
|
|
|
|
|
for language in self._languages:
|
|
|
|
|
self._ocr_instances[language] = PaddleOCR(
|
|
|
|
|
use_doc_orientation_classify=True,
|
|
|
|
|
use_doc_unwarping=False,
|
|
|
|
|
use_textline_orientation=False,
|
|
|
|
|
lang=convert_language(language),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def process(self, image: Image.Image) -> OcrResult:
|
2025-11-07 05:41:18 +00:00
|
|
|
"""
|
2025-12-20 02:16:41 +00:00
|
|
|
Use `paddleocr` to parse an image.
|
2025-11-07 05:41:18 +00:00
|
|
|
|
|
|
|
|
Compared to Tesseract, PaddleOCR is more accurate at low image
|
|
|
|
|
resolutions and able to perform one-shot page angle detection. However,
|
|
|
|
|
it is at least 10x slower per CPU core, as it is implemented with a deep
|
|
|
|
|
learning model designed for GPU acceleration.
|
|
|
|
|
|
|
|
|
|
PaddleOCR runs multi-core out of the box, so it is not necessary to
|
|
|
|
|
process multiple pages in parallel to achieve good speed (though it
|
|
|
|
|
still helps a little because the algorithm performs some single-threaded
|
|
|
|
|
work at various points).
|
|
|
|
|
|
|
|
|
|
Note: Specifying multiple languages will cause OCR to run once for each
|
|
|
|
|
language and choose the result it thinks is best. Thus, it's
|
|
|
|
|
recommended to stick to one language if at all possible.
|
|
|
|
|
|
2025-12-20 02:16:41 +00:00
|
|
|
Note: Though it works well when it works, PaddlePaddle has a tendency to
|
|
|
|
|
segfault and generally has been found to be buggy and unreliable.
|
|
|
|
|
Installing the nightly development build of `paddlepaddle` may help.
|
|
|
|
|
Refer to:
|
2025-11-07 05:41:18 +00:00
|
|
|
- [PaddleOCR issue 16609](https://github.com/PaddlePaddle/PaddleOCR/issues/16609)
|
|
|
|
|
- [PaddlePaddle PR 75731](https://github.com/PaddlePaddle/Paddle/pull/75731)
|
|
|
|
|
|
|
|
|
|
Params:
|
|
|
|
|
|
|
|
|
|
image PIL image data.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
best_result = None
|
2025-12-20 02:16:41 +00:00
|
|
|
for language in self._languages:
|
|
|
|
|
[res] = self._ocr_instances[language].predict(
|
|
|
|
|
np.array(image.convert("RGB"))
|
|
|
|
|
)
|
2025-11-07 05:41:18 +00:00
|
|
|
if best_result is None or len(res["rec_texts"]) > len(
|
|
|
|
|
best_result["rec_texts"]
|
|
|
|
|
):
|
|
|
|
|
best_result = res
|
|
|
|
|
|
|
|
|
|
if best_result is None:
|
|
|
|
|
raise Exception("no languages specified")
|
|
|
|
|
|
|
|
|
|
res = best_result
|
|
|
|
|
|
2025-12-20 02:16:41 +00:00
|
|
|
return OcrResult(
|
|
|
|
|
blocks=[
|
|
|
|
|
TextBlock(
|
|
|
|
|
text=res["rec_texts"][i],
|
|
|
|
|
x0=res["rec_boxes"][i][0],
|
|
|
|
|
y0=res["rec_boxes"][i][1],
|
|
|
|
|
x1=res["rec_boxes"][i][2],
|
|
|
|
|
y1=res["rec_boxes"][i][3],
|
|
|
|
|
)
|
|
|
|
|
for i, _ in enumerate(res["rec_texts"])
|
|
|
|
|
],
|
|
|
|
|
page_angle=(360 - res["doc_preprocessor_res"]["angle"]) % 360,
|
2025-11-07 05:41:18 +00:00
|
|
|
)
|