100 lines
3.7 KiB
Python
100 lines
3.7 KiB
Python
|
|
import numpy as np
|
||
|
|
import pandas as pd
|
||
|
|
from paddleocr import PaddleOCR
|
||
|
|
from PIL import Image
|
||
|
|
|
||
|
|
|
||
|
|
# Reuse OCR instances per language.
|
||
|
|
instances: dict[str, PaddleOCR] = {}
|
||
|
|
|
||
|
|
|
||
|
|
def convert_language(iso639_3_code: str) -> str:
|
||
|
|
"""
|
||
|
|
Format an ISO 639-3 language code for PaddleOCR.
|
||
|
|
"""
|
||
|
|
|
||
|
|
# TODO: Add remaining language code mappings (refer to:
|
||
|
|
# https://github.com/PaddlePaddle/PaddleOCR/blob/eaede685bcaf22f287edf8865f4dd8d374acb75e/paddleocr/_pipelines/ocr.py#L306).
|
||
|
|
mappings = {"eng": "en", "fra": "fr"}
|
||
|
|
if iso639_3_code in mappings:
|
||
|
|
return mappings[iso639_3_code]
|
||
|
|
return iso639_3_code
|
||
|
|
|
||
|
|
|
||
|
|
class OcrEngine:
|
||
|
|
def process(
|
||
|
|
image: Image, languages: list[str] = ["eng"]
|
||
|
|
) -> tuple[pd.DataFrame, dict]:
|
||
|
|
"""
|
||
|
|
Use `paddleocr` to parse an image to a `DataFrame` with columns
|
||
|
|
`["text", "x0", "y0", "x1", "y1"]`, where X and Y coordinates are in
|
||
|
|
pixels measured from the top left corner of the image. `x1` and `y1`
|
||
|
|
values will be greater than or equal to the corresponding `x0` and `y0`
|
||
|
|
values.
|
||
|
|
|
||
|
|
Compared to Tesseract, PaddleOCR is more accurate at low image
|
||
|
|
resolutions and able to perform one-shot page angle detection. However,
|
||
|
|
it is at least 10x slower per CPU core, as it is implemented with a deep
|
||
|
|
learning model designed for GPU acceleration.
|
||
|
|
|
||
|
|
PaddleOCR runs multi-core out of the box, so it is not necessary to
|
||
|
|
process multiple pages in parallel to achieve good speed (though it
|
||
|
|
still helps a little because the algorithm performs some single-threaded
|
||
|
|
work at various points).
|
||
|
|
|
||
|
|
Note: Specifying multiple languages will cause OCR to run once for each
|
||
|
|
language and choose the result it thinks is best. Thus, it's
|
||
|
|
recommended to stick to one language if at all possible.
|
||
|
|
|
||
|
|
Troubleshooting: The PaddlePaddle core package has/had a bug triggering
|
||
|
|
segfaults on ARM systems. Installing the nightly development build of
|
||
|
|
`paddlepaddle` may be necessary to avoid it. Refer to:
|
||
|
|
- [PaddleOCR issue 16609](https://github.com/PaddlePaddle/PaddleOCR/issues/16609)
|
||
|
|
- [PaddlePaddle PR 75731](https://github.com/PaddlePaddle/Paddle/pull/75731)
|
||
|
|
|
||
|
|
Params:
|
||
|
|
|
||
|
|
image PIL image data.
|
||
|
|
|
||
|
|
languages List of ISO-639-3 language codes fed to the OCR backend.
|
||
|
|
"""
|
||
|
|
|
||
|
|
best_result = None
|
||
|
|
for language in languages:
|
||
|
|
if language in instances:
|
||
|
|
ocr_instance = instances[language]
|
||
|
|
else:
|
||
|
|
ocr_instance = PaddleOCR(
|
||
|
|
use_doc_orientation_classify=True,
|
||
|
|
use_doc_unwarping=False,
|
||
|
|
use_textline_orientation=False,
|
||
|
|
lang=convert_language(language),
|
||
|
|
)
|
||
|
|
instances[language] = ocr_instance
|
||
|
|
|
||
|
|
[res] = ocr_instance.predict(np.array(image.convert("RGB")))
|
||
|
|
if best_result is None or len(res["rec_texts"]) > len(
|
||
|
|
best_result["rec_texts"]
|
||
|
|
):
|
||
|
|
best_result = res
|
||
|
|
|
||
|
|
if best_result is None:
|
||
|
|
raise Exception("no languages specified")
|
||
|
|
|
||
|
|
res = best_result
|
||
|
|
|
||
|
|
return (
|
||
|
|
pd.DataFrame(
|
||
|
|
{
|
||
|
|
"text": res["rec_texts"],
|
||
|
|
"x0": [x0 for [x0, _, _, _] in res["rec_boxes"]],
|
||
|
|
"y0": [y0 for [_, y0, _, _] in res["rec_boxes"]],
|
||
|
|
"x1": [x1 for [_, _, x1, _] in res["rec_boxes"]],
|
||
|
|
"y1": [y1 for [_, _, _, y1] in res["rec_boxes"]],
|
||
|
|
}
|
||
|
|
),
|
||
|
|
{
|
||
|
|
"page_angle": res["doc_preprocessor_res"]["angle"],
|
||
|
|
},
|
||
|
|
)
|