MicroQA/microqa/ocr/paddleocr.py

import numpy as np
import pandas as pd
from paddleocr import PaddleOCR
from PIL import Image


# Reuse OCR instances per language.
instances: dict[str, PaddleOCR] = {}


def convert_language(iso639_3_code: str) -> str:
    """
    Format an ISO 639-3 language code for PaddleOCR.
    """

    # TODO: Add remaining language code mappings (refer to:
    # https://github.com/PaddlePaddle/PaddleOCR/blob/eaede685bcaf22f287edf8865f4dd8d374acb75e/paddleocr/_pipelines/ocr.py#L306).
    mappings = {"eng": "en", "fra": "fr"}
    if iso639_3_code in mappings:
        return mappings[iso639_3_code]
    return iso639_3_code


class OcrEngine:
    def process(
        image: Image, languages: list[str] = ["eng"]
    ) -> tuple[pd.DataFrame, dict]:
        """
        Use `paddleocr` to parse an image to a `DataFrame` with columns
        `["text", "x0", "y0", "x1", "y1"]`, where X and Y coordinates are in
        pixels measured from the top left corner of the image. `x1` and `y1`
        values will be greater than or equal to the corresponding `x0` and `y0`
        values.

        Compared to Tesseract, PaddleOCR is more accurate at low image
        resolutions and able to perform one-shot page angle detection. However,
        it is at least 10x slower per CPU core, as it is implemented with a deep
        learning model designed for GPU acceleration.

        PaddleOCR runs multi-core out of the box, so it is not necessary to
        process multiple pages in parallel to achieve good speed (though it
        still helps a little because the algorithm performs some single-threaded
        work at various points).

        Note: Specifying multiple languages will cause OCR to run once for each
        language and choose the result it thinks is best. Thus, it's
        recommended to stick to one language if at all possible.

        Troubleshooting: The PaddlePaddle core package has/had a bug triggering
        segfaults on ARM systems. Installing the nightly development build of
        `paddlepaddle` may be necessary to avoid it. Refer to:
        - [PaddleOCR issue 16609](https://github.com/PaddlePaddle/PaddleOCR/issues/16609)
        - [PaddlePaddle PR 75731](https://github.com/PaddlePaddle/Paddle/pull/75731)

        Params:

            image       PIL image data.

            languages   List of ISO-639-3 language codes fed to the OCR backend.
        """

        best_result = None
        for language in languages:
            if language in instances:
                ocr_instance = instances[language]
            else:
                ocr_instance = PaddleOCR(
                    use_doc_orientation_classify=True,
                    use_doc_unwarping=False,
                    use_textline_orientation=False,
                    lang=convert_language(language),
                )
                instances[language] = ocr_instance

            [res] = ocr_instance.predict(np.array(image.convert("RGB")))
            if best_result is None or len(res["rec_texts"]) > len(
                best_result["rec_texts"]
            ):
                best_result = res

        if best_result is None:
            raise Exception("no languages specified")

        res = best_result

        return (
            pd.DataFrame(
                {
                    "text": res["rec_texts"],
                    "x0": [x0 for [x0, _, _, _] in res["rec_boxes"]],
                    "y0": [y0 for [_, y0, _, _] in res["rec_boxes"]],
                    "x1": [x1 for [_, _, x1, _] in res["rec_boxes"]],
                    "y1": [y1 for [_, _, _, y1] in res["rec_boxes"]],
                }
            ),
            {
                "page_angle": res["doc_preprocessor_res"]["angle"],
            },
        )
add interchangeable ocr engines 2025-11-07 05:41:18 +00:00			`import numpy as np`
			`import pandas as pd`
			`from paddleocr import PaddleOCR`
			`from PIL import Image`


			`# Reuse OCR instances per language.`
			`instances: dict[str, PaddleOCR] = {}`


			`def convert_language(iso639_3_code: str) -> str:`
			`"""`
			`Format an ISO 639-3 language code for PaddleOCR.`
			`"""`

			`# TODO: Add remaining language code mappings (refer to:`
			`# https://github.com/PaddlePaddle/PaddleOCR/blob/eaede685bcaf22f287edf8865f4dd8d374acb75e/paddleocr/_pipelines/ocr.py#L306).`
			`mappings = {"eng": "en", "fra": "fr"}`
			`if iso639_3_code in mappings:`
			`return mappings[iso639_3_code]`
			`return iso639_3_code`


			`class OcrEngine:`
			`def process(`
			`image: Image, languages: list[str] = ["eng"]`
			`) -> tuple[pd.DataFrame, dict]:`
			`"""`
			Use `paddleocr` to parse an image to a `DataFrame` with columns
			`["text", "x0", "y0", "x1", "y1"]`, where X and Y coordinates are in
			pixels measured from the top left corner of the image. `x1` and `y1`
			values will be greater than or equal to the corresponding `x0` and `y0`
			`values.`

			`Compared to Tesseract, PaddleOCR is more accurate at low image`
			`resolutions and able to perform one-shot page angle detection. However,`
			`it is at least 10x slower per CPU core, as it is implemented with a deep`
			`learning model designed for GPU acceleration.`

			`PaddleOCR runs multi-core out of the box, so it is not necessary to`
			`process multiple pages in parallel to achieve good speed (though it`
			`still helps a little because the algorithm performs some single-threaded`
			`work at various points).`

			`Note: Specifying multiple languages will cause OCR to run once for each`
			`language and choose the result it thinks is best. Thus, it's`
			`recommended to stick to one language if at all possible.`

			`Troubleshooting: The PaddlePaddle core package has/had a bug triggering`
			`segfaults on ARM systems. Installing the nightly development build of`
			`paddlepaddle` may be necessary to avoid it. Refer to:
			`- [PaddleOCR issue 16609](https://github.com/PaddlePaddle/PaddleOCR/issues/16609)`
			`- [PaddlePaddle PR 75731](https://github.com/PaddlePaddle/Paddle/pull/75731)`

			`Params:`

			`image PIL image data.`

			`languages List of ISO-639-3 language codes fed to the OCR backend.`
			`"""`

			`best_result = None`
			`for language in languages:`
			`if language in instances:`
			`ocr_instance = instances[language]`
			`else:`
			`ocr_instance = PaddleOCR(`
			`use_doc_orientation_classify=True,`
			`use_doc_unwarping=False,`
			`use_textline_orientation=False,`
			`lang=convert_language(language),`
			`)`
			`instances[language] = ocr_instance`

			`[res] = ocr_instance.predict(np.array(image.convert("RGB")))`
			`if best_result is None or len(res["rec_texts"]) > len(`
			`best_result["rec_texts"]`
			`):`
			`best_result = res`

			`if best_result is None:`
			`raise Exception("no languages specified")`

			`res = best_result`

			`return (`
			`pd.DataFrame(`
			`{`
			`"text": res["rec_texts"],`
			`"x0": [x0 for [x0, _, _, _] in res["rec_boxes"]],`
			`"y0": [y0 for [_, y0, _, _] in res["rec_boxes"]],`
			`"x1": [x1 for [_, _, x1, _] in res["rec_boxes"]],`
			`"y1": [y1 for [_, _, _, y1] in res["rec_boxes"]],`
			`}`
			`),`
			`{`
			`"page_angle": res["doc_preprocessor_res"]["angle"],`
			`},`
			`)`