reuse pdf ocr when available

This commit is contained in:
Brent Schroeter 2025-12-20 02:16:41 +00:00
parent ac7e93a75b
commit 3da76d4537
9 changed files with 1373 additions and 901 deletions

View file

@ -11,9 +11,25 @@ from microqa.engine import analyze_doc
def main():
parser = ArgumentParser()
parser.add_argument("--item-id")
parser.add_argument("--cpus", type=int, default=4)
parser.add_argument(
"--ocr-backend",
help="which local OCR backend to use when available text in archived PDF files is insufficient; one of 'tesseract' or 'paddleocr'",
default="tesseract",
)
parser.add_argument("--verbose", action="store_true")
args = parser.parse_args()
# Import OCR engine modules only as needed, to avoid unnecessary slow
# startups and/or missing dependency errors.
if args.ocr_backend == "tesseract":
from microqa.ocr.tesseract import TesseractOcrEngine
ocr_engine = TesseractOcrEngine(languages=["eng", "fra"])
elif args.ocr_backend == "paddleocr":
from microqa.ocr.paddleocr import PaddleOcrEngine
ocr_engine = PaddleOcrEngine(languages=["eng", "fra"])
cache_item(
args.item_id,
# Will not refetch if value is already cached.
@ -29,7 +45,10 @@ def main():
else item.docs
)
analyses = [
analyze_doc(doc, parallel=args.cpus, use_cache=True) for doc in minimal_docs
analyze_doc(
doc=doc, ocr_engine=ocr_engine, use_cache=True, verbose=args.verbose
)
for doc in minimal_docs
]
t_end = time()
@ -43,7 +62,7 @@ def main():
[
i
for i, page in enumerate(doc["pages"])
if not page["ocr_orientation_match"]
if 45 < page["page_angle"] < 315
]
for doc in analyses
],

24
main.py
View file

@ -30,8 +30,24 @@ def main():
help="script will attempt to analyze all items with a review date greater than or equal to this value (YYYYMMDD)",
default="20250701",
)
parser.add_argument(
"--ocr-backend",
help="which local OCR backend to use when available text in archived PDF files is insufficient; one of 'tesseract' or 'paddleocr'",
default="tesseract",
)
args = parser.parse_args()
# Import OCR engine modules only as needed, to avoid unnecessary slow
# startups and/or missing dependency errors.
if args.ocr_backend == "tesseract":
from microqa.ocr.tesseract import TesseractOcrEngine
ocr_engine = TesseractOcrEngine(languages=["eng", "fra"])
elif args.ocr_backend == "paddleocr":
from microqa.ocr.paddleocr import PaddleOcrEngine
ocr_engine = PaddleOcrEngine(languages=["eng", "fra"])
with sqlite3.connect(args.database) as conn:
cur = conn.cursor()
cur.execute("""
@ -51,7 +67,7 @@ create table if not exists pages (
id int primary key,
doc text not null,
page int not null,
orientation_match boolean not null,
page_angle float not null,
sharpness real not null,
is_blank boolean not null,
text_margin_px int not null
@ -95,7 +111,7 @@ order by review_date
[doc.name, item_id],
)
analysis = analyze_doc(
doc, parallel=args.cpus, verbose=True
doc=doc, ocr_engine=ocr_engine, verbose=True
)
for i, page in enumerate(analysis["pages"]):
cur.execute(
@ -103,7 +119,7 @@ order by review_date
insert into pages (
doc,
page,
orientation_match,
page_angle,
sharpness,
is_blank,
text_margin_px
@ -111,7 +127,7 @@ insert into pages (
[
doc.name,
i + 1,
page["ocr_orientation_match"],
page["page_angle"],
page["sharpness"],
page["blank"],
page["text_margin_px"],

View file

@ -1,57 +1,41 @@
from dataclasses import dataclass, field
from multiprocessing import Pool
from sys import stdout
import numpy as np
from PIL import Image, ImageFilter
from .items import ArchiveDoc
from .ocr.tesseract import OcrEngine
from .items import ArchiveDoc, ArchiveLeaf
from .ocr import OcrEngine, TextBlock
def analyze_doc(doc: ArchiveDoc, parallel=1, use_cache=False, verbose=False):
def analyze_doc(
doc: ArchiveDoc,
ocr_engine: OcrEngine,
use_cache: bool = False,
verbose: bool = False,
):
"""
Analyzes all pages in an ArchiveDoc for useful metrics such as sharpness,
orientation, presence of text overflows, and so on.
"""
if verbose:
print(f"Loading {doc.name}...")
stdout.flush()
tasks: PageAnalysisTask = [
PageAnalysisTask(im=leaf.image)
for leaf in doc.fetch_leaves(use_cache=use_cache)
]
all_leaves = doc.fetch_leaves(use_cache=use_cache)
if verbose:
print(f"Processing {len(tasks)} pages...", file=stdout)
print(f"Processing {len(all_leaves)} pages...", file=stdout)
stdout.flush()
if parallel > 1:
# Parallelize image processing and OCR of pages across up to n cores.
with Pool(parallel) as pool:
return {"pages": pool.map(analyze_page, tasks)}
return {"pages": [analyze_page(task) for task in tasks]}
@dataclass
class PageAnalysisTask:
"""
Attributes:
im PIL Image, pre-scaled using .thumbnail() to fit the long
edge to 3200 px.
ocr_langs Tesseract language codes (3 letters each, in a "+"-separated
list).
"""
im: Image.Image
ocr_langs: list[str] = field(default_factory=lambda: ["eng"])
def analyze_page(task):
im_cropped = task.im.crop(
analyzed_pages = []
for leaf in all_leaves:
im_cropped = leaf.image.crop(
(
task.im.size[0] * 0.1,
task.im.size[1] * 0.1,
task.im.size[0] * 0.9,
task.im.size[1] * 0.9,
leaf.image.size[0] * 0.1,
leaf.image.size[1] * 0.1,
leaf.image.size[0] * 0.9,
leaf.image.size[1] * 0.9,
)
)
@ -59,9 +43,15 @@ def analyze_page(task):
if is_blank:
max_sharpness = 1
ocr_orientation_match = True
text_margin_px = -1
page_angle = 0
else:
# Sharpness is determined by percentile of pixels that match some
# criteria, so it may vary significantly depending on which portion
# of the image is analyzed. In an effort to identify the sharpest
# edges, we split up the image into chunks and assume that the
# highest sharpness value obtained across all chunks is
# representative of the image as a whole.
max_sharpness = 0.0
if im_cropped.size[0] < im_cropped.size[1]:
# Page is in portrait orientation.
@ -87,62 +77,73 @@ def analyze_page(task):
),
)
OCR_SCALE = 1
# TODO: Refactor orientation detection logic into the OCR engine
# modules.
best_ocr_score = -1
best_ocr_words = None
best_ocr_orientation = -1
for orientation in range(4):
im_rotated = task.im.resize(
np.int_(np.array(task.im.size) * OCR_SCALE)
).rotate(90 * orientation, expand=True)
ocr, ocr_meta = OcrEngine.process(im_rotated, languages=task.ocr_langs)
if "page_angle" in ocr_meta:
# OCR engine automatically accounts for page rotation.
best_ocr_score = ocr.shape[0]
# PaddleOCR counts rotation as degrees, in the opposite
# direction as PIL's `Image.rotate()`
best_ocr_orientation = (
4 - round(((ocr_meta["page_angle"] + 360) % 360) / 90)
) % 4
best_ocr_words = ocr
break
if ocr.shape[0] > best_ocr_score:
best_ocr_score = ocr.shape[0]
best_ocr_orientation = orientation
best_ocr_words = ocr
if best_ocr_score > 50:
# Unlikely that another orientation will have more words, so
# stop eating up CPU.
break
if best_ocr_words.empty:
ocr_orientation_match = True
text_margin_px = -1
else:
ocr_orientation_match = best_ocr_orientation == 0
best_ocr_dims = OCR_SCALE * np.array(
task.im.size
if best_ocr_orientation % 2 == 0
else (task.im.size[1], task.im.size[0])
# OCR is computationally expensive, so we try to take advantage of
# the Tesseract data already parsed by the Internet Archive and
# embedded in the PDF, when possible. If there is not sufficient
# text in the PDF to be confident that the Archive's OCR
# postprocessing captured it all, then OCR is recomputed locally.
#
# In some instances, the Archive's OCR detects rotated text but
# parses it as gibberish. To partially mitigate this, we ignore all
# precomputed text blocks with a "portrait" aspect ratio. This will
# not necessarily help with text that is rotated 180 degrees, but in
# practice that case is rarely encountered. This will also not work
# well with non-latin scripts that are intended to be oriented
# vertically.
OCR_RECOMPUTE_THRESHOLD_WORDS = 30
if (
sum(
(
len(block.text.split())
for block in leaf.text_blocks
if block.x1 - block.x0 > block.y1 - block.y0
)
)
>= OCR_RECOMPUTE_THRESHOLD_WORDS
):
if verbose:
print("Using PDF text.")
ocred_leaf = leaf
page_angle = 0
else:
if verbose:
print("Using OCR.")
OCR_SCALE = 1
im_scaled = leaf.image.resize(
np.int_(np.array(leaf.image.size) * OCR_SCALE)
)
ocr_result = ocr_engine.process(im_scaled)
ocred_leaf = ArchiveLeaf(
image=leaf.image,
page_number=leaf.page_number,
text_blocks=[
TextBlock(
x0=int(block.x0 / OCR_SCALE),
y0=int(block.y0 / OCR_SCALE),
x1=int(block.x1 / OCR_SCALE),
y1=int(block.y1 / OCR_SCALE),
text=block.text,
)
for block in ocr_result.blocks
],
)
page_angle = ocr_result.page_angle
word_margins_all_directions = np.sort(
np.int_(
np.concat(
(
best_ocr_words["x0"].to_numpy(),
best_ocr_words["y0"].to_numpy(),
best_ocr_dims[0] - best_ocr_words["x1"].to_numpy(),
best_ocr_dims[1] - best_ocr_words["y1"].to_numpy(),
[
np.array(
[
block.x0,
block.y0,
leaf.image.size[0] - block.x1,
leaf.image.size[1] - block.y1,
]
)
for block in ocred_leaf.text_blocks
]
)
# Transform back into original image pixel density
/ OCR_SCALE
)
)
# Skip the n closest words to the edge, to help ignore stray OCR artifacts.
@ -153,16 +154,23 @@ def analyze_page(task):
else -1
)
return {
# Make sure the OCR engine is running with orientation detection.
assert page_angle is not None
analyzed_pages.append(
{
"blank": is_blank,
"ocr_orientation_match": ocr_orientation_match,
"size_analyzed": task.im.size,
"page_angle": page_angle,
"size_analyzed": leaf.image.size,
"sharpness": max_sharpness,
"text_margin_px": text_margin_px,
}
)
return {"pages": analyzed_pages}
def analyze_sharpness(im):
def analyze_sharpness(im: Image.Image):
"""
Crudely quantifies the "sharpness" of edges in an image, on a scale of 0 to
1. The scale is not linear with respect to scan quality: anything above 0.1

View file

@ -3,18 +3,18 @@ Python utilities for structuring data and metadata pulled from archive.org
microfiche scans.
"""
import json
import os
import urllib
from contextlib import nullcontext
from dataclasses import dataclass
from io import BytesIO
from typing import Optional
from zipfile import ZipFile
import pymupdf
import requests
from PIL import Image
from .ocr import TextBlock
CACHE_DIR = "./archive_cache"
@ -38,16 +38,20 @@ class ArchiveLeaf:
presented to users, otherwise a (potentially empty)
string with the inferred page number as defined by the
document being scanned.
text_blocks List of text blocks extracted from PyMuPDF's
TextPage.extractBlocks() method.
"""
image: Image
image: Image.Image
page_number: Optional[str]
text_blocks: list[TextBlock]
@dataclass
class ArchiveDoc:
"""
Information pertaining to a single set of processed pages, of which there
Information pdertaining to a single set of processed pages, of which there
may be multiple for any given ArchiveItem. For example, one SCOTUS case may
contain several briefs/petitions/etc., each presented as a distinct PDF but
all held within the parent `ArchiveItem`.
@ -80,62 +84,71 @@ class ArchiveDoc:
name: str
title: Optional[str]
def fetch_leaves(self, numbered_only=True, use_cache=False) -> list[ArchiveLeaf]:
def fetch_leaves(self, use_cache=False) -> list[ArchiveLeaf]:
"""
Fetch images and page number data for this document from archive.org,
over the Internet.
Fetch images and OCR text data for this document from archive.org PDF files.
Params:
numbered_only If `True`, discards any leaves with no corresponding
page number entries. Leaves for which the page
number is an empty string are retained.
use_cache If `True`, locally cached zip files under the
use_cache If `True`, locally cached PDF files under the
`./archive_cache` directory (relative to the working
directory) will be used instead of fetching over
HTTPS.
"""
if use_cache:
# Cached file names are derived from the percent-encoded verison of
# `self.name`, so that there's no need to worry about directory
# separators or other disallowed characters in the file names
# defined by archive.org.
with open(
f"{CACHE_DIR}/{_url_encode(self.name)}_page_numbers.json", "r"
) as f:
page_nums = json.load(f)["pages"]
zip_reader_ctx = open(f"{CACHE_DIR}/{_url_encode(self.name)}_jp2.zip", "rb")
with open(f"{CACHE_DIR}/{_url_encode(self.name)}.pdf", "rb") as f:
pdf_data = f.read()
else:
page_nums = _fetch_page_nums(self.identifier, self.name)["pages"]
# Wrap in a context manager so that the reader can be used in a `with`
# block in the same way as a file accessed with `open()`.
zip_reader_ctx = nullcontext(
BytesIO(_fetch_jp2_zip(self.identifier, self.name))
)
pdf_data = _fetch_pdf(self.identifier, self.name)
leaves = []
with zip_reader_ctx as zip_reader, ZipFile(zip_reader) as jp_zip:
for leaf_num, file_name in enumerate(sorted(jp_zip.namelist())):
for page_index, page_num_info in enumerate(page_nums):
if page_num_info["leafNum"] == leaf_num:
# Stop iterating and keep page_index set to the current
# value.
break
else:
# Indicate that leaf was not found in page_num list.
page_index = None
# Open PDF from bytes
pdf_doc = pymupdf.open(stream=pdf_data, filetype="pdf")
if not numbered_only or page_index is not None:
with jp_zip.open(file_name) as jp_file:
# Convert to single-channel greyscale ("L").
image = Image.open(jp_file).convert("L")
# Rescale long edge to no more than 3200 px.
try:
for page_num in range(len(pdf_doc)):
page = pdf_doc[page_num]
# Extract text blocks with coordinates
# Convert to TextBlock objects, discarding block_no and block_type
text_blocks = [
TextBlock(
x0=int(x0),
y0=int(y0),
x1=int(x1),
y1=int(y1),
text=text,
)
for x0, y0, x1, y1, text, *_ in page.get_text("blocks")
]
# Render page to image
# Use a matrix to scale appropriately (default is 72 DPI)
# Scale factor 4.44 gives approximately 320 DPI, which should produce
# images with long edge around 3200px for typical page sizes
mat = pymupdf.Matrix(4.44, 4.44)
pix = page.get_pixmap(matrix=mat, alpha=False)
# Convert PyMuPDF pixmap to PIL Image
img_data = pix.tobytes("ppm")
image = Image.open(BytesIO(img_data)).convert("L")
# Ensure long edge is no more than 3200 px
image.thumbnail((3200, 3200))
leaves.append(ArchiveLeaf(image=image, page_number=page_index))
# Page numbers are 1-indexed for human readability
leaves.append(
ArchiveLeaf(
image=image,
page_number=str(page_num + 1),
text_blocks=text_blocks,
)
)
finally:
pdf_doc.close()
return leaves
@ -201,20 +214,18 @@ def fetch_item(identifier: str, use_cache=False) -> ArchiveItem:
doc_names = [
# Strip suffix, to just leave the identifier, and title if present.
name[: -len("_jp2.zip")]
name[: -len(".pdf")]
for name in file_names
if name.lower().endswith("_jp2.zip")
# Exclude unprocessed scans, which are also named `..._jp2.zip`.
and name.lower() != f"{identifier.lower()}_micro_jp2.zip"
if name.lower().endswith(".pdf")
]
# Assert that all files we expect to find are actually present.
for doc_name in doc_names:
if f"{_url_encode(doc_name.lower())}_page_numbers.json" not in [
if f"{_url_encode(doc_name.lower())}.pdf" not in [
name.lower() for name in file_names
]:
raise Exception(
f"expected file not found: {_url_encode(doc_name.lower())}_page_numbers.zip"
f"expected file not found: {_url_encode(doc_name.lower())}.pdf"
)
return ArchiveItem(
@ -232,7 +243,7 @@ def fetch_item(identifier: str, use_cache=False) -> ArchiveItem:
def cache_item(identifier: str, overwrite=True):
"""
Load the relevant files for an `ArchiveItem` and its component `ArchiveDoc`s
Load the PDF files for an `ArchiveItem` and its component `ArchiveDoc`s
and store them within the `archive_cache` directory (relative to the working
directory). The `archive_cache` directory will be created if it does not
exist.
@ -249,16 +260,14 @@ def cache_item(identifier: str, overwrite=True):
for name in os.listdir(CACHE_DIR):
if _url_decode(name.lower()).startswith(identifier.lower()):
if not overwrite:
return
item = fetch_item(identifier)
for doc in item.docs:
page_nums = _fetch_page_nums(identifier, doc.name)
zip_file = _fetch_jp2_zip(identifier, doc.name)
with open(f"{CACHE_DIR}/{_url_encode(doc.name)}_page_numbers.json", "w") as f:
json.dump(page_nums, f)
with open(f"{CACHE_DIR}/{_url_encode(doc.name)}_jp2.zip", "wb") as f:
f.write(zip_file)
pdf_data = _fetch_pdf(identifier, doc.name)
with open(f"{CACHE_DIR}/{_url_encode(doc.name)}.pdf", "wb") as f:
f.write(pdf_data)
def _url_encode(string: str) -> str:
@ -278,31 +287,16 @@ def _url_decode(string: str) -> str:
return urllib.parse.unquote(string)
def _fetch_page_nums(identifier: str, doc_name: str) -> dict:
def _fetch_pdf(identifier: str, doc_name: str) -> bytes:
"""
Fetch JSON file with page number metadata for an `ArchiveDoc`.
Fetch PDF file for an `ArchiveDoc`.
"""
# `self.name` does not get percent-encoded, because it is derived from the
# `doc_name` does not get percent-encoded, because it is derived from the
# file path itself as defined by archive.org. Percent-encoding it further
# may result in a 404 error.
page_nums_resp = requests.get(
f"https://archive.org/download/{_url_encode(identifier)}/{doc_name}_page_numbers.json"
resp = requests.get(
f"https://archive.org/download/{_url_encode(identifier)}/{doc_name}.pdf"
)
page_nums_resp.raise_for_status()
return page_nums_resp.json()
def _fetch_jp2_zip(identifier: str, doc_name: str) -> bytes:
"""
Fetch zip file with processed page scans for an `ArchiveDoc`.
"""
# `self.name` does not get percent-encoded, because it is derived
# from the file path itself as defined by archive.org. Percent-
# encoding it further may result in a 404 error.
zip_resp = requests.get(
f"https://archive.org/download/{_url_encode(identifier)}/{doc_name}_jp2.zip"
)
zip_resp.raise_for_status()
return zip_resp.content
resp.raise_for_status()
return resp.content

View file

@ -2,13 +2,74 @@
This module contains interchangeable engines for optical character recognition,
making it easy to swap implementations in and out based on speed and accuracy
advantages without rewriting business logic.
Each nested module exports a class named `OcrEngine` with a method named
`process()`, which accepts a PIL `Image` and list of languages, and which
returns a tuple containing a standardized `DataFrame` as well as a dictionary
containing any additional specialized metadata made available from the
underlying OCR engine. The `DataFrame` has columns
`["text", "x0", "y0", "x1", "y1"]`, where X and Y coordinates are in pixels
measured from the top left corner of the image. `x1` and `y1` values will be
greater than or equal to the corresponding `x0` and `y0` values.
"""
from dataclasses import dataclass
from typing import Optional
from PIL import Image
@dataclass
class TextBlock:
"""
Attributes:
x0 Left coordinate of the bounding box, in pixels.
y0 Top coordinate of the bounding box, in pixels.
x1 Right coordinate of the bounding box, in pixels from left of
image.
y1 Bottom coordinate of the bounding box, in pixels from top of
image.
text Text content of the block.
"""
x0: int
y0: int
x1: int
y1: int
text: str
@dataclass
class OcrResult:
"""
OCR data parsed from a single page.
Attributes:
blocks Blocks of text detected on a page.
page_angle Optional detected rotation of the page, in degrees clockwise
relative to upright.
"""
blocks: list[TextBlock]
page_angle: Optional[float]
class OcrEngine:
"""
Abstract class for interchangeable OCR processing backends.
Params:
detect_angle Allows page angle detection to be enabled or disabled
for certain implementations. Defaults to True.
languages List of ISO-639-3 language codes fed to the OCR backend.
"""
_detect_angle: bool
_languages: list[str]
def __init__(self, languages: list[str], detect_angle: bool = True):
self._detect_angle = detect_angle
self._languages = languages.copy()
def process(self, image: Image.Image) -> OcrResult:
raise NotImplementedError()

View file

@ -1,11 +1,8 @@
import numpy as np
import pandas as pd
from paddleocr import PaddleOCR
from PIL import Image
# Reuse OCR instances per language.
instances: dict[str, PaddleOCR] = {}
from . import OcrEngine, OcrResult, TextBlock
def convert_language(iso639_3_code: str) -> str:
@ -21,16 +18,23 @@ def convert_language(iso639_3_code: str) -> str:
return iso639_3_code
class OcrEngine:
def process(
image: Image, languages: list[str] = ["eng"]
) -> tuple[pd.DataFrame, dict]:
class PaddleOcrEngine(OcrEngine):
# Dict of ISO 639-3 language code to PaddleOCR instance.
_ocr_instances: dict[str, PaddleOCR] = {}
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
for language in self._languages:
self._ocr_instances[language] = PaddleOCR(
use_doc_orientation_classify=True,
use_doc_unwarping=False,
use_textline_orientation=False,
lang=convert_language(language),
)
def process(self, image: Image.Image) -> OcrResult:
"""
Use `paddleocr` to parse an image to a `DataFrame` with columns
`["text", "x0", "y0", "x1", "y1"]`, where X and Y coordinates are in
pixels measured from the top left corner of the image. `x1` and `y1`
values will be greater than or equal to the corresponding `x0` and `y0`
values.
Use `paddleocr` to parse an image.
Compared to Tesseract, PaddleOCR is more accurate at low image
resolutions and able to perform one-shot page angle detection. However,
@ -46,33 +50,23 @@ class OcrEngine:
language and choose the result it thinks is best. Thus, it's
recommended to stick to one language if at all possible.
Troubleshooting: The PaddlePaddle core package has/had a bug triggering
segfaults on ARM systems. Installing the nightly development build of
`paddlepaddle` may be necessary to avoid it. Refer to:
Note: Though it works well when it works, PaddlePaddle has a tendency to
segfault and generally has been found to be buggy and unreliable.
Installing the nightly development build of `paddlepaddle` may help.
Refer to:
- [PaddleOCR issue 16609](https://github.com/PaddlePaddle/PaddleOCR/issues/16609)
- [PaddlePaddle PR 75731](https://github.com/PaddlePaddle/Paddle/pull/75731)
Params:
image PIL image data.
languages List of ISO-639-3 language codes fed to the OCR backend.
"""
best_result = None
for language in languages:
if language in instances:
ocr_instance = instances[language]
else:
ocr_instance = PaddleOCR(
use_doc_orientation_classify=True,
use_doc_unwarping=False,
use_textline_orientation=False,
lang=convert_language(language),
for language in self._languages:
[res] = self._ocr_instances[language].predict(
np.array(image.convert("RGB"))
)
instances[language] = ocr_instance
[res] = ocr_instance.predict(np.array(image.convert("RGB")))
if best_result is None or len(res["rec_texts"]) > len(
best_result["rec_texts"]
):
@ -83,17 +77,16 @@ class OcrEngine:
res = best_result
return (
pd.DataFrame(
{
"text": res["rec_texts"],
"x0": [x0 for [x0, _, _, _] in res["rec_boxes"]],
"y0": [y0 for [_, y0, _, _] in res["rec_boxes"]],
"x1": [x1 for [_, _, x1, _] in res["rec_boxes"]],
"y1": [y1 for [_, _, _, y1] in res["rec_boxes"]],
}
),
{
"page_angle": res["doc_preprocessor_res"]["angle"],
},
return OcrResult(
blocks=[
TextBlock(
text=res["rec_texts"][i],
x0=res["rec_boxes"][i][0],
y0=res["rec_boxes"][i][1],
x1=res["rec_boxes"][i][2],
y1=res["rec_boxes"][i][3],
)
for i, _ in enumerate(res["rec_texts"])
],
page_angle=(360 - res["doc_preprocessor_res"]["angle"]) % 360,
)

View file

@ -1,16 +1,13 @@
import pandas as pd
import pytesseract
from PIL import Image
from . import OcrEngine, OcrResult, TextBlock
class OcrEngine:
def process(image: Image, languages: list[str]) -> tuple[pd.DataFrame, dict]:
class TesseractOcrEngine(OcrEngine):
def process(self, image: Image.Image) -> OcrResult:
"""
Use `pytesseract` to parse an image to a `DataFrame` with columns
`["text", "x0", "y0", "x1", "y1"]`, where X and Y coordinates are in
pixels measured from the top left corner of the image. `x1` and `y1`
values will be greater than or equal to the corresponding `x0` and `y0`
values.
Use `pytesseract` to parse an image.
Note: Each Tesseract command runs single-threaded, so speed can be
improved up to ~4x by distributing pages across processes running in
@ -22,34 +19,97 @@ class OcrEngine:
Params:
image PIL image data.
languages List of ISO-639-3 language codes fed to the OCR backend.
"""
blocks_best = []
angle_best = None
angles = [0, 90, 180, 270] if self._detect_angle else [0]
for angle in angles:
# Rotate the image counter-clockwise, since we care about
# keeping track of the angle from the upright position *to*
# the original position, not *from*.
rotated_image = image.rotate(360 - angle, expand=True)
df = pytesseract.image_to_data(
image,
lang="+".join(languages),
rotated_image,
lang="+".join(self._languages),
config="--oem 1 --tessdata-dir ./data/tessdata_fast-4.1.0",
output_type=pytesseract.Output.DATAFRAME,
)
).fillna({"text": ""})
# Exclude words with relatively low confidence ratings.
# Exclude blocks with relatively low confidence ratings.
df = df[df["conf"] > 80]
# Attempt to exclude words that seem vertically oriented.
# Exclude empty words
df = df[df["text"] != ""]
# Attempt to exclude blocks that seem vertically oriented.
# TODO: Will this work for non-Latin scripts? Probably not all.
df = df[(df["width"] / df["height"]) > 0.8]
return (
pd.DataFrame(
{
"text": df["text"],
"x0": df["left"],
"y0": df["top"],
"x1": df["left"] + df["width"],
"y1": df["top"] + df["height"],
}
print(
[
TextBlock(
# Rotate X and Y coordinates back to match the original image.
*_box_after_rotation(
int(row["left"]),
int(row["top"]),
int(row["left"] + row["width"]),
int(row["top"] + row["height"]),
*rotated_image.size,
angle,
),
# We don't use any page-level metadata from the Tesseract output.
{},
text=row["text"],
)
for _, row in df.iterrows()
]
)
if angle_best is None or df.shape[0] > len(blocks_best):
angle_best = angle
blocks_best = [
TextBlock(
# Rotate X and Y coordinates back to match the original image.
*_box_after_rotation(
int(row["left"]),
int(row["top"]),
int(row["left"] + row["width"]),
int(row["top"] + row["height"]),
*rotated_image.size,
angle,
),
text=row["text"],
)
for _, row in df.iterrows()
]
return OcrResult(
blocks=blocks_best, page_angle=angle_best if self._detect_angle else None
)
def _box_after_rotation(
x0: int,
y0: int,
x1: int,
y1: int,
image_width: int,
image_height: int,
degrees_clockwise: int,
) -> tuple[int, int, int, int]:
"""
Given the corners of a box in an image, returns the corners of an equivalent
box if the image is rotated by some multiple of 90 degrees. Both input and
output coordinates are expected to be top left followed by bottom right,
where the origin is at the top left.
"""
angle = ((degrees_clockwise % 360) + 360) % 360
if angle == 0:
return x0, y0, x1, y1
if angle == 90:
return image_height - y1, x0, image_height - y0, x1
if angle == 180:
return image_width - x1, image_height - y1, image_width - x0, image_height - y0
if angle == 270:
return y0, image_width - x1, y1, image_width - x0
else:
raise Exception("_box_after_rotation() only accepts multiples of 90 degrees")

View file

@ -6,10 +6,12 @@ readme = "README.md"
requires-python = ">=3.11"
dependencies = [
"numpy>=2.3.2",
"paddleocr>=3.2.0",
"paddlepaddle>=3.2.0",
"paddleocr>=3.3.0",
"paddlepaddle>=3.2.2",
"pandas>=2.3.1",
"pillow>=11.3.0",
"psycopg[binary]>=3.2.12",
"pymupdf>=1.26.6",
"pytesseract>=0.3.13",
"requests>=2.32.4",
]

1491
uv.lock generated

File diff suppressed because it is too large Load diff