reuse pdf ocr when available
This commit is contained in:
parent
ac7e93a75b
commit
3da76d4537
9 changed files with 1373 additions and 901 deletions
|
|
@ -11,9 +11,25 @@ from microqa.engine import analyze_doc
|
||||||
def main():
|
def main():
|
||||||
parser = ArgumentParser()
|
parser = ArgumentParser()
|
||||||
parser.add_argument("--item-id")
|
parser.add_argument("--item-id")
|
||||||
parser.add_argument("--cpus", type=int, default=4)
|
parser.add_argument(
|
||||||
|
"--ocr-backend",
|
||||||
|
help="which local OCR backend to use when available text in archived PDF files is insufficient; one of 'tesseract' or 'paddleocr'",
|
||||||
|
default="tesseract",
|
||||||
|
)
|
||||||
|
parser.add_argument("--verbose", action="store_true")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Import OCR engine modules only as needed, to avoid unnecessary slow
|
||||||
|
# startups and/or missing dependency errors.
|
||||||
|
if args.ocr_backend == "tesseract":
|
||||||
|
from microqa.ocr.tesseract import TesseractOcrEngine
|
||||||
|
|
||||||
|
ocr_engine = TesseractOcrEngine(languages=["eng", "fra"])
|
||||||
|
elif args.ocr_backend == "paddleocr":
|
||||||
|
from microqa.ocr.paddleocr import PaddleOcrEngine
|
||||||
|
|
||||||
|
ocr_engine = PaddleOcrEngine(languages=["eng", "fra"])
|
||||||
|
|
||||||
cache_item(
|
cache_item(
|
||||||
args.item_id,
|
args.item_id,
|
||||||
# Will not refetch if value is already cached.
|
# Will not refetch if value is already cached.
|
||||||
|
|
@ -29,7 +45,10 @@ def main():
|
||||||
else item.docs
|
else item.docs
|
||||||
)
|
)
|
||||||
analyses = [
|
analyses = [
|
||||||
analyze_doc(doc, parallel=args.cpus, use_cache=True) for doc in minimal_docs
|
analyze_doc(
|
||||||
|
doc=doc, ocr_engine=ocr_engine, use_cache=True, verbose=args.verbose
|
||||||
|
)
|
||||||
|
for doc in minimal_docs
|
||||||
]
|
]
|
||||||
|
|
||||||
t_end = time()
|
t_end = time()
|
||||||
|
|
@ -43,7 +62,7 @@ def main():
|
||||||
[
|
[
|
||||||
i
|
i
|
||||||
for i, page in enumerate(doc["pages"])
|
for i, page in enumerate(doc["pages"])
|
||||||
if not page["ocr_orientation_match"]
|
if 45 < page["page_angle"] < 315
|
||||||
]
|
]
|
||||||
for doc in analyses
|
for doc in analyses
|
||||||
],
|
],
|
||||||
|
|
|
||||||
24
main.py
24
main.py
|
|
@ -30,8 +30,24 @@ def main():
|
||||||
help="script will attempt to analyze all items with a review date greater than or equal to this value (YYYYMMDD)",
|
help="script will attempt to analyze all items with a review date greater than or equal to this value (YYYYMMDD)",
|
||||||
default="20250701",
|
default="20250701",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--ocr-backend",
|
||||||
|
help="which local OCR backend to use when available text in archived PDF files is insufficient; one of 'tesseract' or 'paddleocr'",
|
||||||
|
default="tesseract",
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Import OCR engine modules only as needed, to avoid unnecessary slow
|
||||||
|
# startups and/or missing dependency errors.
|
||||||
|
if args.ocr_backend == "tesseract":
|
||||||
|
from microqa.ocr.tesseract import TesseractOcrEngine
|
||||||
|
|
||||||
|
ocr_engine = TesseractOcrEngine(languages=["eng", "fra"])
|
||||||
|
elif args.ocr_backend == "paddleocr":
|
||||||
|
from microqa.ocr.paddleocr import PaddleOcrEngine
|
||||||
|
|
||||||
|
ocr_engine = PaddleOcrEngine(languages=["eng", "fra"])
|
||||||
|
|
||||||
with sqlite3.connect(args.database) as conn:
|
with sqlite3.connect(args.database) as conn:
|
||||||
cur = conn.cursor()
|
cur = conn.cursor()
|
||||||
cur.execute("""
|
cur.execute("""
|
||||||
|
|
@ -51,7 +67,7 @@ create table if not exists pages (
|
||||||
id int primary key,
|
id int primary key,
|
||||||
doc text not null,
|
doc text not null,
|
||||||
page int not null,
|
page int not null,
|
||||||
orientation_match boolean not null,
|
page_angle float not null,
|
||||||
sharpness real not null,
|
sharpness real not null,
|
||||||
is_blank boolean not null,
|
is_blank boolean not null,
|
||||||
text_margin_px int not null
|
text_margin_px int not null
|
||||||
|
|
@ -95,7 +111,7 @@ order by review_date
|
||||||
[doc.name, item_id],
|
[doc.name, item_id],
|
||||||
)
|
)
|
||||||
analysis = analyze_doc(
|
analysis = analyze_doc(
|
||||||
doc, parallel=args.cpus, verbose=True
|
doc=doc, ocr_engine=ocr_engine, verbose=True
|
||||||
)
|
)
|
||||||
for i, page in enumerate(analysis["pages"]):
|
for i, page in enumerate(analysis["pages"]):
|
||||||
cur.execute(
|
cur.execute(
|
||||||
|
|
@ -103,7 +119,7 @@ order by review_date
|
||||||
insert into pages (
|
insert into pages (
|
||||||
doc,
|
doc,
|
||||||
page,
|
page,
|
||||||
orientation_match,
|
page_angle,
|
||||||
sharpness,
|
sharpness,
|
||||||
is_blank,
|
is_blank,
|
||||||
text_margin_px
|
text_margin_px
|
||||||
|
|
@ -111,7 +127,7 @@ insert into pages (
|
||||||
[
|
[
|
||||||
doc.name,
|
doc.name,
|
||||||
i + 1,
|
i + 1,
|
||||||
page["ocr_orientation_match"],
|
page["page_angle"],
|
||||||
page["sharpness"],
|
page["sharpness"],
|
||||||
page["blank"],
|
page["blank"],
|
||||||
page["text_margin_px"],
|
page["text_margin_px"],
|
||||||
|
|
|
||||||
|
|
@ -1,57 +1,41 @@
|
||||||
from dataclasses import dataclass, field
|
|
||||||
from multiprocessing import Pool
|
|
||||||
from sys import stdout
|
from sys import stdout
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from PIL import Image, ImageFilter
|
from PIL import Image, ImageFilter
|
||||||
|
|
||||||
from .items import ArchiveDoc
|
from .items import ArchiveDoc, ArchiveLeaf
|
||||||
from .ocr.tesseract import OcrEngine
|
from .ocr import OcrEngine, TextBlock
|
||||||
|
|
||||||
|
|
||||||
def analyze_doc(doc: ArchiveDoc, parallel=1, use_cache=False, verbose=False):
|
def analyze_doc(
|
||||||
|
doc: ArchiveDoc,
|
||||||
|
ocr_engine: OcrEngine,
|
||||||
|
use_cache: bool = False,
|
||||||
|
verbose: bool = False,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Analyzes all pages in an ArchiveDoc for useful metrics such as sharpness,
|
||||||
|
orientation, presence of text overflows, and so on.
|
||||||
|
"""
|
||||||
|
|
||||||
if verbose:
|
if verbose:
|
||||||
print(f"Loading {doc.name}...")
|
print(f"Loading {doc.name}...")
|
||||||
stdout.flush()
|
stdout.flush()
|
||||||
|
|
||||||
tasks: PageAnalysisTask = [
|
all_leaves = doc.fetch_leaves(use_cache=use_cache)
|
||||||
PageAnalysisTask(im=leaf.image)
|
|
||||||
for leaf in doc.fetch_leaves(use_cache=use_cache)
|
|
||||||
]
|
|
||||||
|
|
||||||
if verbose:
|
if verbose:
|
||||||
print(f"Processing {len(tasks)} pages...", file=stdout)
|
print(f"Processing {len(all_leaves)} pages...", file=stdout)
|
||||||
stdout.flush()
|
stdout.flush()
|
||||||
|
|
||||||
if parallel > 1:
|
analyzed_pages = []
|
||||||
# Parallelize image processing and OCR of pages across up to n cores.
|
for leaf in all_leaves:
|
||||||
with Pool(parallel) as pool:
|
im_cropped = leaf.image.crop(
|
||||||
return {"pages": pool.map(analyze_page, tasks)}
|
|
||||||
|
|
||||||
return {"pages": [analyze_page(task) for task in tasks]}
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class PageAnalysisTask:
|
|
||||||
"""
|
|
||||||
Attributes:
|
|
||||||
im PIL Image, pre-scaled using .thumbnail() to fit the long
|
|
||||||
edge to 3200 px.
|
|
||||||
ocr_langs Tesseract language codes (3 letters each, in a "+"-separated
|
|
||||||
list).
|
|
||||||
"""
|
|
||||||
|
|
||||||
im: Image.Image
|
|
||||||
ocr_langs: list[str] = field(default_factory=lambda: ["eng"])
|
|
||||||
|
|
||||||
|
|
||||||
def analyze_page(task):
|
|
||||||
im_cropped = task.im.crop(
|
|
||||||
(
|
(
|
||||||
task.im.size[0] * 0.1,
|
leaf.image.size[0] * 0.1,
|
||||||
task.im.size[1] * 0.1,
|
leaf.image.size[1] * 0.1,
|
||||||
task.im.size[0] * 0.9,
|
leaf.image.size[0] * 0.9,
|
||||||
task.im.size[1] * 0.9,
|
leaf.image.size[1] * 0.9,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -59,9 +43,15 @@ def analyze_page(task):
|
||||||
|
|
||||||
if is_blank:
|
if is_blank:
|
||||||
max_sharpness = 1
|
max_sharpness = 1
|
||||||
ocr_orientation_match = True
|
|
||||||
text_margin_px = -1
|
text_margin_px = -1
|
||||||
|
page_angle = 0
|
||||||
else:
|
else:
|
||||||
|
# Sharpness is determined by percentile of pixels that match some
|
||||||
|
# criteria, so it may vary significantly depending on which portion
|
||||||
|
# of the image is analyzed. In an effort to identify the sharpest
|
||||||
|
# edges, we split up the image into chunks and assume that the
|
||||||
|
# highest sharpness value obtained across all chunks is
|
||||||
|
# representative of the image as a whole.
|
||||||
max_sharpness = 0.0
|
max_sharpness = 0.0
|
||||||
if im_cropped.size[0] < im_cropped.size[1]:
|
if im_cropped.size[0] < im_cropped.size[1]:
|
||||||
# Page is in portrait orientation.
|
# Page is in portrait orientation.
|
||||||
|
|
@ -87,62 +77,73 @@ def analyze_page(task):
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
OCR_SCALE = 1
|
# OCR is computationally expensive, so we try to take advantage of
|
||||||
# TODO: Refactor orientation detection logic into the OCR engine
|
# the Tesseract data already parsed by the Internet Archive and
|
||||||
# modules.
|
# embedded in the PDF, when possible. If there is not sufficient
|
||||||
best_ocr_score = -1
|
# text in the PDF to be confident that the Archive's OCR
|
||||||
best_ocr_words = None
|
# postprocessing captured it all, then OCR is recomputed locally.
|
||||||
best_ocr_orientation = -1
|
#
|
||||||
for orientation in range(4):
|
# In some instances, the Archive's OCR detects rotated text but
|
||||||
im_rotated = task.im.resize(
|
# parses it as gibberish. To partially mitigate this, we ignore all
|
||||||
np.int_(np.array(task.im.size) * OCR_SCALE)
|
# precomputed text blocks with a "portrait" aspect ratio. This will
|
||||||
).rotate(90 * orientation, expand=True)
|
# not necessarily help with text that is rotated 180 degrees, but in
|
||||||
ocr, ocr_meta = OcrEngine.process(im_rotated, languages=task.ocr_langs)
|
# practice that case is rarely encountered. This will also not work
|
||||||
|
# well with non-latin scripts that are intended to be oriented
|
||||||
if "page_angle" in ocr_meta:
|
# vertically.
|
||||||
# OCR engine automatically accounts for page rotation.
|
OCR_RECOMPUTE_THRESHOLD_WORDS = 30
|
||||||
best_ocr_score = ocr.shape[0]
|
if (
|
||||||
# PaddleOCR counts rotation as degrees, in the opposite
|
sum(
|
||||||
# direction as PIL's `Image.rotate()`
|
(
|
||||||
best_ocr_orientation = (
|
len(block.text.split())
|
||||||
4 - round(((ocr_meta["page_angle"] + 360) % 360) / 90)
|
for block in leaf.text_blocks
|
||||||
) % 4
|
if block.x1 - block.x0 > block.y1 - block.y0
|
||||||
best_ocr_words = ocr
|
|
||||||
break
|
|
||||||
|
|
||||||
if ocr.shape[0] > best_ocr_score:
|
|
||||||
best_ocr_score = ocr.shape[0]
|
|
||||||
best_ocr_orientation = orientation
|
|
||||||
best_ocr_words = ocr
|
|
||||||
if best_ocr_score > 50:
|
|
||||||
# Unlikely that another orientation will have more words, so
|
|
||||||
# stop eating up CPU.
|
|
||||||
break
|
|
||||||
|
|
||||||
if best_ocr_words.empty:
|
|
||||||
ocr_orientation_match = True
|
|
||||||
text_margin_px = -1
|
|
||||||
else:
|
|
||||||
ocr_orientation_match = best_ocr_orientation == 0
|
|
||||||
|
|
||||||
best_ocr_dims = OCR_SCALE * np.array(
|
|
||||||
task.im.size
|
|
||||||
if best_ocr_orientation % 2 == 0
|
|
||||||
else (task.im.size[1], task.im.size[0])
|
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
>= OCR_RECOMPUTE_THRESHOLD_WORDS
|
||||||
|
):
|
||||||
|
if verbose:
|
||||||
|
print("Using PDF text.")
|
||||||
|
ocred_leaf = leaf
|
||||||
|
page_angle = 0
|
||||||
|
else:
|
||||||
|
if verbose:
|
||||||
|
print("Using OCR.")
|
||||||
|
OCR_SCALE = 1
|
||||||
|
im_scaled = leaf.image.resize(
|
||||||
|
np.int_(np.array(leaf.image.size) * OCR_SCALE)
|
||||||
|
)
|
||||||
|
ocr_result = ocr_engine.process(im_scaled)
|
||||||
|
ocred_leaf = ArchiveLeaf(
|
||||||
|
image=leaf.image,
|
||||||
|
page_number=leaf.page_number,
|
||||||
|
text_blocks=[
|
||||||
|
TextBlock(
|
||||||
|
x0=int(block.x0 / OCR_SCALE),
|
||||||
|
y0=int(block.y0 / OCR_SCALE),
|
||||||
|
x1=int(block.x1 / OCR_SCALE),
|
||||||
|
y1=int(block.y1 / OCR_SCALE),
|
||||||
|
text=block.text,
|
||||||
|
)
|
||||||
|
for block in ocr_result.blocks
|
||||||
|
],
|
||||||
|
)
|
||||||
|
page_angle = ocr_result.page_angle
|
||||||
|
|
||||||
word_margins_all_directions = np.sort(
|
word_margins_all_directions = np.sort(
|
||||||
np.int_(
|
np.int_(
|
||||||
np.concat(
|
np.concat(
|
||||||
(
|
[
|
||||||
best_ocr_words["x0"].to_numpy(),
|
np.array(
|
||||||
best_ocr_words["y0"].to_numpy(),
|
[
|
||||||
best_ocr_dims[0] - best_ocr_words["x1"].to_numpy(),
|
block.x0,
|
||||||
best_ocr_dims[1] - best_ocr_words["y1"].to_numpy(),
|
block.y0,
|
||||||
|
leaf.image.size[0] - block.x1,
|
||||||
|
leaf.image.size[1] - block.y1,
|
||||||
|
]
|
||||||
)
|
)
|
||||||
|
for block in ocred_leaf.text_blocks
|
||||||
|
]
|
||||||
)
|
)
|
||||||
# Transform back into original image pixel density
|
|
||||||
/ OCR_SCALE
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
# Skip the n closest words to the edge, to help ignore stray OCR artifacts.
|
# Skip the n closest words to the edge, to help ignore stray OCR artifacts.
|
||||||
|
|
@ -153,16 +154,23 @@ def analyze_page(task):
|
||||||
else -1
|
else -1
|
||||||
)
|
)
|
||||||
|
|
||||||
return {
|
# Make sure the OCR engine is running with orientation detection.
|
||||||
|
assert page_angle is not None
|
||||||
|
|
||||||
|
analyzed_pages.append(
|
||||||
|
{
|
||||||
"blank": is_blank,
|
"blank": is_blank,
|
||||||
"ocr_orientation_match": ocr_orientation_match,
|
"page_angle": page_angle,
|
||||||
"size_analyzed": task.im.size,
|
"size_analyzed": leaf.image.size,
|
||||||
"sharpness": max_sharpness,
|
"sharpness": max_sharpness,
|
||||||
"text_margin_px": text_margin_px,
|
"text_margin_px": text_margin_px,
|
||||||
}
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return {"pages": analyzed_pages}
|
||||||
|
|
||||||
|
|
||||||
def analyze_sharpness(im):
|
def analyze_sharpness(im: Image.Image):
|
||||||
"""
|
"""
|
||||||
Crudely quantifies the "sharpness" of edges in an image, on a scale of 0 to
|
Crudely quantifies the "sharpness" of edges in an image, on a scale of 0 to
|
||||||
1. The scale is not linear with respect to scan quality: anything above 0.1
|
1. The scale is not linear with respect to scan quality: anything above 0.1
|
||||||
|
|
|
||||||
156
microqa/items.py
156
microqa/items.py
|
|
@ -3,18 +3,18 @@ Python utilities for structuring data and metadata pulled from archive.org
|
||||||
microfiche scans.
|
microfiche scans.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import json
|
|
||||||
import os
|
import os
|
||||||
import urllib
|
import urllib
|
||||||
from contextlib import nullcontext
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from zipfile import ZipFile
|
|
||||||
|
|
||||||
|
import pymupdf
|
||||||
import requests
|
import requests
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
|
from .ocr import TextBlock
|
||||||
|
|
||||||
|
|
||||||
CACHE_DIR = "./archive_cache"
|
CACHE_DIR = "./archive_cache"
|
||||||
|
|
||||||
|
|
@ -38,16 +38,20 @@ class ArchiveLeaf:
|
||||||
presented to users, otherwise a (potentially empty)
|
presented to users, otherwise a (potentially empty)
|
||||||
string with the inferred page number as defined by the
|
string with the inferred page number as defined by the
|
||||||
document being scanned.
|
document being scanned.
|
||||||
|
|
||||||
|
text_blocks List of text blocks extracted from PyMuPDF's
|
||||||
|
TextPage.extractBlocks() method.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
image: Image
|
image: Image.Image
|
||||||
page_number: Optional[str]
|
page_number: Optional[str]
|
||||||
|
text_blocks: list[TextBlock]
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class ArchiveDoc:
|
class ArchiveDoc:
|
||||||
"""
|
"""
|
||||||
Information pertaining to a single set of processed pages, of which there
|
Information pdertaining to a single set of processed pages, of which there
|
||||||
may be multiple for any given ArchiveItem. For example, one SCOTUS case may
|
may be multiple for any given ArchiveItem. For example, one SCOTUS case may
|
||||||
contain several briefs/petitions/etc., each presented as a distinct PDF but
|
contain several briefs/petitions/etc., each presented as a distinct PDF but
|
||||||
all held within the parent `ArchiveItem`.
|
all held within the parent `ArchiveItem`.
|
||||||
|
|
@ -80,62 +84,71 @@ class ArchiveDoc:
|
||||||
name: str
|
name: str
|
||||||
title: Optional[str]
|
title: Optional[str]
|
||||||
|
|
||||||
def fetch_leaves(self, numbered_only=True, use_cache=False) -> list[ArchiveLeaf]:
|
def fetch_leaves(self, use_cache=False) -> list[ArchiveLeaf]:
|
||||||
"""
|
"""
|
||||||
Fetch images and page number data for this document from archive.org,
|
Fetch images and OCR text data for this document from archive.org PDF files.
|
||||||
over the Internet.
|
|
||||||
|
|
||||||
Params:
|
Params:
|
||||||
|
|
||||||
numbered_only If `True`, discards any leaves with no corresponding
|
use_cache If `True`, locally cached PDF files under the
|
||||||
page number entries. Leaves for which the page
|
|
||||||
number is an empty string are retained.
|
|
||||||
use_cache If `True`, locally cached zip files under the
|
|
||||||
`./archive_cache` directory (relative to the working
|
`./archive_cache` directory (relative to the working
|
||||||
directory) will be used instead of fetching over
|
directory) will be used instead of fetching over
|
||||||
HTTPS.
|
HTTPS.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if use_cache:
|
if use_cache:
|
||||||
# Cached file names are derived from the percent-encoded verison of
|
with open(f"{CACHE_DIR}/{_url_encode(self.name)}.pdf", "rb") as f:
|
||||||
# `self.name`, so that there's no need to worry about directory
|
pdf_data = f.read()
|
||||||
# separators or other disallowed characters in the file names
|
|
||||||
# defined by archive.org.
|
|
||||||
with open(
|
|
||||||
f"{CACHE_DIR}/{_url_encode(self.name)}_page_numbers.json", "r"
|
|
||||||
) as f:
|
|
||||||
page_nums = json.load(f)["pages"]
|
|
||||||
zip_reader_ctx = open(f"{CACHE_DIR}/{_url_encode(self.name)}_jp2.zip", "rb")
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
page_nums = _fetch_page_nums(self.identifier, self.name)["pages"]
|
pdf_data = _fetch_pdf(self.identifier, self.name)
|
||||||
|
|
||||||
# Wrap in a context manager so that the reader can be used in a `with`
|
|
||||||
# block in the same way as a file accessed with `open()`.
|
|
||||||
zip_reader_ctx = nullcontext(
|
|
||||||
BytesIO(_fetch_jp2_zip(self.identifier, self.name))
|
|
||||||
)
|
|
||||||
|
|
||||||
leaves = []
|
leaves = []
|
||||||
|
|
||||||
with zip_reader_ctx as zip_reader, ZipFile(zip_reader) as jp_zip:
|
# Open PDF from bytes
|
||||||
for leaf_num, file_name in enumerate(sorted(jp_zip.namelist())):
|
pdf_doc = pymupdf.open(stream=pdf_data, filetype="pdf")
|
||||||
for page_index, page_num_info in enumerate(page_nums):
|
|
||||||
if page_num_info["leafNum"] == leaf_num:
|
|
||||||
# Stop iterating and keep page_index set to the current
|
|
||||||
# value.
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
# Indicate that leaf was not found in page_num list.
|
|
||||||
page_index = None
|
|
||||||
|
|
||||||
if not numbered_only or page_index is not None:
|
try:
|
||||||
with jp_zip.open(file_name) as jp_file:
|
for page_num in range(len(pdf_doc)):
|
||||||
# Convert to single-channel greyscale ("L").
|
page = pdf_doc[page_num]
|
||||||
image = Image.open(jp_file).convert("L")
|
|
||||||
# Rescale long edge to no more than 3200 px.
|
# Extract text blocks with coordinates
|
||||||
|
# Convert to TextBlock objects, discarding block_no and block_type
|
||||||
|
text_blocks = [
|
||||||
|
TextBlock(
|
||||||
|
x0=int(x0),
|
||||||
|
y0=int(y0),
|
||||||
|
x1=int(x1),
|
||||||
|
y1=int(y1),
|
||||||
|
text=text,
|
||||||
|
)
|
||||||
|
for x0, y0, x1, y1, text, *_ in page.get_text("blocks")
|
||||||
|
]
|
||||||
|
|
||||||
|
# Render page to image
|
||||||
|
# Use a matrix to scale appropriately (default is 72 DPI)
|
||||||
|
# Scale factor 4.44 gives approximately 320 DPI, which should produce
|
||||||
|
# images with long edge around 3200px for typical page sizes
|
||||||
|
mat = pymupdf.Matrix(4.44, 4.44)
|
||||||
|
pix = page.get_pixmap(matrix=mat, alpha=False)
|
||||||
|
|
||||||
|
# Convert PyMuPDF pixmap to PIL Image
|
||||||
|
img_data = pix.tobytes("ppm")
|
||||||
|
image = Image.open(BytesIO(img_data)).convert("L")
|
||||||
|
|
||||||
|
# Ensure long edge is no more than 3200 px
|
||||||
image.thumbnail((3200, 3200))
|
image.thumbnail((3200, 3200))
|
||||||
leaves.append(ArchiveLeaf(image=image, page_number=page_index))
|
|
||||||
|
# Page numbers are 1-indexed for human readability
|
||||||
|
leaves.append(
|
||||||
|
ArchiveLeaf(
|
||||||
|
image=image,
|
||||||
|
page_number=str(page_num + 1),
|
||||||
|
text_blocks=text_blocks,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
finally:
|
||||||
|
pdf_doc.close()
|
||||||
|
|
||||||
return leaves
|
return leaves
|
||||||
|
|
||||||
|
|
@ -201,20 +214,18 @@ def fetch_item(identifier: str, use_cache=False) -> ArchiveItem:
|
||||||
|
|
||||||
doc_names = [
|
doc_names = [
|
||||||
# Strip suffix, to just leave the identifier, and title if present.
|
# Strip suffix, to just leave the identifier, and title if present.
|
||||||
name[: -len("_jp2.zip")]
|
name[: -len(".pdf")]
|
||||||
for name in file_names
|
for name in file_names
|
||||||
if name.lower().endswith("_jp2.zip")
|
if name.lower().endswith(".pdf")
|
||||||
# Exclude unprocessed scans, which are also named `..._jp2.zip`.
|
|
||||||
and name.lower() != f"{identifier.lower()}_micro_jp2.zip"
|
|
||||||
]
|
]
|
||||||
|
|
||||||
# Assert that all files we expect to find are actually present.
|
# Assert that all files we expect to find are actually present.
|
||||||
for doc_name in doc_names:
|
for doc_name in doc_names:
|
||||||
if f"{_url_encode(doc_name.lower())}_page_numbers.json" not in [
|
if f"{_url_encode(doc_name.lower())}.pdf" not in [
|
||||||
name.lower() for name in file_names
|
name.lower() for name in file_names
|
||||||
]:
|
]:
|
||||||
raise Exception(
|
raise Exception(
|
||||||
f"expected file not found: {_url_encode(doc_name.lower())}_page_numbers.zip"
|
f"expected file not found: {_url_encode(doc_name.lower())}.pdf"
|
||||||
)
|
)
|
||||||
|
|
||||||
return ArchiveItem(
|
return ArchiveItem(
|
||||||
|
|
@ -232,7 +243,7 @@ def fetch_item(identifier: str, use_cache=False) -> ArchiveItem:
|
||||||
|
|
||||||
def cache_item(identifier: str, overwrite=True):
|
def cache_item(identifier: str, overwrite=True):
|
||||||
"""
|
"""
|
||||||
Load the relevant files for an `ArchiveItem` and its component `ArchiveDoc`s
|
Load the PDF files for an `ArchiveItem` and its component `ArchiveDoc`s
|
||||||
and store them within the `archive_cache` directory (relative to the working
|
and store them within the `archive_cache` directory (relative to the working
|
||||||
directory). The `archive_cache` directory will be created if it does not
|
directory). The `archive_cache` directory will be created if it does not
|
||||||
exist.
|
exist.
|
||||||
|
|
@ -249,16 +260,14 @@ def cache_item(identifier: str, overwrite=True):
|
||||||
|
|
||||||
for name in os.listdir(CACHE_DIR):
|
for name in os.listdir(CACHE_DIR):
|
||||||
if _url_decode(name.lower()).startswith(identifier.lower()):
|
if _url_decode(name.lower()).startswith(identifier.lower()):
|
||||||
|
if not overwrite:
|
||||||
return
|
return
|
||||||
|
|
||||||
item = fetch_item(identifier)
|
item = fetch_item(identifier)
|
||||||
for doc in item.docs:
|
for doc in item.docs:
|
||||||
page_nums = _fetch_page_nums(identifier, doc.name)
|
pdf_data = _fetch_pdf(identifier, doc.name)
|
||||||
zip_file = _fetch_jp2_zip(identifier, doc.name)
|
with open(f"{CACHE_DIR}/{_url_encode(doc.name)}.pdf", "wb") as f:
|
||||||
with open(f"{CACHE_DIR}/{_url_encode(doc.name)}_page_numbers.json", "w") as f:
|
f.write(pdf_data)
|
||||||
json.dump(page_nums, f)
|
|
||||||
with open(f"{CACHE_DIR}/{_url_encode(doc.name)}_jp2.zip", "wb") as f:
|
|
||||||
f.write(zip_file)
|
|
||||||
|
|
||||||
|
|
||||||
def _url_encode(string: str) -> str:
|
def _url_encode(string: str) -> str:
|
||||||
|
|
@ -278,31 +287,16 @@ def _url_decode(string: str) -> str:
|
||||||
return urllib.parse.unquote(string)
|
return urllib.parse.unquote(string)
|
||||||
|
|
||||||
|
|
||||||
def _fetch_page_nums(identifier: str, doc_name: str) -> dict:
|
def _fetch_pdf(identifier: str, doc_name: str) -> bytes:
|
||||||
"""
|
"""
|
||||||
Fetch JSON file with page number metadata for an `ArchiveDoc`.
|
Fetch PDF file for an `ArchiveDoc`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# `self.name` does not get percent-encoded, because it is derived from the
|
# `doc_name` does not get percent-encoded, because it is derived from the
|
||||||
# file path itself as defined by archive.org. Percent- encoding it further
|
# file path itself as defined by archive.org. Percent-encoding it further
|
||||||
# may result in a 404 error.
|
# may result in a 404 error.
|
||||||
page_nums_resp = requests.get(
|
resp = requests.get(
|
||||||
f"https://archive.org/download/{_url_encode(identifier)}/{doc_name}_page_numbers.json"
|
f"https://archive.org/download/{_url_encode(identifier)}/{doc_name}.pdf"
|
||||||
)
|
)
|
||||||
page_nums_resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
return page_nums_resp.json()
|
return resp.content
|
||||||
|
|
||||||
|
|
||||||
def _fetch_jp2_zip(identifier: str, doc_name: str) -> bytes:
|
|
||||||
"""
|
|
||||||
Fetch zip file with processed page scans for an `ArchiveDoc`.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# `self.name` does not get percent-encoded, because it is derived
|
|
||||||
# from the file path itself as defined by archive.org. Percent-
|
|
||||||
# encoding it further may result in a 404 error.
|
|
||||||
zip_resp = requests.get(
|
|
||||||
f"https://archive.org/download/{_url_encode(identifier)}/{doc_name}_jp2.zip"
|
|
||||||
)
|
|
||||||
zip_resp.raise_for_status()
|
|
||||||
return zip_resp.content
|
|
||||||
|
|
|
||||||
|
|
@ -2,13 +2,74 @@
|
||||||
This module contains interchangeable engines for optical character recognition,
|
This module contains interchangeable engines for optical character recognition,
|
||||||
making it easy to swap implementations in and out based on speed and accuracy
|
making it easy to swap implementations in and out based on speed and accuracy
|
||||||
advantages without rewriting business logic.
|
advantages without rewriting business logic.
|
||||||
|
|
||||||
Each nested module exports a class named `OcrEngine` with a method named
|
|
||||||
`process()`, which accepts a PIL `Image` and list of languages, and which
|
|
||||||
returns a tuple containing a standardized `DataFrame` as well as a dictionary
|
|
||||||
containing any additional specialized metadata made available from the
|
|
||||||
underlying OCR engine. The `DataFrame` has columns
|
|
||||||
`["text", "x0", "y0", "x1", "y1"]`, where X and Y coordinates are in pixels
|
|
||||||
measured from the top left corner of the image. `x1` and `y1` values will be
|
|
||||||
greater than or equal to the corresponding `x0` and `y0` values.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TextBlock:
|
||||||
|
"""
|
||||||
|
Attributes:
|
||||||
|
|
||||||
|
x0 Left coordinate of the bounding box, in pixels.
|
||||||
|
|
||||||
|
y0 Top coordinate of the bounding box, in pixels.
|
||||||
|
|
||||||
|
x1 Right coordinate of the bounding box, in pixels from left of
|
||||||
|
image.
|
||||||
|
|
||||||
|
y1 Bottom coordinate of the bounding box, in pixels from top of
|
||||||
|
image.
|
||||||
|
|
||||||
|
text Text content of the block.
|
||||||
|
"""
|
||||||
|
|
||||||
|
x0: int
|
||||||
|
y0: int
|
||||||
|
x1: int
|
||||||
|
y1: int
|
||||||
|
text: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class OcrResult:
|
||||||
|
"""
|
||||||
|
OCR data parsed from a single page.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
|
||||||
|
blocks Blocks of text detected on a page.
|
||||||
|
|
||||||
|
page_angle Optional detected rotation of the page, in degrees clockwise
|
||||||
|
relative to upright.
|
||||||
|
"""
|
||||||
|
|
||||||
|
blocks: list[TextBlock]
|
||||||
|
page_angle: Optional[float]
|
||||||
|
|
||||||
|
|
||||||
|
class OcrEngine:
|
||||||
|
"""
|
||||||
|
Abstract class for interchangeable OCR processing backends.
|
||||||
|
|
||||||
|
Params:
|
||||||
|
|
||||||
|
detect_angle Allows page angle detection to be enabled or disabled
|
||||||
|
for certain implementations. Defaults to True.
|
||||||
|
|
||||||
|
languages List of ISO-639-3 language codes fed to the OCR backend.
|
||||||
|
"""
|
||||||
|
|
||||||
|
_detect_angle: bool
|
||||||
|
_languages: list[str]
|
||||||
|
|
||||||
|
def __init__(self, languages: list[str], detect_angle: bool = True):
|
||||||
|
self._detect_angle = detect_angle
|
||||||
|
self._languages = languages.copy()
|
||||||
|
|
||||||
|
def process(self, image: Image.Image) -> OcrResult:
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,8 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
|
||||||
from paddleocr import PaddleOCR
|
from paddleocr import PaddleOCR
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
|
from . import OcrEngine, OcrResult, TextBlock
|
||||||
# Reuse OCR instances per language.
|
|
||||||
instances: dict[str, PaddleOCR] = {}
|
|
||||||
|
|
||||||
|
|
||||||
def convert_language(iso639_3_code: str) -> str:
|
def convert_language(iso639_3_code: str) -> str:
|
||||||
|
|
@ -21,16 +18,23 @@ def convert_language(iso639_3_code: str) -> str:
|
||||||
return iso639_3_code
|
return iso639_3_code
|
||||||
|
|
||||||
|
|
||||||
class OcrEngine:
|
class PaddleOcrEngine(OcrEngine):
|
||||||
def process(
|
# Dict of ISO 639-3 language code to PaddleOCR instance.
|
||||||
image: Image, languages: list[str] = ["eng"]
|
_ocr_instances: dict[str, PaddleOCR] = {}
|
||||||
) -> tuple[pd.DataFrame, dict]:
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
for language in self._languages:
|
||||||
|
self._ocr_instances[language] = PaddleOCR(
|
||||||
|
use_doc_orientation_classify=True,
|
||||||
|
use_doc_unwarping=False,
|
||||||
|
use_textline_orientation=False,
|
||||||
|
lang=convert_language(language),
|
||||||
|
)
|
||||||
|
|
||||||
|
def process(self, image: Image.Image) -> OcrResult:
|
||||||
"""
|
"""
|
||||||
Use `paddleocr` to parse an image to a `DataFrame` with columns
|
Use `paddleocr` to parse an image.
|
||||||
`["text", "x0", "y0", "x1", "y1"]`, where X and Y coordinates are in
|
|
||||||
pixels measured from the top left corner of the image. `x1` and `y1`
|
|
||||||
values will be greater than or equal to the corresponding `x0` and `y0`
|
|
||||||
values.
|
|
||||||
|
|
||||||
Compared to Tesseract, PaddleOCR is more accurate at low image
|
Compared to Tesseract, PaddleOCR is more accurate at low image
|
||||||
resolutions and able to perform one-shot page angle detection. However,
|
resolutions and able to perform one-shot page angle detection. However,
|
||||||
|
|
@ -46,33 +50,23 @@ class OcrEngine:
|
||||||
language and choose the result it thinks is best. Thus, it's
|
language and choose the result it thinks is best. Thus, it's
|
||||||
recommended to stick to one language if at all possible.
|
recommended to stick to one language if at all possible.
|
||||||
|
|
||||||
Troubleshooting: The PaddlePaddle core package has/had a bug triggering
|
Note: Though it works well when it works, PaddlePaddle has a tendency to
|
||||||
segfaults on ARM systems. Installing the nightly development build of
|
segfault and generally has been found to be buggy and unreliable.
|
||||||
`paddlepaddle` may be necessary to avoid it. Refer to:
|
Installing the nightly development build of `paddlepaddle` may help.
|
||||||
|
Refer to:
|
||||||
- [PaddleOCR issue 16609](https://github.com/PaddlePaddle/PaddleOCR/issues/16609)
|
- [PaddleOCR issue 16609](https://github.com/PaddlePaddle/PaddleOCR/issues/16609)
|
||||||
- [PaddlePaddle PR 75731](https://github.com/PaddlePaddle/Paddle/pull/75731)
|
- [PaddlePaddle PR 75731](https://github.com/PaddlePaddle/Paddle/pull/75731)
|
||||||
|
|
||||||
Params:
|
Params:
|
||||||
|
|
||||||
image PIL image data.
|
image PIL image data.
|
||||||
|
|
||||||
languages List of ISO-639-3 language codes fed to the OCR backend.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
best_result = None
|
best_result = None
|
||||||
for language in languages:
|
for language in self._languages:
|
||||||
if language in instances:
|
[res] = self._ocr_instances[language].predict(
|
||||||
ocr_instance = instances[language]
|
np.array(image.convert("RGB"))
|
||||||
else:
|
|
||||||
ocr_instance = PaddleOCR(
|
|
||||||
use_doc_orientation_classify=True,
|
|
||||||
use_doc_unwarping=False,
|
|
||||||
use_textline_orientation=False,
|
|
||||||
lang=convert_language(language),
|
|
||||||
)
|
)
|
||||||
instances[language] = ocr_instance
|
|
||||||
|
|
||||||
[res] = ocr_instance.predict(np.array(image.convert("RGB")))
|
|
||||||
if best_result is None or len(res["rec_texts"]) > len(
|
if best_result is None or len(res["rec_texts"]) > len(
|
||||||
best_result["rec_texts"]
|
best_result["rec_texts"]
|
||||||
):
|
):
|
||||||
|
|
@ -83,17 +77,16 @@ class OcrEngine:
|
||||||
|
|
||||||
res = best_result
|
res = best_result
|
||||||
|
|
||||||
return (
|
return OcrResult(
|
||||||
pd.DataFrame(
|
blocks=[
|
||||||
{
|
TextBlock(
|
||||||
"text": res["rec_texts"],
|
text=res["rec_texts"][i],
|
||||||
"x0": [x0 for [x0, _, _, _] in res["rec_boxes"]],
|
x0=res["rec_boxes"][i][0],
|
||||||
"y0": [y0 for [_, y0, _, _] in res["rec_boxes"]],
|
y0=res["rec_boxes"][i][1],
|
||||||
"x1": [x1 for [_, _, x1, _] in res["rec_boxes"]],
|
x1=res["rec_boxes"][i][2],
|
||||||
"y1": [y1 for [_, _, _, y1] in res["rec_boxes"]],
|
y1=res["rec_boxes"][i][3],
|
||||||
}
|
)
|
||||||
),
|
for i, _ in enumerate(res["rec_texts"])
|
||||||
{
|
],
|
||||||
"page_angle": res["doc_preprocessor_res"]["angle"],
|
page_angle=(360 - res["doc_preprocessor_res"]["angle"]) % 360,
|
||||||
},
|
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -1,16 +1,13 @@
|
||||||
import pandas as pd
|
|
||||||
import pytesseract
|
import pytesseract
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
|
from . import OcrEngine, OcrResult, TextBlock
|
||||||
|
|
||||||
class OcrEngine:
|
|
||||||
def process(image: Image, languages: list[str]) -> tuple[pd.DataFrame, dict]:
|
class TesseractOcrEngine(OcrEngine):
|
||||||
|
def process(self, image: Image.Image) -> OcrResult:
|
||||||
"""
|
"""
|
||||||
Use `pytesseract` to parse an image to a `DataFrame` with columns
|
Use `pytesseract` to parse an image.
|
||||||
`["text", "x0", "y0", "x1", "y1"]`, where X and Y coordinates are in
|
|
||||||
pixels measured from the top left corner of the image. `x1` and `y1`
|
|
||||||
values will be greater than or equal to the corresponding `x0` and `y0`
|
|
||||||
values.
|
|
||||||
|
|
||||||
Note: Each Tesseract command runs single-threaded, so speed can be
|
Note: Each Tesseract command runs single-threaded, so speed can be
|
||||||
improved up to ~4x by distributing pages across processes running in
|
improved up to ~4x by distributing pages across processes running in
|
||||||
|
|
@ -22,34 +19,97 @@ class OcrEngine:
|
||||||
Params:
|
Params:
|
||||||
|
|
||||||
image PIL image data.
|
image PIL image data.
|
||||||
|
|
||||||
languages List of ISO-639-3 language codes fed to the OCR backend.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
blocks_best = []
|
||||||
|
angle_best = None
|
||||||
|
|
||||||
|
angles = [0, 90, 180, 270] if self._detect_angle else [0]
|
||||||
|
for angle in angles:
|
||||||
|
# Rotate the image counter-clockwise, since we care about
|
||||||
|
# keeping track of the angle from the upright position *to*
|
||||||
|
# the original position, not *from*.
|
||||||
|
rotated_image = image.rotate(360 - angle, expand=True)
|
||||||
df = pytesseract.image_to_data(
|
df = pytesseract.image_to_data(
|
||||||
image,
|
rotated_image,
|
||||||
lang="+".join(languages),
|
lang="+".join(self._languages),
|
||||||
config="--oem 1 --tessdata-dir ./data/tessdata_fast-4.1.0",
|
config="--oem 1 --tessdata-dir ./data/tessdata_fast-4.1.0",
|
||||||
output_type=pytesseract.Output.DATAFRAME,
|
output_type=pytesseract.Output.DATAFRAME,
|
||||||
)
|
).fillna({"text": ""})
|
||||||
|
|
||||||
# Exclude words with relatively low confidence ratings.
|
# Exclude blocks with relatively low confidence ratings.
|
||||||
df = df[df["conf"] > 80]
|
df = df[df["conf"] > 80]
|
||||||
|
|
||||||
# Attempt to exclude words that seem vertically oriented.
|
# Exclude empty words
|
||||||
|
df = df[df["text"] != ""]
|
||||||
|
|
||||||
|
# Attempt to exclude blocks that seem vertically oriented.
|
||||||
# TODO: Will this work for non-Latin scripts? Probably not all.
|
# TODO: Will this work for non-Latin scripts? Probably not all.
|
||||||
df = df[(df["width"] / df["height"]) > 0.8]
|
df = df[(df["width"] / df["height"]) > 0.8]
|
||||||
|
|
||||||
return (
|
print(
|
||||||
pd.DataFrame(
|
[
|
||||||
{
|
TextBlock(
|
||||||
"text": df["text"],
|
# Rotate X and Y coordinates back to match the original image.
|
||||||
"x0": df["left"],
|
*_box_after_rotation(
|
||||||
"y0": df["top"],
|
int(row["left"]),
|
||||||
"x1": df["left"] + df["width"],
|
int(row["top"]),
|
||||||
"y1": df["top"] + df["height"],
|
int(row["left"] + row["width"]),
|
||||||
}
|
int(row["top"] + row["height"]),
|
||||||
|
*rotated_image.size,
|
||||||
|
angle,
|
||||||
),
|
),
|
||||||
# We don't use any page-level metadata from the Tesseract output.
|
text=row["text"],
|
||||||
{},
|
|
||||||
)
|
)
|
||||||
|
for _, row in df.iterrows()
|
||||||
|
]
|
||||||
|
)
|
||||||
|
if angle_best is None or df.shape[0] > len(blocks_best):
|
||||||
|
angle_best = angle
|
||||||
|
blocks_best = [
|
||||||
|
TextBlock(
|
||||||
|
# Rotate X and Y coordinates back to match the original image.
|
||||||
|
*_box_after_rotation(
|
||||||
|
int(row["left"]),
|
||||||
|
int(row["top"]),
|
||||||
|
int(row["left"] + row["width"]),
|
||||||
|
int(row["top"] + row["height"]),
|
||||||
|
*rotated_image.size,
|
||||||
|
angle,
|
||||||
|
),
|
||||||
|
text=row["text"],
|
||||||
|
)
|
||||||
|
for _, row in df.iterrows()
|
||||||
|
]
|
||||||
|
|
||||||
|
return OcrResult(
|
||||||
|
blocks=blocks_best, page_angle=angle_best if self._detect_angle else None
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _box_after_rotation(
|
||||||
|
x0: int,
|
||||||
|
y0: int,
|
||||||
|
x1: int,
|
||||||
|
y1: int,
|
||||||
|
image_width: int,
|
||||||
|
image_height: int,
|
||||||
|
degrees_clockwise: int,
|
||||||
|
) -> tuple[int, int, int, int]:
|
||||||
|
"""
|
||||||
|
Given the corners of a box in an image, returns the corners of an equivalent
|
||||||
|
box if the image is rotated by some multiple of 90 degrees. Both input and
|
||||||
|
output coordinates are expected to be top left followed by bottom right,
|
||||||
|
where the origin is at the top left.
|
||||||
|
"""
|
||||||
|
angle = ((degrees_clockwise % 360) + 360) % 360
|
||||||
|
if angle == 0:
|
||||||
|
return x0, y0, x1, y1
|
||||||
|
if angle == 90:
|
||||||
|
return image_height - y1, x0, image_height - y0, x1
|
||||||
|
if angle == 180:
|
||||||
|
return image_width - x1, image_height - y1, image_width - x0, image_height - y0
|
||||||
|
if angle == 270:
|
||||||
|
return y0, image_width - x1, y1, image_width - x0
|
||||||
|
else:
|
||||||
|
raise Exception("_box_after_rotation() only accepts multiples of 90 degrees")
|
||||||
|
|
|
||||||
|
|
@ -6,10 +6,12 @@ readme = "README.md"
|
||||||
requires-python = ">=3.11"
|
requires-python = ">=3.11"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"numpy>=2.3.2",
|
"numpy>=2.3.2",
|
||||||
"paddleocr>=3.2.0",
|
"paddleocr>=3.3.0",
|
||||||
"paddlepaddle>=3.2.0",
|
"paddlepaddle>=3.2.2",
|
||||||
"pandas>=2.3.1",
|
"pandas>=2.3.1",
|
||||||
"pillow>=11.3.0",
|
"pillow>=11.3.0",
|
||||||
|
"psycopg[binary]>=3.2.12",
|
||||||
|
"pymupdf>=1.26.6",
|
||||||
"pytesseract>=0.3.13",
|
"pytesseract>=0.3.13",
|
||||||
"requests>=2.32.4",
|
"requests>=2.32.4",
|
||||||
]
|
]
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue