re-run ocr as needed to screen false positives

This commit is contained in:
Brent Schroeter 2026-01-15 21:33:57 +00:00
parent 9a94719dc1
commit f0a4ef253b
3 changed files with 194 additions and 100 deletions

View file

@ -62,18 +62,33 @@ def main():
[
i
for i, page in enumerate(doc["pages"])
if 45 < page["page_angle"] < 315
if 30 < page["page_angle"] < 330
]
for doc in analyses
],
"sharpness_max": max(
*[page["sharpness"] for doc in analyses for page in doc["pages"]]
[
page["sharpness"]
for doc in analyses
for page in doc["pages"]
if page["sharpness"] is not None
]
),
"sharpness_median": np.median(
[page["sharpness"] for doc in analyses for page in doc["pages"]]
[
page["sharpness"]
for doc in analyses
for page in doc["pages"]
if page["sharpness"] is not None
]
).tolist(),
"sharpness_min": min(
*[page["sharpness"] for doc in analyses for page in doc["pages"]]
[
page["sharpness"]
for doc in analyses
for page in doc["pages"]
if page["sharpness"] is not None
]
),
}
)

View file

@ -275,7 +275,7 @@ insert into phono.items (ia_id, review_date, oai_updatedate, url)
[
doc["identifier"],
doc.get("review_date"),
max(*[datetime.fromisoformat(t) for t in doc["oai_updatedate"]]),
max([datetime.fromisoformat(t) for t in doc["oai_updatedate"]]),
]
for doc in batch
],

View file

@ -1,10 +1,11 @@
from sys import stdout
from typing import Optional
import numpy as np
from PIL import Image
from .items import ArchiveDoc, ArchiveLeaf
from .ocr import OcrEngine, TextBlock
from .ocr import OcrEngine, OcrResult, TextBlock
def analyze_doc(
@ -30,99 +31,64 @@ def analyze_doc(
analyzed_pages = []
for leaf in all_leaves:
im, is_blank = normalize_contrast_for_text(leaf.image)
im_cropped = im.crop(
(
im.size[0] * 0.1,
im.size[1] * 0.1,
im.size[0] * 0.9,
im.size[1] * 0.9,
im_normalized, is_blank = normalize_contrast_for_text(leaf.image)
if is_blank:
analyzed_pages.append(
{
"is_blank": True,
"page_angle": 0,
"size_analyzed": leaf.image.size,
"sharpness": None,
"text_margin_px": None,
}
)
)
sharpness = analyze_sharpness(im_cropped)
# OCR is computationally expensive, so we try to take advantage of
# the Tesseract data already parsed by the Internet Archive and
# embedded in the PDF, when possible. If there is not sufficient
# text in the PDF to be confident that the Archive's OCR
# postprocessing captured it all, then OCR is recomputed locally.
#
# In some instances, the Archive's OCR detects rotated text but
# parses it as gibberish. To partially mitigate this, we ignore all
# precomputed text blocks with a "portrait" aspect ratio. This will
# not necessarily help with text that is rotated 180 degrees, but in
# practice that case is rarely encountered. This will also not work
# well with non-latin scripts that are intended to be oriented
# vertically.
OCR_RECOMPUTE_THRESHOLD_WORDS = 30
if (
sum(
(
len(block.text.split())
for block in leaf.text_blocks
if block.x1 - block.x0 > block.y1 - block.y0
)
)
>= OCR_RECOMPUTE_THRESHOLD_WORDS
):
ocred_leaf = leaf
page_angle = 0
else:
OCR_SCALE = 1
im_scaled = im.resize(np.int_(np.array(im.size) * OCR_SCALE))
ocr_result = ocr_engine.process(im_scaled)
ocred_leaf = ArchiveLeaf(
image=im,
page_number=leaf.page_number,
text_blocks=[
TextBlock(
x0=int(block.x0 / OCR_SCALE),
y0=int(block.y0 / OCR_SCALE),
x1=int(block.x1 / OCR_SCALE),
y1=int(block.y1 / OCR_SCALE),
text=block.text,
sharpness = analyze_sharpness(
# Exclude edges, which typically include the page border.
im_normalized.crop(
(
im_normalized.size[0] * 0.15,
im_normalized.size[1] * 0.15,
im_normalized.size[0] * 0.85,
im_normalized.size[1] * 0.85,
)
for block in ocr_result.blocks
],
)
)
ocr_result, leaf = compute_ocr(leaf, im_normalized, ocr_engine)
page_angle = 0 if ocr_result is None else ocr_result.page_angle
text_margin_px = compute_text_margin_px(
im_normalized.size, leaf.text_blocks
)
# If OCR turns up issues based on the PDF's original text boxes,
# re-run it ourselves to help weed out false positives.
CLIPPING_THRESHOLD_PX = 30
ROT_THRESHOLD_DEG = 30
if ocr_result is None and (
ROT_THRESHOLD_DEG < page_angle < 360 - ROT_THRESHOLD_DEG
or text_margin_px < CLIPPING_THRESHOLD_PX
):
ocr_result, leaf = compute_ocr(
leaf, im_normalized, ocr_engine, force_recompute=True
)
assert ocr_result is not None, (
"compute_ocr(..., force_recompute=True) should always return an OcrResult"
)
page_angle = ocr_result.page_angle
word_margins_all_directions = (
np.sort(
np.concat(
[
np.array(
[
block.x0,
block.y0,
im.size[0] - block.x1,
im.size[1] - block.y1,
]
)
for block in ocred_leaf.text_blocks
]
).astype(np.int_)
)
if len(ocred_leaf.text_blocks) > 0
else np.array([])
)
# Skip the n closest words to the edge, to help ignore stray OCR artifacts.
SKIP_WORDS = 2
text_margin_px = int(
word_margins_all_directions[SKIP_WORDS]
if word_margins_all_directions.shape[0] > SKIP_WORDS
else -1
text_margin_px = compute_text_margin_px(
im_normalized.size, leaf.text_blocks
)
# Make sure the OCR engine is running with orientation detection.
assert page_angle is not None
assert page_angle is not None, (
"OCR engine should be running with page orientation detection"
)
analyzed_pages.append(
{
"is_blank": is_blank,
"is_blank": False,
"page_angle": page_angle,
"size_analyzed": im.size,
"size_analyzed": leaf.image.size,
"sharpness": sharpness,
"text_margin_px": text_margin_px,
}
@ -154,13 +120,15 @@ def normalize_contrast_for_text(im: Image.Image) -> tuple[Image.Image, bool]:
(normalized_image, is_blank)
"""
pixel_values = np.asarray(
# Exclude edges, which typically include the page border.
im.crop(
(
im.size[0] * 0.1,
im.size[1] * 0.1,
im.size[0] * 0.9,
im.size[1] * 0.9,
im.size[0] * 0.15,
im.size[1] * 0.15,
im.size[0] * 0.85,
im.size[1] * 0.85,
)
)
)
@ -189,4 +157,115 @@ def analyze_sharpness(im: Image.Image) -> float:
# computation based on https://stackoverflow.com/a/26014796.
grad_y, grad_x = np.gradient(np.asarray(im))
return float(np.clip(np.quantile(np.sqrt(grad_x**2 + grad_y**2), 0.99) / 255, 0, 1))
def compute_ocr(
leaf: ArchiveLeaf,
im_normalized: Image.Image,
ocr_engine: OcrEngine,
force_recompute: bool = False,
) -> (Optional[OcrResult], ArchiveLeaf):
"""
OCR is computationally expensive, so we try to take advantage of
the Tesseract data already parsed by the Internet Archive and
embedded in the PDF, when possible. If there is not sufficient
text in the PDF to be confident that the Archive's OCR
postprocessing captured it all, then OCR is recomputed locally.
In some instances, the Archive's OCR detects rotated text but
parses it as gibberish. To partially mitigate this, we ignore all
precomputed text blocks with a "portrait" aspect ratio. This will
not necessarily help with text that is rotated 180 degrees, and will
not work well with non-latin scripts that are intended to be oriented
vertically.
Params:
leaf Information for the document page.
im_normalized Contrast-normalized image.
ocr_engine Engine to use as needed for OCR.
force_recompute If `True`, OCR is re-run even if there is already
text data associated with the leaf.
Returns:
Tuple of `OcrResult` and `ArchiveLeaf` if OCR was recomputed; otherwise
tuple of `None` and `ArchiveLeaf` if existing text data was reused.
"""
if not force_recompute:
PDF_OCR_THRESHOLD_WORDS = 30
pdf_word_count = sum(
(
len(block.text.split())
for block in leaf.text_blocks
if block.x1 - block.x0 > block.y1 - block.y0
)
)
if pdf_word_count >= PDF_OCR_THRESHOLD_WORDS:
return None, leaf
ocr_result = ocr_engine.process(im_normalized)
return ocr_result, ArchiveLeaf(
image=leaf.image,
page_number=leaf.page_number,
text_blocks=[
TextBlock(
x0=int(block.x0),
y0=int(block.y0),
x1=int(block.x1),
y1=int(block.y1),
text=block.text,
)
for block in ocr_result.blocks
],
)
def compute_text_margin_px(
im_size: tuple[int, int], text_blocks: list[TextBlock]
) -> Optional[int]:
"""
Infer the margins between OCR'ed text blocks and the edge of their page's
bounds. This helps to detect if adjacent content may be cropped out.
Params:
im_size Dimensions of the page image, in pixels.
text_blocks List of text blocks detected by OCR.
Returns:
Integer pixel count if text is present; otherwise `None`.
"""
word_margins_all_directions = (
np.sort(
np.concat(
[
np.array(
[
block.x0,
block.y0,
im_size[0] - block.x1,
im_size[1] - block.y1,
]
)
for block in text_blocks
]
).astype(np.int_)
)
if len(text_blocks) > 0
else np.array([])
)
# Skip the n closest words to the edge, to help ignore stray OCR artifacts.
SKIP_WORDS = 2
return (
int(word_margins_all_directions[SKIP_WORDS])
if word_margins_all_directions.shape[0] > SKIP_WORDS
else None
)