2025-10-04 18:03:03 -07:00
|
|
|
from sys import stdout
|
2025-10-04 15:09:16 -07:00
|
|
|
|
|
|
|
|
import numpy as np
|
2026-01-15 21:31:24 +00:00
|
|
|
from PIL import Image
|
2025-10-04 15:09:16 -07:00
|
|
|
|
2025-12-20 02:16:41 +00:00
|
|
|
from .items import ArchiveDoc, ArchiveLeaf
|
|
|
|
|
from .ocr import OcrEngine, TextBlock
|
2025-10-04 15:09:16 -07:00
|
|
|
|
|
|
|
|
|
2025-12-20 02:16:41 +00:00
|
|
|
def analyze_doc(
|
|
|
|
|
doc: ArchiveDoc,
|
|
|
|
|
ocr_engine: OcrEngine,
|
|
|
|
|
use_cache: bool = False,
|
|
|
|
|
verbose: bool = False,
|
|
|
|
|
):
|
|
|
|
|
"""
|
|
|
|
|
Analyzes all pages in an ArchiveDoc for useful metrics such as sharpness,
|
|
|
|
|
orientation, presence of text overflows, and so on.
|
|
|
|
|
"""
|
|
|
|
|
|
2025-10-04 15:09:16 -07:00
|
|
|
if verbose:
|
2025-10-04 18:03:03 -07:00
|
|
|
print(f"Loading {doc.name}...")
|
|
|
|
|
stdout.flush()
|
2025-10-04 15:09:16 -07:00
|
|
|
|
2025-12-20 02:16:41 +00:00
|
|
|
all_leaves = doc.fetch_leaves(use_cache=use_cache)
|
2025-10-04 15:09:16 -07:00
|
|
|
|
|
|
|
|
if verbose:
|
2025-12-20 02:16:41 +00:00
|
|
|
print(f"Processing {len(all_leaves)} pages...", file=stdout)
|
2025-10-04 18:03:03 -07:00
|
|
|
stdout.flush()
|
2025-10-04 15:09:16 -07:00
|
|
|
|
2025-12-20 02:16:41 +00:00
|
|
|
analyzed_pages = []
|
|
|
|
|
for leaf in all_leaves:
|
2025-12-20 08:58:49 +00:00
|
|
|
im, is_blank = normalize_contrast_for_text(leaf.image)
|
|
|
|
|
|
|
|
|
|
im_cropped = im.crop(
|
2025-12-20 02:16:41 +00:00
|
|
|
(
|
2025-12-20 08:58:49 +00:00
|
|
|
im.size[0] * 0.1,
|
|
|
|
|
im.size[1] * 0.1,
|
|
|
|
|
im.size[0] * 0.9,
|
|
|
|
|
im.size[1] * 0.9,
|
2025-12-20 02:16:41 +00:00
|
|
|
)
|
|
|
|
|
)
|
2025-12-20 08:58:49 +00:00
|
|
|
sharpness = analyze_sharpness(im_cropped)
|
|
|
|
|
|
|
|
|
|
# OCR is computationally expensive, so we try to take advantage of
|
|
|
|
|
# the Tesseract data already parsed by the Internet Archive and
|
|
|
|
|
# embedded in the PDF, when possible. If there is not sufficient
|
|
|
|
|
# text in the PDF to be confident that the Archive's OCR
|
|
|
|
|
# postprocessing captured it all, then OCR is recomputed locally.
|
|
|
|
|
#
|
|
|
|
|
# In some instances, the Archive's OCR detects rotated text but
|
|
|
|
|
# parses it as gibberish. To partially mitigate this, we ignore all
|
|
|
|
|
# precomputed text blocks with a "portrait" aspect ratio. This will
|
|
|
|
|
# not necessarily help with text that is rotated 180 degrees, but in
|
|
|
|
|
# practice that case is rarely encountered. This will also not work
|
|
|
|
|
# well with non-latin scripts that are intended to be oriented
|
|
|
|
|
# vertically.
|
|
|
|
|
OCR_RECOMPUTE_THRESHOLD_WORDS = 30
|
|
|
|
|
if (
|
|
|
|
|
sum(
|
|
|
|
|
(
|
|
|
|
|
len(block.text.split())
|
|
|
|
|
for block in leaf.text_blocks
|
|
|
|
|
if block.x1 - block.x0 > block.y1 - block.y0
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
>= OCR_RECOMPUTE_THRESHOLD_WORDS
|
|
|
|
|
):
|
|
|
|
|
ocred_leaf = leaf
|
2025-12-20 02:16:41 +00:00
|
|
|
page_angle = 0
|
2025-10-04 15:09:16 -07:00
|
|
|
else:
|
2025-12-20 08:58:49 +00:00
|
|
|
OCR_SCALE = 1
|
|
|
|
|
im_scaled = im.resize(np.int_(np.array(im.size) * OCR_SCALE))
|
|
|
|
|
ocr_result = ocr_engine.process(im_scaled)
|
|
|
|
|
ocred_leaf = ArchiveLeaf(
|
|
|
|
|
image=im,
|
|
|
|
|
page_number=leaf.page_number,
|
|
|
|
|
text_blocks=[
|
|
|
|
|
TextBlock(
|
|
|
|
|
x0=int(block.x0 / OCR_SCALE),
|
|
|
|
|
y0=int(block.y0 / OCR_SCALE),
|
|
|
|
|
x1=int(block.x1 / OCR_SCALE),
|
|
|
|
|
y1=int(block.y1 / OCR_SCALE),
|
|
|
|
|
text=block.text,
|
2025-12-20 02:16:41 +00:00
|
|
|
)
|
2025-12-20 08:58:49 +00:00
|
|
|
for block in ocr_result.blocks
|
|
|
|
|
],
|
2025-10-04 15:09:16 -07:00
|
|
|
)
|
2025-12-20 08:58:49 +00:00
|
|
|
page_angle = ocr_result.page_angle
|
|
|
|
|
|
|
|
|
|
word_margins_all_directions = (
|
|
|
|
|
np.sort(
|
|
|
|
|
np.concat(
|
|
|
|
|
[
|
|
|
|
|
np.array(
|
|
|
|
|
[
|
|
|
|
|
block.x0,
|
|
|
|
|
block.y0,
|
|
|
|
|
im.size[0] - block.x1,
|
|
|
|
|
im.size[1] - block.y1,
|
|
|
|
|
]
|
|
|
|
|
)
|
|
|
|
|
for block in ocred_leaf.text_blocks
|
|
|
|
|
]
|
|
|
|
|
).astype(np.int_)
|
2025-10-04 15:09:16 -07:00
|
|
|
)
|
2025-12-20 08:58:49 +00:00
|
|
|
if len(ocred_leaf.text_blocks) > 0
|
|
|
|
|
else np.array([])
|
|
|
|
|
)
|
|
|
|
|
# Skip the n closest words to the edge, to help ignore stray OCR artifacts.
|
|
|
|
|
SKIP_WORDS = 2
|
|
|
|
|
text_margin_px = int(
|
|
|
|
|
word_margins_all_directions[SKIP_WORDS]
|
|
|
|
|
if word_margins_all_directions.shape[0] > SKIP_WORDS
|
|
|
|
|
else -1
|
|
|
|
|
)
|
2025-10-04 15:09:16 -07:00
|
|
|
|
2025-12-20 02:16:41 +00:00
|
|
|
# Make sure the OCR engine is running with orientation detection.
|
|
|
|
|
assert page_angle is not None
|
|
|
|
|
|
|
|
|
|
analyzed_pages.append(
|
|
|
|
|
{
|
2025-12-20 08:58:49 +00:00
|
|
|
"is_blank": is_blank,
|
2025-12-20 02:16:41 +00:00
|
|
|
"page_angle": page_angle,
|
2025-12-20 08:58:49 +00:00
|
|
|
"size_analyzed": im.size,
|
|
|
|
|
"sharpness": sharpness,
|
2025-12-20 02:16:41 +00:00
|
|
|
"text_margin_px": text_margin_px,
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
return {"pages": analyzed_pages}
|
2025-10-04 15:09:16 -07:00
|
|
|
|
|
|
|
|
|
2025-12-20 08:58:49 +00:00
|
|
|
def normalize_contrast_for_text(im: Image.Image) -> tuple[Image.Image, bool]:
|
2025-10-04 15:09:16 -07:00
|
|
|
"""
|
2025-12-20 08:58:49 +00:00
|
|
|
Most of the pages being analyzed, and virtually all of the pages we care
|
|
|
|
|
about for the purposes of QA, primarily contain text on a contrasting
|
|
|
|
|
background. We can therefore typically assume that it is reasonable to boost
|
|
|
|
|
contrast so that the lightest pixels are nearly white and the darkest pixels
|
|
|
|
|
are nearly black. This can help make analysis more consistent across leaves
|
|
|
|
|
with varying contrast ratios due to varied scanner settings, contrast ratios
|
|
|
|
|
of the original documnets, or weathering/fading of the physical fiche.
|
|
|
|
|
|
|
|
|
|
Processed leaves usually contain some amount of margin around the edges
|
|
|
|
|
where the backlight of the scanning rig is visible through the unexposed
|
|
|
|
|
region of the negative, so contrast detection is heavily center-weighted.
|
|
|
|
|
|
|
|
|
|
Params:
|
|
|
|
|
|
|
|
|
|
im Scan image as a 2-dimensional numpy array. (Use `np.asarray()` to
|
|
|
|
|
convert PIL `Image` objects to an array format.)
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
|
|
|
|
(normalized_image, is_blank)
|
2025-10-04 15:09:16 -07:00
|
|
|
"""
|
2025-12-20 08:58:49 +00:00
|
|
|
pixel_values = np.asarray(
|
|
|
|
|
im.crop(
|
|
|
|
|
(
|
|
|
|
|
im.size[0] * 0.1,
|
|
|
|
|
im.size[1] * 0.1,
|
|
|
|
|
im.size[0] * 0.9,
|
|
|
|
|
im.size[1] * 0.9,
|
|
|
|
|
)
|
2025-10-04 15:09:16 -07:00
|
|
|
)
|
|
|
|
|
)
|
2025-12-20 08:58:49 +00:00
|
|
|
# To avoid extreme outliers, use quantiles instead of absolute extrema.
|
|
|
|
|
extrema = (np.quantile(pixel_values, 0.002), np.quantile(pixel_values, 0.998))
|
|
|
|
|
if extrema[1] - extrema[0] < 64:
|
|
|
|
|
# Assume there is essentially no content here and return the original.
|
|
|
|
|
return im, True
|
|
|
|
|
|
|
|
|
|
# Apply a rudimentary tone curve to the image, with the goal that the
|
|
|
|
|
# extrema we just calculated will evaluate to values "pretty close to" 0%
|
|
|
|
|
# and 100% of the available range.
|
|
|
|
|
return im.point(
|
|
|
|
|
lambda x: np.interp(x, (0, extrema[0], extrema[1], 255), (0, 8, 247, 255))
|
|
|
|
|
), False
|
|
|
|
|
|
|
|
|
|
|
2026-01-15 21:31:24 +00:00
|
|
|
def analyze_sharpness(im: Image.Image) -> float:
|
2025-12-20 08:58:49 +00:00
|
|
|
"""
|
2026-01-15 21:31:24 +00:00
|
|
|
Attempts to quantify the sharpness an image, on a scale of 0 to 1.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# Inferring sharpness by measuring the peak intensity of an edge detection/
|
|
|
|
|
# high pass filter over the image tends to produce different baseline
|
|
|
|
|
# results across documents. We've had much more luck with a direct gradient
|
|
|
|
|
# computation based on https://stackoverflow.com/a/26014796.
|
|
|
|
|
grad_y, grad_x = np.gradient(np.asarray(im))
|
|
|
|
|
return float(np.clip(np.quantile(np.sqrt(grad_x**2 + grad_y**2), 0.99) / 255, 0, 1))
|
2025-12-20 08:58:49 +00:00
|
|
|
"""
|