2025-10-04 18:03:03 -07:00
|
|
|
from sys import stdout
|
2025-10-04 15:09:16 -07:00
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
|
from PIL import Image, ImageFilter
|
|
|
|
|
|
2025-12-20 02:16:41 +00:00
|
|
|
from .items import ArchiveDoc, ArchiveLeaf
|
|
|
|
|
from .ocr import OcrEngine, TextBlock
|
2025-10-04 15:09:16 -07:00
|
|
|
|
|
|
|
|
|
2025-12-20 02:16:41 +00:00
|
|
|
def analyze_doc(
|
|
|
|
|
doc: ArchiveDoc,
|
|
|
|
|
ocr_engine: OcrEngine,
|
|
|
|
|
use_cache: bool = False,
|
|
|
|
|
verbose: bool = False,
|
|
|
|
|
):
|
|
|
|
|
"""
|
|
|
|
|
Analyzes all pages in an ArchiveDoc for useful metrics such as sharpness,
|
|
|
|
|
orientation, presence of text overflows, and so on.
|
|
|
|
|
"""
|
|
|
|
|
|
2025-10-04 15:09:16 -07:00
|
|
|
if verbose:
|
2025-10-04 18:03:03 -07:00
|
|
|
print(f"Loading {doc.name}...")
|
|
|
|
|
stdout.flush()
|
2025-10-04 15:09:16 -07:00
|
|
|
|
2025-12-20 02:16:41 +00:00
|
|
|
all_leaves = doc.fetch_leaves(use_cache=use_cache)
|
2025-10-04 15:09:16 -07:00
|
|
|
|
|
|
|
|
if verbose:
|
2025-12-20 02:16:41 +00:00
|
|
|
print(f"Processing {len(all_leaves)} pages...", file=stdout)
|
2025-10-04 18:03:03 -07:00
|
|
|
stdout.flush()
|
2025-10-04 15:09:16 -07:00
|
|
|
|
2025-12-20 02:16:41 +00:00
|
|
|
analyzed_pages = []
|
|
|
|
|
for leaf in all_leaves:
|
|
|
|
|
im_cropped = leaf.image.crop(
|
|
|
|
|
(
|
|
|
|
|
leaf.image.size[0] * 0.1,
|
|
|
|
|
leaf.image.size[1] * 0.1,
|
|
|
|
|
leaf.image.size[0] * 0.9,
|
|
|
|
|
leaf.image.size[1] * 0.9,
|
|
|
|
|
)
|
|
|
|
|
)
|
2025-10-04 15:09:16 -07:00
|
|
|
|
2025-12-20 02:16:41 +00:00
|
|
|
is_blank = im_cropped.getextrema()[0] > 255 * 0.8
|
2025-10-04 15:09:16 -07:00
|
|
|
|
2025-12-20 02:16:41 +00:00
|
|
|
if is_blank:
|
|
|
|
|
max_sharpness = 1
|
|
|
|
|
text_margin_px = -1
|
|
|
|
|
page_angle = 0
|
2025-10-04 15:09:16 -07:00
|
|
|
else:
|
2025-12-20 02:16:41 +00:00
|
|
|
# Sharpness is determined by percentile of pixels that match some
|
|
|
|
|
# criteria, so it may vary significantly depending on which portion
|
|
|
|
|
# of the image is analyzed. In an effort to identify the sharpest
|
|
|
|
|
# edges, we split up the image into chunks and assume that the
|
|
|
|
|
# highest sharpness value obtained across all chunks is
|
|
|
|
|
# representative of the image as a whole.
|
|
|
|
|
max_sharpness = 0.0
|
|
|
|
|
if im_cropped.size[0] < im_cropped.size[1]:
|
|
|
|
|
# Page is in portrait orientation.
|
|
|
|
|
segments_x = 2
|
|
|
|
|
segments_y = 3
|
|
|
|
|
else:
|
|
|
|
|
# Page is in landscape orientation.
|
|
|
|
|
segments_x = 3
|
|
|
|
|
segments_y = 2
|
|
|
|
|
for i in range(segments_x):
|
|
|
|
|
for j in range(segments_y):
|
|
|
|
|
max_sharpness = max(
|
|
|
|
|
max_sharpness,
|
|
|
|
|
analyze_sharpness(
|
|
|
|
|
im_cropped.crop(
|
|
|
|
|
(
|
|
|
|
|
im_cropped.size[0] / segments_x * i,
|
|
|
|
|
im_cropped.size[1] / segments_y * j,
|
|
|
|
|
im_cropped.size[0] / segments_x * (i + 1),
|
|
|
|
|
im_cropped.size[1] / segments_y * (j + 1),
|
|
|
|
|
)
|
2025-10-04 15:09:16 -07:00
|
|
|
)
|
2025-12-20 02:16:41 +00:00
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# OCR is computationally expensive, so we try to take advantage of
|
|
|
|
|
# the Tesseract data already parsed by the Internet Archive and
|
|
|
|
|
# embedded in the PDF, when possible. If there is not sufficient
|
|
|
|
|
# text in the PDF to be confident that the Archive's OCR
|
|
|
|
|
# postprocessing captured it all, then OCR is recomputed locally.
|
|
|
|
|
#
|
|
|
|
|
# In some instances, the Archive's OCR detects rotated text but
|
|
|
|
|
# parses it as gibberish. To partially mitigate this, we ignore all
|
|
|
|
|
# precomputed text blocks with a "portrait" aspect ratio. This will
|
|
|
|
|
# not necessarily help with text that is rotated 180 degrees, but in
|
|
|
|
|
# practice that case is rarely encountered. This will also not work
|
|
|
|
|
# well with non-latin scripts that are intended to be oriented
|
|
|
|
|
# vertically.
|
|
|
|
|
OCR_RECOMPUTE_THRESHOLD_WORDS = 30
|
|
|
|
|
if (
|
|
|
|
|
sum(
|
|
|
|
|
(
|
|
|
|
|
len(block.text.split())
|
|
|
|
|
for block in leaf.text_blocks
|
|
|
|
|
if block.x1 - block.x0 > block.y1 - block.y0
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
>= OCR_RECOMPUTE_THRESHOLD_WORDS
|
|
|
|
|
):
|
|
|
|
|
if verbose:
|
|
|
|
|
print("Using PDF text.")
|
|
|
|
|
ocred_leaf = leaf
|
|
|
|
|
page_angle = 0
|
|
|
|
|
else:
|
|
|
|
|
if verbose:
|
|
|
|
|
print("Using OCR.")
|
|
|
|
|
OCR_SCALE = 1
|
|
|
|
|
im_scaled = leaf.image.resize(
|
|
|
|
|
np.int_(np.array(leaf.image.size) * OCR_SCALE)
|
|
|
|
|
)
|
|
|
|
|
ocr_result = ocr_engine.process(im_scaled)
|
|
|
|
|
ocred_leaf = ArchiveLeaf(
|
|
|
|
|
image=leaf.image,
|
|
|
|
|
page_number=leaf.page_number,
|
|
|
|
|
text_blocks=[
|
|
|
|
|
TextBlock(
|
|
|
|
|
x0=int(block.x0 / OCR_SCALE),
|
|
|
|
|
y0=int(block.y0 / OCR_SCALE),
|
|
|
|
|
x1=int(block.x1 / OCR_SCALE),
|
|
|
|
|
y1=int(block.y1 / OCR_SCALE),
|
|
|
|
|
text=block.text,
|
2025-10-04 15:09:16 -07:00
|
|
|
)
|
2025-12-20 02:16:41 +00:00
|
|
|
for block in ocr_result.blocks
|
|
|
|
|
],
|
2025-10-04 15:09:16 -07:00
|
|
|
)
|
2025-12-20 02:16:41 +00:00
|
|
|
page_angle = ocr_result.page_angle
|
2025-10-04 15:09:16 -07:00
|
|
|
|
|
|
|
|
word_margins_all_directions = np.sort(
|
|
|
|
|
np.int_(
|
|
|
|
|
np.concat(
|
2025-12-20 02:16:41 +00:00
|
|
|
[
|
|
|
|
|
np.array(
|
|
|
|
|
[
|
|
|
|
|
block.x0,
|
|
|
|
|
block.y0,
|
|
|
|
|
leaf.image.size[0] - block.x1,
|
|
|
|
|
leaf.image.size[1] - block.y1,
|
|
|
|
|
]
|
|
|
|
|
)
|
|
|
|
|
for block in ocred_leaf.text_blocks
|
|
|
|
|
]
|
2025-10-04 15:09:16 -07:00
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
# Skip the n closest words to the edge, to help ignore stray OCR artifacts.
|
|
|
|
|
SKIP_WORDS = 2
|
|
|
|
|
text_margin_px = int(
|
|
|
|
|
word_margins_all_directions[SKIP_WORDS]
|
|
|
|
|
if word_margins_all_directions.shape[0] > SKIP_WORDS
|
|
|
|
|
else -1
|
|
|
|
|
)
|
|
|
|
|
|
2025-12-20 02:16:41 +00:00
|
|
|
# Make sure the OCR engine is running with orientation detection.
|
|
|
|
|
assert page_angle is not None
|
|
|
|
|
|
|
|
|
|
analyzed_pages.append(
|
|
|
|
|
{
|
|
|
|
|
"blank": is_blank,
|
|
|
|
|
"page_angle": page_angle,
|
|
|
|
|
"size_analyzed": leaf.image.size,
|
|
|
|
|
"sharpness": max_sharpness,
|
|
|
|
|
"text_margin_px": text_margin_px,
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
return {"pages": analyzed_pages}
|
2025-10-04 15:09:16 -07:00
|
|
|
|
|
|
|
|
|
2025-12-20 02:16:41 +00:00
|
|
|
def analyze_sharpness(im: Image.Image):
|
2025-10-04 15:09:16 -07:00
|
|
|
"""
|
|
|
|
|
Crudely quantifies the "sharpness" of edges in an image, on a scale of 0 to
|
|
|
|
|
1. The scale is not linear with respect to scan quality: anything above 0.1
|
|
|
|
|
is usually fine.
|
|
|
|
|
"""
|
|
|
|
|
arr = np.asarray(im)
|
|
|
|
|
|
|
|
|
|
# Normalize contrast based on brightest and darkest pixels. For example,
|
|
|
|
|
# NORM_QUANTILE=0.1 will attempt to transform pixel values so that 80% fall
|
|
|
|
|
# between 10% brightness and 90% brightness. In practice, a value around
|
|
|
|
|
# 0.02 seems to work fairly well.
|
|
|
|
|
NORM_QUANTILE = 0.03
|
|
|
|
|
pixel_range = np.quantile(arr, 1.0 - NORM_QUANTILE) - np.quantile(
|
|
|
|
|
arr, NORM_QUANTILE
|
|
|
|
|
)
|
|
|
|
|
if pixel_range == 0:
|
|
|
|
|
arr_normalized = arr
|
|
|
|
|
else:
|
|
|
|
|
arr_normalized = arr * (1.0 - NORM_QUANTILE * 2) / pixel_range
|
|
|
|
|
arr_normalized = (
|
|
|
|
|
arr_normalized - np.quantile(arr_normalized, NORM_QUANTILE) + NORM_QUANTILE
|
|
|
|
|
)
|
|
|
|
|
arr_normalized = np.uint8(np.clip(arr_normalized, 0, 1) * 255)
|
|
|
|
|
|
|
|
|
|
# "Sharpness" is determined by measuring the median intensity of pixels
|
|
|
|
|
# near edges, after an edge detection filter has been applied to the image.
|
|
|
|
|
edges_arr = np.asarray(
|
|
|
|
|
Image.fromarray(arr_normalized).filter(ImageFilter.FIND_EDGES)
|
|
|
|
|
)
|
|
|
|
|
EDGE_THRESHOLD = 8
|
|
|
|
|
return np.median(edges_arr[edges_arr > EDGE_THRESHOLD]) / 255
|