271 lines
9.1 KiB
Python
271 lines
9.1 KiB
Python
from sys import stdout
|
|
from typing import Optional
|
|
|
|
import numpy as np
|
|
from PIL import Image
|
|
|
|
from .items import ArchiveDoc, ArchiveLeaf
|
|
from .ocr import OcrEngine, OcrResult, TextBlock
|
|
|
|
|
|
def analyze_doc(
|
|
doc: ArchiveDoc,
|
|
ocr_engine: OcrEngine,
|
|
use_cache: bool = False,
|
|
verbose: bool = False,
|
|
):
|
|
"""
|
|
Analyzes all pages in an ArchiveDoc for useful metrics such as sharpness,
|
|
orientation, presence of text overflows, and so on.
|
|
"""
|
|
|
|
if verbose:
|
|
print(f"Loading {doc.name}...")
|
|
stdout.flush()
|
|
|
|
all_leaves = doc.fetch_leaves(use_cache=use_cache)
|
|
|
|
if verbose:
|
|
print(f"Processing {len(all_leaves)} pages...", file=stdout)
|
|
stdout.flush()
|
|
|
|
analyzed_pages = []
|
|
for leaf in all_leaves:
|
|
im_normalized, is_blank = normalize_contrast_for_text(leaf.image)
|
|
if is_blank:
|
|
analyzed_pages.append(
|
|
{
|
|
"is_blank": True,
|
|
"page_angle": 0,
|
|
"size_analyzed": leaf.image.size,
|
|
"sharpness": None,
|
|
"text_margin_px": None,
|
|
}
|
|
)
|
|
else:
|
|
sharpness = analyze_sharpness(
|
|
# Exclude edges, which typically include the page border.
|
|
im_normalized.crop(
|
|
(
|
|
im_normalized.size[0] * 0.15,
|
|
im_normalized.size[1] * 0.15,
|
|
im_normalized.size[0] * 0.85,
|
|
im_normalized.size[1] * 0.85,
|
|
)
|
|
)
|
|
)
|
|
|
|
ocr_result, leaf = compute_ocr(leaf, im_normalized, ocr_engine)
|
|
page_angle = 0 if ocr_result is None else ocr_result.page_angle
|
|
text_margin_px = compute_text_margin_px(
|
|
im_normalized.size, leaf.text_blocks
|
|
)
|
|
|
|
# If OCR turns up issues based on the PDF's original text boxes,
|
|
# re-run it ourselves to help weed out false positives.
|
|
CLIPPING_THRESHOLD_PX = 30
|
|
ROT_THRESHOLD_DEG = 30
|
|
if ocr_result is None and (
|
|
ROT_THRESHOLD_DEG < page_angle < 360 - ROT_THRESHOLD_DEG
|
|
or text_margin_px < CLIPPING_THRESHOLD_PX
|
|
):
|
|
ocr_result, leaf = compute_ocr(
|
|
leaf, im_normalized, ocr_engine, force_recompute=True
|
|
)
|
|
assert ocr_result is not None, (
|
|
"compute_ocr(..., force_recompute=True) should always return an OcrResult"
|
|
)
|
|
page_angle = ocr_result.page_angle
|
|
text_margin_px = compute_text_margin_px(
|
|
im_normalized.size, leaf.text_blocks
|
|
)
|
|
|
|
assert page_angle is not None, (
|
|
"OCR engine should be running with page orientation detection"
|
|
)
|
|
|
|
analyzed_pages.append(
|
|
{
|
|
"is_blank": False,
|
|
"page_angle": page_angle,
|
|
"size_analyzed": leaf.image.size,
|
|
"sharpness": sharpness,
|
|
"text_margin_px": text_margin_px,
|
|
}
|
|
)
|
|
|
|
return {"pages": analyzed_pages}
|
|
|
|
|
|
def normalize_contrast_for_text(im: Image.Image) -> tuple[Image.Image, bool]:
|
|
"""
|
|
Most of the pages being analyzed, and virtually all of the pages we care
|
|
about for the purposes of QA, primarily contain text on a contrasting
|
|
background. We can therefore typically assume that it is reasonable to boost
|
|
contrast so that the lightest pixels are nearly white and the darkest pixels
|
|
are nearly black. This can help make analysis more consistent across leaves
|
|
with varying contrast ratios due to varied scanner settings, contrast ratios
|
|
of the original documnets, or weathering/fading of the physical fiche.
|
|
|
|
Processed leaves usually contain some amount of margin around the edges
|
|
where the backlight of the scanning rig is visible through the unexposed
|
|
region of the negative, so contrast detection is heavily center-weighted.
|
|
|
|
Params:
|
|
|
|
im Scan image as a 2-dimensional numpy array. (Use `np.asarray()` to
|
|
convert PIL `Image` objects to an array format.)
|
|
|
|
Returns:
|
|
|
|
(normalized_image, is_blank)
|
|
"""
|
|
|
|
pixel_values = np.asarray(
|
|
# Exclude edges, which typically include the page border.
|
|
im.crop(
|
|
(
|
|
im.size[0] * 0.15,
|
|
im.size[1] * 0.15,
|
|
im.size[0] * 0.85,
|
|
im.size[1] * 0.85,
|
|
)
|
|
)
|
|
)
|
|
# To avoid extreme outliers, use quantiles instead of absolute extrema.
|
|
extrema = (np.quantile(pixel_values, 0.002), np.quantile(pixel_values, 0.998))
|
|
if extrema[1] - extrema[0] < 64:
|
|
# Assume there is essentially no content here and return the original.
|
|
return im, True
|
|
|
|
# Apply a rudimentary tone curve to the image, with the goal that the
|
|
# extrema we just calculated will evaluate to values "pretty close to" 0%
|
|
# and 100% of the available range.
|
|
return im.point(
|
|
lambda x: np.interp(x, (0, extrema[0], extrema[1], 255), (0, 8, 247, 255))
|
|
), False
|
|
|
|
|
|
def analyze_sharpness(im: Image.Image) -> float:
|
|
"""
|
|
Attempts to quantify the sharpness an image, on a scale of 0 to 1.
|
|
"""
|
|
|
|
# Inferring sharpness by measuring the peak intensity of an edge detection/
|
|
# high pass filter over the image tends to produce different baseline
|
|
# results across documents. We've had much more luck with a direct gradient
|
|
# computation based on https://stackoverflow.com/a/26014796.
|
|
grad_y, grad_x = np.gradient(np.asarray(im))
|
|
return float(np.clip(np.quantile(np.sqrt(grad_x**2 + grad_y**2), 0.99) / 255, 0, 1))
|
|
|
|
|
|
def compute_ocr(
|
|
leaf: ArchiveLeaf,
|
|
im_normalized: Image.Image,
|
|
ocr_engine: OcrEngine,
|
|
force_recompute: bool = False,
|
|
) -> (Optional[OcrResult], ArchiveLeaf):
|
|
"""
|
|
OCR is computationally expensive, so we try to take advantage of
|
|
the Tesseract data already parsed by the Internet Archive and
|
|
embedded in the PDF, when possible. If there is not sufficient
|
|
text in the PDF to be confident that the Archive's OCR
|
|
postprocessing captured it all, then OCR is recomputed locally.
|
|
|
|
In some instances, the Archive's OCR detects rotated text but
|
|
parses it as gibberish. To partially mitigate this, we ignore all
|
|
precomputed text blocks with a "portrait" aspect ratio. This will
|
|
not necessarily help with text that is rotated 180 degrees, and will
|
|
not work well with non-latin scripts that are intended to be oriented
|
|
vertically.
|
|
|
|
Params:
|
|
|
|
leaf Information for the document page.
|
|
|
|
im_normalized Contrast-normalized image.
|
|
|
|
ocr_engine Engine to use as needed for OCR.
|
|
|
|
force_recompute If `True`, OCR is re-run even if there is already
|
|
text data associated with the leaf.
|
|
|
|
Returns:
|
|
|
|
Tuple of `OcrResult` and `ArchiveLeaf` if OCR was recomputed; otherwise
|
|
tuple of `None` and `ArchiveLeaf` if existing text data was reused.
|
|
"""
|
|
|
|
if not force_recompute:
|
|
PDF_OCR_THRESHOLD_WORDS = 30
|
|
pdf_word_count = sum(
|
|
(
|
|
len(block.text.split())
|
|
for block in leaf.text_blocks
|
|
if block.x1 - block.x0 > block.y1 - block.y0
|
|
)
|
|
)
|
|
if pdf_word_count >= PDF_OCR_THRESHOLD_WORDS:
|
|
return None, leaf
|
|
|
|
ocr_result = ocr_engine.process(im_normalized)
|
|
return ocr_result, ArchiveLeaf(
|
|
image=leaf.image,
|
|
page_number=leaf.page_number,
|
|
text_blocks=[
|
|
TextBlock(
|
|
x0=int(block.x0),
|
|
y0=int(block.y0),
|
|
x1=int(block.x1),
|
|
y1=int(block.y1),
|
|
text=block.text,
|
|
)
|
|
for block in ocr_result.blocks
|
|
],
|
|
)
|
|
|
|
|
|
def compute_text_margin_px(
|
|
im_size: tuple[int, int], text_blocks: list[TextBlock]
|
|
) -> Optional[int]:
|
|
"""
|
|
Infer the margins between OCR'ed text blocks and the edge of their page's
|
|
bounds. This helps to detect if adjacent content may be cropped out.
|
|
|
|
Params:
|
|
|
|
im_size Dimensions of the page image, in pixels.
|
|
|
|
text_blocks List of text blocks detected by OCR.
|
|
|
|
Returns:
|
|
|
|
Integer pixel count if text is present; otherwise `None`.
|
|
"""
|
|
|
|
word_margins_all_directions = (
|
|
np.sort(
|
|
np.concat(
|
|
[
|
|
np.array(
|
|
[
|
|
block.x0,
|
|
block.y0,
|
|
im_size[0] - block.x1,
|
|
im_size[1] - block.y1,
|
|
]
|
|
)
|
|
for block in text_blocks
|
|
]
|
|
).astype(np.int_)
|
|
)
|
|
if len(text_blocks) > 0
|
|
else np.array([])
|
|
)
|
|
# Skip the n closest words to the edge, to help ignore stray OCR artifacts.
|
|
SKIP_WORDS = 2
|
|
return (
|
|
int(word_margins_all_directions[SKIP_WORDS])
|
|
if word_margins_all_directions.shape[0] > SKIP_WORDS
|
|
else None
|
|
)
|