re-run ocr as needed to screen false positives
This commit is contained in:
parent
9a94719dc1
commit
f0a4ef253b
3 changed files with 194 additions and 100 deletions
|
|
@ -62,18 +62,33 @@ def main():
|
|||
[
|
||||
i
|
||||
for i, page in enumerate(doc["pages"])
|
||||
if 45 < page["page_angle"] < 315
|
||||
if 30 < page["page_angle"] < 330
|
||||
]
|
||||
for doc in analyses
|
||||
],
|
||||
"sharpness_max": max(
|
||||
*[page["sharpness"] for doc in analyses for page in doc["pages"]]
|
||||
[
|
||||
page["sharpness"]
|
||||
for doc in analyses
|
||||
for page in doc["pages"]
|
||||
if page["sharpness"] is not None
|
||||
]
|
||||
),
|
||||
"sharpness_median": np.median(
|
||||
[page["sharpness"] for doc in analyses for page in doc["pages"]]
|
||||
[
|
||||
page["sharpness"]
|
||||
for doc in analyses
|
||||
for page in doc["pages"]
|
||||
if page["sharpness"] is not None
|
||||
]
|
||||
).tolist(),
|
||||
"sharpness_min": min(
|
||||
*[page["sharpness"] for doc in analyses for page in doc["pages"]]
|
||||
[
|
||||
page["sharpness"]
|
||||
for doc in analyses
|
||||
for page in doc["pages"]
|
||||
if page["sharpness"] is not None
|
||||
]
|
||||
),
|
||||
}
|
||||
)
|
||||
|
|
|
|||
2
main.py
2
main.py
|
|
@ -275,7 +275,7 @@ insert into phono.items (ia_id, review_date, oai_updatedate, url)
|
|||
[
|
||||
doc["identifier"],
|
||||
doc.get("review_date"),
|
||||
max(*[datetime.fromisoformat(t) for t in doc["oai_updatedate"]]),
|
||||
max([datetime.fromisoformat(t) for t in doc["oai_updatedate"]]),
|
||||
]
|
||||
for doc in batch
|
||||
],
|
||||
|
|
|
|||
|
|
@ -1,10 +1,11 @@
|
|||
from sys import stdout
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
from .items import ArchiveDoc, ArchiveLeaf
|
||||
from .ocr import OcrEngine, TextBlock
|
||||
from .ocr import OcrEngine, OcrResult, TextBlock
|
||||
|
||||
|
||||
def analyze_doc(
|
||||
|
|
@ -30,99 +31,64 @@ def analyze_doc(
|
|||
|
||||
analyzed_pages = []
|
||||
for leaf in all_leaves:
|
||||
im, is_blank = normalize_contrast_for_text(leaf.image)
|
||||
|
||||
im_cropped = im.crop(
|
||||
(
|
||||
im.size[0] * 0.1,
|
||||
im.size[1] * 0.1,
|
||||
im.size[0] * 0.9,
|
||||
im.size[1] * 0.9,
|
||||
im_normalized, is_blank = normalize_contrast_for_text(leaf.image)
|
||||
if is_blank:
|
||||
analyzed_pages.append(
|
||||
{
|
||||
"is_blank": True,
|
||||
"page_angle": 0,
|
||||
"size_analyzed": leaf.image.size,
|
||||
"sharpness": None,
|
||||
"text_margin_px": None,
|
||||
}
|
||||
)
|
||||
)
|
||||
sharpness = analyze_sharpness(im_cropped)
|
||||
|
||||
# OCR is computationally expensive, so we try to take advantage of
|
||||
# the Tesseract data already parsed by the Internet Archive and
|
||||
# embedded in the PDF, when possible. If there is not sufficient
|
||||
# text in the PDF to be confident that the Archive's OCR
|
||||
# postprocessing captured it all, then OCR is recomputed locally.
|
||||
#
|
||||
# In some instances, the Archive's OCR detects rotated text but
|
||||
# parses it as gibberish. To partially mitigate this, we ignore all
|
||||
# precomputed text blocks with a "portrait" aspect ratio. This will
|
||||
# not necessarily help with text that is rotated 180 degrees, but in
|
||||
# practice that case is rarely encountered. This will also not work
|
||||
# well with non-latin scripts that are intended to be oriented
|
||||
# vertically.
|
||||
OCR_RECOMPUTE_THRESHOLD_WORDS = 30
|
||||
if (
|
||||
sum(
|
||||
(
|
||||
len(block.text.split())
|
||||
for block in leaf.text_blocks
|
||||
if block.x1 - block.x0 > block.y1 - block.y0
|
||||
)
|
||||
)
|
||||
>= OCR_RECOMPUTE_THRESHOLD_WORDS
|
||||
):
|
||||
ocred_leaf = leaf
|
||||
page_angle = 0
|
||||
else:
|
||||
OCR_SCALE = 1
|
||||
im_scaled = im.resize(np.int_(np.array(im.size) * OCR_SCALE))
|
||||
ocr_result = ocr_engine.process(im_scaled)
|
||||
ocred_leaf = ArchiveLeaf(
|
||||
image=im,
|
||||
page_number=leaf.page_number,
|
||||
text_blocks=[
|
||||
TextBlock(
|
||||
x0=int(block.x0 / OCR_SCALE),
|
||||
y0=int(block.y0 / OCR_SCALE),
|
||||
x1=int(block.x1 / OCR_SCALE),
|
||||
y1=int(block.y1 / OCR_SCALE),
|
||||
text=block.text,
|
||||
sharpness = analyze_sharpness(
|
||||
# Exclude edges, which typically include the page border.
|
||||
im_normalized.crop(
|
||||
(
|
||||
im_normalized.size[0] * 0.15,
|
||||
im_normalized.size[1] * 0.15,
|
||||
im_normalized.size[0] * 0.85,
|
||||
im_normalized.size[1] * 0.85,
|
||||
)
|
||||
for block in ocr_result.blocks
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
ocr_result, leaf = compute_ocr(leaf, im_normalized, ocr_engine)
|
||||
page_angle = 0 if ocr_result is None else ocr_result.page_angle
|
||||
text_margin_px = compute_text_margin_px(
|
||||
im_normalized.size, leaf.text_blocks
|
||||
)
|
||||
|
||||
# If OCR turns up issues based on the PDF's original text boxes,
|
||||
# re-run it ourselves to help weed out false positives.
|
||||
CLIPPING_THRESHOLD_PX = 30
|
||||
ROT_THRESHOLD_DEG = 30
|
||||
if ocr_result is None and (
|
||||
ROT_THRESHOLD_DEG < page_angle < 360 - ROT_THRESHOLD_DEG
|
||||
or text_margin_px < CLIPPING_THRESHOLD_PX
|
||||
):
|
||||
ocr_result, leaf = compute_ocr(
|
||||
leaf, im_normalized, ocr_engine, force_recompute=True
|
||||
)
|
||||
assert ocr_result is not None, (
|
||||
"compute_ocr(..., force_recompute=True) should always return an OcrResult"
|
||||
)
|
||||
page_angle = ocr_result.page_angle
|
||||
|
||||
word_margins_all_directions = (
|
||||
np.sort(
|
||||
np.concat(
|
||||
[
|
||||
np.array(
|
||||
[
|
||||
block.x0,
|
||||
block.y0,
|
||||
im.size[0] - block.x1,
|
||||
im.size[1] - block.y1,
|
||||
]
|
||||
)
|
||||
for block in ocred_leaf.text_blocks
|
||||
]
|
||||
).astype(np.int_)
|
||||
)
|
||||
if len(ocred_leaf.text_blocks) > 0
|
||||
else np.array([])
|
||||
)
|
||||
# Skip the n closest words to the edge, to help ignore stray OCR artifacts.
|
||||
SKIP_WORDS = 2
|
||||
text_margin_px = int(
|
||||
word_margins_all_directions[SKIP_WORDS]
|
||||
if word_margins_all_directions.shape[0] > SKIP_WORDS
|
||||
else -1
|
||||
text_margin_px = compute_text_margin_px(
|
||||
im_normalized.size, leaf.text_blocks
|
||||
)
|
||||
|
||||
# Make sure the OCR engine is running with orientation detection.
|
||||
assert page_angle is not None
|
||||
assert page_angle is not None, (
|
||||
"OCR engine should be running with page orientation detection"
|
||||
)
|
||||
|
||||
analyzed_pages.append(
|
||||
{
|
||||
"is_blank": is_blank,
|
||||
"is_blank": False,
|
||||
"page_angle": page_angle,
|
||||
"size_analyzed": im.size,
|
||||
"size_analyzed": leaf.image.size,
|
||||
"sharpness": sharpness,
|
||||
"text_margin_px": text_margin_px,
|
||||
}
|
||||
|
|
@ -154,13 +120,15 @@ def normalize_contrast_for_text(im: Image.Image) -> tuple[Image.Image, bool]:
|
|||
|
||||
(normalized_image, is_blank)
|
||||
"""
|
||||
|
||||
pixel_values = np.asarray(
|
||||
# Exclude edges, which typically include the page border.
|
||||
im.crop(
|
||||
(
|
||||
im.size[0] * 0.1,
|
||||
im.size[1] * 0.1,
|
||||
im.size[0] * 0.9,
|
||||
im.size[1] * 0.9,
|
||||
im.size[0] * 0.15,
|
||||
im.size[1] * 0.15,
|
||||
im.size[0] * 0.85,
|
||||
im.size[1] * 0.85,
|
||||
)
|
||||
)
|
||||
)
|
||||
|
|
@ -189,4 +157,115 @@ def analyze_sharpness(im: Image.Image) -> float:
|
|||
# computation based on https://stackoverflow.com/a/26014796.
|
||||
grad_y, grad_x = np.gradient(np.asarray(im))
|
||||
return float(np.clip(np.quantile(np.sqrt(grad_x**2 + grad_y**2), 0.99) / 255, 0, 1))
|
||||
|
||||
|
||||
def compute_ocr(
|
||||
leaf: ArchiveLeaf,
|
||||
im_normalized: Image.Image,
|
||||
ocr_engine: OcrEngine,
|
||||
force_recompute: bool = False,
|
||||
) -> (Optional[OcrResult], ArchiveLeaf):
|
||||
"""
|
||||
OCR is computationally expensive, so we try to take advantage of
|
||||
the Tesseract data already parsed by the Internet Archive and
|
||||
embedded in the PDF, when possible. If there is not sufficient
|
||||
text in the PDF to be confident that the Archive's OCR
|
||||
postprocessing captured it all, then OCR is recomputed locally.
|
||||
|
||||
In some instances, the Archive's OCR detects rotated text but
|
||||
parses it as gibberish. To partially mitigate this, we ignore all
|
||||
precomputed text blocks with a "portrait" aspect ratio. This will
|
||||
not necessarily help with text that is rotated 180 degrees, and will
|
||||
not work well with non-latin scripts that are intended to be oriented
|
||||
vertically.
|
||||
|
||||
Params:
|
||||
|
||||
leaf Information for the document page.
|
||||
|
||||
im_normalized Contrast-normalized image.
|
||||
|
||||
ocr_engine Engine to use as needed for OCR.
|
||||
|
||||
force_recompute If `True`, OCR is re-run even if there is already
|
||||
text data associated with the leaf.
|
||||
|
||||
Returns:
|
||||
|
||||
Tuple of `OcrResult` and `ArchiveLeaf` if OCR was recomputed; otherwise
|
||||
tuple of `None` and `ArchiveLeaf` if existing text data was reused.
|
||||
"""
|
||||
|
||||
if not force_recompute:
|
||||
PDF_OCR_THRESHOLD_WORDS = 30
|
||||
pdf_word_count = sum(
|
||||
(
|
||||
len(block.text.split())
|
||||
for block in leaf.text_blocks
|
||||
if block.x1 - block.x0 > block.y1 - block.y0
|
||||
)
|
||||
)
|
||||
if pdf_word_count >= PDF_OCR_THRESHOLD_WORDS:
|
||||
return None, leaf
|
||||
|
||||
ocr_result = ocr_engine.process(im_normalized)
|
||||
return ocr_result, ArchiveLeaf(
|
||||
image=leaf.image,
|
||||
page_number=leaf.page_number,
|
||||
text_blocks=[
|
||||
TextBlock(
|
||||
x0=int(block.x0),
|
||||
y0=int(block.y0),
|
||||
x1=int(block.x1),
|
||||
y1=int(block.y1),
|
||||
text=block.text,
|
||||
)
|
||||
for block in ocr_result.blocks
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def compute_text_margin_px(
|
||||
im_size: tuple[int, int], text_blocks: list[TextBlock]
|
||||
) -> Optional[int]:
|
||||
"""
|
||||
Infer the margins between OCR'ed text blocks and the edge of their page's
|
||||
bounds. This helps to detect if adjacent content may be cropped out.
|
||||
|
||||
Params:
|
||||
|
||||
im_size Dimensions of the page image, in pixels.
|
||||
|
||||
text_blocks List of text blocks detected by OCR.
|
||||
|
||||
Returns:
|
||||
|
||||
Integer pixel count if text is present; otherwise `None`.
|
||||
"""
|
||||
|
||||
word_margins_all_directions = (
|
||||
np.sort(
|
||||
np.concat(
|
||||
[
|
||||
np.array(
|
||||
[
|
||||
block.x0,
|
||||
block.y0,
|
||||
im_size[0] - block.x1,
|
||||
im_size[1] - block.y1,
|
||||
]
|
||||
)
|
||||
for block in text_blocks
|
||||
]
|
||||
).astype(np.int_)
|
||||
)
|
||||
if len(text_blocks) > 0
|
||||
else np.array([])
|
||||
)
|
||||
# Skip the n closest words to the edge, to help ignore stray OCR artifacts.
|
||||
SKIP_WORDS = 2
|
||||
return (
|
||||
int(word_margins_all_directions[SKIP_WORDS])
|
||||
if word_margins_all_directions.shape[0] > SKIP_WORDS
|
||||
else None
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue