re-run ocr as needed to screen false positives
This commit is contained in:
parent
9a94719dc1
commit
f0a4ef253b
3 changed files with 194 additions and 100 deletions
|
|
@ -62,18 +62,33 @@ def main():
|
||||||
[
|
[
|
||||||
i
|
i
|
||||||
for i, page in enumerate(doc["pages"])
|
for i, page in enumerate(doc["pages"])
|
||||||
if 45 < page["page_angle"] < 315
|
if 30 < page["page_angle"] < 330
|
||||||
]
|
]
|
||||||
for doc in analyses
|
for doc in analyses
|
||||||
],
|
],
|
||||||
"sharpness_max": max(
|
"sharpness_max": max(
|
||||||
*[page["sharpness"] for doc in analyses for page in doc["pages"]]
|
[
|
||||||
|
page["sharpness"]
|
||||||
|
for doc in analyses
|
||||||
|
for page in doc["pages"]
|
||||||
|
if page["sharpness"] is not None
|
||||||
|
]
|
||||||
),
|
),
|
||||||
"sharpness_median": np.median(
|
"sharpness_median": np.median(
|
||||||
[page["sharpness"] for doc in analyses for page in doc["pages"]]
|
[
|
||||||
|
page["sharpness"]
|
||||||
|
for doc in analyses
|
||||||
|
for page in doc["pages"]
|
||||||
|
if page["sharpness"] is not None
|
||||||
|
]
|
||||||
).tolist(),
|
).tolist(),
|
||||||
"sharpness_min": min(
|
"sharpness_min": min(
|
||||||
*[page["sharpness"] for doc in analyses for page in doc["pages"]]
|
[
|
||||||
|
page["sharpness"]
|
||||||
|
for doc in analyses
|
||||||
|
for page in doc["pages"]
|
||||||
|
if page["sharpness"] is not None
|
||||||
|
]
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
|
||||||
2
main.py
2
main.py
|
|
@ -275,7 +275,7 @@ insert into phono.items (ia_id, review_date, oai_updatedate, url)
|
||||||
[
|
[
|
||||||
doc["identifier"],
|
doc["identifier"],
|
||||||
doc.get("review_date"),
|
doc.get("review_date"),
|
||||||
max(*[datetime.fromisoformat(t) for t in doc["oai_updatedate"]]),
|
max([datetime.fromisoformat(t) for t in doc["oai_updatedate"]]),
|
||||||
]
|
]
|
||||||
for doc in batch
|
for doc in batch
|
||||||
],
|
],
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,11 @@
|
||||||
from sys import stdout
|
from sys import stdout
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from .items import ArchiveDoc, ArchiveLeaf
|
from .items import ArchiveDoc, ArchiveLeaf
|
||||||
from .ocr import OcrEngine, TextBlock
|
from .ocr import OcrEngine, OcrResult, TextBlock
|
||||||
|
|
||||||
|
|
||||||
def analyze_doc(
|
def analyze_doc(
|
||||||
|
|
@ -30,103 +31,68 @@ def analyze_doc(
|
||||||
|
|
||||||
analyzed_pages = []
|
analyzed_pages = []
|
||||||
for leaf in all_leaves:
|
for leaf in all_leaves:
|
||||||
im, is_blank = normalize_contrast_for_text(leaf.image)
|
im_normalized, is_blank = normalize_contrast_for_text(leaf.image)
|
||||||
|
if is_blank:
|
||||||
im_cropped = im.crop(
|
analyzed_pages.append(
|
||||||
(
|
{
|
||||||
im.size[0] * 0.1,
|
"is_blank": True,
|
||||||
im.size[1] * 0.1,
|
"page_angle": 0,
|
||||||
im.size[0] * 0.9,
|
"size_analyzed": leaf.image.size,
|
||||||
im.size[1] * 0.9,
|
"sharpness": None,
|
||||||
|
"text_margin_px": None,
|
||||||
|
}
|
||||||
)
|
)
|
||||||
)
|
else:
|
||||||
sharpness = analyze_sharpness(im_cropped)
|
sharpness = analyze_sharpness(
|
||||||
|
# Exclude edges, which typically include the page border.
|
||||||
# OCR is computationally expensive, so we try to take advantage of
|
im_normalized.crop(
|
||||||
# the Tesseract data already parsed by the Internet Archive and
|
(
|
||||||
# embedded in the PDF, when possible. If there is not sufficient
|
im_normalized.size[0] * 0.15,
|
||||||
# text in the PDF to be confident that the Archive's OCR
|
im_normalized.size[1] * 0.15,
|
||||||
# postprocessing captured it all, then OCR is recomputed locally.
|
im_normalized.size[0] * 0.85,
|
||||||
#
|
im_normalized.size[1] * 0.85,
|
||||||
# In some instances, the Archive's OCR detects rotated text but
|
)
|
||||||
# parses it as gibberish. To partially mitigate this, we ignore all
|
|
||||||
# precomputed text blocks with a "portrait" aspect ratio. This will
|
|
||||||
# not necessarily help with text that is rotated 180 degrees, but in
|
|
||||||
# practice that case is rarely encountered. This will also not work
|
|
||||||
# well with non-latin scripts that are intended to be oriented
|
|
||||||
# vertically.
|
|
||||||
OCR_RECOMPUTE_THRESHOLD_WORDS = 30
|
|
||||||
if (
|
|
||||||
sum(
|
|
||||||
(
|
|
||||||
len(block.text.split())
|
|
||||||
for block in leaf.text_blocks
|
|
||||||
if block.x1 - block.x0 > block.y1 - block.y0
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
>= OCR_RECOMPUTE_THRESHOLD_WORDS
|
|
||||||
):
|
ocr_result, leaf = compute_ocr(leaf, im_normalized, ocr_engine)
|
||||||
ocred_leaf = leaf
|
page_angle = 0 if ocr_result is None else ocr_result.page_angle
|
||||||
page_angle = 0
|
text_margin_px = compute_text_margin_px(
|
||||||
else:
|
im_normalized.size, leaf.text_blocks
|
||||||
OCR_SCALE = 1
|
|
||||||
im_scaled = im.resize(np.int_(np.array(im.size) * OCR_SCALE))
|
|
||||||
ocr_result = ocr_engine.process(im_scaled)
|
|
||||||
ocred_leaf = ArchiveLeaf(
|
|
||||||
image=im,
|
|
||||||
page_number=leaf.page_number,
|
|
||||||
text_blocks=[
|
|
||||||
TextBlock(
|
|
||||||
x0=int(block.x0 / OCR_SCALE),
|
|
||||||
y0=int(block.y0 / OCR_SCALE),
|
|
||||||
x1=int(block.x1 / OCR_SCALE),
|
|
||||||
y1=int(block.y1 / OCR_SCALE),
|
|
||||||
text=block.text,
|
|
||||||
)
|
|
||||||
for block in ocr_result.blocks
|
|
||||||
],
|
|
||||||
)
|
)
|
||||||
page_angle = ocr_result.page_angle
|
|
||||||
|
|
||||||
word_margins_all_directions = (
|
# If OCR turns up issues based on the PDF's original text boxes,
|
||||||
np.sort(
|
# re-run it ourselves to help weed out false positives.
|
||||||
np.concat(
|
CLIPPING_THRESHOLD_PX = 30
|
||||||
[
|
ROT_THRESHOLD_DEG = 30
|
||||||
np.array(
|
if ocr_result is None and (
|
||||||
[
|
ROT_THRESHOLD_DEG < page_angle < 360 - ROT_THRESHOLD_DEG
|
||||||
block.x0,
|
or text_margin_px < CLIPPING_THRESHOLD_PX
|
||||||
block.y0,
|
):
|
||||||
im.size[0] - block.x1,
|
ocr_result, leaf = compute_ocr(
|
||||||
im.size[1] - block.y1,
|
leaf, im_normalized, ocr_engine, force_recompute=True
|
||||||
]
|
)
|
||||||
)
|
assert ocr_result is not None, (
|
||||||
for block in ocred_leaf.text_blocks
|
"compute_ocr(..., force_recompute=True) should always return an OcrResult"
|
||||||
]
|
)
|
||||||
).astype(np.int_)
|
page_angle = ocr_result.page_angle
|
||||||
|
text_margin_px = compute_text_margin_px(
|
||||||
|
im_normalized.size, leaf.text_blocks
|
||||||
|
)
|
||||||
|
|
||||||
|
assert page_angle is not None, (
|
||||||
|
"OCR engine should be running with page orientation detection"
|
||||||
)
|
)
|
||||||
if len(ocred_leaf.text_blocks) > 0
|
|
||||||
else np.array([])
|
|
||||||
)
|
|
||||||
# Skip the n closest words to the edge, to help ignore stray OCR artifacts.
|
|
||||||
SKIP_WORDS = 2
|
|
||||||
text_margin_px = int(
|
|
||||||
word_margins_all_directions[SKIP_WORDS]
|
|
||||||
if word_margins_all_directions.shape[0] > SKIP_WORDS
|
|
||||||
else -1
|
|
||||||
)
|
|
||||||
|
|
||||||
# Make sure the OCR engine is running with orientation detection.
|
analyzed_pages.append(
|
||||||
assert page_angle is not None
|
{
|
||||||
|
"is_blank": False,
|
||||||
analyzed_pages.append(
|
"page_angle": page_angle,
|
||||||
{
|
"size_analyzed": leaf.image.size,
|
||||||
"is_blank": is_blank,
|
"sharpness": sharpness,
|
||||||
"page_angle": page_angle,
|
"text_margin_px": text_margin_px,
|
||||||
"size_analyzed": im.size,
|
}
|
||||||
"sharpness": sharpness,
|
)
|
||||||
"text_margin_px": text_margin_px,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
return {"pages": analyzed_pages}
|
return {"pages": analyzed_pages}
|
||||||
|
|
||||||
|
|
@ -154,13 +120,15 @@ def normalize_contrast_for_text(im: Image.Image) -> tuple[Image.Image, bool]:
|
||||||
|
|
||||||
(normalized_image, is_blank)
|
(normalized_image, is_blank)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
pixel_values = np.asarray(
|
pixel_values = np.asarray(
|
||||||
|
# Exclude edges, which typically include the page border.
|
||||||
im.crop(
|
im.crop(
|
||||||
(
|
(
|
||||||
im.size[0] * 0.1,
|
im.size[0] * 0.15,
|
||||||
im.size[1] * 0.1,
|
im.size[1] * 0.15,
|
||||||
im.size[0] * 0.9,
|
im.size[0] * 0.85,
|
||||||
im.size[1] * 0.9,
|
im.size[1] * 0.85,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
@ -189,4 +157,115 @@ def analyze_sharpness(im: Image.Image) -> float:
|
||||||
# computation based on https://stackoverflow.com/a/26014796.
|
# computation based on https://stackoverflow.com/a/26014796.
|
||||||
grad_y, grad_x = np.gradient(np.asarray(im))
|
grad_y, grad_x = np.gradient(np.asarray(im))
|
||||||
return float(np.clip(np.quantile(np.sqrt(grad_x**2 + grad_y**2), 0.99) / 255, 0, 1))
|
return float(np.clip(np.quantile(np.sqrt(grad_x**2 + grad_y**2), 0.99) / 255, 0, 1))
|
||||||
|
|
||||||
|
|
||||||
|
def compute_ocr(
|
||||||
|
leaf: ArchiveLeaf,
|
||||||
|
im_normalized: Image.Image,
|
||||||
|
ocr_engine: OcrEngine,
|
||||||
|
force_recompute: bool = False,
|
||||||
|
) -> (Optional[OcrResult], ArchiveLeaf):
|
||||||
"""
|
"""
|
||||||
|
OCR is computationally expensive, so we try to take advantage of
|
||||||
|
the Tesseract data already parsed by the Internet Archive and
|
||||||
|
embedded in the PDF, when possible. If there is not sufficient
|
||||||
|
text in the PDF to be confident that the Archive's OCR
|
||||||
|
postprocessing captured it all, then OCR is recomputed locally.
|
||||||
|
|
||||||
|
In some instances, the Archive's OCR detects rotated text but
|
||||||
|
parses it as gibberish. To partially mitigate this, we ignore all
|
||||||
|
precomputed text blocks with a "portrait" aspect ratio. This will
|
||||||
|
not necessarily help with text that is rotated 180 degrees, and will
|
||||||
|
not work well with non-latin scripts that are intended to be oriented
|
||||||
|
vertically.
|
||||||
|
|
||||||
|
Params:
|
||||||
|
|
||||||
|
leaf Information for the document page.
|
||||||
|
|
||||||
|
im_normalized Contrast-normalized image.
|
||||||
|
|
||||||
|
ocr_engine Engine to use as needed for OCR.
|
||||||
|
|
||||||
|
force_recompute If `True`, OCR is re-run even if there is already
|
||||||
|
text data associated with the leaf.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
Tuple of `OcrResult` and `ArchiveLeaf` if OCR was recomputed; otherwise
|
||||||
|
tuple of `None` and `ArchiveLeaf` if existing text data was reused.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not force_recompute:
|
||||||
|
PDF_OCR_THRESHOLD_WORDS = 30
|
||||||
|
pdf_word_count = sum(
|
||||||
|
(
|
||||||
|
len(block.text.split())
|
||||||
|
for block in leaf.text_blocks
|
||||||
|
if block.x1 - block.x0 > block.y1 - block.y0
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if pdf_word_count >= PDF_OCR_THRESHOLD_WORDS:
|
||||||
|
return None, leaf
|
||||||
|
|
||||||
|
ocr_result = ocr_engine.process(im_normalized)
|
||||||
|
return ocr_result, ArchiveLeaf(
|
||||||
|
image=leaf.image,
|
||||||
|
page_number=leaf.page_number,
|
||||||
|
text_blocks=[
|
||||||
|
TextBlock(
|
||||||
|
x0=int(block.x0),
|
||||||
|
y0=int(block.y0),
|
||||||
|
x1=int(block.x1),
|
||||||
|
y1=int(block.y1),
|
||||||
|
text=block.text,
|
||||||
|
)
|
||||||
|
for block in ocr_result.blocks
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def compute_text_margin_px(
|
||||||
|
im_size: tuple[int, int], text_blocks: list[TextBlock]
|
||||||
|
) -> Optional[int]:
|
||||||
|
"""
|
||||||
|
Infer the margins between OCR'ed text blocks and the edge of their page's
|
||||||
|
bounds. This helps to detect if adjacent content may be cropped out.
|
||||||
|
|
||||||
|
Params:
|
||||||
|
|
||||||
|
im_size Dimensions of the page image, in pixels.
|
||||||
|
|
||||||
|
text_blocks List of text blocks detected by OCR.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
Integer pixel count if text is present; otherwise `None`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
word_margins_all_directions = (
|
||||||
|
np.sort(
|
||||||
|
np.concat(
|
||||||
|
[
|
||||||
|
np.array(
|
||||||
|
[
|
||||||
|
block.x0,
|
||||||
|
block.y0,
|
||||||
|
im_size[0] - block.x1,
|
||||||
|
im_size[1] - block.y1,
|
||||||
|
]
|
||||||
|
)
|
||||||
|
for block in text_blocks
|
||||||
|
]
|
||||||
|
).astype(np.int_)
|
||||||
|
)
|
||||||
|
if len(text_blocks) > 0
|
||||||
|
else np.array([])
|
||||||
|
)
|
||||||
|
# Skip the n closest words to the edge, to help ignore stray OCR artifacts.
|
||||||
|
SKIP_WORDS = 2
|
||||||
|
return (
|
||||||
|
int(word_margins_all_directions[SKIP_WORDS])
|
||||||
|
if word_margins_all_directions.shape[0] > SKIP_WORDS
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue