diff --git a/diagnostics.py b/diagnostics.py index c6171ad..4e06936 100644 --- a/diagnostics.py +++ b/diagnostics.py @@ -62,18 +62,33 @@ def main(): [ i for i, page in enumerate(doc["pages"]) - if 45 < page["page_angle"] < 315 + if 30 < page["page_angle"] < 330 ] for doc in analyses ], "sharpness_max": max( - *[page["sharpness"] for doc in analyses for page in doc["pages"]] + [ + page["sharpness"] + for doc in analyses + for page in doc["pages"] + if page["sharpness"] is not None + ] ), "sharpness_median": np.median( - [page["sharpness"] for doc in analyses for page in doc["pages"]] + [ + page["sharpness"] + for doc in analyses + for page in doc["pages"] + if page["sharpness"] is not None + ] ).tolist(), "sharpness_min": min( - *[page["sharpness"] for doc in analyses for page in doc["pages"]] + [ + page["sharpness"] + for doc in analyses + for page in doc["pages"] + if page["sharpness"] is not None + ] ), } ) diff --git a/main.py b/main.py index d7853a0..c45d77b 100644 --- a/main.py +++ b/main.py @@ -275,7 +275,7 @@ insert into phono.items (ia_id, review_date, oai_updatedate, url) [ doc["identifier"], doc.get("review_date"), - max(*[datetime.fromisoformat(t) for t in doc["oai_updatedate"]]), + max([datetime.fromisoformat(t) for t in doc["oai_updatedate"]]), ] for doc in batch ], diff --git a/microqa/engine.py b/microqa/engine.py index b88ec74..57fa07e 100644 --- a/microqa/engine.py +++ b/microqa/engine.py @@ -1,10 +1,11 @@ from sys import stdout +from typing import Optional import numpy as np from PIL import Image from .items import ArchiveDoc, ArchiveLeaf -from .ocr import OcrEngine, TextBlock +from .ocr import OcrEngine, OcrResult, TextBlock def analyze_doc( @@ -30,103 +31,68 @@ def analyze_doc( analyzed_pages = [] for leaf in all_leaves: - im, is_blank = normalize_contrast_for_text(leaf.image) - - im_cropped = im.crop( - ( - im.size[0] * 0.1, - im.size[1] * 0.1, - im.size[0] * 0.9, - im.size[1] * 0.9, + im_normalized, is_blank = normalize_contrast_for_text(leaf.image) + if is_blank: + analyzed_pages.append( + { + "is_blank": True, + "page_angle": 0, + "size_analyzed": leaf.image.size, + "sharpness": None, + "text_margin_px": None, + } ) - ) - sharpness = analyze_sharpness(im_cropped) - - # OCR is computationally expensive, so we try to take advantage of - # the Tesseract data already parsed by the Internet Archive and - # embedded in the PDF, when possible. If there is not sufficient - # text in the PDF to be confident that the Archive's OCR - # postprocessing captured it all, then OCR is recomputed locally. - # - # In some instances, the Archive's OCR detects rotated text but - # parses it as gibberish. To partially mitigate this, we ignore all - # precomputed text blocks with a "portrait" aspect ratio. This will - # not necessarily help with text that is rotated 180 degrees, but in - # practice that case is rarely encountered. This will also not work - # well with non-latin scripts that are intended to be oriented - # vertically. - OCR_RECOMPUTE_THRESHOLD_WORDS = 30 - if ( - sum( - ( - len(block.text.split()) - for block in leaf.text_blocks - if block.x1 - block.x0 > block.y1 - block.y0 + else: + sharpness = analyze_sharpness( + # Exclude edges, which typically include the page border. + im_normalized.crop( + ( + im_normalized.size[0] * 0.15, + im_normalized.size[1] * 0.15, + im_normalized.size[0] * 0.85, + im_normalized.size[1] * 0.85, + ) ) ) - >= OCR_RECOMPUTE_THRESHOLD_WORDS - ): - ocred_leaf = leaf - page_angle = 0 - else: - OCR_SCALE = 1 - im_scaled = im.resize(np.int_(np.array(im.size) * OCR_SCALE)) - ocr_result = ocr_engine.process(im_scaled) - ocred_leaf = ArchiveLeaf( - image=im, - page_number=leaf.page_number, - text_blocks=[ - TextBlock( - x0=int(block.x0 / OCR_SCALE), - y0=int(block.y0 / OCR_SCALE), - x1=int(block.x1 / OCR_SCALE), - y1=int(block.y1 / OCR_SCALE), - text=block.text, - ) - for block in ocr_result.blocks - ], + + ocr_result, leaf = compute_ocr(leaf, im_normalized, ocr_engine) + page_angle = 0 if ocr_result is None else ocr_result.page_angle + text_margin_px = compute_text_margin_px( + im_normalized.size, leaf.text_blocks ) - page_angle = ocr_result.page_angle - word_margins_all_directions = ( - np.sort( - np.concat( - [ - np.array( - [ - block.x0, - block.y0, - im.size[0] - block.x1, - im.size[1] - block.y1, - ] - ) - for block in ocred_leaf.text_blocks - ] - ).astype(np.int_) + # If OCR turns up issues based on the PDF's original text boxes, + # re-run it ourselves to help weed out false positives. + CLIPPING_THRESHOLD_PX = 30 + ROT_THRESHOLD_DEG = 30 + if ocr_result is None and ( + ROT_THRESHOLD_DEG < page_angle < 360 - ROT_THRESHOLD_DEG + or text_margin_px < CLIPPING_THRESHOLD_PX + ): + ocr_result, leaf = compute_ocr( + leaf, im_normalized, ocr_engine, force_recompute=True + ) + assert ocr_result is not None, ( + "compute_ocr(..., force_recompute=True) should always return an OcrResult" + ) + page_angle = ocr_result.page_angle + text_margin_px = compute_text_margin_px( + im_normalized.size, leaf.text_blocks + ) + + assert page_angle is not None, ( + "OCR engine should be running with page orientation detection" ) - if len(ocred_leaf.text_blocks) > 0 - else np.array([]) - ) - # Skip the n closest words to the edge, to help ignore stray OCR artifacts. - SKIP_WORDS = 2 - text_margin_px = int( - word_margins_all_directions[SKIP_WORDS] - if word_margins_all_directions.shape[0] > SKIP_WORDS - else -1 - ) - # Make sure the OCR engine is running with orientation detection. - assert page_angle is not None - - analyzed_pages.append( - { - "is_blank": is_blank, - "page_angle": page_angle, - "size_analyzed": im.size, - "sharpness": sharpness, - "text_margin_px": text_margin_px, - } - ) + analyzed_pages.append( + { + "is_blank": False, + "page_angle": page_angle, + "size_analyzed": leaf.image.size, + "sharpness": sharpness, + "text_margin_px": text_margin_px, + } + ) return {"pages": analyzed_pages} @@ -154,13 +120,15 @@ def normalize_contrast_for_text(im: Image.Image) -> tuple[Image.Image, bool]: (normalized_image, is_blank) """ + pixel_values = np.asarray( + # Exclude edges, which typically include the page border. im.crop( ( - im.size[0] * 0.1, - im.size[1] * 0.1, - im.size[0] * 0.9, - im.size[1] * 0.9, + im.size[0] * 0.15, + im.size[1] * 0.15, + im.size[0] * 0.85, + im.size[1] * 0.85, ) ) ) @@ -189,4 +157,115 @@ def analyze_sharpness(im: Image.Image) -> float: # computation based on https://stackoverflow.com/a/26014796. grad_y, grad_x = np.gradient(np.asarray(im)) return float(np.clip(np.quantile(np.sqrt(grad_x**2 + grad_y**2), 0.99) / 255, 0, 1)) + + +def compute_ocr( + leaf: ArchiveLeaf, + im_normalized: Image.Image, + ocr_engine: OcrEngine, + force_recompute: bool = False, +) -> (Optional[OcrResult], ArchiveLeaf): """ + OCR is computationally expensive, so we try to take advantage of + the Tesseract data already parsed by the Internet Archive and + embedded in the PDF, when possible. If there is not sufficient + text in the PDF to be confident that the Archive's OCR + postprocessing captured it all, then OCR is recomputed locally. + + In some instances, the Archive's OCR detects rotated text but + parses it as gibberish. To partially mitigate this, we ignore all + precomputed text blocks with a "portrait" aspect ratio. This will + not necessarily help with text that is rotated 180 degrees, and will + not work well with non-latin scripts that are intended to be oriented + vertically. + + Params: + + leaf Information for the document page. + + im_normalized Contrast-normalized image. + + ocr_engine Engine to use as needed for OCR. + + force_recompute If `True`, OCR is re-run even if there is already + text data associated with the leaf. + + Returns: + + Tuple of `OcrResult` and `ArchiveLeaf` if OCR was recomputed; otherwise + tuple of `None` and `ArchiveLeaf` if existing text data was reused. + """ + + if not force_recompute: + PDF_OCR_THRESHOLD_WORDS = 30 + pdf_word_count = sum( + ( + len(block.text.split()) + for block in leaf.text_blocks + if block.x1 - block.x0 > block.y1 - block.y0 + ) + ) + if pdf_word_count >= PDF_OCR_THRESHOLD_WORDS: + return None, leaf + + ocr_result = ocr_engine.process(im_normalized) + return ocr_result, ArchiveLeaf( + image=leaf.image, + page_number=leaf.page_number, + text_blocks=[ + TextBlock( + x0=int(block.x0), + y0=int(block.y0), + x1=int(block.x1), + y1=int(block.y1), + text=block.text, + ) + for block in ocr_result.blocks + ], + ) + + +def compute_text_margin_px( + im_size: tuple[int, int], text_blocks: list[TextBlock] +) -> Optional[int]: + """ + Infer the margins between OCR'ed text blocks and the edge of their page's + bounds. This helps to detect if adjacent content may be cropped out. + + Params: + + im_size Dimensions of the page image, in pixels. + + text_blocks List of text blocks detected by OCR. + + Returns: + + Integer pixel count if text is present; otherwise `None`. + """ + + word_margins_all_directions = ( + np.sort( + np.concat( + [ + np.array( + [ + block.x0, + block.y0, + im_size[0] - block.x1, + im_size[1] - block.y1, + ] + ) + for block in text_blocks + ] + ).astype(np.int_) + ) + if len(text_blocks) > 0 + else np.array([]) + ) + # Skip the n closest words to the edge, to help ignore stray OCR artifacts. + SKIP_WORDS = 2 + return ( + int(word_margins_all_directions[SKIP_WORDS]) + if word_margins_all_directions.shape[0] > SKIP_WORDS + else None + )