from sys import stdout import numpy as np from PIL import Image, ImageFilter from .items import ArchiveDoc, ArchiveLeaf from .ocr import OcrEngine, TextBlock def analyze_doc( doc: ArchiveDoc, ocr_engine: OcrEngine, use_cache: bool = False, verbose: bool = False, ): """ Analyzes all pages in an ArchiveDoc for useful metrics such as sharpness, orientation, presence of text overflows, and so on. """ if verbose: print(f"Loading {doc.name}...") stdout.flush() all_leaves = doc.fetch_leaves(use_cache=use_cache) if verbose: print(f"Processing {len(all_leaves)} pages...", file=stdout) stdout.flush() analyzed_pages = [] for leaf in all_leaves: im_cropped = leaf.image.crop( ( leaf.image.size[0] * 0.1, leaf.image.size[1] * 0.1, leaf.image.size[0] * 0.9, leaf.image.size[1] * 0.9, ) ) is_blank = im_cropped.getextrema()[0] > 255 * 0.8 if is_blank: max_sharpness = 1 text_margin_px = -1 page_angle = 0 else: # Sharpness is determined by percentile of pixels that match some # criteria, so it may vary significantly depending on which portion # of the image is analyzed. In an effort to identify the sharpest # edges, we split up the image into chunks and assume that the # highest sharpness value obtained across all chunks is # representative of the image as a whole. max_sharpness = 0.0 if im_cropped.size[0] < im_cropped.size[1]: # Page is in portrait orientation. segments_x = 2 segments_y = 3 else: # Page is in landscape orientation. segments_x = 3 segments_y = 2 for i in range(segments_x): for j in range(segments_y): max_sharpness = max( max_sharpness, analyze_sharpness( im_cropped.crop( ( im_cropped.size[0] / segments_x * i, im_cropped.size[1] / segments_y * j, im_cropped.size[0] / segments_x * (i + 1), im_cropped.size[1] / segments_y * (j + 1), ) ) ), ) # OCR is computationally expensive, so we try to take advantage of # the Tesseract data already parsed by the Internet Archive and # embedded in the PDF, when possible. If there is not sufficient # text in the PDF to be confident that the Archive's OCR # postprocessing captured it all, then OCR is recomputed locally. # # In some instances, the Archive's OCR detects rotated text but # parses it as gibberish. To partially mitigate this, we ignore all # precomputed text blocks with a "portrait" aspect ratio. This will # not necessarily help with text that is rotated 180 degrees, but in # practice that case is rarely encountered. This will also not work # well with non-latin scripts that are intended to be oriented # vertically. OCR_RECOMPUTE_THRESHOLD_WORDS = 30 if ( sum( ( len(block.text.split()) for block in leaf.text_blocks if block.x1 - block.x0 > block.y1 - block.y0 ) ) >= OCR_RECOMPUTE_THRESHOLD_WORDS ): if verbose: print("Using PDF text.") ocred_leaf = leaf page_angle = 0 else: if verbose: print("Using OCR.") OCR_SCALE = 1 im_scaled = leaf.image.resize( np.int_(np.array(leaf.image.size) * OCR_SCALE) ) ocr_result = ocr_engine.process(im_scaled) ocred_leaf = ArchiveLeaf( image=leaf.image, page_number=leaf.page_number, text_blocks=[ TextBlock( x0=int(block.x0 / OCR_SCALE), y0=int(block.y0 / OCR_SCALE), x1=int(block.x1 / OCR_SCALE), y1=int(block.y1 / OCR_SCALE), text=block.text, ) for block in ocr_result.blocks ], ) page_angle = ocr_result.page_angle word_margins_all_directions = np.sort( np.int_( np.concat( [ np.array( [ block.x0, block.y0, leaf.image.size[0] - block.x1, leaf.image.size[1] - block.y1, ] ) for block in ocred_leaf.text_blocks ] ) ) ) # Skip the n closest words to the edge, to help ignore stray OCR artifacts. SKIP_WORDS = 2 text_margin_px = int( word_margins_all_directions[SKIP_WORDS] if word_margins_all_directions.shape[0] > SKIP_WORDS else -1 ) # Make sure the OCR engine is running with orientation detection. assert page_angle is not None analyzed_pages.append( { "blank": is_blank, "page_angle": page_angle, "size_analyzed": leaf.image.size, "sharpness": max_sharpness, "text_margin_px": text_margin_px, } ) return {"pages": analyzed_pages} def analyze_sharpness(im: Image.Image): """ Crudely quantifies the "sharpness" of edges in an image, on a scale of 0 to 1. The scale is not linear with respect to scan quality: anything above 0.1 is usually fine. """ arr = np.asarray(im) # Normalize contrast based on brightest and darkest pixels. For example, # NORM_QUANTILE=0.1 will attempt to transform pixel values so that 80% fall # between 10% brightness and 90% brightness. In practice, a value around # 0.02 seems to work fairly well. NORM_QUANTILE = 0.03 pixel_range = np.quantile(arr, 1.0 - NORM_QUANTILE) - np.quantile( arr, NORM_QUANTILE ) if pixel_range == 0: arr_normalized = arr else: arr_normalized = arr * (1.0 - NORM_QUANTILE * 2) / pixel_range arr_normalized = ( arr_normalized - np.quantile(arr_normalized, NORM_QUANTILE) + NORM_QUANTILE ) arr_normalized = np.uint8(np.clip(arr_normalized, 0, 1) * 255) # "Sharpness" is determined by measuring the median intensity of pixels # near edges, after an edge detection filter has been applied to the image. edges_arr = np.asarray( Image.fromarray(arr_normalized).filter(ImageFilter.FIND_EDGES) ) EDGE_THRESHOLD = 8 return np.median(edges_arr[edges_arr > EDGE_THRESHOLD]) / 255