import json import re from argparse import ArgumentParser from dataclasses import dataclass from multiprocessing.pool import ThreadPool from sys import stderr, stdin, stdout from engine import analyze_item OCR_LANGS = "eng+fra" def main(): parser = ArgumentParser() parser.add_argument("--summarize", action="store_true") parser.add_argument("-v", "--verbose", action="store_true") parser.add_argument("-w", "--workers", type=int, default=1) parser.add_argument("--page-margin-px", type=int, default=50) args = parser.parse_args() # Process STDIN line by line, where each line contains one or more item IDs # separated by whitespace. for line in stdin: item_ids = [value for value in re.split(r",|\s", line) if value] with ThreadPool(args.workers) as pool: if args.verbose: print(f"Running with {args.workers} workers.", file=stderr) stderr.flush() if args.summarize: pool.map( _summarize_item_to_stdout, [ ItemTask( item_id=item_id, page_margin_px=args.page_margin_px, verbose=args.verbose, ) for item_id in item_ids ], ) else: pool.map( _analyze_item_to_stdout, [ ItemTask( item_id=item_id, page_margin_px=args.page_margin_px, verbose=args.verbose, ) for item_id in item_ids ], ) @dataclass class ItemTask: item_id: str page_margin_px: int verbose: bool def _summarize_item_to_stdout(task): item_id = task.item_id page_margin_px = task.page_margin_px verbose = task.verbose if verbose: print(f"Summarizing item {item_id}...", file=stderr) stderr.flush() analysis = analyze_item(item_id, parallel=4, verbose=verbose) # 3 or more blank pages in a row is a flag. CONSECUTIVE_BLANKS_THRESHOLD = 3 if len(analysis["pages"]) >= CONSECUTIVE_BLANKS_THRESHOLD: consecutive_blanks = [page["blank"] for page in analysis["pages"]] for _ in range(1, CONSECUTIVE_BLANKS_THRESHOLD): consecutive_blanks = [ value and consecutive_blanks[i] for i, value in enumerate(consecutive_blanks[1:]) ] consecutive_blanks = [ i + 2 # +1 to account for enumeration offset, and +1 to 1-index for i, value in enumerate(consecutive_blanks[1:]) if value and not consecutive_blanks[i] ] else: consecutive_blanks = [] # 3 or more blank pages in a row is a flag. CONSECUTIVE_BLURRY_THRESHOLD = 3 SHARPNESS_THRESHOLD = 0.1 if len(analysis["pages"]) >= CONSECUTIVE_BLURRY_THRESHOLD: consecutive_blurry = [ page["sharpness"] < SHARPNESS_THRESHOLD for page in analysis["pages"] ] for _ in range(1, CONSECUTIVE_BLURRY_THRESHOLD): consecutive_blurry = [ value and consecutive_blurry[i] for i, value in enumerate(consecutive_blurry[1:]) ] consecutive_blurry = [ i + 2 # +1 to account for enumeration offset, and +1 to 1-index for i, value in enumerate(consecutive_blurry[1:]) if value and not consecutive_blurry[i] ] else: consecutive_blurry = [] check_orientation = [ i + 1 for i, page in enumerate(analysis["pages"]) if not page["ocr_orientation_match"] ] check_crop = [ i + 1 for i, page in enumerate(analysis["pages"]) if page["text_margin_px"] < page_margin_px ] if check_orientation or check_crop or consecutive_blanks or consecutive_blurry: print( json.dumps( { "item_id": item_id, "check_orientation": check_orientation, "check_crop": check_crop, "consecutive_blanks": consecutive_blanks, "consecutive_blurry": consecutive_blurry, } ) ) stdout.flush() if verbose: print(f"Done summarizing item {item_id}.", file=stderr) stderr.flush() def _analyze_item_to_stdout(task): item_id = task.item_id verbose = task.verbose if verbose: print(f"Analyzing item {item_id}...", file=stderr) stderr.flush() print(json.dumps(analyze_item(item_id, parallel=6, verbose=verbose))) stdout.flush() if verbose: print(f"Done analyzing item {item_id}.", file=stderr) stderr.flush() if __name__ == "__main__": main()