From d33a7dc515faf76c93c035f387c2c357b168784a Mon Sep 17 00:00:00 2001 From: Brent Schroeter Date: Sun, 10 Aug 2025 22:56:25 -0700 Subject: [PATCH] crop detection tuning --- .gitignore | 1 + main.py | 53 ++++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 41 insertions(+), 13 deletions(-) diff --git a/.gitignore b/.gitignore index ea8c4bf..a727c0a 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ /target +/data diff --git a/main.py b/main.py index 33332ea..d2d77ae 100644 --- a/main.py +++ b/main.py @@ -24,6 +24,7 @@ def main(): parser.add_argument("--summarize", action="store_true") parser.add_argument("-v", "--verbose", action="store_true") parser.add_argument("-w", "--workers", type=int, default=1) + parser.add_argument("--page-margin-px", type=int, default=50) args = parser.parse_args() # Process STDIN line by line, where each line contains one or more item IDs @@ -38,7 +39,11 @@ def main(): pool.map( _summarize_item_to_stdout, [ - ItemTask(item_id=item_id, verbose=args.verbose) + ItemTask( + item_id=item_id, + page_margin_px=args.page_margin_px, + verbose=args.verbose, + ) for item_id in item_ids ], ) @@ -46,7 +51,11 @@ def main(): pool.map( _analyze_item_to_stdout, [ - ItemTask(item_id=item_id, verbose=args.verbose) + ItemTask( + item_id=item_id, + page_margin_px=args.page_margin_px, + verbose=args.verbose, + ) for item_id in item_ids ], ) @@ -55,18 +64,22 @@ def main(): @dataclass class ItemTask: item_id: str + page_margin_px: int verbose: bool def _summarize_item_to_stdout(task): item_id = task.item_id + page_margin_px = task.page_margin_px verbose = task.verbose if verbose: print(f"Summarizing item {item_id}...", file=stderr) stderr.flush() - analysis = analyze_item(item_id, True, verbose) + analysis = analyze_item( + item_id, page_margin_px=page_margin_px, parallel=True, verbose=verbose + ) # 3 or more blank pages in a row is a flag. CONSECUTIVE_BLANKS_THRESHOLD = 3 @@ -111,8 +124,11 @@ def _summarize_item_to_stdout(task): if not page["ocr_orientation_match"] ] + WORDS_NEAR_EDGE_THRESHOLD = 2 check_crop = [ - i + 1 for i, page in enumerate(analysis["pages"]) if page["words_near_edge"] > 2 + i + 1 + for i, page in enumerate(analysis["pages"]) + if page["words_near_edge"] > WORDS_NEAR_EDGE_THRESHOLD ] if check_orientation or check_crop or consecutive_blanks or consecutive_blurry: @@ -136,13 +152,20 @@ def _summarize_item_to_stdout(task): def _analyze_item_to_stdout(task): item_id = task.item_id + page_margin_px = task.page_margin_px verbose = task.verbose if verbose: print(f"Analyzing item {item_id}...", file=stderr) stderr.flush() - print(json.dumps(analyze_item(item_id, True, verbose))) + print( + json.dumps( + analyze_item( + item_id, page_margin_px=page_margin_px, parallel=True, verbose=verbose + ) + ) + ) stdout.flush() if verbose: @@ -154,12 +177,14 @@ def _analyze_item_to_stdout(task): class PageAnalysisTask: im: Image.Image page_index: int + page_margin_px: int file_name: str def _analyze_page(task): im_original = task.im page_index = task.page_index + page_margin_px = task.page_margin_px file_name = task.file_name im_cropped = im_original.crop( @@ -211,7 +236,7 @@ def _analyze_page(task): ocr = pytesseract.image_to_data( im_rotated, lang=OCR_LANGS, - config="--oem 1 --dpi 300 --tessdata-dir ./tessdata_fast-4.1.0", + config="--oem 1 --dpi 300 --tessdata-dir ./data/tessdata_fast-4.1.0", output_type=pytesseract.Output.DATAFRAME, ).fillna({"text": ""}) words = ocr[(ocr["conf"] > 90) & (ocr["width"] > ocr["height"])] @@ -237,17 +262,16 @@ def _analyze_page(task): if best_ocr_orientation % 2 == 0 else (im_original.size[1], im_original.size[0]) ) - EDGE_TOLERANCE = 0.03 words_near_edge = best_ocr_words[ - (best_ocr_words["left"] < best_ocr_dims[0] * EDGE_TOLERANCE) - | (best_ocr_words["top"] < best_ocr_dims[1] * EDGE_TOLERANCE) + (best_ocr_words["left"] < page_margin_px) + | (best_ocr_words["top"] < page_margin_px) | ( best_ocr_words["left"] + best_ocr_words["width"] - > best_ocr_dims[0] * (1 - EDGE_TOLERANCE) + > best_ocr_dims[0] - page_margin_px ) | ( best_ocr_words["top"] + best_ocr_words["height"] - > best_ocr_dims[1] * (1 - EDGE_TOLERANCE) + > best_ocr_dims[1] - page_margin_px ) ] words_near_edge = words_near_edge.shape[0] @@ -263,7 +287,7 @@ def _analyze_page(task): } -def analyze_item(item_id, parallel=False, verbose=False): +def analyze_item(item_id, page_margin_px, parallel=False, verbose=False): escaped_item_id = urllib.parse.quote(item_id, safe="") if verbose: @@ -300,7 +324,10 @@ def analyze_item(item_id, parallel=False, verbose=False): im.thumbnail((3200, 3200)) tasks.append( PageAnalysisTask( - im=im, page_index=page_index, file_name=file_name + im=im, + page_index=page_index, + page_margin_px=page_margin_px, + file_name=file_name, ) )