From a5e3a2a429abaccb0d8ca8e4fdda2b08815131d7 Mon Sep 17 00:00:00 2001 From: Brent Schroeter Date: Sun, 10 Aug 2025 22:10:16 -0700 Subject: [PATCH] add ocr crop warnings --- README.md | 6 +++++ main.py | 73 ++++++++++++++++++++++++++++++++++++++++--------------- mise.toml | 1 - 3 files changed, 60 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index d1cebcf..9fc65bd 100644 --- a/README.md +++ b/README.md @@ -16,3 +16,9 @@ a single line so that items are summarized in parallel): ```sh pbpaste | tr '\n' ',' | uv run main.py --summarize -workers 4 -v | jq ``` + +## Test Cases + +- Blurry pages: `micro_IA40244209_0984` +- Contrast, page orientation: `micro_IA40244211_2290` +- Crop, low quality fiche: `micro_IA40386420_0689` diff --git a/main.py b/main.py index 372839d..33332ea 100644 --- a/main.py +++ b/main.py @@ -2,21 +2,23 @@ import json import re import urllib.parse from argparse import ArgumentParser -from csv import QUOTE_NONE from dataclasses import dataclass -from io import BytesIO, StringIO +from io import BytesIO from multiprocessing import Pool from multiprocessing.pool import ThreadPool from sys import stderr, stdin, stdout from zipfile import ZipFile import numpy as np -import pandas as pd import pytesseract import requests from PIL import Image, ImageFilter +OCR_LANGS = "eng+fra" +N_OCR_PROCESSES = 4 + + def main(): parser = ArgumentParser() parser.add_argument("--summarize", action="store_true") @@ -64,7 +66,7 @@ def _summarize_item_to_stdout(task): print(f"Summarizing item {item_id}...", file=stderr) stderr.flush() - analysis = analyze_item(item_id, verbose) + analysis = analyze_item(item_id, True, verbose) # 3 or more blank pages in a row is a flag. CONSECUTIVE_BLANKS_THRESHOLD = 3 @@ -85,7 +87,7 @@ def _summarize_item_to_stdout(task): # 3 or more blank pages in a row is a flag. CONSECUTIVE_BLURRY_THRESHOLD = 3 - SHARPNESS_THRESHOLD = 0.08 + SHARPNESS_THRESHOLD = 0.1 if len(analysis["pages"]) >= CONSECUTIVE_BLURRY_THRESHOLD: consecutive_blurry = [ page["sharpness"] < SHARPNESS_THRESHOLD for page in analysis["pages"] @@ -109,12 +111,17 @@ def _summarize_item_to_stdout(task): if not page["ocr_orientation_match"] ] - if check_orientation or consecutive_blanks or consecutive_blurry: + check_crop = [ + i + 1 for i, page in enumerate(analysis["pages"]) if page["words_near_edge"] > 2 + ] + + if check_orientation or check_crop or consecutive_blanks or consecutive_blurry: print( json.dumps( { "item_id": item_id, "check_orientation": check_orientation, + "check_crop": check_crop, "consecutive_blanks": consecutive_blanks, "consecutive_blurry": consecutive_blurry, } @@ -135,7 +142,7 @@ def _analyze_item_to_stdout(task): print(f"Analyzing item {item_id}...", file=stderr) stderr.flush() - print(json.dumps(analyze_item(item_id, verbose))) + print(json.dumps(analyze_item(item_id, True, verbose))) stdout.flush() if verbose: @@ -169,6 +176,7 @@ def _analyze_page(task): if is_blank: max_sharpness = 1 ocr_orientation_match = True + words_near_edge = 0 else: max_sharpness = 0.0 if im_cropped.size[0] < im_cropped.size[1]: @@ -195,14 +203,16 @@ def _analyze_page(task): ), ) - n_words = [] + best_ocr_score = -1 + best_ocr_words = None + best_ocr_orientation = -1 for orientation in range(4): - im_rotated = im_original.rotate(90 * orientation) - ocr = pd.read_csv( - StringIO(pytesseract.image_to_data(im_rotated)), - sep="\t", - quoting=QUOTE_NONE, - dtype={"text": str}, + im_rotated = im_original.rotate(90 * orientation, expand=True) + ocr = pytesseract.image_to_data( + im_rotated, + lang=OCR_LANGS, + config="--oem 1 --dpi 300 --tessdata-dir ./tessdata_fast-4.1.0", + output_type=pytesseract.Output.DATAFRAME, ).fillna({"text": ""}) words = ocr[(ocr["conf"] > 90) & (ocr["width"] > ocr["height"])] words = words[ @@ -211,12 +221,36 @@ def _analyze_page(task): axis=1, ) ] - n_words.append(words.shape[0]) - if len(words) > 50: + if words.shape[0] > best_ocr_score: + best_ocr_score = words.shape[0] + best_ocr_orientation = orientation + best_ocr_words = words + if best_ocr_score > 50: # Unlikely that another orientation will have more words, so # stop eating up CPU unnecessarily. break - ocr_orientation_match = np.max(n_words) == n_words[0] + + ocr_orientation_match = best_ocr_orientation == 0 + + best_ocr_dims = ( + im_original.size + if best_ocr_orientation % 2 == 0 + else (im_original.size[1], im_original.size[0]) + ) + EDGE_TOLERANCE = 0.03 + words_near_edge = best_ocr_words[ + (best_ocr_words["left"] < best_ocr_dims[0] * EDGE_TOLERANCE) + | (best_ocr_words["top"] < best_ocr_dims[1] * EDGE_TOLERANCE) + | ( + best_ocr_words["left"] + best_ocr_words["width"] + > best_ocr_dims[0] * (1 - EDGE_TOLERANCE) + ) + | ( + best_ocr_words["top"] + best_ocr_words["height"] + > best_ocr_dims[1] * (1 - EDGE_TOLERANCE) + ) + ] + words_near_edge = words_near_edge.shape[0] return { "blank": is_blank, @@ -225,6 +259,7 @@ def _analyze_page(task): "page_index": page_index, "size": im_original.size, "sharpness": max_sharpness, + "words_near_edge": words_near_edge, } @@ -262,6 +297,7 @@ def analyze_item(item_id, parallel=False, verbose=False): if page_index != -1: with jp_zip.open(file_name) as jp_file: im = Image.open(jp_file).convert("L") + im.thumbnail((3200, 3200)) tasks.append( PageAnalysisTask( im=im, page_index=page_index, file_name=file_name @@ -273,8 +309,7 @@ def analyze_item(item_id, parallel=False, verbose=False): stderr.flush() if parallel: # Parallelize image processing and OCR of pages across up to n cores. - N_PROCESSES = 8 - with Pool(N_PROCESSES) as pool: + with Pool(N_OCR_PROCESSES) as pool: return {"pages": pool.map(_analyze_page, tasks)} return {"pages": [_analyze_page(task) for task in tasks]} diff --git a/mise.toml b/mise.toml index 43f1713..9dd0c33 100644 --- a/mise.toml +++ b/mise.toml @@ -1,5 +1,4 @@ [tools] -jujutsu = "latest" uv = "latest" watchexec = "latest"