add ocr crop warnings

2025-08-10 22:10:16 -07:00 · 2025-08-10 22:10:16 -07:00 · a5e3a2a429
commit a5e3a2a429
parent 8dbcb19b43
3 changed files with 60 additions and 20 deletions
--- a/README.md
+++ b/README.md
@ -16,3 +16,9 @@ a single line so that items are summarized in parallel):
 ```sh
 pbpaste | tr '\n' ',' | uv run main.py --summarize -workers 4 -v | jq
 ```
 ## Test Cases
 - Blurry pages: `micro_IA40244209_0984`
 - Contrast, page orientation: `micro_IA40244211_2290`
 - Crop, low quality fiche: `micro_IA40386420_0689`
--- a/main.py
+++ b/main.py
@ -2,21 +2,23 @@ import json
 import re
 import urllib.parse
 from argparse import ArgumentParser
 from csv import QUOTE_NONE
 from dataclasses import dataclass
-from io import BytesIO, StringIO
+from io import BytesIO
 from multiprocessing import Pool
 from multiprocessing.pool import ThreadPool
 from sys import stderr, stdin, stdout
 from zipfile import ZipFile
 import numpy as np
 import pandas as pd
 import pytesseract
 import requests
 from PIL import Image, ImageFilter
 OCR_LANGS = "eng+fra"
 N_OCR_PROCESSES = 4
 def main():
    parser = ArgumentParser()
    parser.add_argument("--summarize", action="store_true")
@ -64,7 +66,7 @@ def _summarize_item_to_stdout(task):
        print(f"Summarizing item {item_id}...", file=stderr)
        stderr.flush()
-    analysis = analyze_item(item_id, verbose)
+    analysis = analyze_item(item_id, True, verbose)
    # 3 or more blank pages in a row is a flag.
    CONSECUTIVE_BLANKS_THRESHOLD = 3
@ -85,7 +87,7 @@ def _summarize_item_to_stdout(task):
    # 3 or more blank pages in a row is a flag.
    CONSECUTIVE_BLURRY_THRESHOLD = 3
-    SHARPNESS_THRESHOLD = 0.08
+    SHARPNESS_THRESHOLD = 0.1
    if len(analysis["pages"]) >= CONSECUTIVE_BLURRY_THRESHOLD:
        consecutive_blurry = [
            page["sharpness"] < SHARPNESS_THRESHOLD for page in analysis["pages"]
@ -109,12 +111,17 @@ def _summarize_item_to_stdout(task):
        if not page["ocr_orientation_match"]
    ]
-    if check_orientation or consecutive_blanks or consecutive_blurry:
+    check_crop = [
        i + 1 for i, page in enumerate(analysis["pages"]) if page["words_near_edge"] > 2
    ]
    if check_orientation or check_crop or consecutive_blanks or consecutive_blurry:
        print(
            json.dumps(
                {
                    "item_id": item_id,
                    "check_orientation": check_orientation,
                    "check_crop": check_crop,
                    "consecutive_blanks": consecutive_blanks,
                    "consecutive_blurry": consecutive_blurry,
                }
@ -135,7 +142,7 @@ def _analyze_item_to_stdout(task):
        print(f"Analyzing item {item_id}...", file=stderr)
        stderr.flush()
-    print(json.dumps(analyze_item(item_id, verbose)))
+    print(json.dumps(analyze_item(item_id, True, verbose)))
    stdout.flush()
    if verbose:
@ -169,6 +176,7 @@ def _analyze_page(task):
    if is_blank:
        max_sharpness = 1
        ocr_orientation_match = True
        words_near_edge = 0
    else:
        max_sharpness = 0.0
        if im_cropped.size[0] < im_cropped.size[1]:
@ -195,14 +203,16 @@ def _analyze_page(task):
                    ),
                )
-        n_words = []
+        best_ocr_score = -1
        best_ocr_words = None
        best_ocr_orientation = -1
        for orientation in range(4):
-            im_rotated = im_original.rotate(90 * orientation)
+            im_rotated = im_original.rotate(90 * orientation, expand=True)
-            ocr = pd.read_csv(
+            ocr = pytesseract.image_to_data(
-                StringIO(pytesseract.image_to_data(im_rotated)),
+                im_rotated,
-                sep="\t",
+                lang=OCR_LANGS,
-                quoting=QUOTE_NONE,
+                config="--oem 1 --dpi 300 --tessdata-dir ./tessdata_fast-4.1.0",
-                dtype={"text": str},
+                output_type=pytesseract.Output.DATAFRAME,
            ).fillna({"text": ""})
            words = ocr[(ocr["conf"] > 90) & (ocr["width"] > ocr["height"])]
            words = words[
@ -211,12 +221,36 @@ def _analyze_page(task):
                    axis=1,
                )
            ]
-            n_words.append(words.shape[0])
+            if words.shape[0] > best_ocr_score:
-            if len(words) > 50:
+                best_ocr_score = words.shape[0]
                best_ocr_orientation = orientation
                best_ocr_words = words
            if best_ocr_score > 50:
                # Unlikely that another orientation will have more words, so
                # stop eating up CPU unnecessarily.
                break
-        ocr_orientation_match = np.max(n_words) == n_words[0]
+
        ocr_orientation_match = best_ocr_orientation == 0
        best_ocr_dims = (
            im_original.size
            if best_ocr_orientation % 2 == 0
            else (im_original.size[1], im_original.size[0])
        )
        EDGE_TOLERANCE = 0.03
        words_near_edge = best_ocr_words[
            (best_ocr_words["left"] < best_ocr_dims[0] * EDGE_TOLERANCE)
            | (best_ocr_words["top"] < best_ocr_dims[1] * EDGE_TOLERANCE)
            | (
                best_ocr_words["left"] + best_ocr_words["width"]
                > best_ocr_dims[0] * (1 - EDGE_TOLERANCE)
            )
            | (
                best_ocr_words["top"] + best_ocr_words["height"]
                > best_ocr_dims[1] * (1 - EDGE_TOLERANCE)
            )
        ]
        words_near_edge = words_near_edge.shape[0]
    return {
        "blank": is_blank,
@ -225,6 +259,7 @@ def _analyze_page(task):
        "page_index": page_index,
        "size": im_original.size,
        "sharpness": max_sharpness,
        "words_near_edge": words_near_edge,
    }
@ -262,6 +297,7 @@ def analyze_item(item_id, parallel=False, verbose=False):
            if page_index != -1:
                with jp_zip.open(file_name) as jp_file:
                    im = Image.open(jp_file).convert("L")
                    im.thumbnail((3200, 3200))
                    tasks.append(
                        PageAnalysisTask(
                            im=im, page_index=page_index, file_name=file_name
@ -273,8 +309,7 @@ def analyze_item(item_id, parallel=False, verbose=False):
        stderr.flush()
    if parallel:
        # Parallelize image processing and OCR of pages across up to n cores.
-        N_PROCESSES = 8
+        with Pool(N_OCR_PROCESSES) as pool:
        with Pool(N_PROCESSES) as pool:
            return {"pages": pool.map(_analyze_page, tasks)}
    return {"pages": [_analyze_page(task) for task in tasks]}
--- a/mise.toml
+++ b/mise.toml
@ -1,5 +1,4 @@
 [tools]
 jujutsu = "latest"
 uv = "latest"
 watchexec = "latest"