From a5e3a2a429abaccb0d8ca8e4fdda2b08815131d7 Mon Sep 17 00:00:00 2001
From: Brent Schroeter <contact@brentsch.com>
Date: Sun, 10 Aug 2025 22:10:16 -0700
Subject: [PATCH] add ocr crop warnings

---
 README.md |  6 +++++
 main.py   | 73 ++++++++++++++++++++++++++++++++++++++++---------------
 mise.toml |  1 -
 3 files changed, 60 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index d1cebcf..9fc65bd 100644
--- a/README.md
+++ b/README.md
@@ -16,3 +16,9 @@ a single line so that items are summarized in parallel):
 ```sh
 pbpaste | tr '\n' ',' | uv run main.py --summarize -workers 4 -v | jq
 ```
+
+## Test Cases
+
+- Blurry pages: `micro_IA40244209_0984`
+- Contrast, page orientation: `micro_IA40244211_2290`
+- Crop, low quality fiche: `micro_IA40386420_0689`
diff --git a/main.py b/main.py
index 372839d..33332ea 100644
--- a/main.py
+++ b/main.py
@@ -2,21 +2,23 @@ import json
 import re
 import urllib.parse
 from argparse import ArgumentParser
-from csv import QUOTE_NONE
 from dataclasses import dataclass
-from io import BytesIO, StringIO
+from io import BytesIO
 from multiprocessing import Pool
 from multiprocessing.pool import ThreadPool
 from sys import stderr, stdin, stdout
 from zipfile import ZipFile
 
 import numpy as np
-import pandas as pd
 import pytesseract
 import requests
 from PIL import Image, ImageFilter
 
 
+OCR_LANGS = "eng+fra"
+N_OCR_PROCESSES = 4
+
+
 def main():
     parser = ArgumentParser()
     parser.add_argument("--summarize", action="store_true")
@@ -64,7 +66,7 @@ def _summarize_item_to_stdout(task):
         print(f"Summarizing item {item_id}...", file=stderr)
         stderr.flush()
 
-    analysis = analyze_item(item_id, verbose)
+    analysis = analyze_item(item_id, True, verbose)
 
     # 3 or more blank pages in a row is a flag.
     CONSECUTIVE_BLANKS_THRESHOLD = 3
@@ -85,7 +87,7 @@ def _summarize_item_to_stdout(task):
 
     # 3 or more blank pages in a row is a flag.
     CONSECUTIVE_BLURRY_THRESHOLD = 3
-    SHARPNESS_THRESHOLD = 0.08
+    SHARPNESS_THRESHOLD = 0.1
     if len(analysis["pages"]) >= CONSECUTIVE_BLURRY_THRESHOLD:
         consecutive_blurry = [
             page["sharpness"] < SHARPNESS_THRESHOLD for page in analysis["pages"]
@@ -109,12 +111,17 @@ def _summarize_item_to_stdout(task):
         if not page["ocr_orientation_match"]
     ]
 
-    if check_orientation or consecutive_blanks or consecutive_blurry:
+    check_crop = [
+        i + 1 for i, page in enumerate(analysis["pages"]) if page["words_near_edge"] > 2
+    ]
+
+    if check_orientation or check_crop or consecutive_blanks or consecutive_blurry:
         print(
             json.dumps(
                 {
                     "item_id": item_id,
                     "check_orientation": check_orientation,
+                    "check_crop": check_crop,
                     "consecutive_blanks": consecutive_blanks,
                     "consecutive_blurry": consecutive_blurry,
                 }
@@ -135,7 +142,7 @@ def _analyze_item_to_stdout(task):
         print(f"Analyzing item {item_id}...", file=stderr)
         stderr.flush()
 
-    print(json.dumps(analyze_item(item_id, verbose)))
+    print(json.dumps(analyze_item(item_id, True, verbose)))
     stdout.flush()
 
     if verbose:
@@ -169,6 +176,7 @@ def _analyze_page(task):
     if is_blank:
         max_sharpness = 1
         ocr_orientation_match = True
+        words_near_edge = 0
     else:
         max_sharpness = 0.0
         if im_cropped.size[0] < im_cropped.size[1]:
@@ -195,14 +203,16 @@ def _analyze_page(task):
                     ),
                 )
 
-        n_words = []
+        best_ocr_score = -1
+        best_ocr_words = None
+        best_ocr_orientation = -1
         for orientation in range(4):
-            im_rotated = im_original.rotate(90 * orientation)
-            ocr = pd.read_csv(
-                StringIO(pytesseract.image_to_data(im_rotated)),
-                sep="\t",
-                quoting=QUOTE_NONE,
-                dtype={"text": str},
+            im_rotated = im_original.rotate(90 * orientation, expand=True)
+            ocr = pytesseract.image_to_data(
+                im_rotated,
+                lang=OCR_LANGS,
+                config="--oem 1 --dpi 300 --tessdata-dir ./tessdata_fast-4.1.0",
+                output_type=pytesseract.Output.DATAFRAME,
             ).fillna({"text": ""})
             words = ocr[(ocr["conf"] > 90) & (ocr["width"] > ocr["height"])]
             words = words[
@@ -211,12 +221,36 @@ def _analyze_page(task):
                     axis=1,
                 )
             ]
-            n_words.append(words.shape[0])
-            if len(words) > 50:
+            if words.shape[0] > best_ocr_score:
+                best_ocr_score = words.shape[0]
+                best_ocr_orientation = orientation
+                best_ocr_words = words
+            if best_ocr_score > 50:
                 # Unlikely that another orientation will have more words, so
                 # stop eating up CPU unnecessarily.
                 break
-        ocr_orientation_match = np.max(n_words) == n_words[0]
+
+        ocr_orientation_match = best_ocr_orientation == 0
+
+        best_ocr_dims = (
+            im_original.size
+            if best_ocr_orientation % 2 == 0
+            else (im_original.size[1], im_original.size[0])
+        )
+        EDGE_TOLERANCE = 0.03
+        words_near_edge = best_ocr_words[
+            (best_ocr_words["left"] < best_ocr_dims[0] * EDGE_TOLERANCE)
+            | (best_ocr_words["top"] < best_ocr_dims[1] * EDGE_TOLERANCE)
+            | (
+                best_ocr_words["left"] + best_ocr_words["width"]
+                > best_ocr_dims[0] * (1 - EDGE_TOLERANCE)
+            )
+            | (
+                best_ocr_words["top"] + best_ocr_words["height"]
+                > best_ocr_dims[1] * (1 - EDGE_TOLERANCE)
+            )
+        ]
+        words_near_edge = words_near_edge.shape[0]
 
     return {
         "blank": is_blank,
@@ -225,6 +259,7 @@ def _analyze_page(task):
         "page_index": page_index,
         "size": im_original.size,
         "sharpness": max_sharpness,
+        "words_near_edge": words_near_edge,
     }
 
 
@@ -262,6 +297,7 @@ def analyze_item(item_id, parallel=False, verbose=False):
             if page_index != -1:
                 with jp_zip.open(file_name) as jp_file:
                     im = Image.open(jp_file).convert("L")
+                    im.thumbnail((3200, 3200))
                     tasks.append(
                         PageAnalysisTask(
                             im=im, page_index=page_index, file_name=file_name
@@ -273,8 +309,7 @@ def analyze_item(item_id, parallel=False, verbose=False):
         stderr.flush()
     if parallel:
         # Parallelize image processing and OCR of pages across up to n cores.
-        N_PROCESSES = 8
-        with Pool(N_PROCESSES) as pool:
+        with Pool(N_OCR_PROCESSES) as pool:
             return {"pages": pool.map(_analyze_page, tasks)}
     return {"pages": [_analyze_page(task) for task in tasks]}
 
diff --git a/mise.toml b/mise.toml
index 43f1713..9dd0c33 100644
--- a/mise.toml
+++ b/mise.toml
@@ -1,5 +1,4 @@
 [tools]
-jujutsu = "latest"
 uv = "latest"
 watchexec = "latest"