add ocr crop warnings

2025-08-10 22:10:16 -07:00 · 2025-08-10 22:10:16 -07:00 · a5e3a2a429
commit a5e3a2a429
parent 8dbcb19b43
3 changed files with 60 additions and 20 deletions
--- a/README.md
+++ b/README.md
@ -16,3 +16,9 @@ a single line so that items are summarized in parallel):
 ```sh
 pbpaste | tr '\n' ',' | uv run main.py --summarize -workers 4 -v | jq
 ```
+
+## Test Cases
+
+- Blurry pages: `micro_IA40244209_0984`
+- Contrast, page orientation: `micro_IA40244211_2290`
+- Crop, low quality fiche: `micro_IA40386420_0689`
--- a/main.py
+++ b/main.py
@ -2,21 +2,23 @@ import json
 import re
 import urllib.parse
 from argparse import ArgumentParser
-from csv import QUOTE_NONE
 from dataclasses import dataclass
-from io import BytesIO, StringIO
+from io import BytesIO
 from multiprocessing import Pool
 from multiprocessing.pool import ThreadPool
 from sys import stderr, stdin, stdout
 from zipfile import ZipFile

 import numpy as np
-import pandas as pd
 import pytesseract
 import requests
 from PIL import Image, ImageFilter


+OCR_LANGS = "eng+fra"
+N_OCR_PROCESSES = 4
+
+
 def main():
    parser = ArgumentParser()
    parser.add_argument("--summarize", action="store_true")
@ -64,7 +66,7 @@ def _summarize_item_to_stdout(task):
        print(f"Summarizing item {item_id}...", file=stderr)
        stderr.flush()

-    analysis = analyze_item(item_id, verbose)
+    analysis = analyze_item(item_id, True, verbose)

    # 3 or more blank pages in a row is a flag.
    CONSECUTIVE_BLANKS_THRESHOLD = 3
@ -85,7 +87,7 @@ def _summarize_item_to_stdout(task):

    # 3 or more blank pages in a row is a flag.
    CONSECUTIVE_BLURRY_THRESHOLD = 3
-    SHARPNESS_THRESHOLD = 0.08
+    SHARPNESS_THRESHOLD = 0.1
    if len(analysis["pages"]) >= CONSECUTIVE_BLURRY_THRESHOLD:
        consecutive_blurry = [
            page["sharpness"] < SHARPNESS_THRESHOLD for page in analysis["pages"]
@ -109,12 +111,17 @@ def _summarize_item_to_stdout(task):
        if not page["ocr_orientation_match"]
    ]

-    if check_orientation or consecutive_blanks or consecutive_blurry:
+    check_crop = [
+        i + 1 for i, page in enumerate(analysis["pages"]) if page["words_near_edge"] > 2
+    ]
+
+    if check_orientation or check_crop or consecutive_blanks or consecutive_blurry:
        print(
            json.dumps(
                {
                    "item_id": item_id,
                    "check_orientation": check_orientation,
+                    "check_crop": check_crop,
                    "consecutive_blanks": consecutive_blanks,
                    "consecutive_blurry": consecutive_blurry,
                }
@ -135,7 +142,7 @@ def _analyze_item_to_stdout(task):
        print(f"Analyzing item {item_id}...", file=stderr)
        stderr.flush()

-    print(json.dumps(analyze_item(item_id, verbose)))
+    print(json.dumps(analyze_item(item_id, True, verbose)))
    stdout.flush()

    if verbose:
@ -169,6 +176,7 @@ def _analyze_page(task):
    if is_blank:
        max_sharpness = 1
        ocr_orientation_match = True
+        words_near_edge = 0
    else:
        max_sharpness = 0.0
        if im_cropped.size[0] < im_cropped.size[1]:
@ -195,14 +203,16 @@ def _analyze_page(task):
                    ),
                )

-        n_words = []
+        best_ocr_score = -1
+        best_ocr_words = None
+        best_ocr_orientation = -1
        for orientation in range(4):
-            im_rotated = im_original.rotate(90 * orientation)
-            ocr = pd.read_csv(
-                StringIO(pytesseract.image_to_data(im_rotated)),
-                sep="\t",
-                quoting=QUOTE_NONE,
-                dtype={"text": str},
+            im_rotated = im_original.rotate(90 * orientation, expand=True)
+            ocr = pytesseract.image_to_data(
+                im_rotated,
+                lang=OCR_LANGS,
+                config="--oem 1 --dpi 300 --tessdata-dir ./tessdata_fast-4.1.0",
+                output_type=pytesseract.Output.DATAFRAME,
            ).fillna({"text": ""})
            words = ocr[(ocr["conf"] > 90) & (ocr["width"] > ocr["height"])]
            words = words[
@ -211,12 +221,36 @@ def _analyze_page(task):
                    axis=1,
                )
            ]
-            n_words.append(words.shape[0])
-            if len(words) > 50:
+            if words.shape[0] > best_ocr_score:
+                best_ocr_score = words.shape[0]
+                best_ocr_orientation = orientation
+                best_ocr_words = words
+            if best_ocr_score > 50:
                # Unlikely that another orientation will have more words, so
                # stop eating up CPU unnecessarily.
                break
-        ocr_orientation_match = np.max(n_words) == n_words[0]
+
+        ocr_orientation_match = best_ocr_orientation == 0
+
+        best_ocr_dims = (
+            im_original.size
+            if best_ocr_orientation % 2 == 0
+            else (im_original.size[1], im_original.size[0])
+        )
+        EDGE_TOLERANCE = 0.03
+        words_near_edge = best_ocr_words[
+            (best_ocr_words["left"] < best_ocr_dims[0] * EDGE_TOLERANCE)
+            | (best_ocr_words["top"] < best_ocr_dims[1] * EDGE_TOLERANCE)
+            | (
+                best_ocr_words["left"] + best_ocr_words["width"]
+                > best_ocr_dims[0] * (1 - EDGE_TOLERANCE)
+            )
+            | (
+                best_ocr_words["top"] + best_ocr_words["height"]
+                > best_ocr_dims[1] * (1 - EDGE_TOLERANCE)
+            )
+        ]
+        words_near_edge = words_near_edge.shape[0]

    return {
        "blank": is_blank,
@ -225,6 +259,7 @@ def _analyze_page(task):
        "page_index": page_index,
        "size": im_original.size,
        "sharpness": max_sharpness,
+        "words_near_edge": words_near_edge,
    }


@ -262,6 +297,7 @@ def analyze_item(item_id, parallel=False, verbose=False):
            if page_index != -1:
                with jp_zip.open(file_name) as jp_file:
                    im = Image.open(jp_file).convert("L")
+                    im.thumbnail((3200, 3200))
                    tasks.append(
                        PageAnalysisTask(
                            im=im, page_index=page_index, file_name=file_name
@ -273,8 +309,7 @@ def analyze_item(item_id, parallel=False, verbose=False):
        stderr.flush()
    if parallel:
        # Parallelize image processing and OCR of pages across up to n cores.
-        N_PROCESSES = 8
-        with Pool(N_PROCESSES) as pool:
+        with Pool(N_OCR_PROCESSES) as pool:
            return {"pages": pool.map(_analyze_page, tasks)}
    return {"pages": [_analyze_page(task) for task in tasks]}

--- a/mise.toml
+++ b/mise.toml
@ -1,5 +1,4 @@
 [tools]
-jujutsu = "latest"
 uv = "latest"
 watchexec = "latest"