add ocr crop warnings

This commit is contained in:
Brent Schroeter 2025-08-10 22:10:16 -07:00
parent 8dbcb19b43
commit a5e3a2a429
3 changed files with 60 additions and 20 deletions

View file

@ -16,3 +16,9 @@ a single line so that items are summarized in parallel):
```sh ```sh
pbpaste | tr '\n' ',' | uv run main.py --summarize -workers 4 -v | jq pbpaste | tr '\n' ',' | uv run main.py --summarize -workers 4 -v | jq
``` ```
## Test Cases
- Blurry pages: `micro_IA40244209_0984`
- Contrast, page orientation: `micro_IA40244211_2290`
- Crop, low quality fiche: `micro_IA40386420_0689`

73
main.py
View file

@ -2,21 +2,23 @@ import json
import re import re
import urllib.parse import urllib.parse
from argparse import ArgumentParser from argparse import ArgumentParser
from csv import QUOTE_NONE
from dataclasses import dataclass from dataclasses import dataclass
from io import BytesIO, StringIO from io import BytesIO
from multiprocessing import Pool from multiprocessing import Pool
from multiprocessing.pool import ThreadPool from multiprocessing.pool import ThreadPool
from sys import stderr, stdin, stdout from sys import stderr, stdin, stdout
from zipfile import ZipFile from zipfile import ZipFile
import numpy as np import numpy as np
import pandas as pd
import pytesseract import pytesseract
import requests import requests
from PIL import Image, ImageFilter from PIL import Image, ImageFilter
OCR_LANGS = "eng+fra"
N_OCR_PROCESSES = 4
def main(): def main():
parser = ArgumentParser() parser = ArgumentParser()
parser.add_argument("--summarize", action="store_true") parser.add_argument("--summarize", action="store_true")
@ -64,7 +66,7 @@ def _summarize_item_to_stdout(task):
print(f"Summarizing item {item_id}...", file=stderr) print(f"Summarizing item {item_id}...", file=stderr)
stderr.flush() stderr.flush()
analysis = analyze_item(item_id, verbose) analysis = analyze_item(item_id, True, verbose)
# 3 or more blank pages in a row is a flag. # 3 or more blank pages in a row is a flag.
CONSECUTIVE_BLANKS_THRESHOLD = 3 CONSECUTIVE_BLANKS_THRESHOLD = 3
@ -85,7 +87,7 @@ def _summarize_item_to_stdout(task):
# 3 or more blank pages in a row is a flag. # 3 or more blank pages in a row is a flag.
CONSECUTIVE_BLURRY_THRESHOLD = 3 CONSECUTIVE_BLURRY_THRESHOLD = 3
SHARPNESS_THRESHOLD = 0.08 SHARPNESS_THRESHOLD = 0.1
if len(analysis["pages"]) >= CONSECUTIVE_BLURRY_THRESHOLD: if len(analysis["pages"]) >= CONSECUTIVE_BLURRY_THRESHOLD:
consecutive_blurry = [ consecutive_blurry = [
page["sharpness"] < SHARPNESS_THRESHOLD for page in analysis["pages"] page["sharpness"] < SHARPNESS_THRESHOLD for page in analysis["pages"]
@ -109,12 +111,17 @@ def _summarize_item_to_stdout(task):
if not page["ocr_orientation_match"] if not page["ocr_orientation_match"]
] ]
if check_orientation or consecutive_blanks or consecutive_blurry: check_crop = [
i + 1 for i, page in enumerate(analysis["pages"]) if page["words_near_edge"] > 2
]
if check_orientation or check_crop or consecutive_blanks or consecutive_blurry:
print( print(
json.dumps( json.dumps(
{ {
"item_id": item_id, "item_id": item_id,
"check_orientation": check_orientation, "check_orientation": check_orientation,
"check_crop": check_crop,
"consecutive_blanks": consecutive_blanks, "consecutive_blanks": consecutive_blanks,
"consecutive_blurry": consecutive_blurry, "consecutive_blurry": consecutive_blurry,
} }
@ -135,7 +142,7 @@ def _analyze_item_to_stdout(task):
print(f"Analyzing item {item_id}...", file=stderr) print(f"Analyzing item {item_id}...", file=stderr)
stderr.flush() stderr.flush()
print(json.dumps(analyze_item(item_id, verbose))) print(json.dumps(analyze_item(item_id, True, verbose)))
stdout.flush() stdout.flush()
if verbose: if verbose:
@ -169,6 +176,7 @@ def _analyze_page(task):
if is_blank: if is_blank:
max_sharpness = 1 max_sharpness = 1
ocr_orientation_match = True ocr_orientation_match = True
words_near_edge = 0
else: else:
max_sharpness = 0.0 max_sharpness = 0.0
if im_cropped.size[0] < im_cropped.size[1]: if im_cropped.size[0] < im_cropped.size[1]:
@ -195,14 +203,16 @@ def _analyze_page(task):
), ),
) )
n_words = [] best_ocr_score = -1
best_ocr_words = None
best_ocr_orientation = -1
for orientation in range(4): for orientation in range(4):
im_rotated = im_original.rotate(90 * orientation) im_rotated = im_original.rotate(90 * orientation, expand=True)
ocr = pd.read_csv( ocr = pytesseract.image_to_data(
StringIO(pytesseract.image_to_data(im_rotated)), im_rotated,
sep="\t", lang=OCR_LANGS,
quoting=QUOTE_NONE, config="--oem 1 --dpi 300 --tessdata-dir ./tessdata_fast-4.1.0",
dtype={"text": str}, output_type=pytesseract.Output.DATAFRAME,
).fillna({"text": ""}) ).fillna({"text": ""})
words = ocr[(ocr["conf"] > 90) & (ocr["width"] > ocr["height"])] words = ocr[(ocr["conf"] > 90) & (ocr["width"] > ocr["height"])]
words = words[ words = words[
@ -211,12 +221,36 @@ def _analyze_page(task):
axis=1, axis=1,
) )
] ]
n_words.append(words.shape[0]) if words.shape[0] > best_ocr_score:
if len(words) > 50: best_ocr_score = words.shape[0]
best_ocr_orientation = orientation
best_ocr_words = words
if best_ocr_score > 50:
# Unlikely that another orientation will have more words, so # Unlikely that another orientation will have more words, so
# stop eating up CPU unnecessarily. # stop eating up CPU unnecessarily.
break break
ocr_orientation_match = np.max(n_words) == n_words[0]
ocr_orientation_match = best_ocr_orientation == 0
best_ocr_dims = (
im_original.size
if best_ocr_orientation % 2 == 0
else (im_original.size[1], im_original.size[0])
)
EDGE_TOLERANCE = 0.03
words_near_edge = best_ocr_words[
(best_ocr_words["left"] < best_ocr_dims[0] * EDGE_TOLERANCE)
| (best_ocr_words["top"] < best_ocr_dims[1] * EDGE_TOLERANCE)
| (
best_ocr_words["left"] + best_ocr_words["width"]
> best_ocr_dims[0] * (1 - EDGE_TOLERANCE)
)
| (
best_ocr_words["top"] + best_ocr_words["height"]
> best_ocr_dims[1] * (1 - EDGE_TOLERANCE)
)
]
words_near_edge = words_near_edge.shape[0]
return { return {
"blank": is_blank, "blank": is_blank,
@ -225,6 +259,7 @@ def _analyze_page(task):
"page_index": page_index, "page_index": page_index,
"size": im_original.size, "size": im_original.size,
"sharpness": max_sharpness, "sharpness": max_sharpness,
"words_near_edge": words_near_edge,
} }
@ -262,6 +297,7 @@ def analyze_item(item_id, parallel=False, verbose=False):
if page_index != -1: if page_index != -1:
with jp_zip.open(file_name) as jp_file: with jp_zip.open(file_name) as jp_file:
im = Image.open(jp_file).convert("L") im = Image.open(jp_file).convert("L")
im.thumbnail((3200, 3200))
tasks.append( tasks.append(
PageAnalysisTask( PageAnalysisTask(
im=im, page_index=page_index, file_name=file_name im=im, page_index=page_index, file_name=file_name
@ -273,8 +309,7 @@ def analyze_item(item_id, parallel=False, verbose=False):
stderr.flush() stderr.flush()
if parallel: if parallel:
# Parallelize image processing and OCR of pages across up to n cores. # Parallelize image processing and OCR of pages across up to n cores.
N_PROCESSES = 8 with Pool(N_OCR_PROCESSES) as pool:
with Pool(N_PROCESSES) as pool:
return {"pages": pool.map(_analyze_page, tasks)} return {"pages": pool.map(_analyze_page, tasks)}
return {"pages": [_analyze_page(task) for task in tasks]} return {"pages": [_analyze_page(task) for task in tasks]}

View file

@ -1,5 +1,4 @@
[tools] [tools]
jujutsu = "latest"
uv = "latest" uv = "latest"
watchexec = "latest" watchexec = "latest"