add ocr crop warnings

This commit is contained in:
Brent Schroeter 2025-08-10 22:10:16 -07:00
parent 8dbcb19b43
commit a5e3a2a429
3 changed files with 60 additions and 20 deletions

View file

@ -16,3 +16,9 @@ a single line so that items are summarized in parallel):
```sh
pbpaste | tr '\n' ',' | uv run main.py --summarize -workers 4 -v | jq
```
## Test Cases
- Blurry pages: `micro_IA40244209_0984`
- Contrast, page orientation: `micro_IA40244211_2290`
- Crop, low quality fiche: `micro_IA40386420_0689`

73
main.py
View file

@ -2,21 +2,23 @@ import json
import re
import urllib.parse
from argparse import ArgumentParser
from csv import QUOTE_NONE
from dataclasses import dataclass
from io import BytesIO, StringIO
from io import BytesIO
from multiprocessing import Pool
from multiprocessing.pool import ThreadPool
from sys import stderr, stdin, stdout
from zipfile import ZipFile
import numpy as np
import pandas as pd
import pytesseract
import requests
from PIL import Image, ImageFilter
OCR_LANGS = "eng+fra"
N_OCR_PROCESSES = 4
def main():
parser = ArgumentParser()
parser.add_argument("--summarize", action="store_true")
@ -64,7 +66,7 @@ def _summarize_item_to_stdout(task):
print(f"Summarizing item {item_id}...", file=stderr)
stderr.flush()
analysis = analyze_item(item_id, verbose)
analysis = analyze_item(item_id, True, verbose)
# 3 or more blank pages in a row is a flag.
CONSECUTIVE_BLANKS_THRESHOLD = 3
@ -85,7 +87,7 @@ def _summarize_item_to_stdout(task):
# 3 or more blank pages in a row is a flag.
CONSECUTIVE_BLURRY_THRESHOLD = 3
SHARPNESS_THRESHOLD = 0.08
SHARPNESS_THRESHOLD = 0.1
if len(analysis["pages"]) >= CONSECUTIVE_BLURRY_THRESHOLD:
consecutive_blurry = [
page["sharpness"] < SHARPNESS_THRESHOLD for page in analysis["pages"]
@ -109,12 +111,17 @@ def _summarize_item_to_stdout(task):
if not page["ocr_orientation_match"]
]
if check_orientation or consecutive_blanks or consecutive_blurry:
check_crop = [
i + 1 for i, page in enumerate(analysis["pages"]) if page["words_near_edge"] > 2
]
if check_orientation or check_crop or consecutive_blanks or consecutive_blurry:
print(
json.dumps(
{
"item_id": item_id,
"check_orientation": check_orientation,
"check_crop": check_crop,
"consecutive_blanks": consecutive_blanks,
"consecutive_blurry": consecutive_blurry,
}
@ -135,7 +142,7 @@ def _analyze_item_to_stdout(task):
print(f"Analyzing item {item_id}...", file=stderr)
stderr.flush()
print(json.dumps(analyze_item(item_id, verbose)))
print(json.dumps(analyze_item(item_id, True, verbose)))
stdout.flush()
if verbose:
@ -169,6 +176,7 @@ def _analyze_page(task):
if is_blank:
max_sharpness = 1
ocr_orientation_match = True
words_near_edge = 0
else:
max_sharpness = 0.0
if im_cropped.size[0] < im_cropped.size[1]:
@ -195,14 +203,16 @@ def _analyze_page(task):
),
)
n_words = []
best_ocr_score = -1
best_ocr_words = None
best_ocr_orientation = -1
for orientation in range(4):
im_rotated = im_original.rotate(90 * orientation)
ocr = pd.read_csv(
StringIO(pytesseract.image_to_data(im_rotated)),
sep="\t",
quoting=QUOTE_NONE,
dtype={"text": str},
im_rotated = im_original.rotate(90 * orientation, expand=True)
ocr = pytesseract.image_to_data(
im_rotated,
lang=OCR_LANGS,
config="--oem 1 --dpi 300 --tessdata-dir ./tessdata_fast-4.1.0",
output_type=pytesseract.Output.DATAFRAME,
).fillna({"text": ""})
words = ocr[(ocr["conf"] > 90) & (ocr["width"] > ocr["height"])]
words = words[
@ -211,12 +221,36 @@ def _analyze_page(task):
axis=1,
)
]
n_words.append(words.shape[0])
if len(words) > 50:
if words.shape[0] > best_ocr_score:
best_ocr_score = words.shape[0]
best_ocr_orientation = orientation
best_ocr_words = words
if best_ocr_score > 50:
# Unlikely that another orientation will have more words, so
# stop eating up CPU unnecessarily.
break
ocr_orientation_match = np.max(n_words) == n_words[0]
ocr_orientation_match = best_ocr_orientation == 0
best_ocr_dims = (
im_original.size
if best_ocr_orientation % 2 == 0
else (im_original.size[1], im_original.size[0])
)
EDGE_TOLERANCE = 0.03
words_near_edge = best_ocr_words[
(best_ocr_words["left"] < best_ocr_dims[0] * EDGE_TOLERANCE)
| (best_ocr_words["top"] < best_ocr_dims[1] * EDGE_TOLERANCE)
| (
best_ocr_words["left"] + best_ocr_words["width"]
> best_ocr_dims[0] * (1 - EDGE_TOLERANCE)
)
| (
best_ocr_words["top"] + best_ocr_words["height"]
> best_ocr_dims[1] * (1 - EDGE_TOLERANCE)
)
]
words_near_edge = words_near_edge.shape[0]
return {
"blank": is_blank,
@ -225,6 +259,7 @@ def _analyze_page(task):
"page_index": page_index,
"size": im_original.size,
"sharpness": max_sharpness,
"words_near_edge": words_near_edge,
}
@ -262,6 +297,7 @@ def analyze_item(item_id, parallel=False, verbose=False):
if page_index != -1:
with jp_zip.open(file_name) as jp_file:
im = Image.open(jp_file).convert("L")
im.thumbnail((3200, 3200))
tasks.append(
PageAnalysisTask(
im=im, page_index=page_index, file_name=file_name
@ -273,8 +309,7 @@ def analyze_item(item_id, parallel=False, verbose=False):
stderr.flush()
if parallel:
# Parallelize image processing and OCR of pages across up to n cores.
N_PROCESSES = 8
with Pool(N_PROCESSES) as pool:
with Pool(N_OCR_PROCESSES) as pool:
return {"pages": pool.map(_analyze_page, tasks)}
return {"pages": [_analyze_page(task) for task in tasks]}

View file

@ -1,5 +1,4 @@
[tools]
jujutsu = "latest"
uv = "latest"
watchexec = "latest"