add ocr crop warnings
This commit is contained in:
parent
8dbcb19b43
commit
a5e3a2a429
3 changed files with 60 additions and 20 deletions
|
|
@ -16,3 +16,9 @@ a single line so that items are summarized in parallel):
|
|||
```sh
|
||||
pbpaste | tr '\n' ',' | uv run main.py --summarize -workers 4 -v | jq
|
||||
```
|
||||
|
||||
## Test Cases
|
||||
|
||||
- Blurry pages: `micro_IA40244209_0984`
|
||||
- Contrast, page orientation: `micro_IA40244211_2290`
|
||||
- Crop, low quality fiche: `micro_IA40386420_0689`
|
||||
|
|
|
|||
73
main.py
73
main.py
|
|
@ -2,21 +2,23 @@ import json
|
|||
import re
|
||||
import urllib.parse
|
||||
from argparse import ArgumentParser
|
||||
from csv import QUOTE_NONE
|
||||
from dataclasses import dataclass
|
||||
from io import BytesIO, StringIO
|
||||
from io import BytesIO
|
||||
from multiprocessing import Pool
|
||||
from multiprocessing.pool import ThreadPool
|
||||
from sys import stderr, stdin, stdout
|
||||
from zipfile import ZipFile
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pytesseract
|
||||
import requests
|
||||
from PIL import Image, ImageFilter
|
||||
|
||||
|
||||
OCR_LANGS = "eng+fra"
|
||||
N_OCR_PROCESSES = 4
|
||||
|
||||
|
||||
def main():
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument("--summarize", action="store_true")
|
||||
|
|
@ -64,7 +66,7 @@ def _summarize_item_to_stdout(task):
|
|||
print(f"Summarizing item {item_id}...", file=stderr)
|
||||
stderr.flush()
|
||||
|
||||
analysis = analyze_item(item_id, verbose)
|
||||
analysis = analyze_item(item_id, True, verbose)
|
||||
|
||||
# 3 or more blank pages in a row is a flag.
|
||||
CONSECUTIVE_BLANKS_THRESHOLD = 3
|
||||
|
|
@ -85,7 +87,7 @@ def _summarize_item_to_stdout(task):
|
|||
|
||||
# 3 or more blank pages in a row is a flag.
|
||||
CONSECUTIVE_BLURRY_THRESHOLD = 3
|
||||
SHARPNESS_THRESHOLD = 0.08
|
||||
SHARPNESS_THRESHOLD = 0.1
|
||||
if len(analysis["pages"]) >= CONSECUTIVE_BLURRY_THRESHOLD:
|
||||
consecutive_blurry = [
|
||||
page["sharpness"] < SHARPNESS_THRESHOLD for page in analysis["pages"]
|
||||
|
|
@ -109,12 +111,17 @@ def _summarize_item_to_stdout(task):
|
|||
if not page["ocr_orientation_match"]
|
||||
]
|
||||
|
||||
if check_orientation or consecutive_blanks or consecutive_blurry:
|
||||
check_crop = [
|
||||
i + 1 for i, page in enumerate(analysis["pages"]) if page["words_near_edge"] > 2
|
||||
]
|
||||
|
||||
if check_orientation or check_crop or consecutive_blanks or consecutive_blurry:
|
||||
print(
|
||||
json.dumps(
|
||||
{
|
||||
"item_id": item_id,
|
||||
"check_orientation": check_orientation,
|
||||
"check_crop": check_crop,
|
||||
"consecutive_blanks": consecutive_blanks,
|
||||
"consecutive_blurry": consecutive_blurry,
|
||||
}
|
||||
|
|
@ -135,7 +142,7 @@ def _analyze_item_to_stdout(task):
|
|||
print(f"Analyzing item {item_id}...", file=stderr)
|
||||
stderr.flush()
|
||||
|
||||
print(json.dumps(analyze_item(item_id, verbose)))
|
||||
print(json.dumps(analyze_item(item_id, True, verbose)))
|
||||
stdout.flush()
|
||||
|
||||
if verbose:
|
||||
|
|
@ -169,6 +176,7 @@ def _analyze_page(task):
|
|||
if is_blank:
|
||||
max_sharpness = 1
|
||||
ocr_orientation_match = True
|
||||
words_near_edge = 0
|
||||
else:
|
||||
max_sharpness = 0.0
|
||||
if im_cropped.size[0] < im_cropped.size[1]:
|
||||
|
|
@ -195,14 +203,16 @@ def _analyze_page(task):
|
|||
),
|
||||
)
|
||||
|
||||
n_words = []
|
||||
best_ocr_score = -1
|
||||
best_ocr_words = None
|
||||
best_ocr_orientation = -1
|
||||
for orientation in range(4):
|
||||
im_rotated = im_original.rotate(90 * orientation)
|
||||
ocr = pd.read_csv(
|
||||
StringIO(pytesseract.image_to_data(im_rotated)),
|
||||
sep="\t",
|
||||
quoting=QUOTE_NONE,
|
||||
dtype={"text": str},
|
||||
im_rotated = im_original.rotate(90 * orientation, expand=True)
|
||||
ocr = pytesseract.image_to_data(
|
||||
im_rotated,
|
||||
lang=OCR_LANGS,
|
||||
config="--oem 1 --dpi 300 --tessdata-dir ./tessdata_fast-4.1.0",
|
||||
output_type=pytesseract.Output.DATAFRAME,
|
||||
).fillna({"text": ""})
|
||||
words = ocr[(ocr["conf"] > 90) & (ocr["width"] > ocr["height"])]
|
||||
words = words[
|
||||
|
|
@ -211,12 +221,36 @@ def _analyze_page(task):
|
|||
axis=1,
|
||||
)
|
||||
]
|
||||
n_words.append(words.shape[0])
|
||||
if len(words) > 50:
|
||||
if words.shape[0] > best_ocr_score:
|
||||
best_ocr_score = words.shape[0]
|
||||
best_ocr_orientation = orientation
|
||||
best_ocr_words = words
|
||||
if best_ocr_score > 50:
|
||||
# Unlikely that another orientation will have more words, so
|
||||
# stop eating up CPU unnecessarily.
|
||||
break
|
||||
ocr_orientation_match = np.max(n_words) == n_words[0]
|
||||
|
||||
ocr_orientation_match = best_ocr_orientation == 0
|
||||
|
||||
best_ocr_dims = (
|
||||
im_original.size
|
||||
if best_ocr_orientation % 2 == 0
|
||||
else (im_original.size[1], im_original.size[0])
|
||||
)
|
||||
EDGE_TOLERANCE = 0.03
|
||||
words_near_edge = best_ocr_words[
|
||||
(best_ocr_words["left"] < best_ocr_dims[0] * EDGE_TOLERANCE)
|
||||
| (best_ocr_words["top"] < best_ocr_dims[1] * EDGE_TOLERANCE)
|
||||
| (
|
||||
best_ocr_words["left"] + best_ocr_words["width"]
|
||||
> best_ocr_dims[0] * (1 - EDGE_TOLERANCE)
|
||||
)
|
||||
| (
|
||||
best_ocr_words["top"] + best_ocr_words["height"]
|
||||
> best_ocr_dims[1] * (1 - EDGE_TOLERANCE)
|
||||
)
|
||||
]
|
||||
words_near_edge = words_near_edge.shape[0]
|
||||
|
||||
return {
|
||||
"blank": is_blank,
|
||||
|
|
@ -225,6 +259,7 @@ def _analyze_page(task):
|
|||
"page_index": page_index,
|
||||
"size": im_original.size,
|
||||
"sharpness": max_sharpness,
|
||||
"words_near_edge": words_near_edge,
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -262,6 +297,7 @@ def analyze_item(item_id, parallel=False, verbose=False):
|
|||
if page_index != -1:
|
||||
with jp_zip.open(file_name) as jp_file:
|
||||
im = Image.open(jp_file).convert("L")
|
||||
im.thumbnail((3200, 3200))
|
||||
tasks.append(
|
||||
PageAnalysisTask(
|
||||
im=im, page_index=page_index, file_name=file_name
|
||||
|
|
@ -273,8 +309,7 @@ def analyze_item(item_id, parallel=False, verbose=False):
|
|||
stderr.flush()
|
||||
if parallel:
|
||||
# Parallelize image processing and OCR of pages across up to n cores.
|
||||
N_PROCESSES = 8
|
||||
with Pool(N_PROCESSES) as pool:
|
||||
with Pool(N_OCR_PROCESSES) as pool:
|
||||
return {"pages": pool.map(_analyze_page, tasks)}
|
||||
return {"pages": [_analyze_page(task) for task in tasks]}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
[tools]
|
||||
jujutsu = "latest"
|
||||
uv = "latest"
|
||||
watchexec = "latest"
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue