add ocr crop warnings
This commit is contained in:
parent
8dbcb19b43
commit
a5e3a2a429
3 changed files with 60 additions and 20 deletions
|
|
@ -16,3 +16,9 @@ a single line so that items are summarized in parallel):
|
||||||
```sh
|
```sh
|
||||||
pbpaste | tr '\n' ',' | uv run main.py --summarize -workers 4 -v | jq
|
pbpaste | tr '\n' ',' | uv run main.py --summarize -workers 4 -v | jq
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Test Cases
|
||||||
|
|
||||||
|
- Blurry pages: `micro_IA40244209_0984`
|
||||||
|
- Contrast, page orientation: `micro_IA40244211_2290`
|
||||||
|
- Crop, low quality fiche: `micro_IA40386420_0689`
|
||||||
|
|
|
||||||
73
main.py
73
main.py
|
|
@ -2,21 +2,23 @@ import json
|
||||||
import re
|
import re
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
from csv import QUOTE_NONE
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from io import BytesIO, StringIO
|
from io import BytesIO
|
||||||
from multiprocessing import Pool
|
from multiprocessing import Pool
|
||||||
from multiprocessing.pool import ThreadPool
|
from multiprocessing.pool import ThreadPool
|
||||||
from sys import stderr, stdin, stdout
|
from sys import stderr, stdin, stdout
|
||||||
from zipfile import ZipFile
|
from zipfile import ZipFile
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
|
||||||
import pytesseract
|
import pytesseract
|
||||||
import requests
|
import requests
|
||||||
from PIL import Image, ImageFilter
|
from PIL import Image, ImageFilter
|
||||||
|
|
||||||
|
|
||||||
|
OCR_LANGS = "eng+fra"
|
||||||
|
N_OCR_PROCESSES = 4
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = ArgumentParser()
|
parser = ArgumentParser()
|
||||||
parser.add_argument("--summarize", action="store_true")
|
parser.add_argument("--summarize", action="store_true")
|
||||||
|
|
@ -64,7 +66,7 @@ def _summarize_item_to_stdout(task):
|
||||||
print(f"Summarizing item {item_id}...", file=stderr)
|
print(f"Summarizing item {item_id}...", file=stderr)
|
||||||
stderr.flush()
|
stderr.flush()
|
||||||
|
|
||||||
analysis = analyze_item(item_id, verbose)
|
analysis = analyze_item(item_id, True, verbose)
|
||||||
|
|
||||||
# 3 or more blank pages in a row is a flag.
|
# 3 or more blank pages in a row is a flag.
|
||||||
CONSECUTIVE_BLANKS_THRESHOLD = 3
|
CONSECUTIVE_BLANKS_THRESHOLD = 3
|
||||||
|
|
@ -85,7 +87,7 @@ def _summarize_item_to_stdout(task):
|
||||||
|
|
||||||
# 3 or more blank pages in a row is a flag.
|
# 3 or more blank pages in a row is a flag.
|
||||||
CONSECUTIVE_BLURRY_THRESHOLD = 3
|
CONSECUTIVE_BLURRY_THRESHOLD = 3
|
||||||
SHARPNESS_THRESHOLD = 0.08
|
SHARPNESS_THRESHOLD = 0.1
|
||||||
if len(analysis["pages"]) >= CONSECUTIVE_BLURRY_THRESHOLD:
|
if len(analysis["pages"]) >= CONSECUTIVE_BLURRY_THRESHOLD:
|
||||||
consecutive_blurry = [
|
consecutive_blurry = [
|
||||||
page["sharpness"] < SHARPNESS_THRESHOLD for page in analysis["pages"]
|
page["sharpness"] < SHARPNESS_THRESHOLD for page in analysis["pages"]
|
||||||
|
|
@ -109,12 +111,17 @@ def _summarize_item_to_stdout(task):
|
||||||
if not page["ocr_orientation_match"]
|
if not page["ocr_orientation_match"]
|
||||||
]
|
]
|
||||||
|
|
||||||
if check_orientation or consecutive_blanks or consecutive_blurry:
|
check_crop = [
|
||||||
|
i + 1 for i, page in enumerate(analysis["pages"]) if page["words_near_edge"] > 2
|
||||||
|
]
|
||||||
|
|
||||||
|
if check_orientation or check_crop or consecutive_blanks or consecutive_blurry:
|
||||||
print(
|
print(
|
||||||
json.dumps(
|
json.dumps(
|
||||||
{
|
{
|
||||||
"item_id": item_id,
|
"item_id": item_id,
|
||||||
"check_orientation": check_orientation,
|
"check_orientation": check_orientation,
|
||||||
|
"check_crop": check_crop,
|
||||||
"consecutive_blanks": consecutive_blanks,
|
"consecutive_blanks": consecutive_blanks,
|
||||||
"consecutive_blurry": consecutive_blurry,
|
"consecutive_blurry": consecutive_blurry,
|
||||||
}
|
}
|
||||||
|
|
@ -135,7 +142,7 @@ def _analyze_item_to_stdout(task):
|
||||||
print(f"Analyzing item {item_id}...", file=stderr)
|
print(f"Analyzing item {item_id}...", file=stderr)
|
||||||
stderr.flush()
|
stderr.flush()
|
||||||
|
|
||||||
print(json.dumps(analyze_item(item_id, verbose)))
|
print(json.dumps(analyze_item(item_id, True, verbose)))
|
||||||
stdout.flush()
|
stdout.flush()
|
||||||
|
|
||||||
if verbose:
|
if verbose:
|
||||||
|
|
@ -169,6 +176,7 @@ def _analyze_page(task):
|
||||||
if is_blank:
|
if is_blank:
|
||||||
max_sharpness = 1
|
max_sharpness = 1
|
||||||
ocr_orientation_match = True
|
ocr_orientation_match = True
|
||||||
|
words_near_edge = 0
|
||||||
else:
|
else:
|
||||||
max_sharpness = 0.0
|
max_sharpness = 0.0
|
||||||
if im_cropped.size[0] < im_cropped.size[1]:
|
if im_cropped.size[0] < im_cropped.size[1]:
|
||||||
|
|
@ -195,14 +203,16 @@ def _analyze_page(task):
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
n_words = []
|
best_ocr_score = -1
|
||||||
|
best_ocr_words = None
|
||||||
|
best_ocr_orientation = -1
|
||||||
for orientation in range(4):
|
for orientation in range(4):
|
||||||
im_rotated = im_original.rotate(90 * orientation)
|
im_rotated = im_original.rotate(90 * orientation, expand=True)
|
||||||
ocr = pd.read_csv(
|
ocr = pytesseract.image_to_data(
|
||||||
StringIO(pytesseract.image_to_data(im_rotated)),
|
im_rotated,
|
||||||
sep="\t",
|
lang=OCR_LANGS,
|
||||||
quoting=QUOTE_NONE,
|
config="--oem 1 --dpi 300 --tessdata-dir ./tessdata_fast-4.1.0",
|
||||||
dtype={"text": str},
|
output_type=pytesseract.Output.DATAFRAME,
|
||||||
).fillna({"text": ""})
|
).fillna({"text": ""})
|
||||||
words = ocr[(ocr["conf"] > 90) & (ocr["width"] > ocr["height"])]
|
words = ocr[(ocr["conf"] > 90) & (ocr["width"] > ocr["height"])]
|
||||||
words = words[
|
words = words[
|
||||||
|
|
@ -211,12 +221,36 @@ def _analyze_page(task):
|
||||||
axis=1,
|
axis=1,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
n_words.append(words.shape[0])
|
if words.shape[0] > best_ocr_score:
|
||||||
if len(words) > 50:
|
best_ocr_score = words.shape[0]
|
||||||
|
best_ocr_orientation = orientation
|
||||||
|
best_ocr_words = words
|
||||||
|
if best_ocr_score > 50:
|
||||||
# Unlikely that another orientation will have more words, so
|
# Unlikely that another orientation will have more words, so
|
||||||
# stop eating up CPU unnecessarily.
|
# stop eating up CPU unnecessarily.
|
||||||
break
|
break
|
||||||
ocr_orientation_match = np.max(n_words) == n_words[0]
|
|
||||||
|
ocr_orientation_match = best_ocr_orientation == 0
|
||||||
|
|
||||||
|
best_ocr_dims = (
|
||||||
|
im_original.size
|
||||||
|
if best_ocr_orientation % 2 == 0
|
||||||
|
else (im_original.size[1], im_original.size[0])
|
||||||
|
)
|
||||||
|
EDGE_TOLERANCE = 0.03
|
||||||
|
words_near_edge = best_ocr_words[
|
||||||
|
(best_ocr_words["left"] < best_ocr_dims[0] * EDGE_TOLERANCE)
|
||||||
|
| (best_ocr_words["top"] < best_ocr_dims[1] * EDGE_TOLERANCE)
|
||||||
|
| (
|
||||||
|
best_ocr_words["left"] + best_ocr_words["width"]
|
||||||
|
> best_ocr_dims[0] * (1 - EDGE_TOLERANCE)
|
||||||
|
)
|
||||||
|
| (
|
||||||
|
best_ocr_words["top"] + best_ocr_words["height"]
|
||||||
|
> best_ocr_dims[1] * (1 - EDGE_TOLERANCE)
|
||||||
|
)
|
||||||
|
]
|
||||||
|
words_near_edge = words_near_edge.shape[0]
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"blank": is_blank,
|
"blank": is_blank,
|
||||||
|
|
@ -225,6 +259,7 @@ def _analyze_page(task):
|
||||||
"page_index": page_index,
|
"page_index": page_index,
|
||||||
"size": im_original.size,
|
"size": im_original.size,
|
||||||
"sharpness": max_sharpness,
|
"sharpness": max_sharpness,
|
||||||
|
"words_near_edge": words_near_edge,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -262,6 +297,7 @@ def analyze_item(item_id, parallel=False, verbose=False):
|
||||||
if page_index != -1:
|
if page_index != -1:
|
||||||
with jp_zip.open(file_name) as jp_file:
|
with jp_zip.open(file_name) as jp_file:
|
||||||
im = Image.open(jp_file).convert("L")
|
im = Image.open(jp_file).convert("L")
|
||||||
|
im.thumbnail((3200, 3200))
|
||||||
tasks.append(
|
tasks.append(
|
||||||
PageAnalysisTask(
|
PageAnalysisTask(
|
||||||
im=im, page_index=page_index, file_name=file_name
|
im=im, page_index=page_index, file_name=file_name
|
||||||
|
|
@ -273,8 +309,7 @@ def analyze_item(item_id, parallel=False, verbose=False):
|
||||||
stderr.flush()
|
stderr.flush()
|
||||||
if parallel:
|
if parallel:
|
||||||
# Parallelize image processing and OCR of pages across up to n cores.
|
# Parallelize image processing and OCR of pages across up to n cores.
|
||||||
N_PROCESSES = 8
|
with Pool(N_OCR_PROCESSES) as pool:
|
||||||
with Pool(N_PROCESSES) as pool:
|
|
||||||
return {"pages": pool.map(_analyze_page, tasks)}
|
return {"pages": pool.map(_analyze_page, tasks)}
|
||||||
return {"pages": [_analyze_page(task) for task in tasks]}
|
return {"pages": [_analyze_page(task) for task in tasks]}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,4 @@
|
||||||
[tools]
|
[tools]
|
||||||
jujutsu = "latest"
|
|
||||||
uv = "latest"
|
uv = "latest"
|
||||||
watchexec = "latest"
|
watchexec = "latest"
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue