160 lines
4.9 KiB
Python
160 lines
4.9 KiB
Python
|
|
import json
|
||
|
|
import re
|
||
|
|
from argparse import ArgumentParser
|
||
|
|
from dataclasses import dataclass
|
||
|
|
from multiprocessing.pool import ThreadPool
|
||
|
|
from sys import stderr, stdin, stdout
|
||
|
|
|
||
|
|
from engine import analyze_item
|
||
|
|
|
||
|
|
|
||
|
|
OCR_LANGS = "eng+fra"
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
parser = ArgumentParser()
|
||
|
|
parser.add_argument("--summarize", action="store_true")
|
||
|
|
parser.add_argument("-v", "--verbose", action="store_true")
|
||
|
|
parser.add_argument("-w", "--workers", type=int, default=1)
|
||
|
|
parser.add_argument("--page-margin-px", type=int, default=50)
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
# Process STDIN line by line, where each line contains one or more item IDs
|
||
|
|
# separated by whitespace.
|
||
|
|
for line in stdin:
|
||
|
|
item_ids = [value for value in re.split(r",|\s", line) if value]
|
||
|
|
with ThreadPool(args.workers) as pool:
|
||
|
|
if args.verbose:
|
||
|
|
print(f"Running with {args.workers} workers.", file=stderr)
|
||
|
|
stderr.flush()
|
||
|
|
if args.summarize:
|
||
|
|
pool.map(
|
||
|
|
_summarize_item_to_stdout,
|
||
|
|
[
|
||
|
|
ItemTask(
|
||
|
|
item_id=item_id,
|
||
|
|
page_margin_px=args.page_margin_px,
|
||
|
|
verbose=args.verbose,
|
||
|
|
)
|
||
|
|
for item_id in item_ids
|
||
|
|
],
|
||
|
|
)
|
||
|
|
else:
|
||
|
|
pool.map(
|
||
|
|
_analyze_item_to_stdout,
|
||
|
|
[
|
||
|
|
ItemTask(
|
||
|
|
item_id=item_id,
|
||
|
|
page_margin_px=args.page_margin_px,
|
||
|
|
verbose=args.verbose,
|
||
|
|
)
|
||
|
|
for item_id in item_ids
|
||
|
|
],
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class ItemTask:
|
||
|
|
item_id: str
|
||
|
|
page_margin_px: int
|
||
|
|
verbose: bool
|
||
|
|
|
||
|
|
|
||
|
|
def _summarize_item_to_stdout(task):
|
||
|
|
item_id = task.item_id
|
||
|
|
page_margin_px = task.page_margin_px
|
||
|
|
verbose = task.verbose
|
||
|
|
|
||
|
|
if verbose:
|
||
|
|
print(f"Summarizing item {item_id}...", file=stderr)
|
||
|
|
stderr.flush()
|
||
|
|
|
||
|
|
analysis = analyze_item(item_id, parallel=4, verbose=verbose)
|
||
|
|
|
||
|
|
# 3 or more blank pages in a row is a flag.
|
||
|
|
CONSECUTIVE_BLANKS_THRESHOLD = 3
|
||
|
|
if len(analysis["pages"]) >= CONSECUTIVE_BLANKS_THRESHOLD:
|
||
|
|
consecutive_blanks = [page["blank"] for page in analysis["pages"]]
|
||
|
|
for _ in range(1, CONSECUTIVE_BLANKS_THRESHOLD):
|
||
|
|
consecutive_blanks = [
|
||
|
|
value and consecutive_blanks[i]
|
||
|
|
for i, value in enumerate(consecutive_blanks[1:])
|
||
|
|
]
|
||
|
|
consecutive_blanks = [
|
||
|
|
i + 2 # +1 to account for enumeration offset, and +1 to 1-index
|
||
|
|
for i, value in enumerate(consecutive_blanks[1:])
|
||
|
|
if value and not consecutive_blanks[i]
|
||
|
|
]
|
||
|
|
else:
|
||
|
|
consecutive_blanks = []
|
||
|
|
|
||
|
|
# 3 or more blank pages in a row is a flag.
|
||
|
|
CONSECUTIVE_BLURRY_THRESHOLD = 3
|
||
|
|
SHARPNESS_THRESHOLD = 0.1
|
||
|
|
if len(analysis["pages"]) >= CONSECUTIVE_BLURRY_THRESHOLD:
|
||
|
|
consecutive_blurry = [
|
||
|
|
page["sharpness"] < SHARPNESS_THRESHOLD for page in analysis["pages"]
|
||
|
|
]
|
||
|
|
for _ in range(1, CONSECUTIVE_BLURRY_THRESHOLD):
|
||
|
|
consecutive_blurry = [
|
||
|
|
value and consecutive_blurry[i]
|
||
|
|
for i, value in enumerate(consecutive_blurry[1:])
|
||
|
|
]
|
||
|
|
consecutive_blurry = [
|
||
|
|
i + 2 # +1 to account for enumeration offset, and +1 to 1-index
|
||
|
|
for i, value in enumerate(consecutive_blurry[1:])
|
||
|
|
if value and not consecutive_blurry[i]
|
||
|
|
]
|
||
|
|
else:
|
||
|
|
consecutive_blurry = []
|
||
|
|
|
||
|
|
check_orientation = [
|
||
|
|
i + 1
|
||
|
|
for i, page in enumerate(analysis["pages"])
|
||
|
|
if not page["ocr_orientation_match"]
|
||
|
|
]
|
||
|
|
|
||
|
|
check_crop = [
|
||
|
|
i + 1
|
||
|
|
for i, page in enumerate(analysis["pages"])
|
||
|
|
if page["text_margin_px"] < page_margin_px
|
||
|
|
]
|
||
|
|
|
||
|
|
if check_orientation or check_crop or consecutive_blanks or consecutive_blurry:
|
||
|
|
print(
|
||
|
|
json.dumps(
|
||
|
|
{
|
||
|
|
"item_id": item_id,
|
||
|
|
"check_orientation": check_orientation,
|
||
|
|
"check_crop": check_crop,
|
||
|
|
"consecutive_blanks": consecutive_blanks,
|
||
|
|
"consecutive_blurry": consecutive_blurry,
|
||
|
|
}
|
||
|
|
)
|
||
|
|
)
|
||
|
|
stdout.flush()
|
||
|
|
|
||
|
|
if verbose:
|
||
|
|
print(f"Done summarizing item {item_id}.", file=stderr)
|
||
|
|
stderr.flush()
|
||
|
|
|
||
|
|
|
||
|
|
def _analyze_item_to_stdout(task):
|
||
|
|
item_id = task.item_id
|
||
|
|
verbose = task.verbose
|
||
|
|
|
||
|
|
if verbose:
|
||
|
|
print(f"Analyzing item {item_id}...", file=stderr)
|
||
|
|
stderr.flush()
|
||
|
|
|
||
|
|
print(json.dumps(analyze_item(item_id, parallel=6, verbose=verbose)))
|
||
|
|
stdout.flush()
|
||
|
|
|
||
|
|
if verbose:
|
||
|
|
print(f"Done analyzing item {item_id}.", file=stderr)
|
||
|
|
stderr.flush()
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|