MicroQA/one_off.py
2025-10-04 15:10:10 -07:00

159 lines
4.9 KiB
Python

import json
import re
from argparse import ArgumentParser
from dataclasses import dataclass
from multiprocessing.pool import ThreadPool
from sys import stderr, stdin, stdout
from engine import analyze_item
OCR_LANGS = "eng+fra"
def main():
parser = ArgumentParser()
parser.add_argument("--summarize", action="store_true")
parser.add_argument("-v", "--verbose", action="store_true")
parser.add_argument("-w", "--workers", type=int, default=1)
parser.add_argument("--page-margin-px", type=int, default=50)
args = parser.parse_args()
# Process STDIN line by line, where each line contains one or more item IDs
# separated by whitespace.
for line in stdin:
item_ids = [value for value in re.split(r",|\s", line) if value]
with ThreadPool(args.workers) as pool:
if args.verbose:
print(f"Running with {args.workers} workers.", file=stderr)
stderr.flush()
if args.summarize:
pool.map(
_summarize_item_to_stdout,
[
ItemTask(
item_id=item_id,
page_margin_px=args.page_margin_px,
verbose=args.verbose,
)
for item_id in item_ids
],
)
else:
pool.map(
_analyze_item_to_stdout,
[
ItemTask(
item_id=item_id,
page_margin_px=args.page_margin_px,
verbose=args.verbose,
)
for item_id in item_ids
],
)
@dataclass
class ItemTask:
item_id: str
page_margin_px: int
verbose: bool
def _summarize_item_to_stdout(task):
item_id = task.item_id
page_margin_px = task.page_margin_px
verbose = task.verbose
if verbose:
print(f"Summarizing item {item_id}...", file=stderr)
stderr.flush()
analysis = analyze_item(item_id, parallel=4, verbose=verbose)
# 3 or more blank pages in a row is a flag.
CONSECUTIVE_BLANKS_THRESHOLD = 3
if len(analysis["pages"]) >= CONSECUTIVE_BLANKS_THRESHOLD:
consecutive_blanks = [page["blank"] for page in analysis["pages"]]
for _ in range(1, CONSECUTIVE_BLANKS_THRESHOLD):
consecutive_blanks = [
value and consecutive_blanks[i]
for i, value in enumerate(consecutive_blanks[1:])
]
consecutive_blanks = [
i + 2 # +1 to account for enumeration offset, and +1 to 1-index
for i, value in enumerate(consecutive_blanks[1:])
if value and not consecutive_blanks[i]
]
else:
consecutive_blanks = []
# 3 or more blank pages in a row is a flag.
CONSECUTIVE_BLURRY_THRESHOLD = 3
SHARPNESS_THRESHOLD = 0.1
if len(analysis["pages"]) >= CONSECUTIVE_BLURRY_THRESHOLD:
consecutive_blurry = [
page["sharpness"] < SHARPNESS_THRESHOLD for page in analysis["pages"]
]
for _ in range(1, CONSECUTIVE_BLURRY_THRESHOLD):
consecutive_blurry = [
value and consecutive_blurry[i]
for i, value in enumerate(consecutive_blurry[1:])
]
consecutive_blurry = [
i + 2 # +1 to account for enumeration offset, and +1 to 1-index
for i, value in enumerate(consecutive_blurry[1:])
if value and not consecutive_blurry[i]
]
else:
consecutive_blurry = []
check_orientation = [
i + 1
for i, page in enumerate(analysis["pages"])
if not page["ocr_orientation_match"]
]
check_crop = [
i + 1
for i, page in enumerate(analysis["pages"])
if page["text_margin_px"] < page_margin_px
]
if check_orientation or check_crop or consecutive_blanks or consecutive_blurry:
print(
json.dumps(
{
"item_id": item_id,
"check_orientation": check_orientation,
"check_crop": check_crop,
"consecutive_blanks": consecutive_blanks,
"consecutive_blurry": consecutive_blurry,
}
)
)
stdout.flush()
if verbose:
print(f"Done summarizing item {item_id}.", file=stderr)
stderr.flush()
def _analyze_item_to_stdout(task):
item_id = task.item_id
verbose = task.verbose
if verbose:
print(f"Analyzing item {item_id}...", file=stderr)
stderr.flush()
print(json.dumps(analyze_item(item_id, parallel=6, verbose=verbose)))
stdout.flush()
if verbose:
print(f"Done analyzing item {item_id}.", file=stderr)
stderr.flush()
if __name__ == "__main__":
main()