MicroQA/one_off.py

160 lines
4.9 KiB
Python
Raw Normal View History

2025-10-04 15:09:16 -07:00
import json
import re
from argparse import ArgumentParser
from dataclasses import dataclass
from multiprocessing.pool import ThreadPool
from sys import stderr, stdin, stdout
from engine import analyze_item
OCR_LANGS = "eng+fra"
def main():
parser = ArgumentParser()
parser.add_argument("--summarize", action="store_true")
parser.add_argument("-v", "--verbose", action="store_true")
parser.add_argument("-w", "--workers", type=int, default=1)
parser.add_argument("--page-margin-px", type=int, default=50)
args = parser.parse_args()
# Process STDIN line by line, where each line contains one or more item IDs
# separated by whitespace.
for line in stdin:
item_ids = [value for value in re.split(r",|\s", line) if value]
with ThreadPool(args.workers) as pool:
if args.verbose:
print(f"Running with {args.workers} workers.", file=stderr)
stderr.flush()
if args.summarize:
pool.map(
_summarize_item_to_stdout,
[
ItemTask(
item_id=item_id,
page_margin_px=args.page_margin_px,
verbose=args.verbose,
)
for item_id in item_ids
],
)
else:
pool.map(
_analyze_item_to_stdout,
[
ItemTask(
item_id=item_id,
page_margin_px=args.page_margin_px,
verbose=args.verbose,
)
for item_id in item_ids
],
)
@dataclass
class ItemTask:
item_id: str
page_margin_px: int
verbose: bool
def _summarize_item_to_stdout(task):
item_id = task.item_id
page_margin_px = task.page_margin_px
verbose = task.verbose
if verbose:
print(f"Summarizing item {item_id}...", file=stderr)
stderr.flush()
analysis = analyze_item(item_id, parallel=4, verbose=verbose)
# 3 or more blank pages in a row is a flag.
CONSECUTIVE_BLANKS_THRESHOLD = 3
if len(analysis["pages"]) >= CONSECUTIVE_BLANKS_THRESHOLD:
consecutive_blanks = [page["blank"] for page in analysis["pages"]]
for _ in range(1, CONSECUTIVE_BLANKS_THRESHOLD):
consecutive_blanks = [
value and consecutive_blanks[i]
for i, value in enumerate(consecutive_blanks[1:])
]
consecutive_blanks = [
i + 2 # +1 to account for enumeration offset, and +1 to 1-index
for i, value in enumerate(consecutive_blanks[1:])
if value and not consecutive_blanks[i]
]
else:
consecutive_blanks = []
# 3 or more blank pages in a row is a flag.
CONSECUTIVE_BLURRY_THRESHOLD = 3
SHARPNESS_THRESHOLD = 0.1
if len(analysis["pages"]) >= CONSECUTIVE_BLURRY_THRESHOLD:
consecutive_blurry = [
page["sharpness"] < SHARPNESS_THRESHOLD for page in analysis["pages"]
]
for _ in range(1, CONSECUTIVE_BLURRY_THRESHOLD):
consecutive_blurry = [
value and consecutive_blurry[i]
for i, value in enumerate(consecutive_blurry[1:])
]
consecutive_blurry = [
i + 2 # +1 to account for enumeration offset, and +1 to 1-index
for i, value in enumerate(consecutive_blurry[1:])
if value and not consecutive_blurry[i]
]
else:
consecutive_blurry = []
check_orientation = [
i + 1
for i, page in enumerate(analysis["pages"])
if not page["ocr_orientation_match"]
]
check_crop = [
i + 1
for i, page in enumerate(analysis["pages"])
if page["text_margin_px"] < page_margin_px
]
if check_orientation or check_crop or consecutive_blanks or consecutive_blurry:
print(
json.dumps(
{
"item_id": item_id,
"check_orientation": check_orientation,
"check_crop": check_crop,
"consecutive_blanks": consecutive_blanks,
"consecutive_blurry": consecutive_blurry,
}
)
)
stdout.flush()
if verbose:
print(f"Done summarizing item {item_id}.", file=stderr)
stderr.flush()
def _analyze_item_to_stdout(task):
item_id = task.item_id
verbose = task.verbose
if verbose:
print(f"Analyzing item {item_id}...", file=stderr)
stderr.flush()
print(json.dumps(analyze_item(item_id, parallel=6, verbose=verbose)))
stdout.flush()
if verbose:
print(f"Done analyzing item {item_id}.", file=stderr)
stderr.flush()
if __name__ == "__main__":
main()