import json from argparse import ArgumentParser from time import time import numpy as np from microqa.items import cache_item, fetch_item from microqa.engine import analyze_doc def main(): parser = ArgumentParser() parser.add_argument("--item-id") parser.add_argument("--cpus", type=int, default=4) args = parser.parse_args() cache_item( args.item_id, # Will not refetch if value is already cached. overwrite=False, ) item = fetch_item(args.item_id, use_cache=True) t_start = time() minimal_docs = ( [doc for doc in item.docs if doc.name != ""] if len(item.docs) > 1 else item.docs ) analyses = [ analyze_doc(doc, parallel=args.cpus, use_cache=True) for doc in minimal_docs ] t_end = time() print( json.dumps( { "analyses": analyses, "duration_secs": t_end - t_start, "disoriented_pages": [ [ i for i, page in enumerate(doc["pages"]) if not page["ocr_orientation_match"] ] for doc in analyses ], "sharpness_max": max( *[page["sharpness"] for doc in analyses for page in doc["pages"]] ), "sharpness_median": np.median( [page["sharpness"] for doc in analyses for page in doc["pages"]] ).tolist(), "sharpness_min": min( *[page["sharpness"] for doc in analyses for page in doc["pages"]] ), } ) ) if __name__ == "__main__": main()