import json from argparse import ArgumentParser from time import time import numpy as np from microqa.items import cache_item, fetch_item from microqa.engine import analyze_doc def main(): parser = ArgumentParser() parser.add_argument("--item-id") parser.add_argument( "--ocr-backend", help="which local OCR backend to use when available text in archived PDF files is insufficient; one of 'tesseract' or 'paddleocr'", default="tesseract", ) parser.add_argument("--verbose", action="store_true") args = parser.parse_args() # Import OCR engine modules only as needed, to avoid unnecessary slow # startups and/or missing dependency errors. if args.ocr_backend == "tesseract": from microqa.ocr.tesseract import TesseractOcrEngine ocr_engine = TesseractOcrEngine(languages=["eng", "fra"]) elif args.ocr_backend == "paddleocr": from microqa.ocr.paddleocr import PaddleOcrEngine ocr_engine = PaddleOcrEngine(languages=["eng", "fra"]) cache_item( args.item_id, # Will not refetch if value is already cached. overwrite=False, ) item = fetch_item(args.item_id, use_cache=True) t_start = time() minimal_docs = ( [doc for doc in item.docs if doc.name != doc.identifier] if len(item.docs) > 1 else item.docs ) analyses = [ analyze_doc( doc=doc, ocr_engine=ocr_engine, use_cache=True, verbose=args.verbose ) for doc in minimal_docs ] t_end = time() print( json.dumps( { "analyses": analyses, "duration_secs": t_end - t_start, "disoriented_pages": [ [ i for i, page in enumerate(doc["pages"]) if 45 < page["page_angle"] < 315 ] for doc in analyses ], "sharpness_max": max( *[page["sharpness"] for doc in analyses for page in doc["pages"]] ), "sharpness_median": np.median( [page["sharpness"] for doc in analyses for page in doc["pages"]] ).tolist(), "sharpness_min": min( *[page["sharpness"] for doc in analyses for page in doc["pages"]] ), } ) ) if __name__ == "__main__": main()