2025-10-04 18:03:03 -07:00
|
|
|
import json
|
|
|
|
|
from argparse import ArgumentParser
|
|
|
|
|
from time import time
|
|
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
|
|
2025-11-07 05:41:18 +00:00
|
|
|
from microqa.items import cache_item, fetch_item
|
|
|
|
|
from microqa.engine import analyze_doc
|
2025-10-04 18:03:03 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
parser = ArgumentParser()
|
|
|
|
|
parser.add_argument("--item-id")
|
2025-12-20 02:16:41 +00:00
|
|
|
parser.add_argument(
|
|
|
|
|
"--ocr-backend",
|
|
|
|
|
help="which local OCR backend to use when available text in archived PDF files is insufficient; one of 'tesseract' or 'paddleocr'",
|
|
|
|
|
default="tesseract",
|
|
|
|
|
)
|
|
|
|
|
parser.add_argument("--verbose", action="store_true")
|
2025-10-04 18:03:03 -07:00
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
2025-12-20 02:16:41 +00:00
|
|
|
# Import OCR engine modules only as needed, to avoid unnecessary slow
|
|
|
|
|
# startups and/or missing dependency errors.
|
|
|
|
|
if args.ocr_backend == "tesseract":
|
|
|
|
|
from microqa.ocr.tesseract import TesseractOcrEngine
|
|
|
|
|
|
|
|
|
|
ocr_engine = TesseractOcrEngine(languages=["eng", "fra"])
|
|
|
|
|
elif args.ocr_backend == "paddleocr":
|
|
|
|
|
from microqa.ocr.paddleocr import PaddleOcrEngine
|
|
|
|
|
|
|
|
|
|
ocr_engine = PaddleOcrEngine(languages=["eng", "fra"])
|
|
|
|
|
|
2025-10-04 18:03:03 -07:00
|
|
|
cache_item(
|
|
|
|
|
args.item_id,
|
|
|
|
|
# Will not refetch if value is already cached.
|
|
|
|
|
overwrite=False,
|
|
|
|
|
)
|
|
|
|
|
item = fetch_item(args.item_id, use_cache=True)
|
|
|
|
|
|
|
|
|
|
t_start = time()
|
|
|
|
|
|
|
|
|
|
minimal_docs = (
|
|
|
|
|
[doc for doc in item.docs if doc.name != ""]
|
|
|
|
|
if len(item.docs) > 1
|
|
|
|
|
else item.docs
|
|
|
|
|
)
|
|
|
|
|
analyses = [
|
2025-12-20 02:16:41 +00:00
|
|
|
analyze_doc(
|
|
|
|
|
doc=doc, ocr_engine=ocr_engine, use_cache=True, verbose=args.verbose
|
|
|
|
|
)
|
|
|
|
|
for doc in minimal_docs
|
2025-10-04 18:03:03 -07:00
|
|
|
]
|
|
|
|
|
|
|
|
|
|
t_end = time()
|
|
|
|
|
|
|
|
|
|
print(
|
|
|
|
|
json.dumps(
|
|
|
|
|
{
|
|
|
|
|
"analyses": analyses,
|
|
|
|
|
"duration_secs": t_end - t_start,
|
|
|
|
|
"disoriented_pages": [
|
|
|
|
|
[
|
|
|
|
|
i
|
|
|
|
|
for i, page in enumerate(doc["pages"])
|
2025-12-20 02:16:41 +00:00
|
|
|
if 45 < page["page_angle"] < 315
|
2025-10-04 18:03:03 -07:00
|
|
|
]
|
|
|
|
|
for doc in analyses
|
|
|
|
|
],
|
|
|
|
|
"sharpness_max": max(
|
|
|
|
|
*[page["sharpness"] for doc in analyses for page in doc["pages"]]
|
|
|
|
|
),
|
|
|
|
|
"sharpness_median": np.median(
|
|
|
|
|
[page["sharpness"] for doc in analyses for page in doc["pages"]]
|
|
|
|
|
).tolist(),
|
|
|
|
|
"sharpness_min": min(
|
|
|
|
|
*[page["sharpness"] for doc in analyses for page in doc["pages"]]
|
|
|
|
|
),
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|