MicroQA/diagnostics.py

100 lines
2.9 KiB
Python
Raw Normal View History

import json
from argparse import ArgumentParser
from time import time
import numpy as np
2025-11-07 05:41:18 +00:00
from microqa.items import cache_item, fetch_item
from microqa.engine import analyze_doc
def main():
parser = ArgumentParser()
parser.add_argument("--item-id")
2025-12-20 02:16:41 +00:00
parser.add_argument(
"--ocr-backend",
help="which local OCR backend to use when available text in archived PDF files is insufficient; one of 'tesseract' or 'paddleocr'",
default="tesseract",
)
parser.add_argument("--verbose", action="store_true")
args = parser.parse_args()
2025-12-20 02:16:41 +00:00
# Import OCR engine modules only as needed, to avoid unnecessary slow
# startups and/or missing dependency errors.
if args.ocr_backend == "tesseract":
from microqa.ocr.tesseract import TesseractOcrEngine
ocr_engine = TesseractOcrEngine(languages=["eng", "fra"])
elif args.ocr_backend == "paddleocr":
from microqa.ocr.paddleocr import PaddleOcrEngine
ocr_engine = PaddleOcrEngine(languages=["eng", "fra"])
cache_item(
args.item_id,
# Will not refetch if value is already cached.
overwrite=False,
)
item = fetch_item(args.item_id, use_cache=True)
t_start = time()
minimal_docs = (
[doc for doc in item.docs if doc.name != doc.identifier]
if len(item.docs) > 1
else item.docs
)
analyses = [
2025-12-20 02:16:41 +00:00
analyze_doc(
doc=doc, ocr_engine=ocr_engine, use_cache=True, verbose=args.verbose
)
for doc in minimal_docs
]
t_end = time()
print(
json.dumps(
{
"analyses": analyses,
"duration_secs": t_end - t_start,
"disoriented_pages": [
[
i
for i, page in enumerate(doc["pages"])
if 30 < page["page_angle"] < 330
]
for doc in analyses
],
"sharpness_max": max(
[
page["sharpness"]
for doc in analyses
for page in doc["pages"]
if page["sharpness"] is not None
]
),
"sharpness_median": np.median(
[
page["sharpness"]
for doc in analyses
for page in doc["pages"]
if page["sharpness"] is not None
]
).tolist(),
"sharpness_min": min(
[
page["sharpness"]
for doc in analyses
for page in doc["pages"]
if page["sharpness"] is not None
]
),
}
)
)
if __name__ == "__main__":
main()