MicroQA/diagnostics.py

66 lines
1.7 KiB
Python
Raw Normal View History

import json
from argparse import ArgumentParser
from time import time
import numpy as np
from archive_item import cache_item, fetch_item
from engine import analyze_doc
def main():
parser = ArgumentParser()
parser.add_argument("--item-id")
parser.add_argument("--cpus", type=int, default=4)
args = parser.parse_args()
cache_item(
args.item_id,
# Will not refetch if value is already cached.
overwrite=False,
)
item = fetch_item(args.item_id, use_cache=True)
t_start = time()
minimal_docs = (
[doc for doc in item.docs if doc.name != ""]
if len(item.docs) > 1
else item.docs
)
analyses = [
analyze_doc(doc, parallel=args.cpus, use_cache=True) for doc in minimal_docs
]
t_end = time()
print(
json.dumps(
{
"analyses": analyses,
"duration_secs": t_end - t_start,
"disoriented_pages": [
[
i
for i, page in enumerate(doc["pages"])
if not page["ocr_orientation_match"]
]
for doc in analyses
],
"sharpness_max": max(
*[page["sharpness"] for doc in analyses for page in doc["pages"]]
),
"sharpness_median": np.median(
[page["sharpness"] for doc in analyses for page in doc["pages"]]
).tolist(),
"sharpness_min": min(
*[page["sharpness"] for doc in analyses for page in doc["pages"]]
),
}
)
)
if __name__ == "__main__":
main()