MicroQA/main.py
2025-12-20 02:17:03 +00:00

190 lines
6.5 KiB
Python

import re
import sqlite3
import traceback
from argparse import ArgumentParser
from datetime import datetime
from sys import stderr
from time import sleep
import requests
from microqa.items import fetch_item
from microqa.engine import analyze_doc
def main():
parser = ArgumentParser()
parser.add_argument(
"--database",
help="path to sqlite database for analysis output",
default="./microqa.db",
)
parser.add_argument(
"--cpus",
type=int,
help="number of concurrent subprocesses to use; higher is generally faster but consumes more resources",
default=6,
)
parser.add_argument(
"--earliest-review-date",
help="script will attempt to analyze all items with a review date greater than or equal to this value (YYYYMMDD)",
default="20250701",
)
parser.add_argument(
"--ocr-backend",
help="which local OCR backend to use when available text in archived PDF files is insufficient; one of 'tesseract' or 'paddleocr'",
default="tesseract",
)
args = parser.parse_args()
# Import OCR engine modules only as needed, to avoid unnecessary slow
# startups and/or missing dependency errors.
if args.ocr_backend == "tesseract":
from microqa.ocr.tesseract import TesseractOcrEngine
ocr_engine = TesseractOcrEngine(languages=["eng", "fra"])
elif args.ocr_backend == "paddleocr":
from microqa.ocr.paddleocr import PaddleOcrEngine
ocr_engine = PaddleOcrEngine(languages=["eng", "fra"])
with sqlite3.connect(args.database) as conn:
cur = conn.cursor()
cur.execute("""
create table if not exists items (
id text primary key not null,
review_date text not null,
skip_analysis bool not null,
analyzed_date text
)""")
cur.execute("""
create table if not exists docs (
name text primary key not null,
item text not null
)""")
cur.execute("""
create table if not exists pages (
id int primary key,
doc text not null,
page int not null,
page_angle float not null,
sharpness real not null,
is_blank boolean not null,
text_margin_px int not null
)""")
cur.execute("create index if not exists review_date_idx on items (review_date)")
cur.execute(
"create index if not exists analyzed_date_idx on items (analyzed_date)"
)
cur.execute("create index if not exists item_idx on docs (item)")
cur.execute("create index if not exists doc_idx on pages (doc)")
cur.execute(
"create unique index if not exists doc_page_idx on pages (doc, page)"
)
conn.commit()
while True:
print("Pulling item IDs")
pull_new_item_ids(conn, args.earliest_review_date)
print("Done.")
res = cur.execute("""
select id
from items
where analyzed_date is null
and skip_analysis = false
order by review_date
""")
for (item_id,) in res.fetchall():
N_ATTEMPTS = 3
for _ in range(N_ATTEMPTS):
try:
print(f"Processing {item_id}")
item = fetch_item(item_id)
minimal_docs = (
[doc for doc in item.docs if doc.name != ""]
if len(item.docs) > 1
else item.docs
)
for doc in minimal_docs:
cur.execute(
"insert into docs (name, item) values (?, ?) on conflict do nothing",
[doc.name, item_id],
)
analysis = analyze_doc(
doc=doc, ocr_engine=ocr_engine, verbose=True
)
for i, page in enumerate(analysis["pages"]):
cur.execute(
"""
insert into pages (
doc,
page,
page_angle,
sharpness,
is_blank,
text_margin_px
) values (?, ?, ?, ?, ?, ?)""",
[
doc.name,
i + 1,
page["page_angle"],
page["sharpness"],
page["blank"],
page["text_margin_px"],
],
)
cur.execute(
"update items set analyzed_date = ? where id = ?",
[datetime.utcnow().strftime("%Y%m%d%H%M%S"), item_id],
)
conn.commit()
print("Done")
break
except Exception as err:
print(err, file=stderr)
traceback.print_tb(err.__traceback__, file=stderr)
sleep(15)
break
sleep(3600)
def pull_new_item_ids(conn, earliest_review_date):
cur = conn.cursor()
res = cur.execute("select review_date from items order by review_date desc limit 1")
(latest_review_date,) = res.fetchone() or (earliest_review_date,)
print(latest_review_date)
query = f"""
collection:(microfiche)
AND contributor:(Internet Archive)
AND micro_review:(done)
AND review_date:[{latest_review_date} TO null]
"""
sort = "reviewdate asc"
# Format for API.
query = re.sub(r"\s+", "+", query.strip())
sort = re.sub(r"\s+", "+", sort.strip())
# Archive.org has a paginated scraping API, but the query feature seems to
# be broken in mysterious ways and more or less impossible to use for our
# purposes.
resp = requests.get(
f"https://archive.org/advancedsearch.php?q={query}&sort[]={sort}&fl[]=identifier&fl[]=review_date&rows=250000&output=json",
)
resp.raise_for_status()
try:
body = resp.json()
except Exception as err:
print("Body:", resp.text, file=stderr)
raise err
for doc in body["response"]["docs"]:
cur.execute(
"insert into items (id, review_date, skip_analysis) values (?, ?, false) on conflict do nothing",
(doc["identifier"], doc["review_date"]),
)
conn.commit()
if __name__ == "__main__":
main()