MicroQA/main.py

191 lines
6.5 KiB
Python
Raw Normal View History

2025-08-10 12:27:39 -07:00
import re
2025-10-04 15:09:16 -07:00
import sqlite3
import traceback
2025-08-10 12:27:39 -07:00
from argparse import ArgumentParser
2025-10-04 15:09:16 -07:00
from datetime import datetime
from sys import stderr
from time import sleep
2025-08-10 12:27:39 -07:00
import requests
2025-11-07 05:41:18 +00:00
from microqa.items import fetch_item
from microqa.engine import analyze_doc
2025-08-10 22:10:16 -07:00
2025-08-10 12:27:39 -07:00
def main():
parser = ArgumentParser()
parser.add_argument(
"--database",
help="path to sqlite database for analysis output",
default="./microqa.db",
)
parser.add_argument(
"--cpus",
type=int,
help="number of concurrent subprocesses to use; higher is generally faster but consumes more resources",
2025-11-07 05:41:18 +00:00
default=6,
)
parser.add_argument(
"--earliest-review-date",
help="script will attempt to analyze all items with a review date greater than or equal to this value (YYYYMMDD)",
default="20250701",
)
2025-12-20 02:16:41 +00:00
parser.add_argument(
"--ocr-backend",
help="which local OCR backend to use when available text in archived PDF files is insufficient; one of 'tesseract' or 'paddleocr'",
default="tesseract",
)
2025-08-10 12:27:39 -07:00
args = parser.parse_args()
2025-12-20 02:16:41 +00:00
# Import OCR engine modules only as needed, to avoid unnecessary slow
# startups and/or missing dependency errors.
if args.ocr_backend == "tesseract":
from microqa.ocr.tesseract import TesseractOcrEngine
ocr_engine = TesseractOcrEngine(languages=["eng", "fra"])
elif args.ocr_backend == "paddleocr":
from microqa.ocr.paddleocr import PaddleOcrEngine
ocr_engine = PaddleOcrEngine(languages=["eng", "fra"])
2025-10-04 15:09:16 -07:00
with sqlite3.connect(args.database) as conn:
cur = conn.cursor()
cur.execute("""
create table if not exists items (
id text primary key not null,
review_date text not null,
skip_analysis bool not null,
analyzed_date text
)""")
cur.execute("""
create table if not exists docs (
name text primary key not null,
item text not null
)""")
cur.execute("""
2025-10-04 15:09:16 -07:00
create table if not exists pages (
id int primary key,
doc text not null,
2025-10-04 15:09:16 -07:00
page int not null,
2025-12-20 02:16:41 +00:00
page_angle float not null,
2025-10-04 15:09:16 -07:00
sharpness real not null,
is_blank boolean not null,
text_margin_px int not null
)""")
cur.execute("create index if not exists review_date_idx on items (review_date)")
cur.execute(
"create index if not exists analyzed_date_idx on items (analyzed_date)"
2025-08-10 12:27:39 -07:00
)
cur.execute("create index if not exists item_idx on docs (item)")
cur.execute("create index if not exists doc_idx on pages (doc)")
2025-10-04 15:09:16 -07:00
cur.execute(
"create unique index if not exists doc_page_idx on pages (doc, page)"
2025-08-10 12:27:39 -07:00
)
2025-10-04 15:09:16 -07:00
conn.commit()
while True:
print("Pulling item IDs")
pull_new_item_ids(conn, args.earliest_review_date)
print("Done.")
res = cur.execute("""
select id
from items
where analyzed_date is null
and skip_analysis = false
order by review_date
""")
for (item_id,) in res.fetchall():
N_ATTEMPTS = 3
for _ in range(N_ATTEMPTS):
try:
print(f"Processing {item_id}")
item = fetch_item(item_id)
minimal_docs = (
[doc for doc in item.docs if doc.name != ""]
if len(item.docs) > 1
else item.docs
2025-08-10 12:27:39 -07:00
)
for doc in minimal_docs:
2025-10-04 15:09:16 -07:00
cur.execute(
"insert into docs (name, item) values (?, ?) on conflict do nothing",
[doc.name, item_id],
)
analysis = analyze_doc(
2025-12-20 02:16:41 +00:00
doc=doc, ocr_engine=ocr_engine, verbose=True
)
for i, page in enumerate(analysis["pages"]):
cur.execute(
"""
2025-10-04 15:09:16 -07:00
insert into pages (
doc,
2025-10-04 15:09:16 -07:00
page,
2025-12-20 02:16:41 +00:00
page_angle,
2025-10-04 15:09:16 -07:00
sharpness,
is_blank,
text_margin_px
) values (?, ?, ?, ?, ?, ?)""",
[
doc.name,
i + 1,
2025-12-20 02:16:41 +00:00
page["page_angle"],
page["sharpness"],
page["blank"],
page["text_margin_px"],
],
)
2025-10-04 15:09:16 -07:00
cur.execute(
"update items set analyzed_date = ? where id = ?",
[datetime.utcnow().strftime("%Y%m%d%H%M%S"), item_id],
2025-08-10 12:27:39 -07:00
)
2025-10-04 15:09:16 -07:00
conn.commit()
print("Done")
break
except Exception as err:
print(err, file=stderr)
traceback.print_tb(err.__traceback__, file=stderr)
sleep(15)
break
sleep(3600)
def pull_new_item_ids(conn, earliest_review_date):
cur = conn.cursor()
res = cur.execute("select review_date from items order by review_date desc limit 1")
(latest_review_date,) = res.fetchone() or (earliest_review_date,)
print(latest_review_date)
query = f"""
collection:(microfiche)
AND contributor:(Internet Archive)
AND micro_review:(done)
AND review_date:[{latest_review_date} TO null]
2025-08-10 12:27:39 -07:00
"""
2025-10-04 15:09:16 -07:00
sort = "reviewdate asc"
# Format for API.
query = re.sub(r"\s+", "+", query.strip())
sort = re.sub(r"\s+", "+", sort.strip())
# Archive.org has a paginated scraping API, but the query feature seems to
# be broken in mysterious ways and more or less impossible to use for our
# purposes.
2025-10-04 15:09:16 -07:00
resp = requests.get(
f"https://archive.org/advancedsearch.php?q={query}&sort[]={sort}&fl[]=identifier&fl[]=review_date&rows=250000&output=json",
2025-08-10 12:27:39 -07:00
)
2025-10-04 15:09:16 -07:00
resp.raise_for_status()
try:
body = resp.json()
except Exception as err:
print("Body:", resp.text, file=stderr)
raise err
for doc in body["response"]["docs"]:
cur.execute(
"insert into items (id, review_date, skip_analysis) values (?, ?, false) on conflict do nothing",
(doc["identifier"], doc["review_date"]),
2025-08-10 12:27:39 -07:00
)
2025-10-04 15:09:16 -07:00
conn.commit()
2025-08-10 12:27:39 -07:00
if __name__ == "__main__":
main()