import re import sqlite3 import traceback from argparse import ArgumentParser from datetime import datetime from sys import stderr from time import sleep import requests from archive_item import fetch_item from engine import analyze_doc def main(): parser = ArgumentParser() parser.add_argument( "--database", help="path to sqlite database for analysis output", default="./microqa.db", ) parser.add_argument( "--cpus", type=int, help="number of concurrent subprocesses to use; higher is generally faster but consumes more resources", default=2, ) parser.add_argument( "--earliest-review-date", help="script will attempt to analyze all items with a review date greater than or equal to this value (YYYYMMDD)", default="20250701", ) args = parser.parse_args() with sqlite3.connect(args.database) as conn: cur = conn.cursor() cur.execute(""" create table if not exists items ( id text primary key not null, review_date text not null, skip_analysis bool not null, analyzed_date text )""") cur.execute(""" create table if not exists docs ( name text primary key not null, item text not null )""") cur.execute(""" create table if not exists pages ( id int primary key, doc text not null, page int not null, orientation_match boolean not null, sharpness real not null, is_blank boolean not null, text_margin_px int not null )""") cur.execute("create index if not exists review_date_idx on items (review_date)") cur.execute( "create index if not exists analyzed_date_idx on items (analyzed_date)" ) cur.execute("create index if not exists item_idx on docs (item)") cur.execute("create index if not exists doc_idx on pages (doc)") cur.execute( "create unique index if not exists doc_page_idx on pages (doc, page)" ) conn.commit() while True: print("Pulling item IDs") pull_new_item_ids(conn, args.earliest_review_date) print("Done.") res = cur.execute(""" select id from items where analyzed_date is null and skip_analysis = false order by review_date """) for (item_id,) in res.fetchall(): N_ATTEMPTS = 3 for _ in range(N_ATTEMPTS): try: print(f"Processing {item_id}") item = fetch_item(item_id) minimal_docs = ( [doc for doc in item.docs if doc.name != ""] if len(item.docs) > 1 else item.docs ) for doc in minimal_docs: cur.execute( "insert into docs (name, item) values (?, ?) on conflict do nothing", [doc.name, item_id], ) analysis = analyze_doc( doc, parallel=args.cpus, verbose=True ) for i, page in enumerate(analysis["pages"]): cur.execute( """ insert into pages ( doc, page, orientation_match, sharpness, is_blank, text_margin_px ) values (?, ?, ?, ?, ?, ?)""", [ doc.name, i + 1, page["ocr_orientation_match"], page["sharpness"], page["blank"], page["text_margin_px"], ], ) cur.execute( "update items set analyzed_date = ? where id = ?", [datetime.utcnow().strftime("%Y%m%d%H%M%S"), item_id], ) conn.commit() print("Done") break except Exception as err: print(err, file=stderr) traceback.print_tb(err.__traceback__, file=stderr) sleep(15) break sleep(3600) def pull_new_item_ids(conn, earliest_review_date): cur = conn.cursor() res = cur.execute("select review_date from items order by review_date desc limit 1") (latest_review_date,) = res.fetchone() or (earliest_review_date,) print(latest_review_date) query = f""" collection:(microfiche) AND contributor:(Internet Archive) AND micro_review:(done) AND review_date:[{latest_review_date} TO null] """ sort = "reviewdate asc" # Format for API. query = re.sub(r"\s+", "+", query.strip()) sort = re.sub(r"\s+", "+", sort.strip()) # Archive.org has a paginated scraping API, but the query feature seems to # be broken in mysterious ways and more or less impossible to use for our # purposes. resp = requests.get( f"https://archive.org/advancedsearch.php?q={query}&sort[]={sort}&fl[]=identifier&fl[]=review_date&rows=250000&output=json", ) resp.raise_for_status() try: body = resp.json() except Exception as err: print("Body:", resp.text, file=stderr) raise err for doc in body["response"]["docs"]: cur.execute( "insert into items (id, review_date, skip_analysis) values (?, ?, false) on conflict do nothing", (doc["identifier"], doc["review_date"]), ) conn.commit() if __name__ == "__main__": main()