import re import sqlite3 import traceback from argparse import ArgumentParser from datetime import datetime from time import sleep import requests from main import analyze_item def main(): parser = ArgumentParser() parser.add_argument("--database", default="./microqa.db") parser.add_argument("--cpus", type=int, default=2) parser.add_argument("--earliest-review-date", default="20250701") args = parser.parse_args() with sqlite3.connect(args.database) as conn: cur = conn.cursor() cur.execute(""" create table if not exists items ( id text primary key not null, review_date text not null, analyzed_date text )""") cur.execute(""" create table if not exists pages ( id int primary key, item text not null, page int not null, orientation_match boolean not null, sharpness real not null, is_blank boolean not null, text_margin_px int not null )""") conn.commit() while True: print("Pulling item IDs") pull_new_item_ids(conn, args.earliest_review_date) print("Done.") res = cur.execute( "select id from items where analyzed_date is null order by review_date" ) for (item_id,) in res.fetchall(): N_ATTEMPTS = 3 for _ in range(N_ATTEMPTS): try: print(f"Processing {item_id}") analysis = analyze_item( item_id, parallel=args.cpus, verbose=True ) for i, page in enumerate(analysis["pages"]): cur.execute( """ insert into pages ( item, page, orientation_match, sharpness, is_blank, text_margin_px ) values ( ?, ?, ?, ?, ?, ? )""", [ item_id, i + 1, page["ocr_orientation_match"], page["sharpness"], page["blank"], page["text_margin_px"], ], ) cur.execute( "update items set analyzed_date = ? where id = ?", [datetime.utcnow().strftime("%Y%m%d%H%M%S"), item_id], ) conn.commit() print("Done") break except Exception as err: print(err) traceback.print_tb(err.__traceback__) sleep(15) break sleep(3600) def pull_new_item_ids(conn, earliest_review_date): cur = conn.cursor() res = cur.execute("select review_date from items order by review_date desc limit 1") (latest_review_date,) = res.fetchone() or (earliest_review_date,) print(latest_review_date) query = f""" collection:(microfiche) AND contributor:(Internet Archive) AND micro_review:(done) AND review_date:[{latest_review_date} TO null] """ sort = "reviewdate asc" # Format for API. query = re.sub(r"\s+", "+", query.strip()) sort = re.sub(r"\s+", "+", sort.strip()) for i in range(1, 999): resp = requests.get( f"https://archive.org/advancedsearch.php?q={query}&sort[]={sort}&fl[]=identifier&fl[]=review_date&rows=100&page={i}&output=json", ) resp.raise_for_status() body = resp.json() if len(body["response"]["docs"]) == 0: break cur.executemany( "insert into items (id, review_date) values (?, ?) on conflict do nothing", [ (doc["identifier"], doc["review_date"]) for doc in body["response"]["docs"] ], ) conn.commit() if __name__ == "__main__": main()