import re import sqlite3 import traceback from argparse import ArgumentParser from datetime import datetime from sys import stderr from time import sleep import requests from engine import analyze_item def main(): parser = ArgumentParser() parser.add_argument("--database", default="./microqa.db") parser.add_argument("--cpus", type=int, default=2) parser.add_argument("--earliest-review-date", default="20250701") args = parser.parse_args() with sqlite3.connect(args.database) as conn: cur = conn.cursor() cur.execute(""" create table if not exists items ( id text primary key not null, review_date text not null, skip_analysis bool not null, analyzed_date text )""") cur.execute(""" create table if not exists pages ( id int primary key, item text not null, page int not null, orientation_match boolean not null, sharpness real not null, is_blank boolean not null, text_margin_px int not null )""") cur.execute("create index if not exists review_date_idx on items (review_date)") cur.execute( "create index if not exists analyzed_date_idx on items (analyzed_date)" ) cur.execute("create index if not exists item_idx on pages (item)") cur.execute( "create unique index if not exists item_page_idx on pages (item, page)" ) conn.commit() while True: print("Pulling item IDs") pull_new_item_ids(conn, args.earliest_review_date) print("Done.") res = cur.execute(""" select id from items where analyzed_date is null and skip_analysis = false order by review_date """) for (item_id,) in res.fetchall(): N_ATTEMPTS = 3 for _ in range(N_ATTEMPTS): try: print(f"Processing {item_id}") analysis = analyze_item( item_id, parallel=args.cpus, verbose=True ) for i, page in enumerate(analysis["pages"]): cur.execute( """ insert into pages ( item, page, orientation_match, sharpness, is_blank, text_margin_px ) values ( ?, ?, ?, ?, ?, ? )""", [ item_id, i + 1, page["ocr_orientation_match"], page["sharpness"], page["blank"], page["text_margin_px"], ], ) cur.execute( "update items set analyzed_date = ? where id = ?", [datetime.utcnow().strftime("%Y%m%d%H%M%S"), item_id], ) conn.commit() print("Done") break except Exception as err: print(err, file=stderr) traceback.print_tb(err.__traceback__, file=stderr) sleep(15) break sleep(3600) def pull_new_item_ids(conn, earliest_review_date): cur = conn.cursor() res = cur.execute("select review_date from items order by review_date desc limit 1") (latest_review_date,) = res.fetchone() or (earliest_review_date,) print(latest_review_date) query = f""" collection:(microfiche) AND contributor:(Internet Archive) AND micro_review:(done) AND review_date:[{latest_review_date} TO null] """ sort = "reviewdate asc" # Format for API. query = re.sub(r"\s+", "+", query.strip()) sort = re.sub(r"\s+", "+", sort.strip()) # params = { # "q": query, # "count": 100, # "fields": "identifier,review_date", # "sorts": sort, # } # for i in range(1, 999): # resp = requests.get( # "https://archive.org/services/search/v1/scrape", # params=params, # ) # resp.raise_for_status() # print(resp.text) # try: # body = resp.json() # except Exception as err: # print("Body:", resp.text, file=stderr) # raise err # for doc in body["items"]: # cur.execute( # "insert into items (id, review_date, skip_analysis) values (?, ?, false) on conflict do nothing", # (doc["identifier"], doc["review_date"]), # ) # conn.commit() # cursor = body.get("cursor", None) # if cursor is None: # break # params = params.copy() # params["cursor"] = cursor resp = requests.get( f"https://archive.org/advancedsearch.php?q={query}&sort[]={sort}&fl[]=identifier&fl[]=review_date&rows=250000&output=json", ) resp.raise_for_status() try: body = resp.json() except Exception as err: print("Body:", resp.text, file=stderr) raise err for doc in body["response"]["docs"]: cur.execute( "insert into items (id, review_date, skip_analysis) values (?, ?, false) on conflict do nothing", (doc["identifier"], doc["review_date"]), ) conn.commit() if __name__ == "__main__": main()