MicroQA/cache.py

135 lines
4 KiB
Python
Raw Normal View History

2025-08-18 20:31:55 -07:00
import re
import sqlite3
import traceback
from argparse import ArgumentParser
from datetime import datetime
from time import sleep
import requests
from main import analyze_item
def main():
parser = ArgumentParser()
parser.add_argument("--database", default="./microqa.db")
parser.add_argument("--cpus", type=int, default=2)
parser.add_argument("--earliest-review-date", default="20250701")
args = parser.parse_args()
with sqlite3.connect(args.database) as conn:
cur = conn.cursor()
cur.execute("""
create table if not exists items (
id text primary key not null,
review_date text not null,
analyzed_date text
)""")
cur.execute("""
create table if not exists pages (
id int primary key,
item text not null,
page int not null,
orientation_match boolean not null,
sharpness real not null,
is_blank boolean not null,
text_margin_px int not null
)""")
conn.commit()
while True:
print("Pulling item IDs")
pull_new_item_ids(conn, args.earliest_review_date)
print("Done.")
res = cur.execute(
"select id from items where analyzed_date is null order by review_date"
)
for (item_id,) in res.fetchall():
N_ATTEMPTS = 3
for _ in range(N_ATTEMPTS):
try:
print(f"Processing {item_id}")
analysis = analyze_item(
item_id, parallel=args.cpus, verbose=True
)
for i, page in enumerate(analysis["pages"]):
cur.execute(
"""
insert into pages (
item,
page,
orientation_match,
sharpness,
is_blank,
text_margin_px
) values (
?,
?,
?,
?,
?,
?
)""",
[
item_id,
i + 1,
page["ocr_orientation_match"],
page["sharpness"],
page["blank"],
page["text_margin_px"],
],
)
cur.execute(
"update items set analyzed_date = ? where id = ?",
[datetime.utcnow().strftime("%Y%m%d%H%M%S"), item_id],
)
conn.commit()
print("Done")
break
except Exception as err:
print(err)
traceback.print_tb(err.__traceback__)
sleep(15)
break
sleep(3600)
def pull_new_item_ids(conn, earliest_review_date):
cur = conn.cursor()
res = cur.execute("select review_date from items order by review_date desc limit 1")
(latest_review_date,) = res.fetchone() or (earliest_review_date,)
print(latest_review_date)
query = f"""
collection:(microfiche)
AND contributor:(Internet Archive)
AND micro_review:(done)
AND review_date:[{latest_review_date} TO null]
"""
sort = "reviewdate asc"
# Format for API.
query = re.sub(r"\s+", "+", query.strip())
sort = re.sub(r"\s+", "+", sort.strip())
for i in range(1, 999):
resp = requests.get(
f"https://archive.org/advancedsearch.php?q={query}&sort[]={sort}&fl[]=identifier&fl[]=review_date&rows=100&page={i}&output=json",
)
resp.raise_for_status()
body = resp.json()
if len(body["response"]["docs"]) == 0:
break
cur.executemany(
"insert into items (id, review_date) values (?, ?) on conflict do nothing",
[
(doc["identifier"], doc["review_date"])
for doc in body["response"]["docs"]
],
)
conn.commit()
if __name__ == "__main__":
main()