MicroQA/main.py
2025-10-04 15:10:10 -07:00

176 lines
5.5 KiB
Python

import re
import sqlite3
import traceback
from argparse import ArgumentParser
from datetime import datetime
from sys import stderr
from time import sleep
import requests
from engine import analyze_item
def main():
parser = ArgumentParser()
parser.add_argument("--database", default="./microqa.db")
parser.add_argument("--cpus", type=int, default=2)
parser.add_argument("--earliest-review-date", default="20250701")
args = parser.parse_args()
with sqlite3.connect(args.database) as conn:
cur = conn.cursor()
cur.execute("""
create table if not exists items (
id text primary key not null,
review_date text not null,
skip_analysis bool not null,
analyzed_date text
)""")
cur.execute("""
create table if not exists pages (
id int primary key,
item text not null,
page int not null,
orientation_match boolean not null,
sharpness real not null,
is_blank boolean not null,
text_margin_px int not null
)""")
cur.execute("create index if not exists review_date_idx on items (review_date)")
cur.execute(
"create index if not exists analyzed_date_idx on items (analyzed_date)"
)
cur.execute("create index if not exists item_idx on pages (item)")
cur.execute(
"create unique index if not exists item_page_idx on pages (item, page)"
)
conn.commit()
while True:
print("Pulling item IDs")
pull_new_item_ids(conn, args.earliest_review_date)
print("Done.")
res = cur.execute("""
select id
from items
where analyzed_date is null
and skip_analysis = false
order by review_date
""")
for (item_id,) in res.fetchall():
N_ATTEMPTS = 3
for _ in range(N_ATTEMPTS):
try:
print(f"Processing {item_id}")
analysis = analyze_item(
item_id, parallel=args.cpus, verbose=True
)
for i, page in enumerate(analysis["pages"]):
cur.execute(
"""
insert into pages (
item,
page,
orientation_match,
sharpness,
is_blank,
text_margin_px
) values (
?,
?,
?,
?,
?,
?
)""",
[
item_id,
i + 1,
page["ocr_orientation_match"],
page["sharpness"],
page["blank"],
page["text_margin_px"],
],
)
cur.execute(
"update items set analyzed_date = ? where id = ?",
[datetime.utcnow().strftime("%Y%m%d%H%M%S"), item_id],
)
conn.commit()
print("Done")
break
except Exception as err:
print(err, file=stderr)
traceback.print_tb(err.__traceback__, file=stderr)
sleep(15)
break
sleep(3600)
def pull_new_item_ids(conn, earliest_review_date):
cur = conn.cursor()
res = cur.execute("select review_date from items order by review_date desc limit 1")
(latest_review_date,) = res.fetchone() or (earliest_review_date,)
print(latest_review_date)
query = f"""
collection:(microfiche)
AND contributor:(Internet Archive)
AND micro_review:(done)
AND review_date:[{latest_review_date} TO null]
"""
sort = "reviewdate asc"
# Format for API.
query = re.sub(r"\s+", "+", query.strip())
sort = re.sub(r"\s+", "+", sort.strip())
# params = {
# "q": query,
# "count": 100,
# "fields": "identifier,review_date",
# "sorts": sort,
# }
# for i in range(1, 999):
# resp = requests.get(
# "https://archive.org/services/search/v1/scrape",
# params=params,
# )
# resp.raise_for_status()
# print(resp.text)
# try:
# body = resp.json()
# except Exception as err:
# print("Body:", resp.text, file=stderr)
# raise err
# for doc in body["items"]:
# cur.execute(
# "insert into items (id, review_date, skip_analysis) values (?, ?, false) on conflict do nothing",
# (doc["identifier"], doc["review_date"]),
# )
# conn.commit()
# cursor = body.get("cursor", None)
# if cursor is None:
# break
# params = params.copy()
# params["cursor"] = cursor
resp = requests.get(
f"https://archive.org/advancedsearch.php?q={query}&sort[]={sort}&fl[]=identifier&fl[]=review_date&rows=250000&output=json",
)
resp.raise_for_status()
try:
body = resp.json()
except Exception as err:
print("Body:", resp.text, file=stderr)
raise err
for doc in body["response"]["docs"]:
cur.execute(
"insert into items (id, review_date, skip_analysis) values (?, ?, false) on conflict do nothing",
(doc["identifier"], doc["review_date"]),
)
conn.commit()
if __name__ == "__main__":
main()