2025-08-10 12:27:39 -07:00
|
|
|
import re
|
2025-10-04 15:09:16 -07:00
|
|
|
import sqlite3
|
|
|
|
|
import traceback
|
2025-08-10 12:27:39 -07:00
|
|
|
from argparse import ArgumentParser
|
2025-10-04 15:09:16 -07:00
|
|
|
from datetime import datetime
|
|
|
|
|
from sys import stderr
|
|
|
|
|
from time import sleep
|
2025-08-10 12:27:39 -07:00
|
|
|
|
|
|
|
|
import requests
|
|
|
|
|
|
2025-10-04 15:09:16 -07:00
|
|
|
from engine import analyze_item
|
2025-08-10 22:10:16 -07:00
|
|
|
|
|
|
|
|
|
2025-08-10 12:27:39 -07:00
|
|
|
def main():
|
|
|
|
|
parser = ArgumentParser()
|
2025-10-04 15:09:16 -07:00
|
|
|
parser.add_argument("--database", default="./microqa.db")
|
|
|
|
|
parser.add_argument("--cpus", type=int, default=2)
|
|
|
|
|
parser.add_argument("--earliest-review-date", default="20250701")
|
2025-08-10 12:27:39 -07:00
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
2025-10-04 15:09:16 -07:00
|
|
|
with sqlite3.connect(args.database) as conn:
|
|
|
|
|
cur = conn.cursor()
|
|
|
|
|
cur.execute("""
|
|
|
|
|
create table if not exists items (
|
|
|
|
|
id text primary key not null,
|
|
|
|
|
review_date text not null,
|
|
|
|
|
skip_analysis bool not null,
|
|
|
|
|
analyzed_date text
|
|
|
|
|
)""")
|
|
|
|
|
cur.execute("""
|
|
|
|
|
create table if not exists pages (
|
|
|
|
|
id int primary key,
|
|
|
|
|
item text not null,
|
|
|
|
|
page int not null,
|
|
|
|
|
orientation_match boolean not null,
|
|
|
|
|
sharpness real not null,
|
|
|
|
|
is_blank boolean not null,
|
|
|
|
|
text_margin_px int not null
|
|
|
|
|
)""")
|
|
|
|
|
cur.execute("create index if not exists review_date_idx on items (review_date)")
|
|
|
|
|
cur.execute(
|
|
|
|
|
"create index if not exists analyzed_date_idx on items (analyzed_date)"
|
2025-08-10 12:27:39 -07:00
|
|
|
)
|
2025-10-04 15:09:16 -07:00
|
|
|
cur.execute("create index if not exists item_idx on pages (item)")
|
|
|
|
|
cur.execute(
|
|
|
|
|
"create unique index if not exists item_page_idx on pages (item, page)"
|
2025-08-10 12:27:39 -07:00
|
|
|
)
|
2025-10-04 15:09:16 -07:00
|
|
|
conn.commit()
|
|
|
|
|
|
|
|
|
|
while True:
|
|
|
|
|
print("Pulling item IDs")
|
|
|
|
|
pull_new_item_ids(conn, args.earliest_review_date)
|
|
|
|
|
print("Done.")
|
|
|
|
|
res = cur.execute("""
|
|
|
|
|
select id
|
|
|
|
|
from items
|
|
|
|
|
where analyzed_date is null
|
|
|
|
|
and skip_analysis = false
|
|
|
|
|
order by review_date
|
|
|
|
|
""")
|
|
|
|
|
for (item_id,) in res.fetchall():
|
|
|
|
|
N_ATTEMPTS = 3
|
|
|
|
|
for _ in range(N_ATTEMPTS):
|
|
|
|
|
try:
|
|
|
|
|
print(f"Processing {item_id}")
|
|
|
|
|
analysis = analyze_item(
|
|
|
|
|
item_id, parallel=args.cpus, verbose=True
|
2025-08-10 12:27:39 -07:00
|
|
|
)
|
2025-10-04 15:09:16 -07:00
|
|
|
for i, page in enumerate(analysis["pages"]):
|
|
|
|
|
cur.execute(
|
|
|
|
|
"""
|
|
|
|
|
insert into pages (
|
|
|
|
|
item,
|
|
|
|
|
page,
|
|
|
|
|
orientation_match,
|
|
|
|
|
sharpness,
|
|
|
|
|
is_blank,
|
|
|
|
|
text_margin_px
|
|
|
|
|
) values (
|
|
|
|
|
?,
|
|
|
|
|
?,
|
|
|
|
|
?,
|
|
|
|
|
?,
|
|
|
|
|
?,
|
|
|
|
|
?
|
|
|
|
|
)""",
|
|
|
|
|
[
|
|
|
|
|
item_id,
|
|
|
|
|
i + 1,
|
|
|
|
|
page["ocr_orientation_match"],
|
|
|
|
|
page["sharpness"],
|
|
|
|
|
page["blank"],
|
|
|
|
|
page["text_margin_px"],
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
cur.execute(
|
|
|
|
|
"update items set analyzed_date = ? where id = ?",
|
|
|
|
|
[datetime.utcnow().strftime("%Y%m%d%H%M%S"), item_id],
|
2025-08-10 12:27:39 -07:00
|
|
|
)
|
2025-10-04 15:09:16 -07:00
|
|
|
conn.commit()
|
|
|
|
|
print("Done")
|
|
|
|
|
break
|
|
|
|
|
except Exception as err:
|
|
|
|
|
print(err, file=stderr)
|
|
|
|
|
traceback.print_tb(err.__traceback__, file=stderr)
|
|
|
|
|
sleep(15)
|
|
|
|
|
break
|
|
|
|
|
sleep(3600)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def pull_new_item_ids(conn, earliest_review_date):
|
|
|
|
|
cur = conn.cursor()
|
|
|
|
|
res = cur.execute("select review_date from items order by review_date desc limit 1")
|
|
|
|
|
(latest_review_date,) = res.fetchone() or (earliest_review_date,)
|
|
|
|
|
print(latest_review_date)
|
|
|
|
|
|
|
|
|
|
query = f"""
|
|
|
|
|
collection:(microfiche)
|
|
|
|
|
AND contributor:(Internet Archive)
|
|
|
|
|
AND micro_review:(done)
|
|
|
|
|
AND review_date:[{latest_review_date} TO null]
|
2025-08-10 12:27:39 -07:00
|
|
|
"""
|
2025-10-04 15:09:16 -07:00
|
|
|
sort = "reviewdate asc"
|
|
|
|
|
|
|
|
|
|
# Format for API.
|
|
|
|
|
query = re.sub(r"\s+", "+", query.strip())
|
|
|
|
|
sort = re.sub(r"\s+", "+", sort.strip())
|
|
|
|
|
|
|
|
|
|
# params = {
|
|
|
|
|
# "q": query,
|
|
|
|
|
# "count": 100,
|
|
|
|
|
# "fields": "identifier,review_date",
|
|
|
|
|
# "sorts": sort,
|
|
|
|
|
# }
|
|
|
|
|
# for i in range(1, 999):
|
|
|
|
|
# resp = requests.get(
|
|
|
|
|
# "https://archive.org/services/search/v1/scrape",
|
|
|
|
|
# params=params,
|
|
|
|
|
# )
|
|
|
|
|
# resp.raise_for_status()
|
|
|
|
|
# print(resp.text)
|
|
|
|
|
# try:
|
|
|
|
|
# body = resp.json()
|
|
|
|
|
# except Exception as err:
|
|
|
|
|
# print("Body:", resp.text, file=stderr)
|
|
|
|
|
# raise err
|
|
|
|
|
# for doc in body["items"]:
|
|
|
|
|
# cur.execute(
|
|
|
|
|
# "insert into items (id, review_date, skip_analysis) values (?, ?, false) on conflict do nothing",
|
|
|
|
|
# (doc["identifier"], doc["review_date"]),
|
|
|
|
|
# )
|
|
|
|
|
# conn.commit()
|
|
|
|
|
# cursor = body.get("cursor", None)
|
|
|
|
|
# if cursor is None:
|
|
|
|
|
# break
|
|
|
|
|
# params = params.copy()
|
|
|
|
|
# params["cursor"] = cursor
|
|
|
|
|
resp = requests.get(
|
|
|
|
|
f"https://archive.org/advancedsearch.php?q={query}&sort[]={sort}&fl[]=identifier&fl[]=review_date&rows=250000&output=json",
|
2025-08-10 12:27:39 -07:00
|
|
|
)
|
2025-10-04 15:09:16 -07:00
|
|
|
resp.raise_for_status()
|
|
|
|
|
try:
|
|
|
|
|
body = resp.json()
|
|
|
|
|
except Exception as err:
|
|
|
|
|
print("Body:", resp.text, file=stderr)
|
|
|
|
|
raise err
|
|
|
|
|
for doc in body["response"]["docs"]:
|
|
|
|
|
cur.execute(
|
|
|
|
|
"insert into items (id, review_date, skip_analysis) values (?, ?, false) on conflict do nothing",
|
|
|
|
|
(doc["identifier"], doc["review_date"]),
|
2025-08-10 12:27:39 -07:00
|
|
|
)
|
2025-10-04 15:09:16 -07:00
|
|
|
conn.commit()
|
2025-08-10 12:27:39 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|