diff --git a/README.md b/README.md index 9fc65bd..4f8bea2 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,22 @@ a single line so that items are summarized in parallel): pbpaste | tr '\n' ',' | uv run main.py --summarize -workers 4 -v | jq ``` +Query a pre-populated database for suspect pages: + +```sql +select 'https://archive.org/details/' || items.id, + pages.page, + pages.orientation_match, + pages.sharpness, + pages.text_margin_px +from items + join pages on pages.item = items.id +where pages.orientation_match = 0 + or pages.sharpness < 0.07 + or (pages.text_margin_px > -1 and pages.text_margin_px < 50) +order by items.id; +``` + ## Test Cases - Blurry pages: `micro_IA40244209_0984` diff --git a/cache.py b/cache.py new file mode 100644 index 0000000..49a2b32 --- /dev/null +++ b/cache.py @@ -0,0 +1,134 @@ +import re +import sqlite3 +import traceback +from argparse import ArgumentParser +from datetime import datetime +from time import sleep + +import requests + +from main import analyze_item + + +def main(): + parser = ArgumentParser() + parser.add_argument("--database", default="./microqa.db") + parser.add_argument("--cpus", type=int, default=2) + parser.add_argument("--earliest-review-date", default="20250701") + args = parser.parse_args() + + with sqlite3.connect(args.database) as conn: + cur = conn.cursor() + cur.execute(""" +create table if not exists items ( + id text primary key not null, + review_date text not null, + analyzed_date text +)""") + cur.execute(""" +create table if not exists pages ( + id int primary key, + item text not null, + page int not null, + orientation_match boolean not null, + sharpness real not null, + is_blank boolean not null, + text_margin_px int not null +)""") + conn.commit() + + while True: + print("Pulling item IDs") + pull_new_item_ids(conn, args.earliest_review_date) + print("Done.") + res = cur.execute( + "select id from items where analyzed_date is null order by review_date" + ) + for (item_id,) in res.fetchall(): + N_ATTEMPTS = 3 + for _ in range(N_ATTEMPTS): + try: + print(f"Processing {item_id}") + analysis = analyze_item( + item_id, parallel=args.cpus, verbose=True + ) + for i, page in enumerate(analysis["pages"]): + cur.execute( + """ +insert into pages ( + item, + page, + orientation_match, + sharpness, + is_blank, + text_margin_px +) values ( + ?, + ?, + ?, + ?, + ?, + ? + )""", + [ + item_id, + i + 1, + page["ocr_orientation_match"], + page["sharpness"], + page["blank"], + page["text_margin_px"], + ], + ) + cur.execute( + "update items set analyzed_date = ? where id = ?", + [datetime.utcnow().strftime("%Y%m%d%H%M%S"), item_id], + ) + conn.commit() + print("Done") + break + except Exception as err: + print(err) + traceback.print_tb(err.__traceback__) + sleep(15) + break + sleep(3600) + + +def pull_new_item_ids(conn, earliest_review_date): + cur = conn.cursor() + res = cur.execute("select review_date from items order by review_date desc limit 1") + (latest_review_date,) = res.fetchone() or (earliest_review_date,) + print(latest_review_date) + + query = f""" + collection:(microfiche) + AND contributor:(Internet Archive) + AND micro_review:(done) + AND review_date:[{latest_review_date} TO null] + """ + sort = "reviewdate asc" + + # Format for API. + query = re.sub(r"\s+", "+", query.strip()) + sort = re.sub(r"\s+", "+", sort.strip()) + + for i in range(1, 999): + resp = requests.get( + f"https://archive.org/advancedsearch.php?q={query}&sort[]={sort}&fl[]=identifier&fl[]=review_date&rows=100&page={i}&output=json", + ) + resp.raise_for_status() + body = resp.json() + if len(body["response"]["docs"]) == 0: + break + cur.executemany( + "insert into items (id, review_date) values (?, ?) on conflict do nothing", + [ + (doc["identifier"], doc["review_date"]) + for doc in body["response"]["docs"] + ], + ) + conn.commit() + + +if __name__ == "__main__": + main() diff --git a/main.py b/main.py index d2d77ae..85178a5 100644 --- a/main.py +++ b/main.py @@ -16,7 +16,6 @@ from PIL import Image, ImageFilter OCR_LANGS = "eng+fra" -N_OCR_PROCESSES = 4 def main(): @@ -77,9 +76,7 @@ def _summarize_item_to_stdout(task): print(f"Summarizing item {item_id}...", file=stderr) stderr.flush() - analysis = analyze_item( - item_id, page_margin_px=page_margin_px, parallel=True, verbose=verbose - ) + analysis = analyze_item(item_id, parallel=4, verbose=verbose) # 3 or more blank pages in a row is a flag. CONSECUTIVE_BLANKS_THRESHOLD = 3 @@ -124,11 +121,10 @@ def _summarize_item_to_stdout(task): if not page["ocr_orientation_match"] ] - WORDS_NEAR_EDGE_THRESHOLD = 2 check_crop = [ i + 1 for i, page in enumerate(analysis["pages"]) - if page["words_near_edge"] > WORDS_NEAR_EDGE_THRESHOLD + if page["text_margin_px"] < page_margin_px ] if check_orientation or check_crop or consecutive_blanks or consecutive_blurry: @@ -152,20 +148,13 @@ def _summarize_item_to_stdout(task): def _analyze_item_to_stdout(task): item_id = task.item_id - page_margin_px = task.page_margin_px verbose = task.verbose if verbose: print(f"Analyzing item {item_id}...", file=stderr) stderr.flush() - print( - json.dumps( - analyze_item( - item_id, page_margin_px=page_margin_px, parallel=True, verbose=verbose - ) - ) - ) + print(json.dumps(analyze_item(item_id, parallel=4, verbose=verbose))) stdout.flush() if verbose: @@ -177,14 +166,12 @@ def _analyze_item_to_stdout(task): class PageAnalysisTask: im: Image.Image page_index: int - page_margin_px: int file_name: str def _analyze_page(task): im_original = task.im page_index = task.page_index - page_margin_px = task.page_margin_px file_name = task.file_name im_cropped = im_original.crop( @@ -201,7 +188,7 @@ def _analyze_page(task): if is_blank: max_sharpness = 1 ocr_orientation_match = True - words_near_edge = 0 + text_margin_px = -1 else: max_sharpness = 0.0 if im_cropped.size[0] < im_cropped.size[1]: @@ -262,19 +249,26 @@ def _analyze_page(task): if best_ocr_orientation % 2 == 0 else (im_original.size[1], im_original.size[0]) ) - words_near_edge = best_ocr_words[ - (best_ocr_words["left"] < page_margin_px) - | (best_ocr_words["top"] < page_margin_px) - | ( - best_ocr_words["left"] + best_ocr_words["width"] - > best_ocr_dims[0] - page_margin_px + + word_margins_all_directions = np.sort( + np.concat( + ( + best_ocr_words["left"].to_numpy(), + best_ocr_words["top"].to_numpy(), + best_ocr_dims[0] + - (best_ocr_words["left"] + best_ocr_words["width"]).to_numpy(), + best_ocr_dims[1] + - (best_ocr_words["top"] + best_ocr_words["height"]).to_numpy(), + ) ) - | ( - best_ocr_words["top"] + best_ocr_words["height"] - > best_ocr_dims[1] - page_margin_px - ) - ] - words_near_edge = words_near_edge.shape[0] + ) + # Skip the n closest words to the edge, to help ignore stray OCR artifacts. + SKIP_WORDS = 2 + text_margin_px = ( + int(word_margins_all_directions[SKIP_WORDS]) + if word_margins_all_directions.shape[0] > SKIP_WORDS + else -1 + ) return { "blank": is_blank, @@ -283,11 +277,11 @@ def _analyze_page(task): "page_index": page_index, "size": im_original.size, "sharpness": max_sharpness, - "words_near_edge": words_near_edge, + "text_margin_px": text_margin_px, } -def analyze_item(item_id, page_margin_px, parallel=False, verbose=False): +def analyze_item(item_id, parallel=1, verbose=False): escaped_item_id = urllib.parse.quote(item_id, safe="") if verbose: @@ -326,7 +320,6 @@ def analyze_item(item_id, page_margin_px, parallel=False, verbose=False): PageAnalysisTask( im=im, page_index=page_index, - page_margin_px=page_margin_px, file_name=file_name, ) ) @@ -334,9 +327,9 @@ def analyze_item(item_id, page_margin_px, parallel=False, verbose=False): if verbose: print(f"Processing {len(page_nums)} pages...", file=stderr) stderr.flush() - if parallel: + if parallel > 1: # Parallelize image processing and OCR of pages across up to n cores. - with Pool(N_OCR_PROCESSES) as pool: + with Pool(parallel) as pool: return {"pages": pool.map(_analyze_page, tasks)} return {"pages": [_analyze_page(task) for task in tasks]}