From 4d9161b04301e9d0332ec1b4327c656ba38d3ec1 Mon Sep 17 00:00:00 2001 From: Brent Schroeter Date: Sat, 4 Oct 2025 15:09:16 -0700 Subject: [PATCH] rewrite to engine.py --- .gitignore | 2 + README.md | 1 + cache.py | 134 --------------- engine.py | 239 ++++++++++++++++++++++++++ main.py | 494 ++++++++++++++++------------------------------------- one_off.py | 159 +++++++++++++++++ 6 files changed, 550 insertions(+), 479 deletions(-) delete mode 100644 cache.py create mode 100644 engine.py create mode 100644 one_off.py diff --git a/.gitignore b/.gitignore index a727c0a..327445f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ /target /data +*.db +__pycache__ diff --git a/README.md b/README.md index 4f8bea2..ccb4295 100644 --- a/README.md +++ b/README.md @@ -38,3 +38,4 @@ order by items.id; - Blurry pages: `micro_IA40244209_0984` - Contrast, page orientation: `micro_IA40244211_2290` - Crop, low quality fiche: `micro_IA40386420_0689` +- "Bite sized" SCOTUS doc with multiple viewable files and some blurry pages: `micro_IA40386007_0012` diff --git a/cache.py b/cache.py deleted file mode 100644 index 49a2b32..0000000 --- a/cache.py +++ /dev/null @@ -1,134 +0,0 @@ -import re -import sqlite3 -import traceback -from argparse import ArgumentParser -from datetime import datetime -from time import sleep - -import requests - -from main import analyze_item - - -def main(): - parser = ArgumentParser() - parser.add_argument("--database", default="./microqa.db") - parser.add_argument("--cpus", type=int, default=2) - parser.add_argument("--earliest-review-date", default="20250701") - args = parser.parse_args() - - with sqlite3.connect(args.database) as conn: - cur = conn.cursor() - cur.execute(""" -create table if not exists items ( - id text primary key not null, - review_date text not null, - analyzed_date text -)""") - cur.execute(""" -create table if not exists pages ( - id int primary key, - item text not null, - page int not null, - orientation_match boolean not null, - sharpness real not null, - is_blank boolean not null, - text_margin_px int not null -)""") - conn.commit() - - while True: - print("Pulling item IDs") - pull_new_item_ids(conn, args.earliest_review_date) - print("Done.") - res = cur.execute( - "select id from items where analyzed_date is null order by review_date" - ) - for (item_id,) in res.fetchall(): - N_ATTEMPTS = 3 - for _ in range(N_ATTEMPTS): - try: - print(f"Processing {item_id}") - analysis = analyze_item( - item_id, parallel=args.cpus, verbose=True - ) - for i, page in enumerate(analysis["pages"]): - cur.execute( - """ -insert into pages ( - item, - page, - orientation_match, - sharpness, - is_blank, - text_margin_px -) values ( - ?, - ?, - ?, - ?, - ?, - ? - )""", - [ - item_id, - i + 1, - page["ocr_orientation_match"], - page["sharpness"], - page["blank"], - page["text_margin_px"], - ], - ) - cur.execute( - "update items set analyzed_date = ? where id = ?", - [datetime.utcnow().strftime("%Y%m%d%H%M%S"), item_id], - ) - conn.commit() - print("Done") - break - except Exception as err: - print(err) - traceback.print_tb(err.__traceback__) - sleep(15) - break - sleep(3600) - - -def pull_new_item_ids(conn, earliest_review_date): - cur = conn.cursor() - res = cur.execute("select review_date from items order by review_date desc limit 1") - (latest_review_date,) = res.fetchone() or (earliest_review_date,) - print(latest_review_date) - - query = f""" - collection:(microfiche) - AND contributor:(Internet Archive) - AND micro_review:(done) - AND review_date:[{latest_review_date} TO null] - """ - sort = "reviewdate asc" - - # Format for API. - query = re.sub(r"\s+", "+", query.strip()) - sort = re.sub(r"\s+", "+", sort.strip()) - - for i in range(1, 999): - resp = requests.get( - f"https://archive.org/advancedsearch.php?q={query}&sort[]={sort}&fl[]=identifier&fl[]=review_date&rows=100&page={i}&output=json", - ) - resp.raise_for_status() - body = resp.json() - if len(body["response"]["docs"]) == 0: - break - cur.executemany( - "insert into items (id, review_date) values (?, ?) on conflict do nothing", - [ - (doc["identifier"], doc["review_date"]) - for doc in body["response"]["docs"] - ], - ) - conn.commit() - - -if __name__ == "__main__": - main() diff --git a/engine.py b/engine.py new file mode 100644 index 0000000..c6f91d2 --- /dev/null +++ b/engine.py @@ -0,0 +1,239 @@ +import urllib.parse +import re +from dataclasses import dataclass +from io import BytesIO +from multiprocessing import Pool +from sys import stderr +from zipfile import ZipFile + +import numpy as np +import pytesseract +import requests +from PIL import Image, ImageFilter + + +def analyze_item(item_id, ocr_langs="eng+fra", parallel=1, verbose=False): + escaped_item_id = urllib.parse.quote(item_id, safe="") + + if verbose: + print("Downloading...", file=stderr) + stderr.flush() + page_nums_resp = requests.get( + f"https://archive.org/metadata/{escaped_item_id}/page_numbers/pages" + ) + page_nums_resp.raise_for_status() + page_nums = page_nums_resp.json()["result"] + + zip_resp = requests.get( + f"https://archive.org/download/{escaped_item_id}/{escaped_item_id}_jp2.zip" + ) + zip_resp.raise_for_status() + + if verbose: + print("Decompressing...", file=stderr) + stderr.flush() + tasks = [] + with ZipFile(BytesIO(zip_resp.content)) as jp_zip: + for leaf_num, file_name in enumerate(sorted(jp_zip.namelist())): + for page_index, page_num_info in enumerate(page_nums): + if page_num_info["leafNum"] == leaf_num: + # Stop iterating and keep page_index set to the current item. + break + else: + # Set to -1 to indicate that leaf was not found in page_num list. + page_index = -1 + + if page_index != -1: + with jp_zip.open(file_name) as jp_file: + im = Image.open(jp_file).convert("L") + im.thumbnail((3200, 3200)) + tasks.append( + PageAnalysisTask( + im=im, + ocr_langs=ocr_langs, + ) + ) + + if verbose: + print(f"Processing {len(page_nums)} pages...", file=stderr) + stderr.flush() + + if parallel > 1: + # Parallelize image processing and OCR of pages across up to n cores. + with Pool(parallel) as pool: + return {"pages": pool.map(analyze_page, tasks)} + + return {"pages": [analyze_page(task) for task in tasks]} + + +@dataclass +class PageAnalysisTask: + """ + Attributes: + im PIL Image, pre-scaled using .thumbnail() to fit the long + edge to 3200 px. + ocr_langs Tesseract language codes (3 letters each, in a "+"-separated + list). + """ + + im: Image.Image + ocr_langs: str = "eng+fra" + + +def analyze_page(task): + im_cropped = task.im.crop( + ( + task.im.size[0] * 0.1, + task.im.size[1] * 0.1, + task.im.size[0] * 0.9, + task.im.size[1] * 0.9, + ) + ) + + is_blank = im_cropped.getextrema()[0] > 255 * 0.8 + + if is_blank: + max_sharpness = 1 + ocr_orientation_match = True + text_margin_px = -1 + else: + max_sharpness = 0.0 + if im_cropped.size[0] < im_cropped.size[1]: + # Page is in portrait orientation. + segments_x = 2 + segments_y = 3 + else: + # Page is in landscape orientation. + segments_x = 3 + segments_y = 2 + for i in range(segments_x): + for j in range(segments_y): + max_sharpness = max( + max_sharpness, + analyze_sharpness( + im_cropped.crop( + ( + im_cropped.size[0] / segments_x * i, + im_cropped.size[1] / segments_y * j, + im_cropped.size[0] / segments_x * (i + 1), + im_cropped.size[1] / segments_y * (j + 1), + ) + ) + ), + ) + + OCR_SCALE = 1 + best_ocr_score = -1 + best_ocr_words = None + best_ocr_orientation = -1 + for orientation in range(4): + im_rotated = task.im.resize( + np.int_(np.array(task.im.size) * OCR_SCALE) + ).rotate(90 * orientation, expand=True) + ocr = pytesseract.image_to_data( + im_rotated, + lang=task.ocr_langs, + config=f"--oem 1 --dpi {int(300 * OCR_SCALE)} --tessdata-dir ./data/tessdata_fast-4.1.0", + output_type=pytesseract.Output.DATAFRAME, + ).fillna({"text": ""}) + # Keep only words that Tesseract is confident in, and which are + # oriented horizontally. + words = ocr[(ocr["conf"] > 90) & (ocr["width"] > ocr["height"])] + # Keep only alphabetical words of 4 or more characters. + words = words[ + words.apply( + lambda row: re.fullmatch(r"[a-zA-Z]{4,}", str(row["text"])) + is not None, + axis=1, + ) + ] + if words.shape[0] > best_ocr_score: + best_ocr_score = words.shape[0] + best_ocr_orientation = orientation + best_ocr_words = words + if best_ocr_score > 50: + # Unlikely that another orientation will have more words, so + # stop eating up CPU. + break + + if best_ocr_words.empty: + ocr_orientation_match = True + text_margin_px = -1 + else: + ocr_orientation_match = best_ocr_orientation == 0 + + best_ocr_dims = OCR_SCALE * np.array( + task.im.size + if best_ocr_orientation % 2 == 0 + else (task.im.size[1], task.im.size[0]) + ) + + word_margins_all_directions = np.sort( + np.int_( + np.concat( + ( + best_ocr_words["left"].to_numpy(), + best_ocr_words["top"].to_numpy(), + best_ocr_dims[0] + - ( + best_ocr_words["left"] + best_ocr_words["width"] + ).to_numpy(), + best_ocr_dims[1] + - ( + best_ocr_words["top"] + best_ocr_words["height"] + ).to_numpy(), + ) + ) + # Transform back into original image pixel density + / OCR_SCALE + ) + ) + # Skip the n closest words to the edge, to help ignore stray OCR artifacts. + SKIP_WORDS = 2 + text_margin_px = int( + word_margins_all_directions[SKIP_WORDS] + if word_margins_all_directions.shape[0] > SKIP_WORDS + else -1 + ) + + return { + "blank": is_blank, + "ocr_orientation_match": ocr_orientation_match, + "size_analyzed": task.im.size, + "sharpness": max_sharpness, + "text_margin_px": text_margin_px, + } + + +def analyze_sharpness(im): + """ + Crudely quantifies the "sharpness" of edges in an image, on a scale of 0 to + 1. The scale is not linear with respect to scan quality: anything above 0.1 + is usually fine. + """ + arr = np.asarray(im) + + # Normalize contrast based on brightest and darkest pixels. For example, + # NORM_QUANTILE=0.1 will attempt to transform pixel values so that 80% fall + # between 10% brightness and 90% brightness. In practice, a value around + # 0.02 seems to work fairly well. + NORM_QUANTILE = 0.03 + pixel_range = np.quantile(arr, 1.0 - NORM_QUANTILE) - np.quantile( + arr, NORM_QUANTILE + ) + if pixel_range == 0: + arr_normalized = arr + else: + arr_normalized = arr * (1.0 - NORM_QUANTILE * 2) / pixel_range + arr_normalized = ( + arr_normalized - np.quantile(arr_normalized, NORM_QUANTILE) + NORM_QUANTILE + ) + arr_normalized = np.uint8(np.clip(arr_normalized, 0, 1) * 255) + + # "Sharpness" is determined by measuring the median intensity of pixels + # near edges, after an edge detection filter has been applied to the image. + edges_arr = np.asarray( + Image.fromarray(arr_normalized).filter(ImageFilter.FIND_EDGES) + ) + EDGE_THRESHOLD = 8 + return np.median(edges_arr[edges_arr > EDGE_THRESHOLD]) / 255 diff --git a/main.py b/main.py index 85178a5..95e19f0 100644 --- a/main.py +++ b/main.py @@ -1,371 +1,175 @@ -import json import re -import urllib.parse +import sqlite3 +import traceback from argparse import ArgumentParser -from dataclasses import dataclass -from io import BytesIO -from multiprocessing import Pool -from multiprocessing.pool import ThreadPool -from sys import stderr, stdin, stdout -from zipfile import ZipFile +from datetime import datetime +from sys import stderr +from time import sleep -import numpy as np -import pytesseract import requests -from PIL import Image, ImageFilter - -OCR_LANGS = "eng+fra" +from engine import analyze_item def main(): parser = ArgumentParser() - parser.add_argument("--summarize", action="store_true") - parser.add_argument("-v", "--verbose", action="store_true") - parser.add_argument("-w", "--workers", type=int, default=1) - parser.add_argument("--page-margin-px", type=int, default=50) + parser.add_argument("--database", default="./microqa.db") + parser.add_argument("--cpus", type=int, default=2) + parser.add_argument("--earliest-review-date", default="20250701") args = parser.parse_args() - # Process STDIN line by line, where each line contains one or more item IDs - # separated by whitespace. - for line in stdin: - item_ids = [value for value in re.split(r",|\s", line) if value] - with ThreadPool(args.workers) as pool: - if args.verbose: - print(f"Running with {args.workers} workers.", file=stderr) - stderr.flush() - if args.summarize: - pool.map( - _summarize_item_to_stdout, - [ - ItemTask( - item_id=item_id, - page_margin_px=args.page_margin_px, - verbose=args.verbose, - ) - for item_id in item_ids - ], - ) - else: - pool.map( - _analyze_item_to_stdout, - [ - ItemTask( - item_id=item_id, - page_margin_px=args.page_margin_px, - verbose=args.verbose, - ) - for item_id in item_ids - ], - ) - - -@dataclass -class ItemTask: - item_id: str - page_margin_px: int - verbose: bool - - -def _summarize_item_to_stdout(task): - item_id = task.item_id - page_margin_px = task.page_margin_px - verbose = task.verbose - - if verbose: - print(f"Summarizing item {item_id}...", file=stderr) - stderr.flush() - - analysis = analyze_item(item_id, parallel=4, verbose=verbose) - - # 3 or more blank pages in a row is a flag. - CONSECUTIVE_BLANKS_THRESHOLD = 3 - if len(analysis["pages"]) >= CONSECUTIVE_BLANKS_THRESHOLD: - consecutive_blanks = [page["blank"] for page in analysis["pages"]] - for _ in range(1, CONSECUTIVE_BLANKS_THRESHOLD): - consecutive_blanks = [ - value and consecutive_blanks[i] - for i, value in enumerate(consecutive_blanks[1:]) - ] - consecutive_blanks = [ - i + 2 # +1 to account for enumeration offset, and +1 to 1-index - for i, value in enumerate(consecutive_blanks[1:]) - if value and not consecutive_blanks[i] - ] - else: - consecutive_blanks = [] - - # 3 or more blank pages in a row is a flag. - CONSECUTIVE_BLURRY_THRESHOLD = 3 - SHARPNESS_THRESHOLD = 0.1 - if len(analysis["pages"]) >= CONSECUTIVE_BLURRY_THRESHOLD: - consecutive_blurry = [ - page["sharpness"] < SHARPNESS_THRESHOLD for page in analysis["pages"] - ] - for _ in range(1, CONSECUTIVE_BLURRY_THRESHOLD): - consecutive_blurry = [ - value and consecutive_blurry[i] - for i, value in enumerate(consecutive_blurry[1:]) - ] - consecutive_blurry = [ - i + 2 # +1 to account for enumeration offset, and +1 to 1-index - for i, value in enumerate(consecutive_blurry[1:]) - if value and not consecutive_blurry[i] - ] - else: - consecutive_blurry = [] - - check_orientation = [ - i + 1 - for i, page in enumerate(analysis["pages"]) - if not page["ocr_orientation_match"] - ] - - check_crop = [ - i + 1 - for i, page in enumerate(analysis["pages"]) - if page["text_margin_px"] < page_margin_px - ] - - if check_orientation or check_crop or consecutive_blanks or consecutive_blurry: - print( - json.dumps( - { - "item_id": item_id, - "check_orientation": check_orientation, - "check_crop": check_crop, - "consecutive_blanks": consecutive_blanks, - "consecutive_blurry": consecutive_blurry, - } - ) + with sqlite3.connect(args.database) as conn: + cur = conn.cursor() + cur.execute(""" +create table if not exists items ( + id text primary key not null, + review_date text not null, + skip_analysis bool not null, + analyzed_date text +)""") + cur.execute(""" +create table if not exists pages ( + id int primary key, + item text not null, + page int not null, + orientation_match boolean not null, + sharpness real not null, + is_blank boolean not null, + text_margin_px int not null +)""") + cur.execute("create index if not exists review_date_idx on items (review_date)") + cur.execute( + "create index if not exists analyzed_date_idx on items (analyzed_date)" ) - stdout.flush() - - if verbose: - print(f"Done summarizing item {item_id}.", file=stderr) - stderr.flush() - - -def _analyze_item_to_stdout(task): - item_id = task.item_id - verbose = task.verbose - - if verbose: - print(f"Analyzing item {item_id}...", file=stderr) - stderr.flush() - - print(json.dumps(analyze_item(item_id, parallel=4, verbose=verbose))) - stdout.flush() - - if verbose: - print(f"Done analyzing item {item_id}.", file=stderr) - stderr.flush() - - -@dataclass -class PageAnalysisTask: - im: Image.Image - page_index: int - file_name: str - - -def _analyze_page(task): - im_original = task.im - page_index = task.page_index - file_name = task.file_name - - im_cropped = im_original.crop( - ( - im_original.size[0] * 0.1, - im_original.size[1] * 0.1, - im_original.size[0] * 0.9, - im_original.size[1] * 0.9, + cur.execute("create index if not exists item_idx on pages (item)") + cur.execute( + "create unique index if not exists item_page_idx on pages (item, page)" ) - ) + conn.commit() - is_blank = im_cropped.getextrema()[0] > 255 * 0.8 - - if is_blank: - max_sharpness = 1 - ocr_orientation_match = True - text_margin_px = -1 - else: - max_sharpness = 0.0 - if im_cropped.size[0] < im_cropped.size[1]: - # Page is in portrait orientation. - segments_x = 2 - segments_y = 3 - else: - # Page is in landscape orientation. - segments_x = 3 - segments_y = 2 - for i in range(segments_x): - for j in range(segments_y): - max_sharpness = max( - max_sharpness, - analyze_sharpness( - im_cropped.crop( - ( - im_cropped.size[0] / segments_x * i, - im_cropped.size[1] / segments_y * j, - im_cropped.size[0] / segments_x * (i + 1), - im_cropped.size[1] / segments_y * (j + 1), + while True: + print("Pulling item IDs") + pull_new_item_ids(conn, args.earliest_review_date) + print("Done.") + res = cur.execute(""" +select id +from items +where analyzed_date is null + and skip_analysis = false +order by review_date +""") + for (item_id,) in res.fetchall(): + N_ATTEMPTS = 3 + for _ in range(N_ATTEMPTS): + try: + print(f"Processing {item_id}") + analysis = analyze_item( + item_id, parallel=args.cpus, verbose=True + ) + for i, page in enumerate(analysis["pages"]): + cur.execute( + """ +insert into pages ( + item, + page, + orientation_match, + sharpness, + is_blank, + text_margin_px +) values ( + ?, + ?, + ?, + ?, + ?, + ? + )""", + [ + item_id, + i + 1, + page["ocr_orientation_match"], + page["sharpness"], + page["blank"], + page["text_margin_px"], + ], ) + cur.execute( + "update items set analyzed_date = ? where id = ?", + [datetime.utcnow().strftime("%Y%m%d%H%M%S"), item_id], ) - ), - ) - - best_ocr_score = -1 - best_ocr_words = None - best_ocr_orientation = -1 - for orientation in range(4): - im_rotated = im_original.rotate(90 * orientation, expand=True) - ocr = pytesseract.image_to_data( - im_rotated, - lang=OCR_LANGS, - config="--oem 1 --dpi 300 --tessdata-dir ./data/tessdata_fast-4.1.0", - output_type=pytesseract.Output.DATAFRAME, - ).fillna({"text": ""}) - words = ocr[(ocr["conf"] > 90) & (ocr["width"] > ocr["height"])] - words = words[ - words.apply( - lambda row: re.fullmatch(r"[a-zA-Z]{4,}", row["text"]) is not None, - axis=1, - ) - ] - if words.shape[0] > best_ocr_score: - best_ocr_score = words.shape[0] - best_ocr_orientation = orientation - best_ocr_words = words - if best_ocr_score > 50: - # Unlikely that another orientation will have more words, so - # stop eating up CPU unnecessarily. - break - - ocr_orientation_match = best_ocr_orientation == 0 - - best_ocr_dims = ( - im_original.size - if best_ocr_orientation % 2 == 0 - else (im_original.size[1], im_original.size[0]) - ) - - word_margins_all_directions = np.sort( - np.concat( - ( - best_ocr_words["left"].to_numpy(), - best_ocr_words["top"].to_numpy(), - best_ocr_dims[0] - - (best_ocr_words["left"] + best_ocr_words["width"]).to_numpy(), - best_ocr_dims[1] - - (best_ocr_words["top"] + best_ocr_words["height"]).to_numpy(), - ) - ) - ) - # Skip the n closest words to the edge, to help ignore stray OCR artifacts. - SKIP_WORDS = 2 - text_margin_px = ( - int(word_margins_all_directions[SKIP_WORDS]) - if word_margins_all_directions.shape[0] > SKIP_WORDS - else -1 - ) - - return { - "blank": is_blank, - "file_name": file_name, - "ocr_orientation_match": ocr_orientation_match, - "page_index": page_index, - "size": im_original.size, - "sharpness": max_sharpness, - "text_margin_px": text_margin_px, - } + conn.commit() + print("Done") + break + except Exception as err: + print(err, file=stderr) + traceback.print_tb(err.__traceback__, file=stderr) + sleep(15) + break + sleep(3600) -def analyze_item(item_id, parallel=1, verbose=False): - escaped_item_id = urllib.parse.quote(item_id, safe="") +def pull_new_item_ids(conn, earliest_review_date): + cur = conn.cursor() + res = cur.execute("select review_date from items order by review_date desc limit 1") + (latest_review_date,) = res.fetchone() or (earliest_review_date,) + print(latest_review_date) - if verbose: - print("Downloading...", file=stderr) - stderr.flush() - page_nums_resp = requests.get( - f"https://archive.org/metadata/{escaped_item_id}/page_numbers/pages" - ) - page_nums_resp.raise_for_status() - page_nums = page_nums_resp.json()["result"] - - zip_resp = requests.get( - f"https://archive.org/download/{escaped_item_id}/{escaped_item_id}_jp2.zip" - ) - zip_resp.raise_for_status() - - if verbose: - print("Decompressing...", file=stderr) - stderr.flush() - tasks = [] - with ZipFile(BytesIO(zip_resp.content)) as jp_zip: - for leaf_num, file_name in enumerate(sorted(jp_zip.namelist())): - for page_index, page_num_info in enumerate(page_nums): - if page_num_info["leafNum"] == leaf_num: - # Stop iterating and keep page_index set to the current item. - break - else: - # Set to -1 to indicate that leaf was not found in page_num list. - page_index = -1 - - if page_index != -1: - with jp_zip.open(file_name) as jp_file: - im = Image.open(jp_file).convert("L") - im.thumbnail((3200, 3200)) - tasks.append( - PageAnalysisTask( - im=im, - page_index=page_index, - file_name=file_name, - ) - ) - - if verbose: - print(f"Processing {len(page_nums)} pages...", file=stderr) - stderr.flush() - if parallel > 1: - # Parallelize image processing and OCR of pages across up to n cores. - with Pool(parallel) as pool: - return {"pages": pool.map(_analyze_page, tasks)} - return {"pages": [_analyze_page(task) for task in tasks]} - - -def analyze_sharpness(im): + query = f""" + collection:(microfiche) + AND contributor:(Internet Archive) + AND micro_review:(done) + AND review_date:[{latest_review_date} TO null] """ - Crudely quantifies the "sharpness" of edges in an image, on a scale of 0 to - 1. The scale is not linear with respect to scan quality: anything above 0.1 - is usually fine. - """ - arr = np.asarray(im) + sort = "reviewdate asc" - # Normalize contrast based on brightest and darkest pixels. For example, - # NORM_QUANTILE=0.1 will attempt to transform pixel values so that 80% fall - # between 10% brightness and 90% brightness. In practice, a value around - # 0.02 seems to work fairly well. - NORM_QUANTILE = 0.03 - pixel_range = np.quantile(arr, 1.0 - NORM_QUANTILE) - np.quantile( - arr, NORM_QUANTILE + # Format for API. + query = re.sub(r"\s+", "+", query.strip()) + sort = re.sub(r"\s+", "+", sort.strip()) + + # params = { + # "q": query, + # "count": 100, + # "fields": "identifier,review_date", + # "sorts": sort, + # } + # for i in range(1, 999): + # resp = requests.get( + # "https://archive.org/services/search/v1/scrape", + # params=params, + # ) + # resp.raise_for_status() + # print(resp.text) + # try: + # body = resp.json() + # except Exception as err: + # print("Body:", resp.text, file=stderr) + # raise err + # for doc in body["items"]: + # cur.execute( + # "insert into items (id, review_date, skip_analysis) values (?, ?, false) on conflict do nothing", + # (doc["identifier"], doc["review_date"]), + # ) + # conn.commit() + # cursor = body.get("cursor", None) + # if cursor is None: + # break + # params = params.copy() + # params["cursor"] = cursor + resp = requests.get( + f"https://archive.org/advancedsearch.php?q={query}&sort[]={sort}&fl[]=identifier&fl[]=review_date&rows=250000&output=json", ) - if pixel_range == 0: - arr_normalized = arr - else: - arr_normalized = arr * (1.0 - NORM_QUANTILE * 2) / pixel_range - arr_normalized = ( - arr_normalized - np.quantile(arr_normalized, NORM_QUANTILE) + NORM_QUANTILE + resp.raise_for_status() + try: + body = resp.json() + except Exception as err: + print("Body:", resp.text, file=stderr) + raise err + for doc in body["response"]["docs"]: + cur.execute( + "insert into items (id, review_date, skip_analysis) values (?, ?, false) on conflict do nothing", + (doc["identifier"], doc["review_date"]), ) - arr_normalized = np.uint8(np.clip(arr_normalized, 0, 1) * 255) - - # "Sharpness" is determined by measuring the median intensity of pixels - # near edges, after an edge detection filter has been applied to the image. - edges_arr = np.asarray( - Image.fromarray(arr_normalized).filter(ImageFilter.FIND_EDGES) - ) - EDGE_THRESHOLD = 8 - return np.median(edges_arr[edges_arr > EDGE_THRESHOLD]) / 255 + conn.commit() if __name__ == "__main__": diff --git a/one_off.py b/one_off.py new file mode 100644 index 0000000..947b294 --- /dev/null +++ b/one_off.py @@ -0,0 +1,159 @@ +import json +import re +from argparse import ArgumentParser +from dataclasses import dataclass +from multiprocessing.pool import ThreadPool +from sys import stderr, stdin, stdout + +from engine import analyze_item + + +OCR_LANGS = "eng+fra" + + +def main(): + parser = ArgumentParser() + parser.add_argument("--summarize", action="store_true") + parser.add_argument("-v", "--verbose", action="store_true") + parser.add_argument("-w", "--workers", type=int, default=1) + parser.add_argument("--page-margin-px", type=int, default=50) + args = parser.parse_args() + + # Process STDIN line by line, where each line contains one or more item IDs + # separated by whitespace. + for line in stdin: + item_ids = [value for value in re.split(r",|\s", line) if value] + with ThreadPool(args.workers) as pool: + if args.verbose: + print(f"Running with {args.workers} workers.", file=stderr) + stderr.flush() + if args.summarize: + pool.map( + _summarize_item_to_stdout, + [ + ItemTask( + item_id=item_id, + page_margin_px=args.page_margin_px, + verbose=args.verbose, + ) + for item_id in item_ids + ], + ) + else: + pool.map( + _analyze_item_to_stdout, + [ + ItemTask( + item_id=item_id, + page_margin_px=args.page_margin_px, + verbose=args.verbose, + ) + for item_id in item_ids + ], + ) + + +@dataclass +class ItemTask: + item_id: str + page_margin_px: int + verbose: bool + + +def _summarize_item_to_stdout(task): + item_id = task.item_id + page_margin_px = task.page_margin_px + verbose = task.verbose + + if verbose: + print(f"Summarizing item {item_id}...", file=stderr) + stderr.flush() + + analysis = analyze_item(item_id, parallel=4, verbose=verbose) + + # 3 or more blank pages in a row is a flag. + CONSECUTIVE_BLANKS_THRESHOLD = 3 + if len(analysis["pages"]) >= CONSECUTIVE_BLANKS_THRESHOLD: + consecutive_blanks = [page["blank"] for page in analysis["pages"]] + for _ in range(1, CONSECUTIVE_BLANKS_THRESHOLD): + consecutive_blanks = [ + value and consecutive_blanks[i] + for i, value in enumerate(consecutive_blanks[1:]) + ] + consecutive_blanks = [ + i + 2 # +1 to account for enumeration offset, and +1 to 1-index + for i, value in enumerate(consecutive_blanks[1:]) + if value and not consecutive_blanks[i] + ] + else: + consecutive_blanks = [] + + # 3 or more blank pages in a row is a flag. + CONSECUTIVE_BLURRY_THRESHOLD = 3 + SHARPNESS_THRESHOLD = 0.1 + if len(analysis["pages"]) >= CONSECUTIVE_BLURRY_THRESHOLD: + consecutive_blurry = [ + page["sharpness"] < SHARPNESS_THRESHOLD for page in analysis["pages"] + ] + for _ in range(1, CONSECUTIVE_BLURRY_THRESHOLD): + consecutive_blurry = [ + value and consecutive_blurry[i] + for i, value in enumerate(consecutive_blurry[1:]) + ] + consecutive_blurry = [ + i + 2 # +1 to account for enumeration offset, and +1 to 1-index + for i, value in enumerate(consecutive_blurry[1:]) + if value and not consecutive_blurry[i] + ] + else: + consecutive_blurry = [] + + check_orientation = [ + i + 1 + for i, page in enumerate(analysis["pages"]) + if not page["ocr_orientation_match"] + ] + + check_crop = [ + i + 1 + for i, page in enumerate(analysis["pages"]) + if page["text_margin_px"] < page_margin_px + ] + + if check_orientation or check_crop or consecutive_blanks or consecutive_blurry: + print( + json.dumps( + { + "item_id": item_id, + "check_orientation": check_orientation, + "check_crop": check_crop, + "consecutive_blanks": consecutive_blanks, + "consecutive_blurry": consecutive_blurry, + } + ) + ) + stdout.flush() + + if verbose: + print(f"Done summarizing item {item_id}.", file=stderr) + stderr.flush() + + +def _analyze_item_to_stdout(task): + item_id = task.item_id + verbose = task.verbose + + if verbose: + print(f"Analyzing item {item_id}...", file=stderr) + stderr.flush() + + print(json.dumps(analyze_item(item_id, parallel=6, verbose=verbose))) + stdout.flush() + + if verbose: + print(f"Done analyzing item {item_id}.", file=stderr) + stderr.flush() + + +if __name__ == "__main__": + main()