rewrite to engine.py

2025-10-04 15:09:16 -07:00 · 2025-10-04 15:09:16 -07:00 · 4d9161b043
commit 4d9161b043
parent 815934ad23
6 changed files with 550 additions and 479 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,4 @@
 /target
 /data
+*.db
+__pycache__
--- a/README.md
+++ b/README.md
@ -38,3 +38,4 @@ order by items.id;
 - Blurry pages: `micro_IA40244209_0984`
 - Contrast, page orientation: `micro_IA40244211_2290`
 - Crop, low quality fiche: `micro_IA40386420_0689`
+- "Bite sized" SCOTUS doc with multiple viewable files and some blurry pages: `micro_IA40386007_0012`
--- a/cache.py
+++ b/cache.py
@ -1,134 +0,0 @@
-import re
-import sqlite3
-import traceback
-from argparse import ArgumentParser
-from datetime import datetime
-from time import sleep
-
-import requests
-
-from main import analyze_item
-
-
-def main():
-    parser = ArgumentParser()
-    parser.add_argument("--database", default="./microqa.db")
-    parser.add_argument("--cpus", type=int, default=2)
-    parser.add_argument("--earliest-review-date", default="20250701")
-    args = parser.parse_args()
-
-    with sqlite3.connect(args.database) as conn:
-        cur = conn.cursor()
-        cur.execute("""
-create table if not exists items (
-    id text primary key not null,
-    review_date text not null,
-    analyzed_date text
-)""")
-        cur.execute("""
-create table if not exists pages (
-    id int primary key,
-    item text not null,
-    page int not null,
-    orientation_match boolean not null,
-    sharpness real not null,
-    is_blank boolean not null,
-    text_margin_px int not null
-)""")
-        conn.commit()
-
-        while True:
-            print("Pulling item IDs")
-            pull_new_item_ids(conn, args.earliest_review_date)
-            print("Done.")
-            res = cur.execute(
-                "select id from items where analyzed_date is null order by review_date"
-            )
-            for (item_id,) in res.fetchall():
-                N_ATTEMPTS = 3
-                for _ in range(N_ATTEMPTS):
-                    try:
-                        print(f"Processing {item_id}")
-                        analysis = analyze_item(
-                            item_id, parallel=args.cpus, verbose=True
-                        )
-                        for i, page in enumerate(analysis["pages"]):
-                            cur.execute(
-                                """
-insert into pages (
-    item,
-    page,
-    orientation_match,
-    sharpness,
-    is_blank,
-    text_margin_px
-) values (
-    ?,
-    ?,
-    ?,
-    ?,
-    ?,
-    ?
- )""",
-                                [
-                                    item_id,
-                                    i + 1,
-                                    page["ocr_orientation_match"],
-                                    page["sharpness"],
-                                    page["blank"],
-                                    page["text_margin_px"],
-                                ],
-                            )
-                        cur.execute(
-                            "update items set analyzed_date = ? where id = ?",
-                            [datetime.utcnow().strftime("%Y%m%d%H%M%S"), item_id],
-                        )
-                        conn.commit()
-                        print("Done")
-                        break
-                    except Exception as err:
-                        print(err)
-                        traceback.print_tb(err.__traceback__)
-                        sleep(15)
-            break
-            sleep(3600)
-
-
-def pull_new_item_ids(conn, earliest_review_date):
-    cur = conn.cursor()
-    res = cur.execute("select review_date from items order by review_date desc limit 1")
-    (latest_review_date,) = res.fetchone() or (earliest_review_date,)
-    print(latest_review_date)
-
-    query = f"""
-        collection:(microfiche)
-        AND contributor:(Internet Archive)
-        AND micro_review:(done)
-        AND review_date:[{latest_review_date} TO null]
-    """
-    sort = "reviewdate asc"
-
-    # Format for API.
-    query = re.sub(r"\s+", "+", query.strip())
-    sort = re.sub(r"\s+", "+", sort.strip())
-
-    for i in range(1, 999):
-        resp = requests.get(
-            f"https://archive.org/advancedsearch.php?q={query}&sort[]={sort}&fl[]=identifier&fl[]=review_date&rows=100&page={i}&output=json",
-        )
-        resp.raise_for_status()
-        body = resp.json()
-        if len(body["response"]["docs"]) == 0:
-            break
-        cur.executemany(
-            "insert into items (id, review_date) values (?, ?) on conflict do nothing",
-            [
-                (doc["identifier"], doc["review_date"])
-                for doc in body["response"]["docs"]
-            ],
-        )
-        conn.commit()
-
-
-if __name__ == "__main__":
-    main()
--- a/engine.py
+++ b/engine.py
@ -0,0 +1,239 @@
+import urllib.parse
+import re
+from dataclasses import dataclass
+from io import BytesIO
+from multiprocessing import Pool
+from sys import stderr
+from zipfile import ZipFile
+
+import numpy as np
+import pytesseract
+import requests
+from PIL import Image, ImageFilter
+
+
+def analyze_item(item_id, ocr_langs="eng+fra", parallel=1, verbose=False):
+    escaped_item_id = urllib.parse.quote(item_id, safe="")
+
+    if verbose:
+        print("Downloading...", file=stderr)
+        stderr.flush()
+    page_nums_resp = requests.get(
+        f"https://archive.org/metadata/{escaped_item_id}/page_numbers/pages"
+    )
+    page_nums_resp.raise_for_status()
+    page_nums = page_nums_resp.json()["result"]
+
+    zip_resp = requests.get(
+        f"https://archive.org/download/{escaped_item_id}/{escaped_item_id}_jp2.zip"
+    )
+    zip_resp.raise_for_status()
+
+    if verbose:
+        print("Decompressing...", file=stderr)
+        stderr.flush()
+    tasks = []
+    with ZipFile(BytesIO(zip_resp.content)) as jp_zip:
+        for leaf_num, file_name in enumerate(sorted(jp_zip.namelist())):
+            for page_index, page_num_info in enumerate(page_nums):
+                if page_num_info["leafNum"] == leaf_num:
+                    # Stop iterating and keep page_index set to the current item.
+                    break
+            else:
+                # Set to -1 to indicate that leaf was not found in page_num list.
+                page_index = -1
+
+            if page_index != -1:
+                with jp_zip.open(file_name) as jp_file:
+                    im = Image.open(jp_file).convert("L")
+                    im.thumbnail((3200, 3200))
+                    tasks.append(
+                        PageAnalysisTask(
+                            im=im,
+                            ocr_langs=ocr_langs,
+                        )
+                    )
+
+    if verbose:
+        print(f"Processing {len(page_nums)} pages...", file=stderr)
+        stderr.flush()
+
+    if parallel > 1:
+        # Parallelize image processing and OCR of pages across up to n cores.
+        with Pool(parallel) as pool:
+            return {"pages": pool.map(analyze_page, tasks)}
+
+    return {"pages": [analyze_page(task) for task in tasks]}
+
+
+@dataclass
+class PageAnalysisTask:
+    """
+    Attributes:
+        im          PIL Image, pre-scaled using .thumbnail() to fit the long
+                    edge to 3200 px.
+        ocr_langs   Tesseract language codes (3 letters each, in a "+"-separated
+                    list).
+    """
+
+    im: Image.Image
+    ocr_langs: str = "eng+fra"
+
+
+def analyze_page(task):
+    im_cropped = task.im.crop(
+        (
+            task.im.size[0] * 0.1,
+            task.im.size[1] * 0.1,
+            task.im.size[0] * 0.9,
+            task.im.size[1] * 0.9,
+        )
+    )
+
+    is_blank = im_cropped.getextrema()[0] > 255 * 0.8
+
+    if is_blank:
+        max_sharpness = 1
+        ocr_orientation_match = True
+        text_margin_px = -1
+    else:
+        max_sharpness = 0.0
+        if im_cropped.size[0] < im_cropped.size[1]:
+            # Page is in portrait orientation.
+            segments_x = 2
+            segments_y = 3
+        else:
+            # Page is in landscape orientation.
+            segments_x = 3
+            segments_y = 2
+        for i in range(segments_x):
+            for j in range(segments_y):
+                max_sharpness = max(
+                    max_sharpness,
+                    analyze_sharpness(
+                        im_cropped.crop(
+                            (
+                                im_cropped.size[0] / segments_x * i,
+                                im_cropped.size[1] / segments_y * j,
+                                im_cropped.size[0] / segments_x * (i + 1),
+                                im_cropped.size[1] / segments_y * (j + 1),
+                            )
+                        )
+                    ),
+                )
+
+        OCR_SCALE = 1
+        best_ocr_score = -1
+        best_ocr_words = None
+        best_ocr_orientation = -1
+        for orientation in range(4):
+            im_rotated = task.im.resize(
+                np.int_(np.array(task.im.size) * OCR_SCALE)
+            ).rotate(90 * orientation, expand=True)
+            ocr = pytesseract.image_to_data(
+                im_rotated,
+                lang=task.ocr_langs,
+                config=f"--oem 1 --dpi {int(300 * OCR_SCALE)} --tessdata-dir ./data/tessdata_fast-4.1.0",
+                output_type=pytesseract.Output.DATAFRAME,
+            ).fillna({"text": ""})
+            # Keep only words that Tesseract is confident in, and which are
+            # oriented horizontally.
+            words = ocr[(ocr["conf"] > 90) & (ocr["width"] > ocr["height"])]
+            # Keep only alphabetical words of 4 or more characters.
+            words = words[
+                words.apply(
+                    lambda row: re.fullmatch(r"[a-zA-Z]{4,}", str(row["text"]))
+                    is not None,
+                    axis=1,
+                )
+            ]
+            if words.shape[0] > best_ocr_score:
+                best_ocr_score = words.shape[0]
+                best_ocr_orientation = orientation
+                best_ocr_words = words
+            if best_ocr_score > 50:
+                # Unlikely that another orientation will have more words, so
+                # stop eating up CPU.
+                break
+
+        if best_ocr_words.empty:
+            ocr_orientation_match = True
+            text_margin_px = -1
+        else:
+            ocr_orientation_match = best_ocr_orientation == 0
+
+            best_ocr_dims = OCR_SCALE * np.array(
+                task.im.size
+                if best_ocr_orientation % 2 == 0
+                else (task.im.size[1], task.im.size[0])
+            )
+
+            word_margins_all_directions = np.sort(
+                np.int_(
+                    np.concat(
+                        (
+                            best_ocr_words["left"].to_numpy(),
+                            best_ocr_words["top"].to_numpy(),
+                            best_ocr_dims[0]
+                            - (
+                                best_ocr_words["left"] + best_ocr_words["width"]
+                            ).to_numpy(),
+                            best_ocr_dims[1]
+                            - (
+                                best_ocr_words["top"] + best_ocr_words["height"]
+                            ).to_numpy(),
+                        )
+                    )
+                    # Transform back into original image pixel density
+                    / OCR_SCALE
+                )
+            )
+            # Skip the n closest words to the edge, to help ignore stray OCR artifacts.
+            SKIP_WORDS = 2
+            text_margin_px = int(
+                word_margins_all_directions[SKIP_WORDS]
+                if word_margins_all_directions.shape[0] > SKIP_WORDS
+                else -1
+            )
+
+    return {
+        "blank": is_blank,
+        "ocr_orientation_match": ocr_orientation_match,
+        "size_analyzed": task.im.size,
+        "sharpness": max_sharpness,
+        "text_margin_px": text_margin_px,
+    }
+
+
+def analyze_sharpness(im):
+    """
+    Crudely quantifies the "sharpness" of edges in an image, on a scale of 0 to
+    1. The scale is not linear with respect to scan quality: anything above 0.1
+    is usually fine.
+    """
+    arr = np.asarray(im)
+
+    # Normalize contrast based on brightest and darkest pixels. For example,
+    # NORM_QUANTILE=0.1 will attempt to transform pixel values so that 80% fall
+    # between 10% brightness and 90% brightness. In practice, a value around
+    # 0.02 seems to work fairly well.
+    NORM_QUANTILE = 0.03
+    pixel_range = np.quantile(arr, 1.0 - NORM_QUANTILE) - np.quantile(
+        arr, NORM_QUANTILE
+    )
+    if pixel_range == 0:
+        arr_normalized = arr
+    else:
+        arr_normalized = arr * (1.0 - NORM_QUANTILE * 2) / pixel_range
+        arr_normalized = (
+            arr_normalized - np.quantile(arr_normalized, NORM_QUANTILE) + NORM_QUANTILE
+        )
+        arr_normalized = np.uint8(np.clip(arr_normalized, 0, 1) * 255)
+
+    # "Sharpness" is determined by measuring the median intensity of pixels
+    # near edges, after an edge detection filter has been applied to the image.
+    edges_arr = np.asarray(
+        Image.fromarray(arr_normalized).filter(ImageFilter.FIND_EDGES)
+    )
+    EDGE_THRESHOLD = 8
+    return np.median(edges_arr[edges_arr > EDGE_THRESHOLD]) / 255
--- a/main.py
+++ b/main.py
@ -1,371 +1,175 @@
-import json
 import re
-import urllib.parse
+import sqlite3
+import traceback
 from argparse import ArgumentParser
-from dataclasses import dataclass
-from io import BytesIO
-from multiprocessing import Pool
-from multiprocessing.pool import ThreadPool
-from sys import stderr, stdin, stdout
-from zipfile import ZipFile
+from datetime import datetime
+from sys import stderr
+from time import sleep

-import numpy as np
-import pytesseract
 import requests
-from PIL import Image, ImageFilter

-
-OCR_LANGS = "eng+fra"
+from engine import analyze_item


 def main():
    parser = ArgumentParser()
-    parser.add_argument("--summarize", action="store_true")
-    parser.add_argument("-v", "--verbose", action="store_true")
-    parser.add_argument("-w", "--workers", type=int, default=1)
-    parser.add_argument("--page-margin-px", type=int, default=50)
+    parser.add_argument("--database", default="./microqa.db")
+    parser.add_argument("--cpus", type=int, default=2)
+    parser.add_argument("--earliest-review-date", default="20250701")
    args = parser.parse_args()

-    # Process STDIN line by line, where each line contains one or more item IDs
-    # separated by whitespace.
-    for line in stdin:
-        item_ids = [value for value in re.split(r",|\s", line) if value]
-        with ThreadPool(args.workers) as pool:
-            if args.verbose:
-                print(f"Running with {args.workers} workers.", file=stderr)
-                stderr.flush()
-            if args.summarize:
-                pool.map(
-                    _summarize_item_to_stdout,
-                    [
-                        ItemTask(
-                            item_id=item_id,
-                            page_margin_px=args.page_margin_px,
-                            verbose=args.verbose,
+    with sqlite3.connect(args.database) as conn:
+        cur = conn.cursor()
+        cur.execute("""
+create table if not exists items (
+    id text primary key not null,
+    review_date text not null,
+    skip_analysis bool not null,
+    analyzed_date text
+)""")
+        cur.execute("""
+create table if not exists pages (
+    id int primary key,
+    item text not null,
+    page int not null,
+    orientation_match boolean not null,
+    sharpness real not null,
+    is_blank boolean not null,
+    text_margin_px int not null
+)""")
+        cur.execute("create index if not exists review_date_idx on items (review_date)")
+        cur.execute(
+            "create index if not exists analyzed_date_idx on items (analyzed_date)"
        )
-                        for item_id in item_ids
+        cur.execute("create index if not exists item_idx on pages (item)")
+        cur.execute(
+            "create unique index if not exists item_page_idx on pages (item, page)"
+        )
+        conn.commit()
+
+        while True:
+            print("Pulling item IDs")
+            pull_new_item_ids(conn, args.earliest_review_date)
+            print("Done.")
+            res = cur.execute("""
+select id
+from items
+where analyzed_date is null
+    and skip_analysis = false
+order by review_date
+""")
+            for (item_id,) in res.fetchall():
+                N_ATTEMPTS = 3
+                for _ in range(N_ATTEMPTS):
+                    try:
+                        print(f"Processing {item_id}")
+                        analysis = analyze_item(
+                            item_id, parallel=args.cpus, verbose=True
+                        )
+                        for i, page in enumerate(analysis["pages"]):
+                            cur.execute(
+                                """
+insert into pages (
+    item,
+    page,
+    orientation_match,
+    sharpness,
+    is_blank,
+    text_margin_px
+) values (
+    ?,
+    ?,
+    ?,
+    ?,
+    ?,
+    ?
+ )""",
+                                [
+                                    item_id,
+                                    i + 1,
+                                    page["ocr_orientation_match"],
+                                    page["sharpness"],
+                                    page["blank"],
+                                    page["text_margin_px"],
                                ],
                            )
-            else:
-                pool.map(
-                    _analyze_item_to_stdout,
-                    [
-                        ItemTask(
-                            item_id=item_id,
-                            page_margin_px=args.page_margin_px,
-                            verbose=args.verbose,
+                        cur.execute(
+                            "update items set analyzed_date = ? where id = ?",
+                            [datetime.utcnow().strftime("%Y%m%d%H%M%S"), item_id],
                        )
-                        for item_id in item_ids
-                    ],
-                )
-
-
-@dataclass
-class ItemTask:
-    item_id: str
-    page_margin_px: int
-    verbose: bool
-
-
-def _summarize_item_to_stdout(task):
-    item_id = task.item_id
-    page_margin_px = task.page_margin_px
-    verbose = task.verbose
-
-    if verbose:
-        print(f"Summarizing item {item_id}...", file=stderr)
-        stderr.flush()
-
-    analysis = analyze_item(item_id, parallel=4, verbose=verbose)
-
-    # 3 or more blank pages in a row is a flag.
-    CONSECUTIVE_BLANKS_THRESHOLD = 3
-    if len(analysis["pages"]) >= CONSECUTIVE_BLANKS_THRESHOLD:
-        consecutive_blanks = [page["blank"] for page in analysis["pages"]]
-        for _ in range(1, CONSECUTIVE_BLANKS_THRESHOLD):
-            consecutive_blanks = [
-                value and consecutive_blanks[i]
-                for i, value in enumerate(consecutive_blanks[1:])
-            ]
-        consecutive_blanks = [
-            i + 2  # +1 to account for enumeration offset, and +1 to 1-index
-            for i, value in enumerate(consecutive_blanks[1:])
-            if value and not consecutive_blanks[i]
-        ]
-    else:
-        consecutive_blanks = []
-
-    # 3 or more blank pages in a row is a flag.
-    CONSECUTIVE_BLURRY_THRESHOLD = 3
-    SHARPNESS_THRESHOLD = 0.1
-    if len(analysis["pages"]) >= CONSECUTIVE_BLURRY_THRESHOLD:
-        consecutive_blurry = [
-            page["sharpness"] < SHARPNESS_THRESHOLD for page in analysis["pages"]
-        ]
-        for _ in range(1, CONSECUTIVE_BLURRY_THRESHOLD):
-            consecutive_blurry = [
-                value and consecutive_blurry[i]
-                for i, value in enumerate(consecutive_blurry[1:])
-            ]
-        consecutive_blurry = [
-            i + 2  # +1 to account for enumeration offset, and +1 to 1-index
-            for i, value in enumerate(consecutive_blurry[1:])
-            if value and not consecutive_blurry[i]
-        ]
-    else:
-        consecutive_blurry = []
-
-    check_orientation = [
-        i + 1
-        for i, page in enumerate(analysis["pages"])
-        if not page["ocr_orientation_match"]
-    ]
-
-    check_crop = [
-        i + 1
-        for i, page in enumerate(analysis["pages"])
-        if page["text_margin_px"] < page_margin_px
-    ]
-
-    if check_orientation or check_crop or consecutive_blanks or consecutive_blurry:
-        print(
-            json.dumps(
-                {
-                    "item_id": item_id,
-                    "check_orientation": check_orientation,
-                    "check_crop": check_crop,
-                    "consecutive_blanks": consecutive_blanks,
-                    "consecutive_blurry": consecutive_blurry,
-                }
-            )
-        )
-        stdout.flush()
-
-    if verbose:
-        print(f"Done summarizing item {item_id}.", file=stderr)
-        stderr.flush()
-
-
-def _analyze_item_to_stdout(task):
-    item_id = task.item_id
-    verbose = task.verbose
-
-    if verbose:
-        print(f"Analyzing item {item_id}...", file=stderr)
-        stderr.flush()
-
-    print(json.dumps(analyze_item(item_id, parallel=4, verbose=verbose)))
-    stdout.flush()
-
-    if verbose:
-        print(f"Done analyzing item {item_id}.", file=stderr)
-        stderr.flush()
-
-
-@dataclass
-class PageAnalysisTask:
-    im: Image.Image
-    page_index: int
-    file_name: str
-
-
-def _analyze_page(task):
-    im_original = task.im
-    page_index = task.page_index
-    file_name = task.file_name
-
-    im_cropped = im_original.crop(
-        (
-            im_original.size[0] * 0.1,
-            im_original.size[1] * 0.1,
-            im_original.size[0] * 0.9,
-            im_original.size[1] * 0.9,
-        )
-    )
-
-    is_blank = im_cropped.getextrema()[0] > 255 * 0.8
-
-    if is_blank:
-        max_sharpness = 1
-        ocr_orientation_match = True
-        text_margin_px = -1
-    else:
-        max_sharpness = 0.0
-        if im_cropped.size[0] < im_cropped.size[1]:
-            # Page is in portrait orientation.
-            segments_x = 2
-            segments_y = 3
-        else:
-            # Page is in landscape orientation.
-            segments_x = 3
-            segments_y = 2
-        for i in range(segments_x):
-            for j in range(segments_y):
-                max_sharpness = max(
-                    max_sharpness,
-                    analyze_sharpness(
-                        im_cropped.crop(
-                            (
-                                im_cropped.size[0] / segments_x * i,
-                                im_cropped.size[1] / segments_y * j,
-                                im_cropped.size[0] / segments_x * (i + 1),
-                                im_cropped.size[1] / segments_y * (j + 1),
-                            )
-                        )
-                    ),
-                )
-
-        best_ocr_score = -1
-        best_ocr_words = None
-        best_ocr_orientation = -1
-        for orientation in range(4):
-            im_rotated = im_original.rotate(90 * orientation, expand=True)
-            ocr = pytesseract.image_to_data(
-                im_rotated,
-                lang=OCR_LANGS,
-                config="--oem 1 --dpi 300 --tessdata-dir ./data/tessdata_fast-4.1.0",
-                output_type=pytesseract.Output.DATAFRAME,
-            ).fillna({"text": ""})
-            words = ocr[(ocr["conf"] > 90) & (ocr["width"] > ocr["height"])]
-            words = words[
-                words.apply(
-                    lambda row: re.fullmatch(r"[a-zA-Z]{4,}", row["text"]) is not None,
-                    axis=1,
-                )
-            ]
-            if words.shape[0] > best_ocr_score:
-                best_ocr_score = words.shape[0]
-                best_ocr_orientation = orientation
-                best_ocr_words = words
-            if best_ocr_score > 50:
-                # Unlikely that another orientation will have more words, so
-                # stop eating up CPU unnecessarily.
+                        conn.commit()
+                        print("Done")
                        break
-
-        ocr_orientation_match = best_ocr_orientation == 0
-
-        best_ocr_dims = (
-            im_original.size
-            if best_ocr_orientation % 2 == 0
-            else (im_original.size[1], im_original.size[0])
-        )
-
-        word_margins_all_directions = np.sort(
-            np.concat(
-                (
-                    best_ocr_words["left"].to_numpy(),
-                    best_ocr_words["top"].to_numpy(),
-                    best_ocr_dims[0]
-                    - (best_ocr_words["left"] + best_ocr_words["width"]).to_numpy(),
-                    best_ocr_dims[1]
-                    - (best_ocr_words["top"] + best_ocr_words["height"]).to_numpy(),
-                )
-            )
-        )
-        # Skip the n closest words to the edge, to help ignore stray OCR artifacts.
-        SKIP_WORDS = 2
-        text_margin_px = (
-            int(word_margins_all_directions[SKIP_WORDS])
-            if word_margins_all_directions.shape[0] > SKIP_WORDS
-            else -1
-        )
-
-    return {
-        "blank": is_blank,
-        "file_name": file_name,
-        "ocr_orientation_match": ocr_orientation_match,
-        "page_index": page_index,
-        "size": im_original.size,
-        "sharpness": max_sharpness,
-        "text_margin_px": text_margin_px,
-    }
-
-
-def analyze_item(item_id, parallel=1, verbose=False):
-    escaped_item_id = urllib.parse.quote(item_id, safe="")
-
-    if verbose:
-        print("Downloading...", file=stderr)
-        stderr.flush()
-    page_nums_resp = requests.get(
-        f"https://archive.org/metadata/{escaped_item_id}/page_numbers/pages"
-    )
-    page_nums_resp.raise_for_status()
-    page_nums = page_nums_resp.json()["result"]
-
-    zip_resp = requests.get(
-        f"https://archive.org/download/{escaped_item_id}/{escaped_item_id}_jp2.zip"
-    )
-    zip_resp.raise_for_status()
-
-    if verbose:
-        print("Decompressing...", file=stderr)
-        stderr.flush()
-    tasks = []
-    with ZipFile(BytesIO(zip_resp.content)) as jp_zip:
-        for leaf_num, file_name in enumerate(sorted(jp_zip.namelist())):
-            for page_index, page_num_info in enumerate(page_nums):
-                if page_num_info["leafNum"] == leaf_num:
-                    # Stop iterating and keep page_index set to the current item.
+                    except Exception as err:
+                        print(err, file=stderr)
+                        traceback.print_tb(err.__traceback__, file=stderr)
+                        sleep(15)
            break
-            else:
-                # Set to -1 to indicate that leaf was not found in page_num list.
-                page_index = -1
-
-            if page_index != -1:
-                with jp_zip.open(file_name) as jp_file:
-                    im = Image.open(jp_file).convert("L")
-                    im.thumbnail((3200, 3200))
-                    tasks.append(
-                        PageAnalysisTask(
-                            im=im,
-                            page_index=page_index,
-                            file_name=file_name,
-                        )
-                    )
-
-    if verbose:
-        print(f"Processing {len(page_nums)} pages...", file=stderr)
-        stderr.flush()
-    if parallel > 1:
-        # Parallelize image processing and OCR of pages across up to n cores.
-        with Pool(parallel) as pool:
-            return {"pages": pool.map(_analyze_page, tasks)}
-    return {"pages": [_analyze_page(task) for task in tasks]}
+            sleep(3600)


-def analyze_sharpness(im):
+def pull_new_item_ids(conn, earliest_review_date):
+    cur = conn.cursor()
+    res = cur.execute("select review_date from items order by review_date desc limit 1")
+    (latest_review_date,) = res.fetchone() or (earliest_review_date,)
+    print(latest_review_date)
+
+    query = f"""
+        collection:(microfiche)
+        AND contributor:(Internet Archive)
+        AND micro_review:(done)
+        AND review_date:[{latest_review_date} TO null]
    """
-    Crudely quantifies the "sharpness" of edges in an image, on a scale of 0 to
-    1. The scale is not linear with respect to scan quality: anything above 0.1
-    is usually fine.
-    """
-    arr = np.asarray(im)
+    sort = "reviewdate asc"

-    # Normalize contrast based on brightest and darkest pixels. For example,
-    # NORM_QUANTILE=0.1 will attempt to transform pixel values so that 80% fall
-    # between 10% brightness and 90% brightness. In practice, a value around
-    # 0.02 seems to work fairly well.
-    NORM_QUANTILE = 0.03
-    pixel_range = np.quantile(arr, 1.0 - NORM_QUANTILE) - np.quantile(
-        arr, NORM_QUANTILE
-    )
-    if pixel_range == 0:
-        arr_normalized = arr
-    else:
-        arr_normalized = arr * (1.0 - NORM_QUANTILE * 2) / pixel_range
-        arr_normalized = (
-            arr_normalized - np.quantile(arr_normalized, NORM_QUANTILE) + NORM_QUANTILE
-        )
-        arr_normalized = np.uint8(np.clip(arr_normalized, 0, 1) * 255)
+    # Format for API.
+    query = re.sub(r"\s+", "+", query.strip())
+    sort = re.sub(r"\s+", "+", sort.strip())

-    # "Sharpness" is determined by measuring the median intensity of pixels
-    # near edges, after an edge detection filter has been applied to the image.
-    edges_arr = np.asarray(
-        Image.fromarray(arr_normalized).filter(ImageFilter.FIND_EDGES)
+    #    params = {
+    #        "q": query,
+    #        "count": 100,
+    #        "fields": "identifier,review_date",
+    #        "sorts": sort,
+    #    }
+    #    for i in range(1, 999):
+    #        resp = requests.get(
+    #            "https://archive.org/services/search/v1/scrape",
+    #            params=params,
+    #        )
+    #        resp.raise_for_status()
+    #        print(resp.text)
+    #        try:
+    #            body = resp.json()
+    #        except Exception as err:
+    #            print("Body:", resp.text, file=stderr)
+    #            raise err
+    #        for doc in body["items"]:
+    #            cur.execute(
+    #                "insert into items (id, review_date, skip_analysis) values (?, ?, false) on conflict do nothing",
+    #                (doc["identifier"], doc["review_date"]),
+    #            )
+    #        conn.commit()
+    #        cursor = body.get("cursor", None)
+    #        if cursor is None:
+    #            break
+    #        params = params.copy()
+    #        params["cursor"] = cursor
+    resp = requests.get(
+        f"https://archive.org/advancedsearch.php?q={query}&sort[]={sort}&fl[]=identifier&fl[]=review_date&rows=250000&output=json",
    )
-    EDGE_THRESHOLD = 8
-    return np.median(edges_arr[edges_arr > EDGE_THRESHOLD]) / 255
+    resp.raise_for_status()
+    try:
+        body = resp.json()
+    except Exception as err:
+        print("Body:", resp.text, file=stderr)
+        raise err
+    for doc in body["response"]["docs"]:
+        cur.execute(
+            "insert into items (id, review_date, skip_analysis) values (?, ?, false) on conflict do nothing",
+            (doc["identifier"], doc["review_date"]),
+        )
+        conn.commit()


 if __name__ == "__main__":
--- a/one_off.py
+++ b/one_off.py
@ -0,0 +1,159 @@
+import json
+import re
+from argparse import ArgumentParser
+from dataclasses import dataclass
+from multiprocessing.pool import ThreadPool
+from sys import stderr, stdin, stdout
+
+from engine import analyze_item
+
+
+OCR_LANGS = "eng+fra"
+
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument("--summarize", action="store_true")
+    parser.add_argument("-v", "--verbose", action="store_true")
+    parser.add_argument("-w", "--workers", type=int, default=1)
+    parser.add_argument("--page-margin-px", type=int, default=50)
+    args = parser.parse_args()
+
+    # Process STDIN line by line, where each line contains one or more item IDs
+    # separated by whitespace.
+    for line in stdin:
+        item_ids = [value for value in re.split(r",|\s", line) if value]
+        with ThreadPool(args.workers) as pool:
+            if args.verbose:
+                print(f"Running with {args.workers} workers.", file=stderr)
+                stderr.flush()
+            if args.summarize:
+                pool.map(
+                    _summarize_item_to_stdout,
+                    [
+                        ItemTask(
+                            item_id=item_id,
+                            page_margin_px=args.page_margin_px,
+                            verbose=args.verbose,
+                        )
+                        for item_id in item_ids
+                    ],
+                )
+            else:
+                pool.map(
+                    _analyze_item_to_stdout,
+                    [
+                        ItemTask(
+                            item_id=item_id,
+                            page_margin_px=args.page_margin_px,
+                            verbose=args.verbose,
+                        )
+                        for item_id in item_ids
+                    ],
+                )
+
+
+@dataclass
+class ItemTask:
+    item_id: str
+    page_margin_px: int
+    verbose: bool
+
+
+def _summarize_item_to_stdout(task):
+    item_id = task.item_id
+    page_margin_px = task.page_margin_px
+    verbose = task.verbose
+
+    if verbose:
+        print(f"Summarizing item {item_id}...", file=stderr)
+        stderr.flush()
+
+    analysis = analyze_item(item_id, parallel=4, verbose=verbose)
+
+    # 3 or more blank pages in a row is a flag.
+    CONSECUTIVE_BLANKS_THRESHOLD = 3
+    if len(analysis["pages"]) >= CONSECUTIVE_BLANKS_THRESHOLD:
+        consecutive_blanks = [page["blank"] for page in analysis["pages"]]
+        for _ in range(1, CONSECUTIVE_BLANKS_THRESHOLD):
+            consecutive_blanks = [
+                value and consecutive_blanks[i]
+                for i, value in enumerate(consecutive_blanks[1:])
+            ]
+        consecutive_blanks = [
+            i + 2  # +1 to account for enumeration offset, and +1 to 1-index
+            for i, value in enumerate(consecutive_blanks[1:])
+            if value and not consecutive_blanks[i]
+        ]
+    else:
+        consecutive_blanks = []
+
+    # 3 or more blank pages in a row is a flag.
+    CONSECUTIVE_BLURRY_THRESHOLD = 3
+    SHARPNESS_THRESHOLD = 0.1
+    if len(analysis["pages"]) >= CONSECUTIVE_BLURRY_THRESHOLD:
+        consecutive_blurry = [
+            page["sharpness"] < SHARPNESS_THRESHOLD for page in analysis["pages"]
+        ]
+        for _ in range(1, CONSECUTIVE_BLURRY_THRESHOLD):
+            consecutive_blurry = [
+                value and consecutive_blurry[i]
+                for i, value in enumerate(consecutive_blurry[1:])
+            ]
+        consecutive_blurry = [
+            i + 2  # +1 to account for enumeration offset, and +1 to 1-index
+            for i, value in enumerate(consecutive_blurry[1:])
+            if value and not consecutive_blurry[i]
+        ]
+    else:
+        consecutive_blurry = []
+
+    check_orientation = [
+        i + 1
+        for i, page in enumerate(analysis["pages"])
+        if not page["ocr_orientation_match"]
+    ]
+
+    check_crop = [
+        i + 1
+        for i, page in enumerate(analysis["pages"])
+        if page["text_margin_px"] < page_margin_px
+    ]
+
+    if check_orientation or check_crop or consecutive_blanks or consecutive_blurry:
+        print(
+            json.dumps(
+                {
+                    "item_id": item_id,
+                    "check_orientation": check_orientation,
+                    "check_crop": check_crop,
+                    "consecutive_blanks": consecutive_blanks,
+                    "consecutive_blurry": consecutive_blurry,
+                }
+            )
+        )
+        stdout.flush()
+
+    if verbose:
+        print(f"Done summarizing item {item_id}.", file=stderr)
+        stderr.flush()
+
+
+def _analyze_item_to_stdout(task):
+    item_id = task.item_id
+    verbose = task.verbose
+
+    if verbose:
+        print(f"Analyzing item {item_id}...", file=stderr)
+        stderr.flush()
+
+    print(json.dumps(analyze_item(item_id, parallel=6, verbose=verbose)))
+    stdout.flush()
+
+    if verbose:
+        print(f"Done analyzing item {item_id}.", file=stderr)
+        stderr.flush()
+
+
+if __name__ == "__main__":
+    main()