rewrite to engine.py

2025-10-04 15:09:16 -07:00 · 2025-10-04 15:09:16 -07:00 · 4d9161b043
commit 4d9161b043
parent 815934ad23
6 changed files with 550 additions and 479 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,4 @@
 /target
 /data
 *.db
 __pycache__
--- a/README.md
+++ b/README.md
@ -38,3 +38,4 @@ order by items.id;
 - Blurry pages: `micro_IA40244209_0984`
 - Contrast, page orientation: `micro_IA40244211_2290`
 - Crop, low quality fiche: `micro_IA40386420_0689`
 - "Bite sized" SCOTUS doc with multiple viewable files and some blurry pages: `micro_IA40386007_0012`
--- a/cache.py
+++ b/cache.py
@ -1,134 +0,0 @@
 import re
 import sqlite3
 import traceback
 from argparse import ArgumentParser
 from datetime import datetime
 from time import sleep
 import requests
 from main import analyze_item
 def main():
    parser = ArgumentParser()
    parser.add_argument("--database", default="./microqa.db")
    parser.add_argument("--cpus", type=int, default=2)
    parser.add_argument("--earliest-review-date", default="20250701")
    args = parser.parse_args()
    with sqlite3.connect(args.database) as conn:
        cur = conn.cursor()
        cur.execute("""
 create table if not exists items (
    id text primary key not null,
    review_date text not null,
    analyzed_date text
 )""")
        cur.execute("""
 create table if not exists pages (
    id int primary key,
    item text not null,
    page int not null,
    orientation_match boolean not null,
    sharpness real not null,
    is_blank boolean not null,
    text_margin_px int not null
 )""")
        conn.commit()
        while True:
            print("Pulling item IDs")
            pull_new_item_ids(conn, args.earliest_review_date)
            print("Done.")
            res = cur.execute(
                "select id from items where analyzed_date is null order by review_date"
            )
            for (item_id,) in res.fetchall():
                N_ATTEMPTS = 3
                for _ in range(N_ATTEMPTS):
                    try:
                        print(f"Processing {item_id}")
                        analysis = analyze_item(
                            item_id, parallel=args.cpus, verbose=True
                        )
                        for i, page in enumerate(analysis["pages"]):
                            cur.execute(
                                """
 insert into pages (
    item,
    page,
    orientation_match,
    sharpness,
    is_blank,
    text_margin_px
 ) values (
    ?,
    ?,
    ?,
    ?,
    ?,
    ?
 )""",
                                [
                                    item_id,
                                    i + 1,
                                    page["ocr_orientation_match"],
                                    page["sharpness"],
                                    page["blank"],
                                    page["text_margin_px"],
                                ],
                            )
                        cur.execute(
                            "update items set analyzed_date = ? where id = ?",
                            [datetime.utcnow().strftime("%Y%m%d%H%M%S"), item_id],
                        )
                        conn.commit()
                        print("Done")
                        break
                    except Exception as err:
                        print(err)
                        traceback.print_tb(err.__traceback__)
                        sleep(15)
            break
            sleep(3600)
 def pull_new_item_ids(conn, earliest_review_date):
    cur = conn.cursor()
    res = cur.execute("select review_date from items order by review_date desc limit 1")
    (latest_review_date,) = res.fetchone() or (earliest_review_date,)
    print(latest_review_date)
    query = f"""
        collection:(microfiche)
        AND contributor:(Internet Archive)
        AND micro_review:(done)
        AND review_date:[{latest_review_date} TO null]
    """
    sort = "reviewdate asc"
    # Format for API.
    query = re.sub(r"\s+", "+", query.strip())
    sort = re.sub(r"\s+", "+", sort.strip())
    for i in range(1, 999):
        resp = requests.get(
            f"https://archive.org/advancedsearch.php?q={query}&sort[]={sort}&fl[]=identifier&fl[]=review_date&rows=100&page={i}&output=json",
        )
        resp.raise_for_status()
        body = resp.json()
        if len(body["response"]["docs"]) == 0:
            break
        cur.executemany(
            "insert into items (id, review_date) values (?, ?) on conflict do nothing",
            [
                (doc["identifier"], doc["review_date"])
                for doc in body["response"]["docs"]
            ],
        )
        conn.commit()
 if __name__ == "__main__":
    main()
--- a/engine.py
+++ b/engine.py
@ -0,0 +1,239 @@
 import urllib.parse
 import re
 from dataclasses import dataclass
 from io import BytesIO
 from multiprocessing import Pool
 from sys import stderr
 from zipfile import ZipFile
 import numpy as np
 import pytesseract
 import requests
 from PIL import Image, ImageFilter
 def analyze_item(item_id, ocr_langs="eng+fra", parallel=1, verbose=False):
    escaped_item_id = urllib.parse.quote(item_id, safe="")
    if verbose:
        print("Downloading...", file=stderr)
        stderr.flush()
    page_nums_resp = requests.get(
        f"https://archive.org/metadata/{escaped_item_id}/page_numbers/pages"
    )
    page_nums_resp.raise_for_status()
    page_nums = page_nums_resp.json()["result"]
    zip_resp = requests.get(
        f"https://archive.org/download/{escaped_item_id}/{escaped_item_id}_jp2.zip"
    )
    zip_resp.raise_for_status()
    if verbose:
        print("Decompressing...", file=stderr)
        stderr.flush()
    tasks = []
    with ZipFile(BytesIO(zip_resp.content)) as jp_zip:
        for leaf_num, file_name in enumerate(sorted(jp_zip.namelist())):
            for page_index, page_num_info in enumerate(page_nums):
                if page_num_info["leafNum"] == leaf_num:
                    # Stop iterating and keep page_index set to the current item.
                    break
            else:
                # Set to -1 to indicate that leaf was not found in page_num list.
                page_index = -1
            if page_index != -1:
                with jp_zip.open(file_name) as jp_file:
                    im = Image.open(jp_file).convert("L")
                    im.thumbnail((3200, 3200))
                    tasks.append(
                        PageAnalysisTask(
                            im=im,
                            ocr_langs=ocr_langs,
                        )
                    )
    if verbose:
        print(f"Processing {len(page_nums)} pages...", file=stderr)
        stderr.flush()
    if parallel > 1:
        # Parallelize image processing and OCR of pages across up to n cores.
        with Pool(parallel) as pool:
            return {"pages": pool.map(analyze_page, tasks)}
    return {"pages": [analyze_page(task) for task in tasks]}
@dataclass
 class PageAnalysisTask:
    """
    Attributes:
        im          PIL Image, pre-scaled using .thumbnail() to fit the long
                    edge to 3200 px.
        ocr_langs   Tesseract language codes (3 letters each, in a "+"-separated
                    list).
    """
    im: Image.Image
    ocr_langs: str = "eng+fra"
 def analyze_page(task):
    im_cropped = task.im.crop(
        (
            task.im.size[0] * 0.1,
            task.im.size[1] * 0.1,
            task.im.size[0] * 0.9,
            task.im.size[1] * 0.9,
        )
    )
    is_blank = im_cropped.getextrema()[0] > 255 * 0.8
    if is_blank:
        max_sharpness = 1
        ocr_orientation_match = True
        text_margin_px = -1
    else:
        max_sharpness = 0.0
        if im_cropped.size[0] < im_cropped.size[1]:
            # Page is in portrait orientation.
            segments_x = 2
            segments_y = 3
        else:
            # Page is in landscape orientation.
            segments_x = 3
            segments_y = 2
        for i in range(segments_x):
            for j in range(segments_y):
                max_sharpness = max(
                    max_sharpness,
                    analyze_sharpness(
                        im_cropped.crop(
                            (
                                im_cropped.size[0] / segments_x * i,
                                im_cropped.size[1] / segments_y * j,
                                im_cropped.size[0] / segments_x * (i + 1),
                                im_cropped.size[1] / segments_y * (j + 1),
                            )
                        )
                    ),
                )
        OCR_SCALE = 1
        best_ocr_score = -1
        best_ocr_words = None
        best_ocr_orientation = -1
        for orientation in range(4):
            im_rotated = task.im.resize(
                np.int_(np.array(task.im.size) * OCR_SCALE)
            ).rotate(90 * orientation, expand=True)
            ocr = pytesseract.image_to_data(
                im_rotated,
                lang=task.ocr_langs,
                config=f"--oem 1 --dpi {int(300 * OCR_SCALE)} --tessdata-dir ./data/tessdata_fast-4.1.0",
                output_type=pytesseract.Output.DATAFRAME,
            ).fillna({"text": ""})
            # Keep only words that Tesseract is confident in, and which are
            # oriented horizontally.
            words = ocr[(ocr["conf"] > 90) & (ocr["width"] > ocr["height"])]
            # Keep only alphabetical words of 4 or more characters.
            words = words[
                words.apply(
                    lambda row: re.fullmatch(r"[a-zA-Z]{4,}", str(row["text"]))
                    is not None,
                    axis=1,
                )
            ]
            if words.shape[0] > best_ocr_score:
                best_ocr_score = words.shape[0]
                best_ocr_orientation = orientation
                best_ocr_words = words
            if best_ocr_score > 50:
                # Unlikely that another orientation will have more words, so
                # stop eating up CPU.
                break
        if best_ocr_words.empty:
            ocr_orientation_match = True
            text_margin_px = -1
        else:
            ocr_orientation_match = best_ocr_orientation == 0
            best_ocr_dims = OCR_SCALE * np.array(
                task.im.size
                if best_ocr_orientation % 2 == 0
                else (task.im.size[1], task.im.size[0])
            )
            word_margins_all_directions = np.sort(
                np.int_(
                    np.concat(
                        (
                            best_ocr_words["left"].to_numpy(),
                            best_ocr_words["top"].to_numpy(),
                            best_ocr_dims[0]
                            - (
                                best_ocr_words["left"] + best_ocr_words["width"]
                            ).to_numpy(),
                            best_ocr_dims[1]
                            - (
                                best_ocr_words["top"] + best_ocr_words["height"]
                            ).to_numpy(),
                        )
                    )
                    # Transform back into original image pixel density
                    / OCR_SCALE
                )
            )
            # Skip the n closest words to the edge, to help ignore stray OCR artifacts.
            SKIP_WORDS = 2
            text_margin_px = int(
                word_margins_all_directions[SKIP_WORDS]
                if word_margins_all_directions.shape[0] > SKIP_WORDS
                else -1
            )
    return {
        "blank": is_blank,
        "ocr_orientation_match": ocr_orientation_match,
        "size_analyzed": task.im.size,
        "sharpness": max_sharpness,
        "text_margin_px": text_margin_px,
    }
 def analyze_sharpness(im):
    """
    Crudely quantifies the "sharpness" of edges in an image, on a scale of 0 to
    1. The scale is not linear with respect to scan quality: anything above 0.1
    is usually fine.
    """
    arr = np.asarray(im)
    # Normalize contrast based on brightest and darkest pixels. For example,
    # NORM_QUANTILE=0.1 will attempt to transform pixel values so that 80% fall
    # between 10% brightness and 90% brightness. In practice, a value around
    # 0.02 seems to work fairly well.
    NORM_QUANTILE = 0.03
    pixel_range = np.quantile(arr, 1.0 - NORM_QUANTILE) - np.quantile(
        arr, NORM_QUANTILE
    )
    if pixel_range == 0:
        arr_normalized = arr
    else:
        arr_normalized = arr * (1.0 - NORM_QUANTILE * 2) / pixel_range
        arr_normalized = (
            arr_normalized - np.quantile(arr_normalized, NORM_QUANTILE) + NORM_QUANTILE
        )
        arr_normalized = np.uint8(np.clip(arr_normalized, 0, 1) * 255)
    # "Sharpness" is determined by measuring the median intensity of pixels
    # near edges, after an edge detection filter has been applied to the image.
    edges_arr = np.asarray(
        Image.fromarray(arr_normalized).filter(ImageFilter.FIND_EDGES)
    )
    EDGE_THRESHOLD = 8
    return np.median(edges_arr[edges_arr > EDGE_THRESHOLD]) / 255
--- a/main.py
+++ b/main.py
@ -1,371 +1,175 @@
 import json
 import re
-import urllib.parse
+import sqlite3
 import traceback
 from argparse import ArgumentParser
-from dataclasses import dataclass
+from datetime import datetime
-from io import BytesIO
+from sys import stderr
-from multiprocessing import Pool
+from time import sleep
 from multiprocessing.pool import ThreadPool
 from sys import stderr, stdin, stdout
 from zipfile import ZipFile
 import numpy as np
 import pytesseract
 import requests
 from PIL import Image, ImageFilter
-
+from engine import analyze_item
 OCR_LANGS = "eng+fra"
 def main():
    parser = ArgumentParser()
-    parser.add_argument("--summarize", action="store_true")
+    parser.add_argument("--database", default="./microqa.db")
-    parser.add_argument("-v", "--verbose", action="store_true")
+    parser.add_argument("--cpus", type=int, default=2)
-    parser.add_argument("-w", "--workers", type=int, default=1)
+    parser.add_argument("--earliest-review-date", default="20250701")
    parser.add_argument("--page-margin-px", type=int, default=50)
    args = parser.parse_args()
-    # Process STDIN line by line, where each line contains one or more item IDs
+    with sqlite3.connect(args.database) as conn:
-    # separated by whitespace.
+        cur = conn.cursor()
-    for line in stdin:
+        cur.execute("""
-        item_ids = [value for value in re.split(r",|\s", line) if value]
+create table if not exists items (
-        with ThreadPool(args.workers) as pool:
+    id text primary key not null,
-            if args.verbose:
+    review_date text not null,
-                print(f"Running with {args.workers} workers.", file=stderr)
+    skip_analysis bool not null,
-                stderr.flush()
+    analyzed_date text
-            if args.summarize:
+)""")
-                pool.map(
+        cur.execute("""
-                    _summarize_item_to_stdout,
+create table if not exists pages (
-                    [
+    id int primary key,
-                        ItemTask(
+    item text not null,
-                            item_id=item_id,
+    page int not null,
-                            page_margin_px=args.page_margin_px,
+    orientation_match boolean not null,
-                            verbose=args.verbose,
+    sharpness real not null,
    is_blank boolean not null,
    text_margin_px int not null
 )""")
        cur.execute("create index if not exists review_date_idx on items (review_date)")
        cur.execute(
            "create index if not exists analyzed_date_idx on items (analyzed_date)"
        )
-                        for item_id in item_ids
+        cur.execute("create index if not exists item_idx on pages (item)")
        cur.execute(
            "create unique index if not exists item_page_idx on pages (item, page)"
        )
        conn.commit()
        while True:
            print("Pulling item IDs")
            pull_new_item_ids(conn, args.earliest_review_date)
            print("Done.")
            res = cur.execute("""
 select id
 from items
 where analyzed_date is null
    and skip_analysis = false
 order by review_date
 """)
            for (item_id,) in res.fetchall():
                N_ATTEMPTS = 3
                for _ in range(N_ATTEMPTS):
                    try:
                        print(f"Processing {item_id}")
                        analysis = analyze_item(
                            item_id, parallel=args.cpus, verbose=True
                        )
                        for i, page in enumerate(analysis["pages"]):
                            cur.execute(
                                """
 insert into pages (
    item,
    page,
    orientation_match,
    sharpness,
    is_blank,
    text_margin_px
 ) values (
    ?,
    ?,
    ?,
    ?,
    ?,
    ?
 )""",
                                [
                                    item_id,
                                    i + 1,
                                    page["ocr_orientation_match"],
                                    page["sharpness"],
                                    page["blank"],
                                    page["text_margin_px"],
                                ],
                            )
-            else:
+                        cur.execute(
-                pool.map(
+                            "update items set analyzed_date = ? where id = ?",
-                    _analyze_item_to_stdout,
+                            [datetime.utcnow().strftime("%Y%m%d%H%M%S"), item_id],
                    [
                        ItemTask(
                            item_id=item_id,
                            page_margin_px=args.page_margin_px,
                            verbose=args.verbose,
                        )
-                        for item_id in item_ids
+                        conn.commit()
-                    ],
+                        print("Done")
                )
@dataclass
 class ItemTask:
    item_id: str
    page_margin_px: int
    verbose: bool
 def _summarize_item_to_stdout(task):
    item_id = task.item_id
    page_margin_px = task.page_margin_px
    verbose = task.verbose
    if verbose:
        print(f"Summarizing item {item_id}...", file=stderr)
        stderr.flush()
    analysis = analyze_item(item_id, parallel=4, verbose=verbose)
    # 3 or more blank pages in a row is a flag.
    CONSECUTIVE_BLANKS_THRESHOLD = 3
    if len(analysis["pages"]) >= CONSECUTIVE_BLANKS_THRESHOLD:
        consecutive_blanks = [page["blank"] for page in analysis["pages"]]
        for _ in range(1, CONSECUTIVE_BLANKS_THRESHOLD):
            consecutive_blanks = [
                value and consecutive_blanks[i]
                for i, value in enumerate(consecutive_blanks[1:])
            ]
        consecutive_blanks = [
            i + 2  # +1 to account for enumeration offset, and +1 to 1-index
            for i, value in enumerate(consecutive_blanks[1:])
            if value and not consecutive_blanks[i]
        ]
    else:
        consecutive_blanks = []
    # 3 or more blank pages in a row is a flag.
    CONSECUTIVE_BLURRY_THRESHOLD = 3
    SHARPNESS_THRESHOLD = 0.1
    if len(analysis["pages"]) >= CONSECUTIVE_BLURRY_THRESHOLD:
        consecutive_blurry = [
            page["sharpness"] < SHARPNESS_THRESHOLD for page in analysis["pages"]
        ]
        for _ in range(1, CONSECUTIVE_BLURRY_THRESHOLD):
            consecutive_blurry = [
                value and consecutive_blurry[i]
                for i, value in enumerate(consecutive_blurry[1:])
            ]
        consecutive_blurry = [
            i + 2  # +1 to account for enumeration offset, and +1 to 1-index
            for i, value in enumerate(consecutive_blurry[1:])
            if value and not consecutive_blurry[i]
        ]
    else:
        consecutive_blurry = []
    check_orientation = [
        i + 1
        for i, page in enumerate(analysis["pages"])
        if not page["ocr_orientation_match"]
    ]
    check_crop = [
        i + 1
        for i, page in enumerate(analysis["pages"])
        if page["text_margin_px"] < page_margin_px
    ]
    if check_orientation or check_crop or consecutive_blanks or consecutive_blurry:
        print(
            json.dumps(
                {
                    "item_id": item_id,
                    "check_orientation": check_orientation,
                    "check_crop": check_crop,
                    "consecutive_blanks": consecutive_blanks,
                    "consecutive_blurry": consecutive_blurry,
                }
            )
        )
        stdout.flush()
    if verbose:
        print(f"Done summarizing item {item_id}.", file=stderr)
        stderr.flush()
 def _analyze_item_to_stdout(task):
    item_id = task.item_id
    verbose = task.verbose
    if verbose:
        print(f"Analyzing item {item_id}...", file=stderr)
        stderr.flush()
    print(json.dumps(analyze_item(item_id, parallel=4, verbose=verbose)))
    stdout.flush()
    if verbose:
        print(f"Done analyzing item {item_id}.", file=stderr)
        stderr.flush()
@dataclass
 class PageAnalysisTask:
    im: Image.Image
    page_index: int
    file_name: str
 def _analyze_page(task):
    im_original = task.im
    page_index = task.page_index
    file_name = task.file_name
    im_cropped = im_original.crop(
        (
            im_original.size[0] * 0.1,
            im_original.size[1] * 0.1,
            im_original.size[0] * 0.9,
            im_original.size[1] * 0.9,
        )
    )
    is_blank = im_cropped.getextrema()[0] > 255 * 0.8
    if is_blank:
        max_sharpness = 1
        ocr_orientation_match = True
        text_margin_px = -1
    else:
        max_sharpness = 0.0
        if im_cropped.size[0] < im_cropped.size[1]:
            # Page is in portrait orientation.
            segments_x = 2
            segments_y = 3
        else:
            # Page is in landscape orientation.
            segments_x = 3
            segments_y = 2
        for i in range(segments_x):
            for j in range(segments_y):
                max_sharpness = max(
                    max_sharpness,
                    analyze_sharpness(
                        im_cropped.crop(
                            (
                                im_cropped.size[0] / segments_x * i,
                                im_cropped.size[1] / segments_y * j,
                                im_cropped.size[0] / segments_x * (i + 1),
                                im_cropped.size[1] / segments_y * (j + 1),
                            )
                        )
                    ),
                )
        best_ocr_score = -1
        best_ocr_words = None
        best_ocr_orientation = -1
        for orientation in range(4):
            im_rotated = im_original.rotate(90 * orientation, expand=True)
            ocr = pytesseract.image_to_data(
                im_rotated,
                lang=OCR_LANGS,
                config="--oem 1 --dpi 300 --tessdata-dir ./data/tessdata_fast-4.1.0",
                output_type=pytesseract.Output.DATAFRAME,
            ).fillna({"text": ""})
            words = ocr[(ocr["conf"] > 90) & (ocr["width"] > ocr["height"])]
            words = words[
                words.apply(
                    lambda row: re.fullmatch(r"[a-zA-Z]{4,}", row["text"]) is not None,
                    axis=1,
                )
            ]
            if words.shape[0] > best_ocr_score:
                best_ocr_score = words.shape[0]
                best_ocr_orientation = orientation
                best_ocr_words = words
            if best_ocr_score > 50:
                # Unlikely that another orientation will have more words, so
                # stop eating up CPU unnecessarily.
                        break
-
+                    except Exception as err:
-        ocr_orientation_match = best_ocr_orientation == 0
+                        print(err, file=stderr)
-
+                        traceback.print_tb(err.__traceback__, file=stderr)
-        best_ocr_dims = (
+                        sleep(15)
            im_original.size
            if best_ocr_orientation % 2 == 0
            else (im_original.size[1], im_original.size[0])
        )
        word_margins_all_directions = np.sort(
            np.concat(
                (
                    best_ocr_words["left"].to_numpy(),
                    best_ocr_words["top"].to_numpy(),
                    best_ocr_dims[0]
                    - (best_ocr_words["left"] + best_ocr_words["width"]).to_numpy(),
                    best_ocr_dims[1]
                    - (best_ocr_words["top"] + best_ocr_words["height"]).to_numpy(),
                )
            )
        )
        # Skip the n closest words to the edge, to help ignore stray OCR artifacts.
        SKIP_WORDS = 2
        text_margin_px = (
            int(word_margins_all_directions[SKIP_WORDS])
            if word_margins_all_directions.shape[0] > SKIP_WORDS
            else -1
        )
    return {
        "blank": is_blank,
        "file_name": file_name,
        "ocr_orientation_match": ocr_orientation_match,
        "page_index": page_index,
        "size": im_original.size,
        "sharpness": max_sharpness,
        "text_margin_px": text_margin_px,
    }
 def analyze_item(item_id, parallel=1, verbose=False):
    escaped_item_id = urllib.parse.quote(item_id, safe="")
    if verbose:
        print("Downloading...", file=stderr)
        stderr.flush()
    page_nums_resp = requests.get(
        f"https://archive.org/metadata/{escaped_item_id}/page_numbers/pages"
    )
    page_nums_resp.raise_for_status()
    page_nums = page_nums_resp.json()["result"]
    zip_resp = requests.get(
        f"https://archive.org/download/{escaped_item_id}/{escaped_item_id}_jp2.zip"
    )
    zip_resp.raise_for_status()
    if verbose:
        print("Decompressing...", file=stderr)
        stderr.flush()
    tasks = []
    with ZipFile(BytesIO(zip_resp.content)) as jp_zip:
        for leaf_num, file_name in enumerate(sorted(jp_zip.namelist())):
            for page_index, page_num_info in enumerate(page_nums):
                if page_num_info["leafNum"] == leaf_num:
                    # Stop iterating and keep page_index set to the current item.
            break
-            else:
+            sleep(3600)
                # Set to -1 to indicate that leaf was not found in page_num list.
                page_index = -1
            if page_index != -1:
                with jp_zip.open(file_name) as jp_file:
                    im = Image.open(jp_file).convert("L")
                    im.thumbnail((3200, 3200))
                    tasks.append(
                        PageAnalysisTask(
                            im=im,
                            page_index=page_index,
                            file_name=file_name,
                        )
                    )
    if verbose:
        print(f"Processing {len(page_nums)} pages...", file=stderr)
        stderr.flush()
    if parallel > 1:
        # Parallelize image processing and OCR of pages across up to n cores.
        with Pool(parallel) as pool:
            return {"pages": pool.map(_analyze_page, tasks)}
    return {"pages": [_analyze_page(task) for task in tasks]}
-def analyze_sharpness(im):
+def pull_new_item_ids(conn, earliest_review_date):
    cur = conn.cursor()
    res = cur.execute("select review_date from items order by review_date desc limit 1")
    (latest_review_date,) = res.fetchone() or (earliest_review_date,)
    print(latest_review_date)
    query = f"""
        collection:(microfiche)
        AND contributor:(Internet Archive)
        AND micro_review:(done)
        AND review_date:[{latest_review_date} TO null]
    """
-    Crudely quantifies the "sharpness" of edges in an image, on a scale of 0 to
+    sort = "reviewdate asc"
    1. The scale is not linear with respect to scan quality: anything above 0.1
    is usually fine.
    """
    arr = np.asarray(im)
-    # Normalize contrast based on brightest and darkest pixels. For example,
+    # Format for API.
-    # NORM_QUANTILE=0.1 will attempt to transform pixel values so that 80% fall
+    query = re.sub(r"\s+", "+", query.strip())
-    # between 10% brightness and 90% brightness. In practice, a value around
+    sort = re.sub(r"\s+", "+", sort.strip())
    # 0.02 seems to work fairly well.
    NORM_QUANTILE = 0.03
    pixel_range = np.quantile(arr, 1.0 - NORM_QUANTILE) - np.quantile(
        arr, NORM_QUANTILE
    )
    if pixel_range == 0:
        arr_normalized = arr
    else:
        arr_normalized = arr * (1.0 - NORM_QUANTILE * 2) / pixel_range
        arr_normalized = (
            arr_normalized - np.quantile(arr_normalized, NORM_QUANTILE) + NORM_QUANTILE
        )
        arr_normalized = np.uint8(np.clip(arr_normalized, 0, 1) * 255)
-    # "Sharpness" is determined by measuring the median intensity of pixels
+    #    params = {
-    # near edges, after an edge detection filter has been applied to the image.
+    #        "q": query,
-    edges_arr = np.asarray(
+    #        "count": 100,
-        Image.fromarray(arr_normalized).filter(ImageFilter.FIND_EDGES)
+    #        "fields": "identifier,review_date",
    #        "sorts": sort,
    #    }
    #    for i in range(1, 999):
    #        resp = requests.get(
    #            "https://archive.org/services/search/v1/scrape",
    #            params=params,
    #        )
    #        resp.raise_for_status()
    #        print(resp.text)
    #        try:
    #            body = resp.json()
    #        except Exception as err:
    #            print("Body:", resp.text, file=stderr)
    #            raise err
    #        for doc in body["items"]:
    #            cur.execute(
    #                "insert into items (id, review_date, skip_analysis) values (?, ?, false) on conflict do nothing",
    #                (doc["identifier"], doc["review_date"]),
    #            )
    #        conn.commit()
    #        cursor = body.get("cursor", None)
    #        if cursor is None:
    #            break
    #        params = params.copy()
    #        params["cursor"] = cursor
    resp = requests.get(
        f"https://archive.org/advancedsearch.php?q={query}&sort[]={sort}&fl[]=identifier&fl[]=review_date&rows=250000&output=json",
    )
-    EDGE_THRESHOLD = 8
+    resp.raise_for_status()
-    return np.median(edges_arr[edges_arr > EDGE_THRESHOLD]) / 255
+    try:
        body = resp.json()
    except Exception as err:
        print("Body:", resp.text, file=stderr)
        raise err
    for doc in body["response"]["docs"]:
        cur.execute(
            "insert into items (id, review_date, skip_analysis) values (?, ?, false) on conflict do nothing",
            (doc["identifier"], doc["review_date"]),
        )
        conn.commit()
 if __name__ == "__main__":
--- a/one_off.py
+++ b/one_off.py
@ -0,0 +1,159 @@
 import json
 import re
 from argparse import ArgumentParser
 from dataclasses import dataclass
 from multiprocessing.pool import ThreadPool
 from sys import stderr, stdin, stdout
 from engine import analyze_item
 OCR_LANGS = "eng+fra"
 def main():
    parser = ArgumentParser()
    parser.add_argument("--summarize", action="store_true")
    parser.add_argument("-v", "--verbose", action="store_true")
    parser.add_argument("-w", "--workers", type=int, default=1)
    parser.add_argument("--page-margin-px", type=int, default=50)
    args = parser.parse_args()
    # Process STDIN line by line, where each line contains one or more item IDs
    # separated by whitespace.
    for line in stdin:
        item_ids = [value for value in re.split(r",|\s", line) if value]
        with ThreadPool(args.workers) as pool:
            if args.verbose:
                print(f"Running with {args.workers} workers.", file=stderr)
                stderr.flush()
            if args.summarize:
                pool.map(
                    _summarize_item_to_stdout,
                    [
                        ItemTask(
                            item_id=item_id,
                            page_margin_px=args.page_margin_px,
                            verbose=args.verbose,
                        )
                        for item_id in item_ids
                    ],
                )
            else:
                pool.map(
                    _analyze_item_to_stdout,
                    [
                        ItemTask(
                            item_id=item_id,
                            page_margin_px=args.page_margin_px,
                            verbose=args.verbose,
                        )
                        for item_id in item_ids
                    ],
                )
@dataclass
 class ItemTask:
    item_id: str
    page_margin_px: int
    verbose: bool
 def _summarize_item_to_stdout(task):
    item_id = task.item_id
    page_margin_px = task.page_margin_px
    verbose = task.verbose
    if verbose:
        print(f"Summarizing item {item_id}...", file=stderr)
        stderr.flush()
    analysis = analyze_item(item_id, parallel=4, verbose=verbose)
    # 3 or more blank pages in a row is a flag.
    CONSECUTIVE_BLANKS_THRESHOLD = 3
    if len(analysis["pages"]) >= CONSECUTIVE_BLANKS_THRESHOLD:
        consecutive_blanks = [page["blank"] for page in analysis["pages"]]
        for _ in range(1, CONSECUTIVE_BLANKS_THRESHOLD):
            consecutive_blanks = [
                value and consecutive_blanks[i]
                for i, value in enumerate(consecutive_blanks[1:])
            ]
        consecutive_blanks = [
            i + 2  # +1 to account for enumeration offset, and +1 to 1-index
            for i, value in enumerate(consecutive_blanks[1:])
            if value and not consecutive_blanks[i]
        ]
    else:
        consecutive_blanks = []
    # 3 or more blank pages in a row is a flag.
    CONSECUTIVE_BLURRY_THRESHOLD = 3
    SHARPNESS_THRESHOLD = 0.1
    if len(analysis["pages"]) >= CONSECUTIVE_BLURRY_THRESHOLD:
        consecutive_blurry = [
            page["sharpness"] < SHARPNESS_THRESHOLD for page in analysis["pages"]
        ]
        for _ in range(1, CONSECUTIVE_BLURRY_THRESHOLD):
            consecutive_blurry = [
                value and consecutive_blurry[i]
                for i, value in enumerate(consecutive_blurry[1:])
            ]
        consecutive_blurry = [
            i + 2  # +1 to account for enumeration offset, and +1 to 1-index
            for i, value in enumerate(consecutive_blurry[1:])
            if value and not consecutive_blurry[i]
        ]
    else:
        consecutive_blurry = []
    check_orientation = [
        i + 1
        for i, page in enumerate(analysis["pages"])
        if not page["ocr_orientation_match"]
    ]
    check_crop = [
        i + 1
        for i, page in enumerate(analysis["pages"])
        if page["text_margin_px"] < page_margin_px
    ]
    if check_orientation or check_crop or consecutive_blanks or consecutive_blurry:
        print(
            json.dumps(
                {
                    "item_id": item_id,
                    "check_orientation": check_orientation,
                    "check_crop": check_crop,
                    "consecutive_blanks": consecutive_blanks,
                    "consecutive_blurry": consecutive_blurry,
                }
            )
        )
        stdout.flush()
    if verbose:
        print(f"Done summarizing item {item_id}.", file=stderr)
        stderr.flush()
 def _analyze_item_to_stdout(task):
    item_id = task.item_id
    verbose = task.verbose
    if verbose:
        print(f"Analyzing item {item_id}...", file=stderr)
        stderr.flush()
    print(json.dumps(analyze_item(item_id, parallel=6, verbose=verbose)))
    stdout.flush()
    if verbose:
        print(f"Done analyzing item {item_id}.", file=stderr)
        stderr.flush()
 if __name__ == "__main__":
    main()