diff --git a/README.md b/README.md
index 9fc65bd..4f8bea2 100644
--- a/README.md
+++ b/README.md
@@ -17,6 +17,22 @@ a single line so that items are summarized in parallel):
 pbpaste | tr '\n' ',' | uv run main.py --summarize -workers 4 -v | jq
 ```
 
+Query a pre-populated database for suspect pages:
+
+```sql
+select   'https://archive.org/details/' || items.id,
+         pages.page,
+         pages.orientation_match,
+         pages.sharpness,
+         pages.text_margin_px
+from     items
+         join pages on pages.item = items.id
+where    pages.orientation_match = 0
+         or pages.sharpness < 0.07
+         or (pages.text_margin_px > -1 and pages.text_margin_px < 50)
+order by items.id;
+```
+
 ## Test Cases
 
 - Blurry pages: `micro_IA40244209_0984`
diff --git a/cache.py b/cache.py
new file mode 100644
index 0000000..49a2b32
--- /dev/null
+++ b/cache.py
@@ -0,0 +1,134 @@
+import re
+import sqlite3
+import traceback
+from argparse import ArgumentParser
+from datetime import datetime
+from time import sleep
+
+import requests
+
+from main import analyze_item
+
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument("--database", default="./microqa.db")
+    parser.add_argument("--cpus", type=int, default=2)
+    parser.add_argument("--earliest-review-date", default="20250701")
+    args = parser.parse_args()
+
+    with sqlite3.connect(args.database) as conn:
+        cur = conn.cursor()
+        cur.execute("""
+create table if not exists items (
+    id text primary key not null,
+    review_date text not null,
+    analyzed_date text
+)""")
+        cur.execute("""
+create table if not exists pages (
+    id int primary key,
+    item text not null,
+    page int not null,
+    orientation_match boolean not null,
+    sharpness real not null,
+    is_blank boolean not null,
+    text_margin_px int not null
+)""")
+        conn.commit()
+
+        while True:
+            print("Pulling item IDs")
+            pull_new_item_ids(conn, args.earliest_review_date)
+            print("Done.")
+            res = cur.execute(
+                "select id from items where analyzed_date is null order by review_date"
+            )
+            for (item_id,) in res.fetchall():
+                N_ATTEMPTS = 3
+                for _ in range(N_ATTEMPTS):
+                    try:
+                        print(f"Processing {item_id}")
+                        analysis = analyze_item(
+                            item_id, parallel=args.cpus, verbose=True
+                        )
+                        for i, page in enumerate(analysis["pages"]):
+                            cur.execute(
+                                """
+insert into pages (
+    item,
+    page,
+    orientation_match,
+    sharpness,
+    is_blank,
+    text_margin_px
+) values (
+    ?,
+    ?,
+    ?,
+    ?,
+    ?,
+    ?
+ )""",
+                                [
+                                    item_id,
+                                    i + 1,
+                                    page["ocr_orientation_match"],
+                                    page["sharpness"],
+                                    page["blank"],
+                                    page["text_margin_px"],
+                                ],
+                            )
+                        cur.execute(
+                            "update items set analyzed_date = ? where id = ?",
+                            [datetime.utcnow().strftime("%Y%m%d%H%M%S"), item_id],
+                        )
+                        conn.commit()
+                        print("Done")
+                        break
+                    except Exception as err:
+                        print(err)
+                        traceback.print_tb(err.__traceback__)
+                        sleep(15)
+            break
+            sleep(3600)
+
+
+def pull_new_item_ids(conn, earliest_review_date):
+    cur = conn.cursor()
+    res = cur.execute("select review_date from items order by review_date desc limit 1")
+    (latest_review_date,) = res.fetchone() or (earliest_review_date,)
+    print(latest_review_date)
+
+    query = f"""
+        collection:(microfiche)
+        AND contributor:(Internet Archive)
+        AND micro_review:(done)
+        AND review_date:[{latest_review_date} TO null]
+    """
+    sort = "reviewdate asc"
+
+    # Format for API.
+    query = re.sub(r"\s+", "+", query.strip())
+    sort = re.sub(r"\s+", "+", sort.strip())
+
+    for i in range(1, 999):
+        resp = requests.get(
+            f"https://archive.org/advancedsearch.php?q={query}&sort[]={sort}&fl[]=identifier&fl[]=review_date&rows=100&page={i}&output=json",
+        )
+        resp.raise_for_status()
+        body = resp.json()
+        if len(body["response"]["docs"]) == 0:
+            break
+        cur.executemany(
+            "insert into items (id, review_date) values (?, ?) on conflict do nothing",
+            [
+                (doc["identifier"], doc["review_date"])
+                for doc in body["response"]["docs"]
+            ],
+        )
+        conn.commit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/main.py b/main.py
index d2d77ae..85178a5 100644
--- a/main.py
+++ b/main.py
@@ -16,7 +16,6 @@ from PIL import Image, ImageFilter
 
 
 OCR_LANGS = "eng+fra"
-N_OCR_PROCESSES = 4
 
 
 def main():
@@ -77,9 +76,7 @@ def _summarize_item_to_stdout(task):
         print(f"Summarizing item {item_id}...", file=stderr)
         stderr.flush()
 
-    analysis = analyze_item(
-        item_id, page_margin_px=page_margin_px, parallel=True, verbose=verbose
-    )
+    analysis = analyze_item(item_id, parallel=4, verbose=verbose)
 
     # 3 or more blank pages in a row is a flag.
     CONSECUTIVE_BLANKS_THRESHOLD = 3
@@ -124,11 +121,10 @@ def _summarize_item_to_stdout(task):
         if not page["ocr_orientation_match"]
     ]
 
-    WORDS_NEAR_EDGE_THRESHOLD = 2
     check_crop = [
         i + 1
         for i, page in enumerate(analysis["pages"])
-        if page["words_near_edge"] > WORDS_NEAR_EDGE_THRESHOLD
+        if page["text_margin_px"] < page_margin_px
     ]
 
     if check_orientation or check_crop or consecutive_blanks or consecutive_blurry:
@@ -152,20 +148,13 @@ def _summarize_item_to_stdout(task):
 
 def _analyze_item_to_stdout(task):
     item_id = task.item_id
-    page_margin_px = task.page_margin_px
     verbose = task.verbose
 
     if verbose:
         print(f"Analyzing item {item_id}...", file=stderr)
         stderr.flush()
 
-    print(
-        json.dumps(
-            analyze_item(
-                item_id, page_margin_px=page_margin_px, parallel=True, verbose=verbose
-            )
-        )
-    )
+    print(json.dumps(analyze_item(item_id, parallel=4, verbose=verbose)))
     stdout.flush()
 
     if verbose:
@@ -177,14 +166,12 @@ def _analyze_item_to_stdout(task):
 class PageAnalysisTask:
     im: Image.Image
     page_index: int
-    page_margin_px: int
     file_name: str
 
 
 def _analyze_page(task):
     im_original = task.im
     page_index = task.page_index
-    page_margin_px = task.page_margin_px
     file_name = task.file_name
 
     im_cropped = im_original.crop(
@@ -201,7 +188,7 @@ def _analyze_page(task):
     if is_blank:
         max_sharpness = 1
         ocr_orientation_match = True
-        words_near_edge = 0
+        text_margin_px = -1
     else:
         max_sharpness = 0.0
         if im_cropped.size[0] < im_cropped.size[1]:
@@ -262,19 +249,26 @@ def _analyze_page(task):
             if best_ocr_orientation % 2 == 0
             else (im_original.size[1], im_original.size[0])
         )
-        words_near_edge = best_ocr_words[
-            (best_ocr_words["left"] < page_margin_px)
-            | (best_ocr_words["top"] < page_margin_px)
-            | (
-                best_ocr_words["left"] + best_ocr_words["width"]
-                > best_ocr_dims[0] - page_margin_px
+
+        word_margins_all_directions = np.sort(
+            np.concat(
+                (
+                    best_ocr_words["left"].to_numpy(),
+                    best_ocr_words["top"].to_numpy(),
+                    best_ocr_dims[0]
+                    - (best_ocr_words["left"] + best_ocr_words["width"]).to_numpy(),
+                    best_ocr_dims[1]
+                    - (best_ocr_words["top"] + best_ocr_words["height"]).to_numpy(),
+                )
             )
-            | (
-                best_ocr_words["top"] + best_ocr_words["height"]
-                > best_ocr_dims[1] - page_margin_px
-            )
-        ]
-        words_near_edge = words_near_edge.shape[0]
+        )
+        # Skip the n closest words to the edge, to help ignore stray OCR artifacts.
+        SKIP_WORDS = 2
+        text_margin_px = (
+            int(word_margins_all_directions[SKIP_WORDS])
+            if word_margins_all_directions.shape[0] > SKIP_WORDS
+            else -1
+        )
 
     return {
         "blank": is_blank,
@@ -283,11 +277,11 @@ def _analyze_page(task):
         "page_index": page_index,
         "size": im_original.size,
         "sharpness": max_sharpness,
-        "words_near_edge": words_near_edge,
+        "text_margin_px": text_margin_px,
     }
 
 
-def analyze_item(item_id, page_margin_px, parallel=False, verbose=False):
+def analyze_item(item_id, parallel=1, verbose=False):
     escaped_item_id = urllib.parse.quote(item_id, safe="")
 
     if verbose:
@@ -326,7 +320,6 @@ def analyze_item(item_id, page_margin_px, parallel=False, verbose=False):
                         PageAnalysisTask(
                             im=im,
                             page_index=page_index,
-                            page_margin_px=page_margin_px,
                             file_name=file_name,
                         )
                     )
@@ -334,9 +327,9 @@ def analyze_item(item_id, page_margin_px, parallel=False, verbose=False):
     if verbose:
         print(f"Processing {len(page_nums)} pages...", file=stderr)
         stderr.flush()
-    if parallel:
+    if parallel > 1:
         # Parallelize image processing and OCR of pages across up to n cores.
-        with Pool(N_OCR_PROCESSES) as pool:
+        with Pool(parallel) as pool:
             return {"pages": pool.map(_analyze_page, tasks)}
     return {"pages": [_analyze_page(task) for task in tasks]}