store results to sqlite

2025-08-18 20:31:55 -07:00 · 2025-08-18 20:31:55 -07:00 · 815934ad23
commit 815934ad23
parent d33a7dc515
3 changed files with 177 additions and 34 deletions
--- a/README.md
+++ b/README.md
@ -17,6 +17,22 @@ a single line so that items are summarized in parallel):
 pbpaste | tr '\n' ',' | uv run main.py --summarize -workers 4 -v | jq
 ```

+Query a pre-populated database for suspect pages:
+
+```sql
+select   'https://archive.org/details/' || items.id,
+         pages.page,
+         pages.orientation_match,
+         pages.sharpness,
+         pages.text_margin_px
+from     items
+         join pages on pages.item = items.id
+where    pages.orientation_match = 0
+         or pages.sharpness < 0.07
+         or (pages.text_margin_px > -1 and pages.text_margin_px < 50)
+order by items.id;
+```
+
 ## Test Cases

 - Blurry pages: `micro_IA40244209_0984`
--- a/cache.py
+++ b/cache.py
@ -0,0 +1,134 @@
+import re
+import sqlite3
+import traceback
+from argparse import ArgumentParser
+from datetime import datetime
+from time import sleep
+
+import requests
+
+from main import analyze_item
+
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument("--database", default="./microqa.db")
+    parser.add_argument("--cpus", type=int, default=2)
+    parser.add_argument("--earliest-review-date", default="20250701")
+    args = parser.parse_args()
+
+    with sqlite3.connect(args.database) as conn:
+        cur = conn.cursor()
+        cur.execute("""
+create table if not exists items (
+    id text primary key not null,
+    review_date text not null,
+    analyzed_date text
+)""")
+        cur.execute("""
+create table if not exists pages (
+    id int primary key,
+    item text not null,
+    page int not null,
+    orientation_match boolean not null,
+    sharpness real not null,
+    is_blank boolean not null,
+    text_margin_px int not null
+)""")
+        conn.commit()
+
+        while True:
+            print("Pulling item IDs")
+            pull_new_item_ids(conn, args.earliest_review_date)
+            print("Done.")
+            res = cur.execute(
+                "select id from items where analyzed_date is null order by review_date"
+            )
+            for (item_id,) in res.fetchall():
+                N_ATTEMPTS = 3
+                for _ in range(N_ATTEMPTS):
+                    try:
+                        print(f"Processing {item_id}")
+                        analysis = analyze_item(
+                            item_id, parallel=args.cpus, verbose=True
+                        )
+                        for i, page in enumerate(analysis["pages"]):
+                            cur.execute(
+                                """
+insert into pages (
+    item,
+    page,
+    orientation_match,
+    sharpness,
+    is_blank,
+    text_margin_px
+) values (
+    ?,
+    ?,
+    ?,
+    ?,
+    ?,
+    ?
+ )""",
+                                [
+                                    item_id,
+                                    i + 1,
+                                    page["ocr_orientation_match"],
+                                    page["sharpness"],
+                                    page["blank"],
+                                    page["text_margin_px"],
+                                ],
+                            )
+                        cur.execute(
+                            "update items set analyzed_date = ? where id = ?",
+                            [datetime.utcnow().strftime("%Y%m%d%H%M%S"), item_id],
+                        )
+                        conn.commit()
+                        print("Done")
+                        break
+                    except Exception as err:
+                        print(err)
+                        traceback.print_tb(err.__traceback__)
+                        sleep(15)
+            break
+            sleep(3600)
+
+
+def pull_new_item_ids(conn, earliest_review_date):
+    cur = conn.cursor()
+    res = cur.execute("select review_date from items order by review_date desc limit 1")
+    (latest_review_date,) = res.fetchone() or (earliest_review_date,)
+    print(latest_review_date)
+
+    query = f"""
+        collection:(microfiche)
+        AND contributor:(Internet Archive)
+        AND micro_review:(done)
+        AND review_date:[{latest_review_date} TO null]
+    """
+    sort = "reviewdate asc"
+
+    # Format for API.
+    query = re.sub(r"\s+", "+", query.strip())
+    sort = re.sub(r"\s+", "+", sort.strip())
+
+    for i in range(1, 999):
+        resp = requests.get(
+            f"https://archive.org/advancedsearch.php?q={query}&sort[]={sort}&fl[]=identifier&fl[]=review_date&rows=100&page={i}&output=json",
+        )
+        resp.raise_for_status()
+        body = resp.json()
+        if len(body["response"]["docs"]) == 0:
+            break
+        cur.executemany(
+            "insert into items (id, review_date) values (?, ?) on conflict do nothing",
+            [
+                (doc["identifier"], doc["review_date"])
+                for doc in body["response"]["docs"]
+            ],
+        )
+        conn.commit()
+
+
+if __name__ == "__main__":
+    main()
--- a/main.py
+++ b/main.py
@ -16,7 +16,6 @@ from PIL import Image, ImageFilter


 OCR_LANGS = "eng+fra"
-N_OCR_PROCESSES = 4


 def main():
@ -77,9 +76,7 @@ def _summarize_item_to_stdout(task):
        print(f"Summarizing item {item_id}...", file=stderr)
        stderr.flush()

-    analysis = analyze_item(
-        item_id, page_margin_px=page_margin_px, parallel=True, verbose=verbose
-    )
+    analysis = analyze_item(item_id, parallel=4, verbose=verbose)

    # 3 or more blank pages in a row is a flag.
    CONSECUTIVE_BLANKS_THRESHOLD = 3
@ -124,11 +121,10 @@ def _summarize_item_to_stdout(task):
        if not page["ocr_orientation_match"]
    ]

-    WORDS_NEAR_EDGE_THRESHOLD = 2
    check_crop = [
        i + 1
        for i, page in enumerate(analysis["pages"])
-        if page["words_near_edge"] > WORDS_NEAR_EDGE_THRESHOLD
+        if page["text_margin_px"] < page_margin_px
    ]

    if check_orientation or check_crop or consecutive_blanks or consecutive_blurry:
@ -152,20 +148,13 @@ def _summarize_item_to_stdout(task):

 def _analyze_item_to_stdout(task):
    item_id = task.item_id
-    page_margin_px = task.page_margin_px
    verbose = task.verbose

    if verbose:
        print(f"Analyzing item {item_id}...", file=stderr)
        stderr.flush()

-    print(
-        json.dumps(
-            analyze_item(
-                item_id, page_margin_px=page_margin_px, parallel=True, verbose=verbose
-            )
-        )
-    )
+    print(json.dumps(analyze_item(item_id, parallel=4, verbose=verbose)))
    stdout.flush()

    if verbose:
@ -177,14 +166,12 @@ def _analyze_item_to_stdout(task):
 class PageAnalysisTask:
    im: Image.Image
    page_index: int
-    page_margin_px: int
    file_name: str


 def _analyze_page(task):
    im_original = task.im
    page_index = task.page_index
-    page_margin_px = task.page_margin_px
    file_name = task.file_name

    im_cropped = im_original.crop(
@ -201,7 +188,7 @@ def _analyze_page(task):
    if is_blank:
        max_sharpness = 1
        ocr_orientation_match = True
-        words_near_edge = 0
+        text_margin_px = -1
    else:
        max_sharpness = 0.0
        if im_cropped.size[0] < im_cropped.size[1]:
@ -262,19 +249,26 @@ def _analyze_page(task):
            if best_ocr_orientation % 2 == 0
            else (im_original.size[1], im_original.size[0])
        )
-        words_near_edge = best_ocr_words[
-            (best_ocr_words["left"] < page_margin_px)
-            | (best_ocr_words["top"] < page_margin_px)
-            | (
-                best_ocr_words["left"] + best_ocr_words["width"]
-                > best_ocr_dims[0] - page_margin_px
+
+        word_margins_all_directions = np.sort(
+            np.concat(
+                (
+                    best_ocr_words["left"].to_numpy(),
+                    best_ocr_words["top"].to_numpy(),
+                    best_ocr_dims[0]
+                    - (best_ocr_words["left"] + best_ocr_words["width"]).to_numpy(),
+                    best_ocr_dims[1]
+                    - (best_ocr_words["top"] + best_ocr_words["height"]).to_numpy(),
+                )
            )
-            | (
-                best_ocr_words["top"] + best_ocr_words["height"]
-                > best_ocr_dims[1] - page_margin_px
-            )
-        ]
-        words_near_edge = words_near_edge.shape[0]
+        )
+        # Skip the n closest words to the edge, to help ignore stray OCR artifacts.
+        SKIP_WORDS = 2
+        text_margin_px = (
+            int(word_margins_all_directions[SKIP_WORDS])
+            if word_margins_all_directions.shape[0] > SKIP_WORDS
+            else -1
+        )

    return {
        "blank": is_blank,
@ -283,11 +277,11 @@ def _analyze_page(task):
        "page_index": page_index,
        "size": im_original.size,
        "sharpness": max_sharpness,
-        "words_near_edge": words_near_edge,
+        "text_margin_px": text_margin_px,
    }


-def analyze_item(item_id, page_margin_px, parallel=False, verbose=False):
+def analyze_item(item_id, parallel=1, verbose=False):
    escaped_item_id = urllib.parse.quote(item_id, safe="")

    if verbose:
@ -326,7 +320,6 @@ def analyze_item(item_id, page_margin_px, parallel=False, verbose=False):
                        PageAnalysisTask(
                            im=im,
                            page_index=page_index,
-                            page_margin_px=page_margin_px,
                            file_name=file_name,
                        )
                    )
@ -334,9 +327,9 @@ def analyze_item(item_id, page_margin_px, parallel=False, verbose=False):
    if verbose:
        print(f"Processing {len(page_nums)} pages...", file=stderr)
        stderr.flush()
-    if parallel:
+    if parallel > 1:
        # Parallelize image processing and OCR of pages across up to n cores.
-        with Pool(N_OCR_PROCESSES) as pool:
+        with Pool(parallel) as pool:
            return {"pages": pool.map(_analyze_page, tasks)}
    return {"pages": [_analyze_page(task) for task in tasks]}