store results to sqlite

2025-08-18 20:31:55 -07:00 · 2025-08-18 20:31:55 -07:00 · 815934ad23
commit 815934ad23
parent d33a7dc515
3 changed files with 177 additions and 34 deletions
--- a/README.md
+++ b/README.md
@ -17,6 +17,22 @@ a single line so that items are summarized in parallel):
 pbpaste | tr '\n' ',' | uv run main.py --summarize -workers 4 -v | jq
 ```
 Query a pre-populated database for suspect pages:
 ```sql
 select   'https://archive.org/details/' || items.id,
         pages.page,
         pages.orientation_match,
         pages.sharpness,
         pages.text_margin_px
 from     items
         join pages on pages.item = items.id
 where    pages.orientation_match = 0
         or pages.sharpness < 0.07
         or (pages.text_margin_px > -1 and pages.text_margin_px < 50)
 order by items.id;
 ```
 ## Test Cases
 - Blurry pages: `micro_IA40244209_0984`
--- a/cache.py
+++ b/cache.py
@ -0,0 +1,134 @@
 import re
 import sqlite3
 import traceback
 from argparse import ArgumentParser
 from datetime import datetime
 from time import sleep
 import requests
 from main import analyze_item
 def main():
    parser = ArgumentParser()
    parser.add_argument("--database", default="./microqa.db")
    parser.add_argument("--cpus", type=int, default=2)
    parser.add_argument("--earliest-review-date", default="20250701")
    args = parser.parse_args()
    with sqlite3.connect(args.database) as conn:
        cur = conn.cursor()
        cur.execute("""
 create table if not exists items (
    id text primary key not null,
    review_date text not null,
    analyzed_date text
 )""")
        cur.execute("""
 create table if not exists pages (
    id int primary key,
    item text not null,
    page int not null,
    orientation_match boolean not null,
    sharpness real not null,
    is_blank boolean not null,
    text_margin_px int not null
 )""")
        conn.commit()
        while True:
            print("Pulling item IDs")
            pull_new_item_ids(conn, args.earliest_review_date)
            print("Done.")
            res = cur.execute(
                "select id from items where analyzed_date is null order by review_date"
            )
            for (item_id,) in res.fetchall():
                N_ATTEMPTS = 3
                for _ in range(N_ATTEMPTS):
                    try:
                        print(f"Processing {item_id}")
                        analysis = analyze_item(
                            item_id, parallel=args.cpus, verbose=True
                        )
                        for i, page in enumerate(analysis["pages"]):
                            cur.execute(
                                """
 insert into pages (
    item,
    page,
    orientation_match,
    sharpness,
    is_blank,
    text_margin_px
 ) values (
    ?,
    ?,
    ?,
    ?,
    ?,
    ?
 )""",
                                [
                                    item_id,
                                    i + 1,
                                    page["ocr_orientation_match"],
                                    page["sharpness"],
                                    page["blank"],
                                    page["text_margin_px"],
                                ],
                            )
                        cur.execute(
                            "update items set analyzed_date = ? where id = ?",
                            [datetime.utcnow().strftime("%Y%m%d%H%M%S"), item_id],
                        )
                        conn.commit()
                        print("Done")
                        break
                    except Exception as err:
                        print(err)
                        traceback.print_tb(err.__traceback__)
                        sleep(15)
            break
            sleep(3600)
 def pull_new_item_ids(conn, earliest_review_date):
    cur = conn.cursor()
    res = cur.execute("select review_date from items order by review_date desc limit 1")
    (latest_review_date,) = res.fetchone() or (earliest_review_date,)
    print(latest_review_date)
    query = f"""
        collection:(microfiche)
        AND contributor:(Internet Archive)
        AND micro_review:(done)
        AND review_date:[{latest_review_date} TO null]
    """
    sort = "reviewdate asc"
    # Format for API.
    query = re.sub(r"\s+", "+", query.strip())
    sort = re.sub(r"\s+", "+", sort.strip())
    for i in range(1, 999):
        resp = requests.get(
            f"https://archive.org/advancedsearch.php?q={query}&sort[]={sort}&fl[]=identifier&fl[]=review_date&rows=100&page={i}&output=json",
        )
        resp.raise_for_status()
        body = resp.json()
        if len(body["response"]["docs"]) == 0:
            break
        cur.executemany(
            "insert into items (id, review_date) values (?, ?) on conflict do nothing",
            [
                (doc["identifier"], doc["review_date"])
                for doc in body["response"]["docs"]
            ],
        )
        conn.commit()
 if __name__ == "__main__":
    main()
--- a/main.py
+++ b/main.py
@ -16,7 +16,6 @@ from PIL import Image, ImageFilter
 OCR_LANGS = "eng+fra"
 N_OCR_PROCESSES = 4
 def main():
@ -77,9 +76,7 @@ def _summarize_item_to_stdout(task):
        print(f"Summarizing item {item_id}...", file=stderr)
        stderr.flush()
-    analysis = analyze_item(
+    analysis = analyze_item(item_id, parallel=4, verbose=verbose)
        item_id, page_margin_px=page_margin_px, parallel=True, verbose=verbose
    )
    # 3 or more blank pages in a row is a flag.
    CONSECUTIVE_BLANKS_THRESHOLD = 3
@ -124,11 +121,10 @@ def _summarize_item_to_stdout(task):
        if not page["ocr_orientation_match"]
    ]
    WORDS_NEAR_EDGE_THRESHOLD = 2
    check_crop = [
        i + 1
        for i, page in enumerate(analysis["pages"])
-        if page["words_near_edge"] > WORDS_NEAR_EDGE_THRESHOLD
+        if page["text_margin_px"] < page_margin_px
    ]
    if check_orientation or check_crop or consecutive_blanks or consecutive_blurry:
@ -152,20 +148,13 @@ def _summarize_item_to_stdout(task):
 def _analyze_item_to_stdout(task):
    item_id = task.item_id
    page_margin_px = task.page_margin_px
    verbose = task.verbose
    if verbose:
        print(f"Analyzing item {item_id}...", file=stderr)
        stderr.flush()
-    print(
+    print(json.dumps(analyze_item(item_id, parallel=4, verbose=verbose)))
        json.dumps(
            analyze_item(
                item_id, page_margin_px=page_margin_px, parallel=True, verbose=verbose
            )
        )
    )
    stdout.flush()
    if verbose:
@ -177,14 +166,12 @@ def _analyze_item_to_stdout(task):
 class PageAnalysisTask:
    im: Image.Image
    page_index: int
    page_margin_px: int
    file_name: str
 def _analyze_page(task):
    im_original = task.im
    page_index = task.page_index
    page_margin_px = task.page_margin_px
    file_name = task.file_name
    im_cropped = im_original.crop(
@ -201,7 +188,7 @@ def _analyze_page(task):
    if is_blank:
        max_sharpness = 1
        ocr_orientation_match = True
-        words_near_edge = 0
+        text_margin_px = -1
    else:
        max_sharpness = 0.0
        if im_cropped.size[0] < im_cropped.size[1]:
@ -262,19 +249,26 @@ def _analyze_page(task):
            if best_ocr_orientation % 2 == 0
            else (im_original.size[1], im_original.size[0])
        )
-        words_near_edge = best_ocr_words[
+
-            (best_ocr_words["left"] < page_margin_px)
+        word_margins_all_directions = np.sort(
-            | (best_ocr_words["top"] < page_margin_px)
+            np.concat(
-            | (
+                (
-                best_ocr_words["left"] + best_ocr_words["width"]
+                    best_ocr_words["left"].to_numpy(),
-                > best_ocr_dims[0] - page_margin_px
+                    best_ocr_words["top"].to_numpy(),
                    best_ocr_dims[0]
                    - (best_ocr_words["left"] + best_ocr_words["width"]).to_numpy(),
                    best_ocr_dims[1]
                    - (best_ocr_words["top"] + best_ocr_words["height"]).to_numpy(),
                )
            )
-            | (
+        )
-                best_ocr_words["top"] + best_ocr_words["height"]
+        # Skip the n closest words to the edge, to help ignore stray OCR artifacts.
-                > best_ocr_dims[1] - page_margin_px
+        SKIP_WORDS = 2
-            )
+        text_margin_px = (
-        ]
+            int(word_margins_all_directions[SKIP_WORDS])
-        words_near_edge = words_near_edge.shape[0]
+            if word_margins_all_directions.shape[0] > SKIP_WORDS
            else -1
        )
    return {
        "blank": is_blank,
@ -283,11 +277,11 @@ def _analyze_page(task):
        "page_index": page_index,
        "size": im_original.size,
        "sharpness": max_sharpness,
-        "words_near_edge": words_near_edge,
+        "text_margin_px": text_margin_px,
    }
-def analyze_item(item_id, page_margin_px, parallel=False, verbose=False):
+def analyze_item(item_id, parallel=1, verbose=False):
    escaped_item_id = urllib.parse.quote(item_id, safe="")
    if verbose:
@ -326,7 +320,6 @@ def analyze_item(item_id, page_margin_px, parallel=False, verbose=False):
                        PageAnalysisTask(
                            im=im,
                            page_index=page_index,
                            page_margin_px=page_margin_px,
                            file_name=file_name,
                        )
                    )
@ -334,9 +327,9 @@ def analyze_item(item_id, page_margin_px, parallel=False, verbose=False):
    if verbose:
        print(f"Processing {len(page_nums)} pages...", file=stderr)
        stderr.flush()
-    if parallel:
+    if parallel > 1:
        # Parallelize image processing and OCR of pages across up to n cores.
-        with Pool(N_OCR_PROCESSES) as pool:
+        with Pool(parallel) as pool:
            return {"pages": pool.map(_analyze_page, tasks)}
    return {"pages": [_analyze_page(task) for task in tasks]}