clean up outdated files

2026-01-16 04:55:50 +00:00 · 2026-01-16 04:55:50 +00:00 · 35f750c28d
commit 35f750c28d
parent b19b0bfb07
4 changed files with 73 additions and 183 deletions
--- a/README.md
+++ b/README.md
@ -1,36 +1,83 @@
 # MicroQA

-QA assistant for the Internet Archive's microfiche scanning team.
+MicroQA assists with quality assurance for the Internet Archive's microfiche
+scanning team by screening for indicators of common scanning and post-processing
+mishaps including missed focus, disorientation, and over-cropping. As the
+Democracy's Library project expands to archive millions of pages of official
+documents, automated screening with OCR and conventional image processing
+multiplies the human QA team's capacity to conduct targeted spot checks,
+re-reviews, and manual re-scans.

-## Usage
+Two pluggable OCR back-ends are available:

-Analyze page statistics for item:
+- [Tesseract](https://github.com/tesseract-ocr/tesseract) -- CPU based,
+  reliable, fast multi-lingual support
+- [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR) -- 1-shot orientation
+  detection, offers GPU or NPU acceleration
+
+## Quickstart
+
+[Mise-en-place](https://mise.jdx.dev/) manages top-level dev dependencies,
+except for OCR back-ends. If you already have a recent version of `uv`
+installed, you may skip `mise install` and use the existing binary instead.

 ```sh
-echo 'micro_IA04244212_1665' | uv run main.py | jq
+mise install
+
+uv sync
+
+sudo apt install tesseract-ocr # Debian, Ubuntu
+sudo dnf install tesseract # Fedora
+brew install tesseract # macOS
+
+# Run page diagnostics on a single Archive item:
+uv run diagnostics.py micro_IA04244212_1665 --verbose
 ```

-Paste item IDs from clipboard and summarize all (`tr` command collapses input to
-a single line so that items are summarized in parallel):
+## Production Usage

-```sh
-pbpaste | tr '\n' ',' | uv run main.py --summarize -workers 4 -v | jq
-```
-
-Query a pre-populated database for suspect pages:
+MicroQA integrates with [Phonograph](https://www.phono.dev/) to coordinate
+between concurrent workers, store data, and make analysis results navigable at
+scale. Phonograph exposes a PostgreSQL interface, so MicroQA can run
+independently if supplied a PostgreSQL database with a schema equivalent to:

 ```sql
-select   'https://archive.org/details/' || items.id,
-         pages.page,
-         pages.orientation_match,
-         pages.sharpness,
-         pages.text_margin_px
-from     items
-         join pages on pages.item = items.id
-where    pages.orientation_match = 0
-         or pages.sharpness < 0.07
-         or (pages.text_margin_px > -1 and pages.text_margin_px < 50)
-order by items.id;
+create schema phono;
+
+create table phono.items (
+  _id            uuid primary key not null default gen_random_uuid(),
+  ia_id          text,
+  oai_updatedate timestamptz,
+  started_date   timestamptz,
+  completed_date timestamptz,
+  review_date    text,
+  url            text,
+  docs_link      text
+);
+
+create table phono.docs (
+  _id        uuid primary key not null default gen_random_uuid(),
+  name       text,
+  item       uuid,
+  pages_link text
+);
+
+create table phono.pages (
+  _id            uuid primary key not null default gen_random_uuid(),
+  doc            uuid,
+  page           numeric,
+  page_angle     numeric,
+  sharpness      numeric,
+  is_blank       text,
+  text_margin_px numeric,
+  url            text
+);
+```
+
+You can then start a worker with:
+
+```sh
+uv run main.py --database <DATABASE URL>
 ```

 ## Test Cases
@ -38,4 +85,5 @@ order by items.id;
 - Blurry pages: `micro_IA40244209_0984`
 - Contrast, page orientation: `micro_IA40244211_2290`
 - Crop, low quality fiche: `micro_IA40386420_0689`
- "Bite sized" SCOTUS doc with multiple viewable files and some blurry pages: `micro_IA40386007_0012`
+- "Bite sized" SCOTUS doc with multiple viewable files and some blurry pages:
+  `micro_IA40386007_0012`
--- a/main.py
+++ b/main.py
@ -13,6 +13,8 @@ from microqa.items import fetch_item, url_encode
 from microqa.engine import analyze_doc


+# Hard-coded Phonograph URLs are included for convenience and are relevant only
+# to the official deployment.
 GUI_DOCS_PORTAL_URL = "https://app.phono.dev/w/019b0a7dd865788e83b8cde7fcc99c9e/r/16583/p/019b6375173c76139afa91a356f97583"
 GUI_PAGES_PORTAL_URL = "https://app.phono.dev/w/019b0a7dd865788e83b8cde7fcc99c9e/r/16604/p/019b6379b1487b1e8791bd6486804452"

--- a/mise.toml
+++ b/mise.toml
@ -1,6 +1,5 @@
 [tools]
 uv = "latest"
-watchexec = "latest"

 [env]

--- a/one_off.py
+++ b/one_off.py
@ -1,159 +0,0 @@
-import json
-import re
-from argparse import ArgumentParser
-from dataclasses import dataclass
-from multiprocessing.pool import ThreadPool
-from sys import stderr, stdin, stdout
-
-from engine import analyze_item
-
-
-OCR_LANGS = "eng+fra"
-
-
-def main():
-    parser = ArgumentParser()
-    parser.add_argument("--summarize", action="store_true")
-    parser.add_argument("-v", "--verbose", action="store_true")
-    parser.add_argument("-w", "--workers", type=int, default=1)
-    parser.add_argument("--page-margin-px", type=int, default=50)
-    args = parser.parse_args()
-
-    # Process STDIN line by line, where each line contains one or more item IDs
-    # separated by whitespace.
-    for line in stdin:
-        item_ids = [value for value in re.split(r",|\s", line) if value]
-        with ThreadPool(args.workers) as pool:
-            if args.verbose:
-                print(f"Running with {args.workers} workers.", file=stderr)
-                stderr.flush()
-            if args.summarize:
-                pool.map(
-                    _summarize_item_to_stdout,
-                    [
-                        ItemTask(
-                            item_id=item_id,
-                            page_margin_px=args.page_margin_px,
-                            verbose=args.verbose,
-                        )
-                        for item_id in item_ids
-                    ],
-                )
-            else:
-                pool.map(
-                    _analyze_item_to_stdout,
-                    [
-                        ItemTask(
-                            item_id=item_id,
-                            page_margin_px=args.page_margin_px,
-                            verbose=args.verbose,
-                        )
-                        for item_id in item_ids
-                    ],
-                )
-
-
-@dataclass
-class ItemTask:
-    item_id: str
-    page_margin_px: int
-    verbose: bool
-
-
-def _summarize_item_to_stdout(task):
-    item_id = task.item_id
-    page_margin_px = task.page_margin_px
-    verbose = task.verbose
-
-    if verbose:
-        print(f"Summarizing item {item_id}...", file=stderr)
-        stderr.flush()
-
-    analysis = analyze_item(item_id, parallel=4, verbose=verbose)
-
-    # 3 or more blank pages in a row is a flag.
-    CONSECUTIVE_BLANKS_THRESHOLD = 3
-    if len(analysis["pages"]) >= CONSECUTIVE_BLANKS_THRESHOLD:
-        consecutive_blanks = [page["blank"] for page in analysis["pages"]]
-        for _ in range(1, CONSECUTIVE_BLANKS_THRESHOLD):
-            consecutive_blanks = [
-                value and consecutive_blanks[i]
-                for i, value in enumerate(consecutive_blanks[1:])
-            ]
-        consecutive_blanks = [
-            i + 2  # +1 to account for enumeration offset, and +1 to 1-index
-            for i, value in enumerate(consecutive_blanks[1:])
-            if value and not consecutive_blanks[i]
-        ]
-    else:
-        consecutive_blanks = []
-
-    # 3 or more blank pages in a row is a flag.
-    CONSECUTIVE_BLURRY_THRESHOLD = 3
-    SHARPNESS_THRESHOLD = 0.1
-    if len(analysis["pages"]) >= CONSECUTIVE_BLURRY_THRESHOLD:
-        consecutive_blurry = [
-            page["sharpness"] < SHARPNESS_THRESHOLD for page in analysis["pages"]
-        ]
-        for _ in range(1, CONSECUTIVE_BLURRY_THRESHOLD):
-            consecutive_blurry = [
-                value and consecutive_blurry[i]
-                for i, value in enumerate(consecutive_blurry[1:])
-            ]
-        consecutive_blurry = [
-            i + 2  # +1 to account for enumeration offset, and +1 to 1-index
-            for i, value in enumerate(consecutive_blurry[1:])
-            if value and not consecutive_blurry[i]
-        ]
-    else:
-        consecutive_blurry = []
-
-    check_orientation = [
-        i + 1
-        for i, page in enumerate(analysis["pages"])
-        if not page["ocr_orientation_match"]
-    ]
-
-    check_crop = [
-        i + 1
-        for i, page in enumerate(analysis["pages"])
-        if page["text_margin_px"] < page_margin_px
-    ]
-
-    if check_orientation or check_crop or consecutive_blanks or consecutive_blurry:
-        print(
-            json.dumps(
-                {
-                    "item_id": item_id,
-                    "check_orientation": check_orientation,
-                    "check_crop": check_crop,
-                    "consecutive_blanks": consecutive_blanks,
-                    "consecutive_blurry": consecutive_blurry,
-                }
-            )
-        )
-        stdout.flush()
-
-    if verbose:
-        print(f"Done summarizing item {item_id}.", file=stderr)
-        stderr.flush()
-
-
-def _analyze_item_to_stdout(task):
-    item_id = task.item_id
-    verbose = task.verbose
-
-    if verbose:
-        print(f"Analyzing item {item_id}...", file=stderr)
-        stderr.flush()
-
-    print(json.dumps(analyze_item(item_id, parallel=6, verbose=verbose)))
-    stdout.flush()
-
-    if verbose:
-        print(f"Done analyzing item {item_id}.", file=stderr)
-        stderr.flush()
-
-
-if __name__ == "__main__":
-    main()