clean up outdated files

2026-01-16 04:55:50 +00:00 · 2026-01-16 04:55:50 +00:00 · 35f750c28d
commit 35f750c28d
parent b19b0bfb07
4 changed files with 73 additions and 183 deletions
--- a/README.md
+++ b/README.md
@ -1,36 +1,83 @@
 # MicroQA
-QA assistant for the Internet Archive's microfiche scanning team.
+MicroQA assists with quality assurance for the Internet Archive's microfiche
 scanning team by screening for indicators of common scanning and post-processing
 mishaps including missed focus, disorientation, and over-cropping. As the
 Democracy's Library project expands to archive millions of pages of official
 documents, automated screening with OCR and conventional image processing
 multiplies the human QA team's capacity to conduct targeted spot checks,
 re-reviews, and manual re-scans.
-## Usage
+Two pluggable OCR back-ends are available:
-Analyze page statistics for item:
+- [Tesseract](https://github.com/tesseract-ocr/tesseract) -- CPU based,
  reliable, fast multi-lingual support
 - [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR) -- 1-shot orientation
  detection, offers GPU or NPU acceleration
 ## Quickstart
 [Mise-en-place](https://mise.jdx.dev/) manages top-level dev dependencies,
 except for OCR back-ends. If you already have a recent version of `uv`
 installed, you may skip `mise install` and use the existing binary instead.
 ```sh
-echo 'micro_IA04244212_1665' | uv run main.py | jq
+mise install
 uv sync
 sudo apt install tesseract-ocr # Debian, Ubuntu
 sudo dnf install tesseract # Fedora
 brew install tesseract # macOS
 # Run page diagnostics on a single Archive item:
 uv run diagnostics.py micro_IA04244212_1665 --verbose
 ```
-Paste item IDs from clipboard and summarize all (`tr` command collapses input to
+## Production Usage
 a single line so that items are summarized in parallel):
-```sh
+MicroQA integrates with [Phonograph](https://www.phono.dev/) to coordinate
-pbpaste | tr '\n' ',' | uv run main.py --summarize -workers 4 -v | jq
+between concurrent workers, store data, and make analysis results navigable at
-```
+scale. Phonograph exposes a PostgreSQL interface, so MicroQA can run
-
+independently if supplied a PostgreSQL database with a schema equivalent to:
 Query a pre-populated database for suspect pages:
 ```sql
-select   'https://archive.org/details/' || items.id,
+create schema phono;
-         pages.page,
+
-         pages.orientation_match,
+create table phono.items (
-         pages.sharpness,
+  _id            uuid primary key not null default gen_random_uuid(),
-         pages.text_margin_px
+  ia_id          text,
-from     items
+  oai_updatedate timestamptz,
-         join pages on pages.item = items.id
+  started_date   timestamptz,
-where    pages.orientation_match = 0
+  completed_date timestamptz,
-         or pages.sharpness < 0.07
+  review_date    text,
-         or (pages.text_margin_px > -1 and pages.text_margin_px < 50)
+  url            text,
-order by items.id;
+  docs_link      text
 );
 create table phono.docs (
  _id        uuid primary key not null default gen_random_uuid(),
  name       text,
  item       uuid,
  pages_link text
 );
 create table phono.pages (
  _id            uuid primary key not null default gen_random_uuid(),
  doc            uuid,
  page           numeric,
  page_angle     numeric,
  sharpness      numeric,
  is_blank       text,
  text_margin_px numeric,
  url            text
 );
 ```
 You can then start a worker with:
 ```sh
 uv run main.py --database <DATABASE URL>
 ```
 ## Test Cases
@ -38,4 +85,5 @@ order by items.id;
 - Blurry pages: `micro_IA40244209_0984`
 - Contrast, page orientation: `micro_IA40244211_2290`
 - Crop, low quality fiche: `micro_IA40386420_0689`
- "Bite sized" SCOTUS doc with multiple viewable files and some blurry pages: `micro_IA40386007_0012`
+- "Bite sized" SCOTUS doc with multiple viewable files and some blurry pages:
  `micro_IA40386007_0012`
--- a/main.py
+++ b/main.py
@ -13,6 +13,8 @@ from microqa.items import fetch_item, url_encode
 from microqa.engine import analyze_doc
 # Hard-coded Phonograph URLs are included for convenience and are relevant only
 # to the official deployment.
 GUI_DOCS_PORTAL_URL = "https://app.phono.dev/w/019b0a7dd865788e83b8cde7fcc99c9e/r/16583/p/019b6375173c76139afa91a356f97583"
 GUI_PAGES_PORTAL_URL = "https://app.phono.dev/w/019b0a7dd865788e83b8cde7fcc99c9e/r/16604/p/019b6379b1487b1e8791bd6486804452"
--- a/mise.toml
+++ b/mise.toml
@ -1,6 +1,5 @@
 [tools]
 uv = "latest"
 watchexec = "latest"
 [env]
--- a/one_off.py
+++ b/one_off.py
@ -1,159 +0,0 @@
 import json
 import re
 from argparse import ArgumentParser
 from dataclasses import dataclass
 from multiprocessing.pool import ThreadPool
 from sys import stderr, stdin, stdout
 from engine import analyze_item
 OCR_LANGS = "eng+fra"
 def main():
    parser = ArgumentParser()
    parser.add_argument("--summarize", action="store_true")
    parser.add_argument("-v", "--verbose", action="store_true")
    parser.add_argument("-w", "--workers", type=int, default=1)
    parser.add_argument("--page-margin-px", type=int, default=50)
    args = parser.parse_args()
    # Process STDIN line by line, where each line contains one or more item IDs
    # separated by whitespace.
    for line in stdin:
        item_ids = [value for value in re.split(r",|\s", line) if value]
        with ThreadPool(args.workers) as pool:
            if args.verbose:
                print(f"Running with {args.workers} workers.", file=stderr)
                stderr.flush()
            if args.summarize:
                pool.map(
                    _summarize_item_to_stdout,
                    [
                        ItemTask(
                            item_id=item_id,
                            page_margin_px=args.page_margin_px,
                            verbose=args.verbose,
                        )
                        for item_id in item_ids
                    ],
                )
            else:
                pool.map(
                    _analyze_item_to_stdout,
                    [
                        ItemTask(
                            item_id=item_id,
                            page_margin_px=args.page_margin_px,
                            verbose=args.verbose,
                        )
                        for item_id in item_ids
                    ],
                )
@dataclass
 class ItemTask:
    item_id: str
    page_margin_px: int
    verbose: bool
 def _summarize_item_to_stdout(task):
    item_id = task.item_id
    page_margin_px = task.page_margin_px
    verbose = task.verbose
    if verbose:
        print(f"Summarizing item {item_id}...", file=stderr)
        stderr.flush()
    analysis = analyze_item(item_id, parallel=4, verbose=verbose)
    # 3 or more blank pages in a row is a flag.
    CONSECUTIVE_BLANKS_THRESHOLD = 3
    if len(analysis["pages"]) >= CONSECUTIVE_BLANKS_THRESHOLD:
        consecutive_blanks = [page["blank"] for page in analysis["pages"]]
        for _ in range(1, CONSECUTIVE_BLANKS_THRESHOLD):
            consecutive_blanks = [
                value and consecutive_blanks[i]
                for i, value in enumerate(consecutive_blanks[1:])
            ]
        consecutive_blanks = [
            i + 2  # +1 to account for enumeration offset, and +1 to 1-index
            for i, value in enumerate(consecutive_blanks[1:])
            if value and not consecutive_blanks[i]
        ]
    else:
        consecutive_blanks = []
    # 3 or more blank pages in a row is a flag.
    CONSECUTIVE_BLURRY_THRESHOLD = 3
    SHARPNESS_THRESHOLD = 0.1
    if len(analysis["pages"]) >= CONSECUTIVE_BLURRY_THRESHOLD:
        consecutive_blurry = [
            page["sharpness"] < SHARPNESS_THRESHOLD for page in analysis["pages"]
        ]
        for _ in range(1, CONSECUTIVE_BLURRY_THRESHOLD):
            consecutive_blurry = [
                value and consecutive_blurry[i]
                for i, value in enumerate(consecutive_blurry[1:])
            ]
        consecutive_blurry = [
            i + 2  # +1 to account for enumeration offset, and +1 to 1-index
            for i, value in enumerate(consecutive_blurry[1:])
            if value and not consecutive_blurry[i]
        ]
    else:
        consecutive_blurry = []
    check_orientation = [
        i + 1
        for i, page in enumerate(analysis["pages"])
        if not page["ocr_orientation_match"]
    ]
    check_crop = [
        i + 1
        for i, page in enumerate(analysis["pages"])
        if page["text_margin_px"] < page_margin_px
    ]
    if check_orientation or check_crop or consecutive_blanks or consecutive_blurry:
        print(
            json.dumps(
                {
                    "item_id": item_id,
                    "check_orientation": check_orientation,
                    "check_crop": check_crop,
                    "consecutive_blanks": consecutive_blanks,
                    "consecutive_blurry": consecutive_blurry,
                }
            )
        )
        stdout.flush()
    if verbose:
        print(f"Done summarizing item {item_id}.", file=stderr)
        stderr.flush()
 def _analyze_item_to_stdout(task):
    item_id = task.item_id
    verbose = task.verbose
    if verbose:
        print(f"Analyzing item {item_id}...", file=stderr)
        stderr.flush()
    print(json.dumps(analyze_item(item_id, parallel=6, verbose=verbose)))
    stdout.flush()
    if verbose:
        print(f"Done analyzing item {item_id}.", file=stderr)
        stderr.flush()
 if __name__ == "__main__":
    main()