diff --git a/README.md b/README.md index ccb4295..8781731 100644 --- a/README.md +++ b/README.md @@ -1,36 +1,83 @@ # MicroQA -QA assistant for the Internet Archive's microfiche scanning team. +MicroQA assists with quality assurance for the Internet Archive's microfiche +scanning team by screening for indicators of common scanning and post-processing +mishaps including missed focus, disorientation, and over-cropping. As the +Democracy's Library project expands to archive millions of pages of official +documents, automated screening with OCR and conventional image processing +multiplies the human QA team's capacity to conduct targeted spot checks, +re-reviews, and manual re-scans. -## Usage +Two pluggable OCR back-ends are available: -Analyze page statistics for item: +- [Tesseract](https://github.com/tesseract-ocr/tesseract) -- CPU based, + reliable, fast multi-lingual support +- [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR) -- 1-shot orientation + detection, offers GPU or NPU acceleration + +## Quickstart + +[Mise-en-place](https://mise.jdx.dev/) manages top-level dev dependencies, +except for OCR back-ends. If you already have a recent version of `uv` +installed, you may skip `mise install` and use the existing binary instead. ```sh -echo 'micro_IA04244212_1665' | uv run main.py | jq +mise install + +uv sync + +sudo apt install tesseract-ocr # Debian, Ubuntu +sudo dnf install tesseract # Fedora +brew install tesseract # macOS + +# Run page diagnostics on a single Archive item: +uv run diagnostics.py micro_IA04244212_1665 --verbose ``` -Paste item IDs from clipboard and summarize all (`tr` command collapses input to -a single line so that items are summarized in parallel): +## Production Usage -```sh -pbpaste | tr '\n' ',' | uv run main.py --summarize -workers 4 -v | jq -``` - -Query a pre-populated database for suspect pages: +MicroQA integrates with [Phonograph](https://www.phono.dev/) to coordinate +between concurrent workers, store data, and make analysis results navigable at +scale. Phonograph exposes a PostgreSQL interface, so MicroQA can run +independently if supplied a PostgreSQL database with a schema equivalent to: ```sql -select 'https://archive.org/details/' || items.id, - pages.page, - pages.orientation_match, - pages.sharpness, - pages.text_margin_px -from items - join pages on pages.item = items.id -where pages.orientation_match = 0 - or pages.sharpness < 0.07 - or (pages.text_margin_px > -1 and pages.text_margin_px < 50) -order by items.id; +create schema phono; + +create table phono.items ( + _id uuid primary key not null default gen_random_uuid(), + ia_id text, + oai_updatedate timestamptz, + started_date timestamptz, + completed_date timestamptz, + review_date text, + url text, + docs_link text +); + +create table phono.docs ( + _id uuid primary key not null default gen_random_uuid(), + name text, + item uuid, + pages_link text +); + +create table phono.pages ( + _id uuid primary key not null default gen_random_uuid(), + doc uuid, + page numeric, + page_angle numeric, + sharpness numeric, + is_blank text, + text_margin_px numeric, + url text +); +``` + +You can then start a worker with: + +```sh +uv run main.py --database ``` ## Test Cases @@ -38,4 +85,5 @@ order by items.id; - Blurry pages: `micro_IA40244209_0984` - Contrast, page orientation: `micro_IA40244211_2290` - Crop, low quality fiche: `micro_IA40386420_0689` -- "Bite sized" SCOTUS doc with multiple viewable files and some blurry pages: `micro_IA40386007_0012` +- "Bite sized" SCOTUS doc with multiple viewable files and some blurry pages: + `micro_IA40386007_0012` diff --git a/main.py b/main.py index 3d8f1f7..d981a68 100644 --- a/main.py +++ b/main.py @@ -13,6 +13,8 @@ from microqa.items import fetch_item, url_encode from microqa.engine import analyze_doc +# Hard-coded Phonograph URLs are included for convenience and are relevant only +# to the official deployment. GUI_DOCS_PORTAL_URL = "https://app.phono.dev/w/019b0a7dd865788e83b8cde7fcc99c9e/r/16583/p/019b6375173c76139afa91a356f97583" GUI_PAGES_PORTAL_URL = "https://app.phono.dev/w/019b0a7dd865788e83b8cde7fcc99c9e/r/16604/p/019b6379b1487b1e8791bd6486804452" diff --git a/mise.toml b/mise.toml index 1f2a0d9..9386ab6 100644 --- a/mise.toml +++ b/mise.toml @@ -1,6 +1,5 @@ [tools] uv = "latest" -watchexec = "latest" [env] diff --git a/one_off.py b/one_off.py deleted file mode 100644 index 947b294..0000000 --- a/one_off.py +++ /dev/null @@ -1,159 +0,0 @@ -import json -import re -from argparse import ArgumentParser -from dataclasses import dataclass -from multiprocessing.pool import ThreadPool -from sys import stderr, stdin, stdout - -from engine import analyze_item - - -OCR_LANGS = "eng+fra" - - -def main(): - parser = ArgumentParser() - parser.add_argument("--summarize", action="store_true") - parser.add_argument("-v", "--verbose", action="store_true") - parser.add_argument("-w", "--workers", type=int, default=1) - parser.add_argument("--page-margin-px", type=int, default=50) - args = parser.parse_args() - - # Process STDIN line by line, where each line contains one or more item IDs - # separated by whitespace. - for line in stdin: - item_ids = [value for value in re.split(r",|\s", line) if value] - with ThreadPool(args.workers) as pool: - if args.verbose: - print(f"Running with {args.workers} workers.", file=stderr) - stderr.flush() - if args.summarize: - pool.map( - _summarize_item_to_stdout, - [ - ItemTask( - item_id=item_id, - page_margin_px=args.page_margin_px, - verbose=args.verbose, - ) - for item_id in item_ids - ], - ) - else: - pool.map( - _analyze_item_to_stdout, - [ - ItemTask( - item_id=item_id, - page_margin_px=args.page_margin_px, - verbose=args.verbose, - ) - for item_id in item_ids - ], - ) - - -@dataclass -class ItemTask: - item_id: str - page_margin_px: int - verbose: bool - - -def _summarize_item_to_stdout(task): - item_id = task.item_id - page_margin_px = task.page_margin_px - verbose = task.verbose - - if verbose: - print(f"Summarizing item {item_id}...", file=stderr) - stderr.flush() - - analysis = analyze_item(item_id, parallel=4, verbose=verbose) - - # 3 or more blank pages in a row is a flag. - CONSECUTIVE_BLANKS_THRESHOLD = 3 - if len(analysis["pages"]) >= CONSECUTIVE_BLANKS_THRESHOLD: - consecutive_blanks = [page["blank"] for page in analysis["pages"]] - for _ in range(1, CONSECUTIVE_BLANKS_THRESHOLD): - consecutive_blanks = [ - value and consecutive_blanks[i] - for i, value in enumerate(consecutive_blanks[1:]) - ] - consecutive_blanks = [ - i + 2 # +1 to account for enumeration offset, and +1 to 1-index - for i, value in enumerate(consecutive_blanks[1:]) - if value and not consecutive_blanks[i] - ] - else: - consecutive_blanks = [] - - # 3 or more blank pages in a row is a flag. - CONSECUTIVE_BLURRY_THRESHOLD = 3 - SHARPNESS_THRESHOLD = 0.1 - if len(analysis["pages"]) >= CONSECUTIVE_BLURRY_THRESHOLD: - consecutive_blurry = [ - page["sharpness"] < SHARPNESS_THRESHOLD for page in analysis["pages"] - ] - for _ in range(1, CONSECUTIVE_BLURRY_THRESHOLD): - consecutive_blurry = [ - value and consecutive_blurry[i] - for i, value in enumerate(consecutive_blurry[1:]) - ] - consecutive_blurry = [ - i + 2 # +1 to account for enumeration offset, and +1 to 1-index - for i, value in enumerate(consecutive_blurry[1:]) - if value and not consecutive_blurry[i] - ] - else: - consecutive_blurry = [] - - check_orientation = [ - i + 1 - for i, page in enumerate(analysis["pages"]) - if not page["ocr_orientation_match"] - ] - - check_crop = [ - i + 1 - for i, page in enumerate(analysis["pages"]) - if page["text_margin_px"] < page_margin_px - ] - - if check_orientation or check_crop or consecutive_blanks or consecutive_blurry: - print( - json.dumps( - { - "item_id": item_id, - "check_orientation": check_orientation, - "check_crop": check_crop, - "consecutive_blanks": consecutive_blanks, - "consecutive_blurry": consecutive_blurry, - } - ) - ) - stdout.flush() - - if verbose: - print(f"Done summarizing item {item_id}.", file=stderr) - stderr.flush() - - -def _analyze_item_to_stdout(task): - item_id = task.item_id - verbose = task.verbose - - if verbose: - print(f"Analyzing item {item_id}...", file=stderr) - stderr.flush() - - print(json.dumps(analyze_item(item_id, parallel=6, verbose=verbose))) - stdout.flush() - - if verbose: - print(f"Done analyzing item {item_id}.", file=stderr) - stderr.flush() - - -if __name__ == "__main__": - main()