clean up outdated files

This commit is contained in:
Brent Schroeter 2026-01-16 04:55:50 +00:00
parent b19b0bfb07
commit 35f750c28d
4 changed files with 73 additions and 183 deletions

View file

@ -1,36 +1,83 @@
# MicroQA # MicroQA
QA assistant for the Internet Archive's microfiche scanning team. MicroQA assists with quality assurance for the Internet Archive's microfiche
scanning team by screening for indicators of common scanning and post-processing
mishaps including missed focus, disorientation, and over-cropping. As the
Democracy's Library project expands to archive millions of pages of official
documents, automated screening with OCR and conventional image processing
multiplies the human QA team's capacity to conduct targeted spot checks,
re-reviews, and manual re-scans.
## Usage Two pluggable OCR back-ends are available:
Analyze page statistics for item: - [Tesseract](https://github.com/tesseract-ocr/tesseract) -- CPU based,
reliable, fast multi-lingual support
- [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR) -- 1-shot orientation
detection, offers GPU or NPU acceleration
## Quickstart
[Mise-en-place](https://mise.jdx.dev/) manages top-level dev dependencies,
except for OCR back-ends. If you already have a recent version of `uv`
installed, you may skip `mise install` and use the existing binary instead.
```sh ```sh
echo 'micro_IA04244212_1665' | uv run main.py | jq mise install
uv sync
sudo apt install tesseract-ocr # Debian, Ubuntu
sudo dnf install tesseract # Fedora
brew install tesseract # macOS
# Run page diagnostics on a single Archive item:
uv run diagnostics.py micro_IA04244212_1665 --verbose
``` ```
Paste item IDs from clipboard and summarize all (`tr` command collapses input to ## Production Usage
a single line so that items are summarized in parallel):
```sh MicroQA integrates with [Phonograph](https://www.phono.dev/) to coordinate
pbpaste | tr '\n' ',' | uv run main.py --summarize -workers 4 -v | jq between concurrent workers, store data, and make analysis results navigable at
``` scale. Phonograph exposes a PostgreSQL interface, so MicroQA can run
independently if supplied a PostgreSQL database with a schema equivalent to:
Query a pre-populated database for suspect pages:
```sql ```sql
select 'https://archive.org/details/' || items.id, create schema phono;
pages.page,
pages.orientation_match, create table phono.items (
pages.sharpness, _id uuid primary key not null default gen_random_uuid(),
pages.text_margin_px ia_id text,
from items oai_updatedate timestamptz,
join pages on pages.item = items.id started_date timestamptz,
where pages.orientation_match = 0 completed_date timestamptz,
or pages.sharpness < 0.07 review_date text,
or (pages.text_margin_px > -1 and pages.text_margin_px < 50) url text,
order by items.id; docs_link text
);
create table phono.docs (
_id uuid primary key not null default gen_random_uuid(),
name text,
item uuid,
pages_link text
);
create table phono.pages (
_id uuid primary key not null default gen_random_uuid(),
doc uuid,
page numeric,
page_angle numeric,
sharpness numeric,
is_blank text,
text_margin_px numeric,
url text
);
```
You can then start a worker with:
```sh
uv run main.py --database <DATABASE URL>
``` ```
## Test Cases ## Test Cases
@ -38,4 +85,5 @@ order by items.id;
- Blurry pages: `micro_IA40244209_0984` - Blurry pages: `micro_IA40244209_0984`
- Contrast, page orientation: `micro_IA40244211_2290` - Contrast, page orientation: `micro_IA40244211_2290`
- Crop, low quality fiche: `micro_IA40386420_0689` - Crop, low quality fiche: `micro_IA40386420_0689`
- "Bite sized" SCOTUS doc with multiple viewable files and some blurry pages: `micro_IA40386007_0012` - "Bite sized" SCOTUS doc with multiple viewable files and some blurry pages:
`micro_IA40386007_0012`

View file

@ -13,6 +13,8 @@ from microqa.items import fetch_item, url_encode
from microqa.engine import analyze_doc from microqa.engine import analyze_doc
# Hard-coded Phonograph URLs are included for convenience and are relevant only
# to the official deployment.
GUI_DOCS_PORTAL_URL = "https://app.phono.dev/w/019b0a7dd865788e83b8cde7fcc99c9e/r/16583/p/019b6375173c76139afa91a356f97583" GUI_DOCS_PORTAL_URL = "https://app.phono.dev/w/019b0a7dd865788e83b8cde7fcc99c9e/r/16583/p/019b6375173c76139afa91a356f97583"
GUI_PAGES_PORTAL_URL = "https://app.phono.dev/w/019b0a7dd865788e83b8cde7fcc99c9e/r/16604/p/019b6379b1487b1e8791bd6486804452" GUI_PAGES_PORTAL_URL = "https://app.phono.dev/w/019b0a7dd865788e83b8cde7fcc99c9e/r/16604/p/019b6379b1487b1e8791bd6486804452"

View file

@ -1,6 +1,5 @@
[tools] [tools]
uv = "latest" uv = "latest"
watchexec = "latest"
[env] [env]

View file

@ -1,159 +0,0 @@
import json
import re
from argparse import ArgumentParser
from dataclasses import dataclass
from multiprocessing.pool import ThreadPool
from sys import stderr, stdin, stdout
from engine import analyze_item
OCR_LANGS = "eng+fra"
def main():
parser = ArgumentParser()
parser.add_argument("--summarize", action="store_true")
parser.add_argument("-v", "--verbose", action="store_true")
parser.add_argument("-w", "--workers", type=int, default=1)
parser.add_argument("--page-margin-px", type=int, default=50)
args = parser.parse_args()
# Process STDIN line by line, where each line contains one or more item IDs
# separated by whitespace.
for line in stdin:
item_ids = [value for value in re.split(r",|\s", line) if value]
with ThreadPool(args.workers) as pool:
if args.verbose:
print(f"Running with {args.workers} workers.", file=stderr)
stderr.flush()
if args.summarize:
pool.map(
_summarize_item_to_stdout,
[
ItemTask(
item_id=item_id,
page_margin_px=args.page_margin_px,
verbose=args.verbose,
)
for item_id in item_ids
],
)
else:
pool.map(
_analyze_item_to_stdout,
[
ItemTask(
item_id=item_id,
page_margin_px=args.page_margin_px,
verbose=args.verbose,
)
for item_id in item_ids
],
)
@dataclass
class ItemTask:
item_id: str
page_margin_px: int
verbose: bool
def _summarize_item_to_stdout(task):
item_id = task.item_id
page_margin_px = task.page_margin_px
verbose = task.verbose
if verbose:
print(f"Summarizing item {item_id}...", file=stderr)
stderr.flush()
analysis = analyze_item(item_id, parallel=4, verbose=verbose)
# 3 or more blank pages in a row is a flag.
CONSECUTIVE_BLANKS_THRESHOLD = 3
if len(analysis["pages"]) >= CONSECUTIVE_BLANKS_THRESHOLD:
consecutive_blanks = [page["blank"] for page in analysis["pages"]]
for _ in range(1, CONSECUTIVE_BLANKS_THRESHOLD):
consecutive_blanks = [
value and consecutive_blanks[i]
for i, value in enumerate(consecutive_blanks[1:])
]
consecutive_blanks = [
i + 2 # +1 to account for enumeration offset, and +1 to 1-index
for i, value in enumerate(consecutive_blanks[1:])
if value and not consecutive_blanks[i]
]
else:
consecutive_blanks = []
# 3 or more blank pages in a row is a flag.
CONSECUTIVE_BLURRY_THRESHOLD = 3
SHARPNESS_THRESHOLD = 0.1
if len(analysis["pages"]) >= CONSECUTIVE_BLURRY_THRESHOLD:
consecutive_blurry = [
page["sharpness"] < SHARPNESS_THRESHOLD for page in analysis["pages"]
]
for _ in range(1, CONSECUTIVE_BLURRY_THRESHOLD):
consecutive_blurry = [
value and consecutive_blurry[i]
for i, value in enumerate(consecutive_blurry[1:])
]
consecutive_blurry = [
i + 2 # +1 to account for enumeration offset, and +1 to 1-index
for i, value in enumerate(consecutive_blurry[1:])
if value and not consecutive_blurry[i]
]
else:
consecutive_blurry = []
check_orientation = [
i + 1
for i, page in enumerate(analysis["pages"])
if not page["ocr_orientation_match"]
]
check_crop = [
i + 1
for i, page in enumerate(analysis["pages"])
if page["text_margin_px"] < page_margin_px
]
if check_orientation or check_crop or consecutive_blanks or consecutive_blurry:
print(
json.dumps(
{
"item_id": item_id,
"check_orientation": check_orientation,
"check_crop": check_crop,
"consecutive_blanks": consecutive_blanks,
"consecutive_blurry": consecutive_blurry,
}
)
)
stdout.flush()
if verbose:
print(f"Done summarizing item {item_id}.", file=stderr)
stderr.flush()
def _analyze_item_to_stdout(task):
item_id = task.item_id
verbose = task.verbose
if verbose:
print(f"Analyzing item {item_id}...", file=stderr)
stderr.flush()
print(json.dumps(analyze_item(item_id, parallel=6, verbose=verbose)))
stdout.flush()
if verbose:
print(f"Done analyzing item {item_id}.", file=stderr)
stderr.flush()
if __name__ == "__main__":
main()