clean up outdated files
This commit is contained in:
parent
b19b0bfb07
commit
35f750c28d
4 changed files with 73 additions and 183 deletions
94
README.md
94
README.md
|
|
@ -1,36 +1,83 @@
|
||||||
# MicroQA
|
# MicroQA
|
||||||
|
|
||||||
QA assistant for the Internet Archive's microfiche scanning team.
|
MicroQA assists with quality assurance for the Internet Archive's microfiche
|
||||||
|
scanning team by screening for indicators of common scanning and post-processing
|
||||||
|
mishaps including missed focus, disorientation, and over-cropping. As the
|
||||||
|
Democracy's Library project expands to archive millions of pages of official
|
||||||
|
documents, automated screening with OCR and conventional image processing
|
||||||
|
multiplies the human QA team's capacity to conduct targeted spot checks,
|
||||||
|
re-reviews, and manual re-scans.
|
||||||
|
|
||||||
## Usage
|
Two pluggable OCR back-ends are available:
|
||||||
|
|
||||||
Analyze page statistics for item:
|
- [Tesseract](https://github.com/tesseract-ocr/tesseract) -- CPU based,
|
||||||
|
reliable, fast multi-lingual support
|
||||||
|
- [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR) -- 1-shot orientation
|
||||||
|
detection, offers GPU or NPU acceleration
|
||||||
|
|
||||||
|
## Quickstart
|
||||||
|
|
||||||
|
[Mise-en-place](https://mise.jdx.dev/) manages top-level dev dependencies,
|
||||||
|
except for OCR back-ends. If you already have a recent version of `uv`
|
||||||
|
installed, you may skip `mise install` and use the existing binary instead.
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
echo 'micro_IA04244212_1665' | uv run main.py | jq
|
mise install
|
||||||
|
|
||||||
|
uv sync
|
||||||
|
|
||||||
|
sudo apt install tesseract-ocr # Debian, Ubuntu
|
||||||
|
sudo dnf install tesseract # Fedora
|
||||||
|
brew install tesseract # macOS
|
||||||
|
|
||||||
|
# Run page diagnostics on a single Archive item:
|
||||||
|
uv run diagnostics.py micro_IA04244212_1665 --verbose
|
||||||
```
|
```
|
||||||
|
|
||||||
Paste item IDs from clipboard and summarize all (`tr` command collapses input to
|
## Production Usage
|
||||||
a single line so that items are summarized in parallel):
|
|
||||||
|
|
||||||
```sh
|
MicroQA integrates with [Phonograph](https://www.phono.dev/) to coordinate
|
||||||
pbpaste | tr '\n' ',' | uv run main.py --summarize -workers 4 -v | jq
|
between concurrent workers, store data, and make analysis results navigable at
|
||||||
```
|
scale. Phonograph exposes a PostgreSQL interface, so MicroQA can run
|
||||||
|
independently if supplied a PostgreSQL database with a schema equivalent to:
|
||||||
Query a pre-populated database for suspect pages:
|
|
||||||
|
|
||||||
```sql
|
```sql
|
||||||
select 'https://archive.org/details/' || items.id,
|
create schema phono;
|
||||||
pages.page,
|
|
||||||
pages.orientation_match,
|
create table phono.items (
|
||||||
pages.sharpness,
|
_id uuid primary key not null default gen_random_uuid(),
|
||||||
pages.text_margin_px
|
ia_id text,
|
||||||
from items
|
oai_updatedate timestamptz,
|
||||||
join pages on pages.item = items.id
|
started_date timestamptz,
|
||||||
where pages.orientation_match = 0
|
completed_date timestamptz,
|
||||||
or pages.sharpness < 0.07
|
review_date text,
|
||||||
or (pages.text_margin_px > -1 and pages.text_margin_px < 50)
|
url text,
|
||||||
order by items.id;
|
docs_link text
|
||||||
|
);
|
||||||
|
|
||||||
|
create table phono.docs (
|
||||||
|
_id uuid primary key not null default gen_random_uuid(),
|
||||||
|
name text,
|
||||||
|
item uuid,
|
||||||
|
pages_link text
|
||||||
|
);
|
||||||
|
|
||||||
|
create table phono.pages (
|
||||||
|
_id uuid primary key not null default gen_random_uuid(),
|
||||||
|
doc uuid,
|
||||||
|
page numeric,
|
||||||
|
page_angle numeric,
|
||||||
|
sharpness numeric,
|
||||||
|
is_blank text,
|
||||||
|
text_margin_px numeric,
|
||||||
|
url text
|
||||||
|
);
|
||||||
|
```
|
||||||
|
|
||||||
|
You can then start a worker with:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
uv run main.py --database <DATABASE URL>
|
||||||
```
|
```
|
||||||
|
|
||||||
## Test Cases
|
## Test Cases
|
||||||
|
|
@ -38,4 +85,5 @@ order by items.id;
|
||||||
- Blurry pages: `micro_IA40244209_0984`
|
- Blurry pages: `micro_IA40244209_0984`
|
||||||
- Contrast, page orientation: `micro_IA40244211_2290`
|
- Contrast, page orientation: `micro_IA40244211_2290`
|
||||||
- Crop, low quality fiche: `micro_IA40386420_0689`
|
- Crop, low quality fiche: `micro_IA40386420_0689`
|
||||||
- "Bite sized" SCOTUS doc with multiple viewable files and some blurry pages: `micro_IA40386007_0012`
|
- "Bite sized" SCOTUS doc with multiple viewable files and some blurry pages:
|
||||||
|
`micro_IA40386007_0012`
|
||||||
|
|
|
||||||
2
main.py
2
main.py
|
|
@ -13,6 +13,8 @@ from microqa.items import fetch_item, url_encode
|
||||||
from microqa.engine import analyze_doc
|
from microqa.engine import analyze_doc
|
||||||
|
|
||||||
|
|
||||||
|
# Hard-coded Phonograph URLs are included for convenience and are relevant only
|
||||||
|
# to the official deployment.
|
||||||
GUI_DOCS_PORTAL_URL = "https://app.phono.dev/w/019b0a7dd865788e83b8cde7fcc99c9e/r/16583/p/019b6375173c76139afa91a356f97583"
|
GUI_DOCS_PORTAL_URL = "https://app.phono.dev/w/019b0a7dd865788e83b8cde7fcc99c9e/r/16583/p/019b6375173c76139afa91a356f97583"
|
||||||
GUI_PAGES_PORTAL_URL = "https://app.phono.dev/w/019b0a7dd865788e83b8cde7fcc99c9e/r/16604/p/019b6379b1487b1e8791bd6486804452"
|
GUI_PAGES_PORTAL_URL = "https://app.phono.dev/w/019b0a7dd865788e83b8cde7fcc99c9e/r/16604/p/019b6379b1487b1e8791bd6486804452"
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,5 @@
|
||||||
[tools]
|
[tools]
|
||||||
uv = "latest"
|
uv = "latest"
|
||||||
watchexec = "latest"
|
|
||||||
|
|
||||||
[env]
|
[env]
|
||||||
|
|
||||||
|
|
|
||||||
159
one_off.py
159
one_off.py
|
|
@ -1,159 +0,0 @@
|
||||||
import json
|
|
||||||
import re
|
|
||||||
from argparse import ArgumentParser
|
|
||||||
from dataclasses import dataclass
|
|
||||||
from multiprocessing.pool import ThreadPool
|
|
||||||
from sys import stderr, stdin, stdout
|
|
||||||
|
|
||||||
from engine import analyze_item
|
|
||||||
|
|
||||||
|
|
||||||
OCR_LANGS = "eng+fra"
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
parser = ArgumentParser()
|
|
||||||
parser.add_argument("--summarize", action="store_true")
|
|
||||||
parser.add_argument("-v", "--verbose", action="store_true")
|
|
||||||
parser.add_argument("-w", "--workers", type=int, default=1)
|
|
||||||
parser.add_argument("--page-margin-px", type=int, default=50)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
# Process STDIN line by line, where each line contains one or more item IDs
|
|
||||||
# separated by whitespace.
|
|
||||||
for line in stdin:
|
|
||||||
item_ids = [value for value in re.split(r",|\s", line) if value]
|
|
||||||
with ThreadPool(args.workers) as pool:
|
|
||||||
if args.verbose:
|
|
||||||
print(f"Running with {args.workers} workers.", file=stderr)
|
|
||||||
stderr.flush()
|
|
||||||
if args.summarize:
|
|
||||||
pool.map(
|
|
||||||
_summarize_item_to_stdout,
|
|
||||||
[
|
|
||||||
ItemTask(
|
|
||||||
item_id=item_id,
|
|
||||||
page_margin_px=args.page_margin_px,
|
|
||||||
verbose=args.verbose,
|
|
||||||
)
|
|
||||||
for item_id in item_ids
|
|
||||||
],
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
pool.map(
|
|
||||||
_analyze_item_to_stdout,
|
|
||||||
[
|
|
||||||
ItemTask(
|
|
||||||
item_id=item_id,
|
|
||||||
page_margin_px=args.page_margin_px,
|
|
||||||
verbose=args.verbose,
|
|
||||||
)
|
|
||||||
for item_id in item_ids
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class ItemTask:
|
|
||||||
item_id: str
|
|
||||||
page_margin_px: int
|
|
||||||
verbose: bool
|
|
||||||
|
|
||||||
|
|
||||||
def _summarize_item_to_stdout(task):
|
|
||||||
item_id = task.item_id
|
|
||||||
page_margin_px = task.page_margin_px
|
|
||||||
verbose = task.verbose
|
|
||||||
|
|
||||||
if verbose:
|
|
||||||
print(f"Summarizing item {item_id}...", file=stderr)
|
|
||||||
stderr.flush()
|
|
||||||
|
|
||||||
analysis = analyze_item(item_id, parallel=4, verbose=verbose)
|
|
||||||
|
|
||||||
# 3 or more blank pages in a row is a flag.
|
|
||||||
CONSECUTIVE_BLANKS_THRESHOLD = 3
|
|
||||||
if len(analysis["pages"]) >= CONSECUTIVE_BLANKS_THRESHOLD:
|
|
||||||
consecutive_blanks = [page["blank"] for page in analysis["pages"]]
|
|
||||||
for _ in range(1, CONSECUTIVE_BLANKS_THRESHOLD):
|
|
||||||
consecutive_blanks = [
|
|
||||||
value and consecutive_blanks[i]
|
|
||||||
for i, value in enumerate(consecutive_blanks[1:])
|
|
||||||
]
|
|
||||||
consecutive_blanks = [
|
|
||||||
i + 2 # +1 to account for enumeration offset, and +1 to 1-index
|
|
||||||
for i, value in enumerate(consecutive_blanks[1:])
|
|
||||||
if value and not consecutive_blanks[i]
|
|
||||||
]
|
|
||||||
else:
|
|
||||||
consecutive_blanks = []
|
|
||||||
|
|
||||||
# 3 or more blank pages in a row is a flag.
|
|
||||||
CONSECUTIVE_BLURRY_THRESHOLD = 3
|
|
||||||
SHARPNESS_THRESHOLD = 0.1
|
|
||||||
if len(analysis["pages"]) >= CONSECUTIVE_BLURRY_THRESHOLD:
|
|
||||||
consecutive_blurry = [
|
|
||||||
page["sharpness"] < SHARPNESS_THRESHOLD for page in analysis["pages"]
|
|
||||||
]
|
|
||||||
for _ in range(1, CONSECUTIVE_BLURRY_THRESHOLD):
|
|
||||||
consecutive_blurry = [
|
|
||||||
value and consecutive_blurry[i]
|
|
||||||
for i, value in enumerate(consecutive_blurry[1:])
|
|
||||||
]
|
|
||||||
consecutive_blurry = [
|
|
||||||
i + 2 # +1 to account for enumeration offset, and +1 to 1-index
|
|
||||||
for i, value in enumerate(consecutive_blurry[1:])
|
|
||||||
if value and not consecutive_blurry[i]
|
|
||||||
]
|
|
||||||
else:
|
|
||||||
consecutive_blurry = []
|
|
||||||
|
|
||||||
check_orientation = [
|
|
||||||
i + 1
|
|
||||||
for i, page in enumerate(analysis["pages"])
|
|
||||||
if not page["ocr_orientation_match"]
|
|
||||||
]
|
|
||||||
|
|
||||||
check_crop = [
|
|
||||||
i + 1
|
|
||||||
for i, page in enumerate(analysis["pages"])
|
|
||||||
if page["text_margin_px"] < page_margin_px
|
|
||||||
]
|
|
||||||
|
|
||||||
if check_orientation or check_crop or consecutive_blanks or consecutive_blurry:
|
|
||||||
print(
|
|
||||||
json.dumps(
|
|
||||||
{
|
|
||||||
"item_id": item_id,
|
|
||||||
"check_orientation": check_orientation,
|
|
||||||
"check_crop": check_crop,
|
|
||||||
"consecutive_blanks": consecutive_blanks,
|
|
||||||
"consecutive_blurry": consecutive_blurry,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
)
|
|
||||||
stdout.flush()
|
|
||||||
|
|
||||||
if verbose:
|
|
||||||
print(f"Done summarizing item {item_id}.", file=stderr)
|
|
||||||
stderr.flush()
|
|
||||||
|
|
||||||
|
|
||||||
def _analyze_item_to_stdout(task):
|
|
||||||
item_id = task.item_id
|
|
||||||
verbose = task.verbose
|
|
||||||
|
|
||||||
if verbose:
|
|
||||||
print(f"Analyzing item {item_id}...", file=stderr)
|
|
||||||
stderr.flush()
|
|
||||||
|
|
||||||
print(json.dumps(analyze_item(item_id, parallel=6, verbose=verbose)))
|
|
||||||
stdout.flush()
|
|
||||||
|
|
||||||
if verbose:
|
|
||||||
print(f"Done analyzing item {item_id}.", file=stderr)
|
|
||||||
stderr.flush()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
Loading…
Add table
Reference in a new issue