clean up outdated files
This commit is contained in:
parent
b19b0bfb07
commit
35f750c28d
4 changed files with 73 additions and 183 deletions
94
README.md
94
README.md
|
|
@ -1,36 +1,83 @@
|
|||
# MicroQA
|
||||
|
||||
QA assistant for the Internet Archive's microfiche scanning team.
|
||||
MicroQA assists with quality assurance for the Internet Archive's microfiche
|
||||
scanning team by screening for indicators of common scanning and post-processing
|
||||
mishaps including missed focus, disorientation, and over-cropping. As the
|
||||
Democracy's Library project expands to archive millions of pages of official
|
||||
documents, automated screening with OCR and conventional image processing
|
||||
multiplies the human QA team's capacity to conduct targeted spot checks,
|
||||
re-reviews, and manual re-scans.
|
||||
|
||||
## Usage
|
||||
Two pluggable OCR back-ends are available:
|
||||
|
||||
Analyze page statistics for item:
|
||||
- [Tesseract](https://github.com/tesseract-ocr/tesseract) -- CPU based,
|
||||
reliable, fast multi-lingual support
|
||||
- [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR) -- 1-shot orientation
|
||||
detection, offers GPU or NPU acceleration
|
||||
|
||||
## Quickstart
|
||||
|
||||
[Mise-en-place](https://mise.jdx.dev/) manages top-level dev dependencies,
|
||||
except for OCR back-ends. If you already have a recent version of `uv`
|
||||
installed, you may skip `mise install` and use the existing binary instead.
|
||||
|
||||
```sh
|
||||
echo 'micro_IA04244212_1665' | uv run main.py | jq
|
||||
mise install
|
||||
|
||||
uv sync
|
||||
|
||||
sudo apt install tesseract-ocr # Debian, Ubuntu
|
||||
sudo dnf install tesseract # Fedora
|
||||
brew install tesseract # macOS
|
||||
|
||||
# Run page diagnostics on a single Archive item:
|
||||
uv run diagnostics.py micro_IA04244212_1665 --verbose
|
||||
```
|
||||
|
||||
Paste item IDs from clipboard and summarize all (`tr` command collapses input to
|
||||
a single line so that items are summarized in parallel):
|
||||
## Production Usage
|
||||
|
||||
```sh
|
||||
pbpaste | tr '\n' ',' | uv run main.py --summarize -workers 4 -v | jq
|
||||
```
|
||||
|
||||
Query a pre-populated database for suspect pages:
|
||||
MicroQA integrates with [Phonograph](https://www.phono.dev/) to coordinate
|
||||
between concurrent workers, store data, and make analysis results navigable at
|
||||
scale. Phonograph exposes a PostgreSQL interface, so MicroQA can run
|
||||
independently if supplied a PostgreSQL database with a schema equivalent to:
|
||||
|
||||
```sql
|
||||
select 'https://archive.org/details/' || items.id,
|
||||
pages.page,
|
||||
pages.orientation_match,
|
||||
pages.sharpness,
|
||||
pages.text_margin_px
|
||||
from items
|
||||
join pages on pages.item = items.id
|
||||
where pages.orientation_match = 0
|
||||
or pages.sharpness < 0.07
|
||||
or (pages.text_margin_px > -1 and pages.text_margin_px < 50)
|
||||
order by items.id;
|
||||
create schema phono;
|
||||
|
||||
create table phono.items (
|
||||
_id uuid primary key not null default gen_random_uuid(),
|
||||
ia_id text,
|
||||
oai_updatedate timestamptz,
|
||||
started_date timestamptz,
|
||||
completed_date timestamptz,
|
||||
review_date text,
|
||||
url text,
|
||||
docs_link text
|
||||
);
|
||||
|
||||
create table phono.docs (
|
||||
_id uuid primary key not null default gen_random_uuid(),
|
||||
name text,
|
||||
item uuid,
|
||||
pages_link text
|
||||
);
|
||||
|
||||
create table phono.pages (
|
||||
_id uuid primary key not null default gen_random_uuid(),
|
||||
doc uuid,
|
||||
page numeric,
|
||||
page_angle numeric,
|
||||
sharpness numeric,
|
||||
is_blank text,
|
||||
text_margin_px numeric,
|
||||
url text
|
||||
);
|
||||
```
|
||||
|
||||
You can then start a worker with:
|
||||
|
||||
```sh
|
||||
uv run main.py --database <DATABASE URL>
|
||||
```
|
||||
|
||||
## Test Cases
|
||||
|
|
@ -38,4 +85,5 @@ order by items.id;
|
|||
- Blurry pages: `micro_IA40244209_0984`
|
||||
- Contrast, page orientation: `micro_IA40244211_2290`
|
||||
- Crop, low quality fiche: `micro_IA40386420_0689`
|
||||
- "Bite sized" SCOTUS doc with multiple viewable files and some blurry pages: `micro_IA40386007_0012`
|
||||
- "Bite sized" SCOTUS doc with multiple viewable files and some blurry pages:
|
||||
`micro_IA40386007_0012`
|
||||
|
|
|
|||
2
main.py
2
main.py
|
|
@ -13,6 +13,8 @@ from microqa.items import fetch_item, url_encode
|
|||
from microqa.engine import analyze_doc
|
||||
|
||||
|
||||
# Hard-coded Phonograph URLs are included for convenience and are relevant only
|
||||
# to the official deployment.
|
||||
GUI_DOCS_PORTAL_URL = "https://app.phono.dev/w/019b0a7dd865788e83b8cde7fcc99c9e/r/16583/p/019b6375173c76139afa91a356f97583"
|
||||
GUI_PAGES_PORTAL_URL = "https://app.phono.dev/w/019b0a7dd865788e83b8cde7fcc99c9e/r/16604/p/019b6379b1487b1e8791bd6486804452"
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
[tools]
|
||||
uv = "latest"
|
||||
watchexec = "latest"
|
||||
|
||||
[env]
|
||||
|
||||
|
|
|
|||
159
one_off.py
159
one_off.py
|
|
@ -1,159 +0,0 @@
|
|||
import json
|
||||
import re
|
||||
from argparse import ArgumentParser
|
||||
from dataclasses import dataclass
|
||||
from multiprocessing.pool import ThreadPool
|
||||
from sys import stderr, stdin, stdout
|
||||
|
||||
from engine import analyze_item
|
||||
|
||||
|
||||
OCR_LANGS = "eng+fra"
|
||||
|
||||
|
||||
def main():
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument("--summarize", action="store_true")
|
||||
parser.add_argument("-v", "--verbose", action="store_true")
|
||||
parser.add_argument("-w", "--workers", type=int, default=1)
|
||||
parser.add_argument("--page-margin-px", type=int, default=50)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Process STDIN line by line, where each line contains one or more item IDs
|
||||
# separated by whitespace.
|
||||
for line in stdin:
|
||||
item_ids = [value for value in re.split(r",|\s", line) if value]
|
||||
with ThreadPool(args.workers) as pool:
|
||||
if args.verbose:
|
||||
print(f"Running with {args.workers} workers.", file=stderr)
|
||||
stderr.flush()
|
||||
if args.summarize:
|
||||
pool.map(
|
||||
_summarize_item_to_stdout,
|
||||
[
|
||||
ItemTask(
|
||||
item_id=item_id,
|
||||
page_margin_px=args.page_margin_px,
|
||||
verbose=args.verbose,
|
||||
)
|
||||
for item_id in item_ids
|
||||
],
|
||||
)
|
||||
else:
|
||||
pool.map(
|
||||
_analyze_item_to_stdout,
|
||||
[
|
||||
ItemTask(
|
||||
item_id=item_id,
|
||||
page_margin_px=args.page_margin_px,
|
||||
verbose=args.verbose,
|
||||
)
|
||||
for item_id in item_ids
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ItemTask:
|
||||
item_id: str
|
||||
page_margin_px: int
|
||||
verbose: bool
|
||||
|
||||
|
||||
def _summarize_item_to_stdout(task):
|
||||
item_id = task.item_id
|
||||
page_margin_px = task.page_margin_px
|
||||
verbose = task.verbose
|
||||
|
||||
if verbose:
|
||||
print(f"Summarizing item {item_id}...", file=stderr)
|
||||
stderr.flush()
|
||||
|
||||
analysis = analyze_item(item_id, parallel=4, verbose=verbose)
|
||||
|
||||
# 3 or more blank pages in a row is a flag.
|
||||
CONSECUTIVE_BLANKS_THRESHOLD = 3
|
||||
if len(analysis["pages"]) >= CONSECUTIVE_BLANKS_THRESHOLD:
|
||||
consecutive_blanks = [page["blank"] for page in analysis["pages"]]
|
||||
for _ in range(1, CONSECUTIVE_BLANKS_THRESHOLD):
|
||||
consecutive_blanks = [
|
||||
value and consecutive_blanks[i]
|
||||
for i, value in enumerate(consecutive_blanks[1:])
|
||||
]
|
||||
consecutive_blanks = [
|
||||
i + 2 # +1 to account for enumeration offset, and +1 to 1-index
|
||||
for i, value in enumerate(consecutive_blanks[1:])
|
||||
if value and not consecutive_blanks[i]
|
||||
]
|
||||
else:
|
||||
consecutive_blanks = []
|
||||
|
||||
# 3 or more blank pages in a row is a flag.
|
||||
CONSECUTIVE_BLURRY_THRESHOLD = 3
|
||||
SHARPNESS_THRESHOLD = 0.1
|
||||
if len(analysis["pages"]) >= CONSECUTIVE_BLURRY_THRESHOLD:
|
||||
consecutive_blurry = [
|
||||
page["sharpness"] < SHARPNESS_THRESHOLD for page in analysis["pages"]
|
||||
]
|
||||
for _ in range(1, CONSECUTIVE_BLURRY_THRESHOLD):
|
||||
consecutive_blurry = [
|
||||
value and consecutive_blurry[i]
|
||||
for i, value in enumerate(consecutive_blurry[1:])
|
||||
]
|
||||
consecutive_blurry = [
|
||||
i + 2 # +1 to account for enumeration offset, and +1 to 1-index
|
||||
for i, value in enumerate(consecutive_blurry[1:])
|
||||
if value and not consecutive_blurry[i]
|
||||
]
|
||||
else:
|
||||
consecutive_blurry = []
|
||||
|
||||
check_orientation = [
|
||||
i + 1
|
||||
for i, page in enumerate(analysis["pages"])
|
||||
if not page["ocr_orientation_match"]
|
||||
]
|
||||
|
||||
check_crop = [
|
||||
i + 1
|
||||
for i, page in enumerate(analysis["pages"])
|
||||
if page["text_margin_px"] < page_margin_px
|
||||
]
|
||||
|
||||
if check_orientation or check_crop or consecutive_blanks or consecutive_blurry:
|
||||
print(
|
||||
json.dumps(
|
||||
{
|
||||
"item_id": item_id,
|
||||
"check_orientation": check_orientation,
|
||||
"check_crop": check_crop,
|
||||
"consecutive_blanks": consecutive_blanks,
|
||||
"consecutive_blurry": consecutive_blurry,
|
||||
}
|
||||
)
|
||||
)
|
||||
stdout.flush()
|
||||
|
||||
if verbose:
|
||||
print(f"Done summarizing item {item_id}.", file=stderr)
|
||||
stderr.flush()
|
||||
|
||||
|
||||
def _analyze_item_to_stdout(task):
|
||||
item_id = task.item_id
|
||||
verbose = task.verbose
|
||||
|
||||
if verbose:
|
||||
print(f"Analyzing item {item_id}...", file=stderr)
|
||||
stderr.flush()
|
||||
|
||||
print(json.dumps(analyze_item(item_id, parallel=6, verbose=verbose)))
|
||||
stdout.flush()
|
||||
|
||||
if verbose:
|
||||
print(f"Done analyzing item {item_id}.", file=stderr)
|
||||
stderr.flush()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Reference in a new issue