rewrite to engine.py
This commit is contained in:
parent
815934ad23
commit
4d9161b043
6 changed files with 550 additions and 479 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
|
@ -1,2 +1,4 @@
|
|||
/target
|
||||
/data
|
||||
*.db
|
||||
__pycache__
|
||||
|
|
|
|||
|
|
@ -38,3 +38,4 @@ order by items.id;
|
|||
- Blurry pages: `micro_IA40244209_0984`
|
||||
- Contrast, page orientation: `micro_IA40244211_2290`
|
||||
- Crop, low quality fiche: `micro_IA40386420_0689`
|
||||
- "Bite sized" SCOTUS doc with multiple viewable files and some blurry pages: `micro_IA40386007_0012`
|
||||
|
|
|
|||
134
cache.py
134
cache.py
|
|
@ -1,134 +0,0 @@
|
|||
import re
|
||||
import sqlite3
|
||||
import traceback
|
||||
from argparse import ArgumentParser
|
||||
from datetime import datetime
|
||||
from time import sleep
|
||||
|
||||
import requests
|
||||
|
||||
from main import analyze_item
|
||||
|
||||
|
||||
def main():
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument("--database", default="./microqa.db")
|
||||
parser.add_argument("--cpus", type=int, default=2)
|
||||
parser.add_argument("--earliest-review-date", default="20250701")
|
||||
args = parser.parse_args()
|
||||
|
||||
with sqlite3.connect(args.database) as conn:
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
create table if not exists items (
|
||||
id text primary key not null,
|
||||
review_date text not null,
|
||||
analyzed_date text
|
||||
)""")
|
||||
cur.execute("""
|
||||
create table if not exists pages (
|
||||
id int primary key,
|
||||
item text not null,
|
||||
page int not null,
|
||||
orientation_match boolean not null,
|
||||
sharpness real not null,
|
||||
is_blank boolean not null,
|
||||
text_margin_px int not null
|
||||
)""")
|
||||
conn.commit()
|
||||
|
||||
while True:
|
||||
print("Pulling item IDs")
|
||||
pull_new_item_ids(conn, args.earliest_review_date)
|
||||
print("Done.")
|
||||
res = cur.execute(
|
||||
"select id from items where analyzed_date is null order by review_date"
|
||||
)
|
||||
for (item_id,) in res.fetchall():
|
||||
N_ATTEMPTS = 3
|
||||
for _ in range(N_ATTEMPTS):
|
||||
try:
|
||||
print(f"Processing {item_id}")
|
||||
analysis = analyze_item(
|
||||
item_id, parallel=args.cpus, verbose=True
|
||||
)
|
||||
for i, page in enumerate(analysis["pages"]):
|
||||
cur.execute(
|
||||
"""
|
||||
insert into pages (
|
||||
item,
|
||||
page,
|
||||
orientation_match,
|
||||
sharpness,
|
||||
is_blank,
|
||||
text_margin_px
|
||||
) values (
|
||||
?,
|
||||
?,
|
||||
?,
|
||||
?,
|
||||
?,
|
||||
?
|
||||
)""",
|
||||
[
|
||||
item_id,
|
||||
i + 1,
|
||||
page["ocr_orientation_match"],
|
||||
page["sharpness"],
|
||||
page["blank"],
|
||||
page["text_margin_px"],
|
||||
],
|
||||
)
|
||||
cur.execute(
|
||||
"update items set analyzed_date = ? where id = ?",
|
||||
[datetime.utcnow().strftime("%Y%m%d%H%M%S"), item_id],
|
||||
)
|
||||
conn.commit()
|
||||
print("Done")
|
||||
break
|
||||
except Exception as err:
|
||||
print(err)
|
||||
traceback.print_tb(err.__traceback__)
|
||||
sleep(15)
|
||||
break
|
||||
sleep(3600)
|
||||
|
||||
|
||||
def pull_new_item_ids(conn, earliest_review_date):
|
||||
cur = conn.cursor()
|
||||
res = cur.execute("select review_date from items order by review_date desc limit 1")
|
||||
(latest_review_date,) = res.fetchone() or (earliest_review_date,)
|
||||
print(latest_review_date)
|
||||
|
||||
query = f"""
|
||||
collection:(microfiche)
|
||||
AND contributor:(Internet Archive)
|
||||
AND micro_review:(done)
|
||||
AND review_date:[{latest_review_date} TO null]
|
||||
"""
|
||||
sort = "reviewdate asc"
|
||||
|
||||
# Format for API.
|
||||
query = re.sub(r"\s+", "+", query.strip())
|
||||
sort = re.sub(r"\s+", "+", sort.strip())
|
||||
|
||||
for i in range(1, 999):
|
||||
resp = requests.get(
|
||||
f"https://archive.org/advancedsearch.php?q={query}&sort[]={sort}&fl[]=identifier&fl[]=review_date&rows=100&page={i}&output=json",
|
||||
)
|
||||
resp.raise_for_status()
|
||||
body = resp.json()
|
||||
if len(body["response"]["docs"]) == 0:
|
||||
break
|
||||
cur.executemany(
|
||||
"insert into items (id, review_date) values (?, ?) on conflict do nothing",
|
||||
[
|
||||
(doc["identifier"], doc["review_date"])
|
||||
for doc in body["response"]["docs"]
|
||||
],
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
239
engine.py
Normal file
239
engine.py
Normal file
|
|
@ -0,0 +1,239 @@
|
|||
import urllib.parse
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from io import BytesIO
|
||||
from multiprocessing import Pool
|
||||
from sys import stderr
|
||||
from zipfile import ZipFile
|
||||
|
||||
import numpy as np
|
||||
import pytesseract
|
||||
import requests
|
||||
from PIL import Image, ImageFilter
|
||||
|
||||
|
||||
def analyze_item(item_id, ocr_langs="eng+fra", parallel=1, verbose=False):
|
||||
escaped_item_id = urllib.parse.quote(item_id, safe="")
|
||||
|
||||
if verbose:
|
||||
print("Downloading...", file=stderr)
|
||||
stderr.flush()
|
||||
page_nums_resp = requests.get(
|
||||
f"https://archive.org/metadata/{escaped_item_id}/page_numbers/pages"
|
||||
)
|
||||
page_nums_resp.raise_for_status()
|
||||
page_nums = page_nums_resp.json()["result"]
|
||||
|
||||
zip_resp = requests.get(
|
||||
f"https://archive.org/download/{escaped_item_id}/{escaped_item_id}_jp2.zip"
|
||||
)
|
||||
zip_resp.raise_for_status()
|
||||
|
||||
if verbose:
|
||||
print("Decompressing...", file=stderr)
|
||||
stderr.flush()
|
||||
tasks = []
|
||||
with ZipFile(BytesIO(zip_resp.content)) as jp_zip:
|
||||
for leaf_num, file_name in enumerate(sorted(jp_zip.namelist())):
|
||||
for page_index, page_num_info in enumerate(page_nums):
|
||||
if page_num_info["leafNum"] == leaf_num:
|
||||
# Stop iterating and keep page_index set to the current item.
|
||||
break
|
||||
else:
|
||||
# Set to -1 to indicate that leaf was not found in page_num list.
|
||||
page_index = -1
|
||||
|
||||
if page_index != -1:
|
||||
with jp_zip.open(file_name) as jp_file:
|
||||
im = Image.open(jp_file).convert("L")
|
||||
im.thumbnail((3200, 3200))
|
||||
tasks.append(
|
||||
PageAnalysisTask(
|
||||
im=im,
|
||||
ocr_langs=ocr_langs,
|
||||
)
|
||||
)
|
||||
|
||||
if verbose:
|
||||
print(f"Processing {len(page_nums)} pages...", file=stderr)
|
||||
stderr.flush()
|
||||
|
||||
if parallel > 1:
|
||||
# Parallelize image processing and OCR of pages across up to n cores.
|
||||
with Pool(parallel) as pool:
|
||||
return {"pages": pool.map(analyze_page, tasks)}
|
||||
|
||||
return {"pages": [analyze_page(task) for task in tasks]}
|
||||
|
||||
|
||||
@dataclass
|
||||
class PageAnalysisTask:
|
||||
"""
|
||||
Attributes:
|
||||
im PIL Image, pre-scaled using .thumbnail() to fit the long
|
||||
edge to 3200 px.
|
||||
ocr_langs Tesseract language codes (3 letters each, in a "+"-separated
|
||||
list).
|
||||
"""
|
||||
|
||||
im: Image.Image
|
||||
ocr_langs: str = "eng+fra"
|
||||
|
||||
|
||||
def analyze_page(task):
|
||||
im_cropped = task.im.crop(
|
||||
(
|
||||
task.im.size[0] * 0.1,
|
||||
task.im.size[1] * 0.1,
|
||||
task.im.size[0] * 0.9,
|
||||
task.im.size[1] * 0.9,
|
||||
)
|
||||
)
|
||||
|
||||
is_blank = im_cropped.getextrema()[0] > 255 * 0.8
|
||||
|
||||
if is_blank:
|
||||
max_sharpness = 1
|
||||
ocr_orientation_match = True
|
||||
text_margin_px = -1
|
||||
else:
|
||||
max_sharpness = 0.0
|
||||
if im_cropped.size[0] < im_cropped.size[1]:
|
||||
# Page is in portrait orientation.
|
||||
segments_x = 2
|
||||
segments_y = 3
|
||||
else:
|
||||
# Page is in landscape orientation.
|
||||
segments_x = 3
|
||||
segments_y = 2
|
||||
for i in range(segments_x):
|
||||
for j in range(segments_y):
|
||||
max_sharpness = max(
|
||||
max_sharpness,
|
||||
analyze_sharpness(
|
||||
im_cropped.crop(
|
||||
(
|
||||
im_cropped.size[0] / segments_x * i,
|
||||
im_cropped.size[1] / segments_y * j,
|
||||
im_cropped.size[0] / segments_x * (i + 1),
|
||||
im_cropped.size[1] / segments_y * (j + 1),
|
||||
)
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
OCR_SCALE = 1
|
||||
best_ocr_score = -1
|
||||
best_ocr_words = None
|
||||
best_ocr_orientation = -1
|
||||
for orientation in range(4):
|
||||
im_rotated = task.im.resize(
|
||||
np.int_(np.array(task.im.size) * OCR_SCALE)
|
||||
).rotate(90 * orientation, expand=True)
|
||||
ocr = pytesseract.image_to_data(
|
||||
im_rotated,
|
||||
lang=task.ocr_langs,
|
||||
config=f"--oem 1 --dpi {int(300 * OCR_SCALE)} --tessdata-dir ./data/tessdata_fast-4.1.0",
|
||||
output_type=pytesseract.Output.DATAFRAME,
|
||||
).fillna({"text": ""})
|
||||
# Keep only words that Tesseract is confident in, and which are
|
||||
# oriented horizontally.
|
||||
words = ocr[(ocr["conf"] > 90) & (ocr["width"] > ocr["height"])]
|
||||
# Keep only alphabetical words of 4 or more characters.
|
||||
words = words[
|
||||
words.apply(
|
||||
lambda row: re.fullmatch(r"[a-zA-Z]{4,}", str(row["text"]))
|
||||
is not None,
|
||||
axis=1,
|
||||
)
|
||||
]
|
||||
if words.shape[0] > best_ocr_score:
|
||||
best_ocr_score = words.shape[0]
|
||||
best_ocr_orientation = orientation
|
||||
best_ocr_words = words
|
||||
if best_ocr_score > 50:
|
||||
# Unlikely that another orientation will have more words, so
|
||||
# stop eating up CPU.
|
||||
break
|
||||
|
||||
if best_ocr_words.empty:
|
||||
ocr_orientation_match = True
|
||||
text_margin_px = -1
|
||||
else:
|
||||
ocr_orientation_match = best_ocr_orientation == 0
|
||||
|
||||
best_ocr_dims = OCR_SCALE * np.array(
|
||||
task.im.size
|
||||
if best_ocr_orientation % 2 == 0
|
||||
else (task.im.size[1], task.im.size[0])
|
||||
)
|
||||
|
||||
word_margins_all_directions = np.sort(
|
||||
np.int_(
|
||||
np.concat(
|
||||
(
|
||||
best_ocr_words["left"].to_numpy(),
|
||||
best_ocr_words["top"].to_numpy(),
|
||||
best_ocr_dims[0]
|
||||
- (
|
||||
best_ocr_words["left"] + best_ocr_words["width"]
|
||||
).to_numpy(),
|
||||
best_ocr_dims[1]
|
||||
- (
|
||||
best_ocr_words["top"] + best_ocr_words["height"]
|
||||
).to_numpy(),
|
||||
)
|
||||
)
|
||||
# Transform back into original image pixel density
|
||||
/ OCR_SCALE
|
||||
)
|
||||
)
|
||||
# Skip the n closest words to the edge, to help ignore stray OCR artifacts.
|
||||
SKIP_WORDS = 2
|
||||
text_margin_px = int(
|
||||
word_margins_all_directions[SKIP_WORDS]
|
||||
if word_margins_all_directions.shape[0] > SKIP_WORDS
|
||||
else -1
|
||||
)
|
||||
|
||||
return {
|
||||
"blank": is_blank,
|
||||
"ocr_orientation_match": ocr_orientation_match,
|
||||
"size_analyzed": task.im.size,
|
||||
"sharpness": max_sharpness,
|
||||
"text_margin_px": text_margin_px,
|
||||
}
|
||||
|
||||
|
||||
def analyze_sharpness(im):
|
||||
"""
|
||||
Crudely quantifies the "sharpness" of edges in an image, on a scale of 0 to
|
||||
1. The scale is not linear with respect to scan quality: anything above 0.1
|
||||
is usually fine.
|
||||
"""
|
||||
arr = np.asarray(im)
|
||||
|
||||
# Normalize contrast based on brightest and darkest pixels. For example,
|
||||
# NORM_QUANTILE=0.1 will attempt to transform pixel values so that 80% fall
|
||||
# between 10% brightness and 90% brightness. In practice, a value around
|
||||
# 0.02 seems to work fairly well.
|
||||
NORM_QUANTILE = 0.03
|
||||
pixel_range = np.quantile(arr, 1.0 - NORM_QUANTILE) - np.quantile(
|
||||
arr, NORM_QUANTILE
|
||||
)
|
||||
if pixel_range == 0:
|
||||
arr_normalized = arr
|
||||
else:
|
||||
arr_normalized = arr * (1.0 - NORM_QUANTILE * 2) / pixel_range
|
||||
arr_normalized = (
|
||||
arr_normalized - np.quantile(arr_normalized, NORM_QUANTILE) + NORM_QUANTILE
|
||||
)
|
||||
arr_normalized = np.uint8(np.clip(arr_normalized, 0, 1) * 255)
|
||||
|
||||
# "Sharpness" is determined by measuring the median intensity of pixels
|
||||
# near edges, after an edge detection filter has been applied to the image.
|
||||
edges_arr = np.asarray(
|
||||
Image.fromarray(arr_normalized).filter(ImageFilter.FIND_EDGES)
|
||||
)
|
||||
EDGE_THRESHOLD = 8
|
||||
return np.median(edges_arr[edges_arr > EDGE_THRESHOLD]) / 255
|
||||
494
main.py
494
main.py
|
|
@ -1,371 +1,175 @@
|
|||
import json
|
||||
import re
|
||||
import urllib.parse
|
||||
import sqlite3
|
||||
import traceback
|
||||
from argparse import ArgumentParser
|
||||
from dataclasses import dataclass
|
||||
from io import BytesIO
|
||||
from multiprocessing import Pool
|
||||
from multiprocessing.pool import ThreadPool
|
||||
from sys import stderr, stdin, stdout
|
||||
from zipfile import ZipFile
|
||||
from datetime import datetime
|
||||
from sys import stderr
|
||||
from time import sleep
|
||||
|
||||
import numpy as np
|
||||
import pytesseract
|
||||
import requests
|
||||
from PIL import Image, ImageFilter
|
||||
|
||||
|
||||
OCR_LANGS = "eng+fra"
|
||||
from engine import analyze_item
|
||||
|
||||
|
||||
def main():
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument("--summarize", action="store_true")
|
||||
parser.add_argument("-v", "--verbose", action="store_true")
|
||||
parser.add_argument("-w", "--workers", type=int, default=1)
|
||||
parser.add_argument("--page-margin-px", type=int, default=50)
|
||||
parser.add_argument("--database", default="./microqa.db")
|
||||
parser.add_argument("--cpus", type=int, default=2)
|
||||
parser.add_argument("--earliest-review-date", default="20250701")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Process STDIN line by line, where each line contains one or more item IDs
|
||||
# separated by whitespace.
|
||||
for line in stdin:
|
||||
item_ids = [value for value in re.split(r",|\s", line) if value]
|
||||
with ThreadPool(args.workers) as pool:
|
||||
if args.verbose:
|
||||
print(f"Running with {args.workers} workers.", file=stderr)
|
||||
stderr.flush()
|
||||
if args.summarize:
|
||||
pool.map(
|
||||
_summarize_item_to_stdout,
|
||||
[
|
||||
ItemTask(
|
||||
item_id=item_id,
|
||||
page_margin_px=args.page_margin_px,
|
||||
verbose=args.verbose,
|
||||
with sqlite3.connect(args.database) as conn:
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
create table if not exists items (
|
||||
id text primary key not null,
|
||||
review_date text not null,
|
||||
skip_analysis bool not null,
|
||||
analyzed_date text
|
||||
)""")
|
||||
cur.execute("""
|
||||
create table if not exists pages (
|
||||
id int primary key,
|
||||
item text not null,
|
||||
page int not null,
|
||||
orientation_match boolean not null,
|
||||
sharpness real not null,
|
||||
is_blank boolean not null,
|
||||
text_margin_px int not null
|
||||
)""")
|
||||
cur.execute("create index if not exists review_date_idx on items (review_date)")
|
||||
cur.execute(
|
||||
"create index if not exists analyzed_date_idx on items (analyzed_date)"
|
||||
)
|
||||
for item_id in item_ids
|
||||
cur.execute("create index if not exists item_idx on pages (item)")
|
||||
cur.execute(
|
||||
"create unique index if not exists item_page_idx on pages (item, page)"
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
while True:
|
||||
print("Pulling item IDs")
|
||||
pull_new_item_ids(conn, args.earliest_review_date)
|
||||
print("Done.")
|
||||
res = cur.execute("""
|
||||
select id
|
||||
from items
|
||||
where analyzed_date is null
|
||||
and skip_analysis = false
|
||||
order by review_date
|
||||
""")
|
||||
for (item_id,) in res.fetchall():
|
||||
N_ATTEMPTS = 3
|
||||
for _ in range(N_ATTEMPTS):
|
||||
try:
|
||||
print(f"Processing {item_id}")
|
||||
analysis = analyze_item(
|
||||
item_id, parallel=args.cpus, verbose=True
|
||||
)
|
||||
for i, page in enumerate(analysis["pages"]):
|
||||
cur.execute(
|
||||
"""
|
||||
insert into pages (
|
||||
item,
|
||||
page,
|
||||
orientation_match,
|
||||
sharpness,
|
||||
is_blank,
|
||||
text_margin_px
|
||||
) values (
|
||||
?,
|
||||
?,
|
||||
?,
|
||||
?,
|
||||
?,
|
||||
?
|
||||
)""",
|
||||
[
|
||||
item_id,
|
||||
i + 1,
|
||||
page["ocr_orientation_match"],
|
||||
page["sharpness"],
|
||||
page["blank"],
|
||||
page["text_margin_px"],
|
||||
],
|
||||
)
|
||||
else:
|
||||
pool.map(
|
||||
_analyze_item_to_stdout,
|
||||
[
|
||||
ItemTask(
|
||||
item_id=item_id,
|
||||
page_margin_px=args.page_margin_px,
|
||||
verbose=args.verbose,
|
||||
cur.execute(
|
||||
"update items set analyzed_date = ? where id = ?",
|
||||
[datetime.utcnow().strftime("%Y%m%d%H%M%S"), item_id],
|
||||
)
|
||||
for item_id in item_ids
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ItemTask:
|
||||
item_id: str
|
||||
page_margin_px: int
|
||||
verbose: bool
|
||||
|
||||
|
||||
def _summarize_item_to_stdout(task):
|
||||
item_id = task.item_id
|
||||
page_margin_px = task.page_margin_px
|
||||
verbose = task.verbose
|
||||
|
||||
if verbose:
|
||||
print(f"Summarizing item {item_id}...", file=stderr)
|
||||
stderr.flush()
|
||||
|
||||
analysis = analyze_item(item_id, parallel=4, verbose=verbose)
|
||||
|
||||
# 3 or more blank pages in a row is a flag.
|
||||
CONSECUTIVE_BLANKS_THRESHOLD = 3
|
||||
if len(analysis["pages"]) >= CONSECUTIVE_BLANKS_THRESHOLD:
|
||||
consecutive_blanks = [page["blank"] for page in analysis["pages"]]
|
||||
for _ in range(1, CONSECUTIVE_BLANKS_THRESHOLD):
|
||||
consecutive_blanks = [
|
||||
value and consecutive_blanks[i]
|
||||
for i, value in enumerate(consecutive_blanks[1:])
|
||||
]
|
||||
consecutive_blanks = [
|
||||
i + 2 # +1 to account for enumeration offset, and +1 to 1-index
|
||||
for i, value in enumerate(consecutive_blanks[1:])
|
||||
if value and not consecutive_blanks[i]
|
||||
]
|
||||
else:
|
||||
consecutive_blanks = []
|
||||
|
||||
# 3 or more blank pages in a row is a flag.
|
||||
CONSECUTIVE_BLURRY_THRESHOLD = 3
|
||||
SHARPNESS_THRESHOLD = 0.1
|
||||
if len(analysis["pages"]) >= CONSECUTIVE_BLURRY_THRESHOLD:
|
||||
consecutive_blurry = [
|
||||
page["sharpness"] < SHARPNESS_THRESHOLD for page in analysis["pages"]
|
||||
]
|
||||
for _ in range(1, CONSECUTIVE_BLURRY_THRESHOLD):
|
||||
consecutive_blurry = [
|
||||
value and consecutive_blurry[i]
|
||||
for i, value in enumerate(consecutive_blurry[1:])
|
||||
]
|
||||
consecutive_blurry = [
|
||||
i + 2 # +1 to account for enumeration offset, and +1 to 1-index
|
||||
for i, value in enumerate(consecutive_blurry[1:])
|
||||
if value and not consecutive_blurry[i]
|
||||
]
|
||||
else:
|
||||
consecutive_blurry = []
|
||||
|
||||
check_orientation = [
|
||||
i + 1
|
||||
for i, page in enumerate(analysis["pages"])
|
||||
if not page["ocr_orientation_match"]
|
||||
]
|
||||
|
||||
check_crop = [
|
||||
i + 1
|
||||
for i, page in enumerate(analysis["pages"])
|
||||
if page["text_margin_px"] < page_margin_px
|
||||
]
|
||||
|
||||
if check_orientation or check_crop or consecutive_blanks or consecutive_blurry:
|
||||
print(
|
||||
json.dumps(
|
||||
{
|
||||
"item_id": item_id,
|
||||
"check_orientation": check_orientation,
|
||||
"check_crop": check_crop,
|
||||
"consecutive_blanks": consecutive_blanks,
|
||||
"consecutive_blurry": consecutive_blurry,
|
||||
}
|
||||
)
|
||||
)
|
||||
stdout.flush()
|
||||
|
||||
if verbose:
|
||||
print(f"Done summarizing item {item_id}.", file=stderr)
|
||||
stderr.flush()
|
||||
|
||||
|
||||
def _analyze_item_to_stdout(task):
|
||||
item_id = task.item_id
|
||||
verbose = task.verbose
|
||||
|
||||
if verbose:
|
||||
print(f"Analyzing item {item_id}...", file=stderr)
|
||||
stderr.flush()
|
||||
|
||||
print(json.dumps(analyze_item(item_id, parallel=4, verbose=verbose)))
|
||||
stdout.flush()
|
||||
|
||||
if verbose:
|
||||
print(f"Done analyzing item {item_id}.", file=stderr)
|
||||
stderr.flush()
|
||||
|
||||
|
||||
@dataclass
|
||||
class PageAnalysisTask:
|
||||
im: Image.Image
|
||||
page_index: int
|
||||
file_name: str
|
||||
|
||||
|
||||
def _analyze_page(task):
|
||||
im_original = task.im
|
||||
page_index = task.page_index
|
||||
file_name = task.file_name
|
||||
|
||||
im_cropped = im_original.crop(
|
||||
(
|
||||
im_original.size[0] * 0.1,
|
||||
im_original.size[1] * 0.1,
|
||||
im_original.size[0] * 0.9,
|
||||
im_original.size[1] * 0.9,
|
||||
)
|
||||
)
|
||||
|
||||
is_blank = im_cropped.getextrema()[0] > 255 * 0.8
|
||||
|
||||
if is_blank:
|
||||
max_sharpness = 1
|
||||
ocr_orientation_match = True
|
||||
text_margin_px = -1
|
||||
else:
|
||||
max_sharpness = 0.0
|
||||
if im_cropped.size[0] < im_cropped.size[1]:
|
||||
# Page is in portrait orientation.
|
||||
segments_x = 2
|
||||
segments_y = 3
|
||||
else:
|
||||
# Page is in landscape orientation.
|
||||
segments_x = 3
|
||||
segments_y = 2
|
||||
for i in range(segments_x):
|
||||
for j in range(segments_y):
|
||||
max_sharpness = max(
|
||||
max_sharpness,
|
||||
analyze_sharpness(
|
||||
im_cropped.crop(
|
||||
(
|
||||
im_cropped.size[0] / segments_x * i,
|
||||
im_cropped.size[1] / segments_y * j,
|
||||
im_cropped.size[0] / segments_x * (i + 1),
|
||||
im_cropped.size[1] / segments_y * (j + 1),
|
||||
)
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
best_ocr_score = -1
|
||||
best_ocr_words = None
|
||||
best_ocr_orientation = -1
|
||||
for orientation in range(4):
|
||||
im_rotated = im_original.rotate(90 * orientation, expand=True)
|
||||
ocr = pytesseract.image_to_data(
|
||||
im_rotated,
|
||||
lang=OCR_LANGS,
|
||||
config="--oem 1 --dpi 300 --tessdata-dir ./data/tessdata_fast-4.1.0",
|
||||
output_type=pytesseract.Output.DATAFRAME,
|
||||
).fillna({"text": ""})
|
||||
words = ocr[(ocr["conf"] > 90) & (ocr["width"] > ocr["height"])]
|
||||
words = words[
|
||||
words.apply(
|
||||
lambda row: re.fullmatch(r"[a-zA-Z]{4,}", row["text"]) is not None,
|
||||
axis=1,
|
||||
)
|
||||
]
|
||||
if words.shape[0] > best_ocr_score:
|
||||
best_ocr_score = words.shape[0]
|
||||
best_ocr_orientation = orientation
|
||||
best_ocr_words = words
|
||||
if best_ocr_score > 50:
|
||||
# Unlikely that another orientation will have more words, so
|
||||
# stop eating up CPU unnecessarily.
|
||||
conn.commit()
|
||||
print("Done")
|
||||
break
|
||||
|
||||
ocr_orientation_match = best_ocr_orientation == 0
|
||||
|
||||
best_ocr_dims = (
|
||||
im_original.size
|
||||
if best_ocr_orientation % 2 == 0
|
||||
else (im_original.size[1], im_original.size[0])
|
||||
)
|
||||
|
||||
word_margins_all_directions = np.sort(
|
||||
np.concat(
|
||||
(
|
||||
best_ocr_words["left"].to_numpy(),
|
||||
best_ocr_words["top"].to_numpy(),
|
||||
best_ocr_dims[0]
|
||||
- (best_ocr_words["left"] + best_ocr_words["width"]).to_numpy(),
|
||||
best_ocr_dims[1]
|
||||
- (best_ocr_words["top"] + best_ocr_words["height"]).to_numpy(),
|
||||
)
|
||||
)
|
||||
)
|
||||
# Skip the n closest words to the edge, to help ignore stray OCR artifacts.
|
||||
SKIP_WORDS = 2
|
||||
text_margin_px = (
|
||||
int(word_margins_all_directions[SKIP_WORDS])
|
||||
if word_margins_all_directions.shape[0] > SKIP_WORDS
|
||||
else -1
|
||||
)
|
||||
|
||||
return {
|
||||
"blank": is_blank,
|
||||
"file_name": file_name,
|
||||
"ocr_orientation_match": ocr_orientation_match,
|
||||
"page_index": page_index,
|
||||
"size": im_original.size,
|
||||
"sharpness": max_sharpness,
|
||||
"text_margin_px": text_margin_px,
|
||||
}
|
||||
|
||||
|
||||
def analyze_item(item_id, parallel=1, verbose=False):
|
||||
escaped_item_id = urllib.parse.quote(item_id, safe="")
|
||||
|
||||
if verbose:
|
||||
print("Downloading...", file=stderr)
|
||||
stderr.flush()
|
||||
page_nums_resp = requests.get(
|
||||
f"https://archive.org/metadata/{escaped_item_id}/page_numbers/pages"
|
||||
)
|
||||
page_nums_resp.raise_for_status()
|
||||
page_nums = page_nums_resp.json()["result"]
|
||||
|
||||
zip_resp = requests.get(
|
||||
f"https://archive.org/download/{escaped_item_id}/{escaped_item_id}_jp2.zip"
|
||||
)
|
||||
zip_resp.raise_for_status()
|
||||
|
||||
if verbose:
|
||||
print("Decompressing...", file=stderr)
|
||||
stderr.flush()
|
||||
tasks = []
|
||||
with ZipFile(BytesIO(zip_resp.content)) as jp_zip:
|
||||
for leaf_num, file_name in enumerate(sorted(jp_zip.namelist())):
|
||||
for page_index, page_num_info in enumerate(page_nums):
|
||||
if page_num_info["leafNum"] == leaf_num:
|
||||
# Stop iterating and keep page_index set to the current item.
|
||||
except Exception as err:
|
||||
print(err, file=stderr)
|
||||
traceback.print_tb(err.__traceback__, file=stderr)
|
||||
sleep(15)
|
||||
break
|
||||
else:
|
||||
# Set to -1 to indicate that leaf was not found in page_num list.
|
||||
page_index = -1
|
||||
|
||||
if page_index != -1:
|
||||
with jp_zip.open(file_name) as jp_file:
|
||||
im = Image.open(jp_file).convert("L")
|
||||
im.thumbnail((3200, 3200))
|
||||
tasks.append(
|
||||
PageAnalysisTask(
|
||||
im=im,
|
||||
page_index=page_index,
|
||||
file_name=file_name,
|
||||
)
|
||||
)
|
||||
|
||||
if verbose:
|
||||
print(f"Processing {len(page_nums)} pages...", file=stderr)
|
||||
stderr.flush()
|
||||
if parallel > 1:
|
||||
# Parallelize image processing and OCR of pages across up to n cores.
|
||||
with Pool(parallel) as pool:
|
||||
return {"pages": pool.map(_analyze_page, tasks)}
|
||||
return {"pages": [_analyze_page(task) for task in tasks]}
|
||||
sleep(3600)
|
||||
|
||||
|
||||
def analyze_sharpness(im):
|
||||
def pull_new_item_ids(conn, earliest_review_date):
|
||||
cur = conn.cursor()
|
||||
res = cur.execute("select review_date from items order by review_date desc limit 1")
|
||||
(latest_review_date,) = res.fetchone() or (earliest_review_date,)
|
||||
print(latest_review_date)
|
||||
|
||||
query = f"""
|
||||
collection:(microfiche)
|
||||
AND contributor:(Internet Archive)
|
||||
AND micro_review:(done)
|
||||
AND review_date:[{latest_review_date} TO null]
|
||||
"""
|
||||
Crudely quantifies the "sharpness" of edges in an image, on a scale of 0 to
|
||||
1. The scale is not linear with respect to scan quality: anything above 0.1
|
||||
is usually fine.
|
||||
"""
|
||||
arr = np.asarray(im)
|
||||
sort = "reviewdate asc"
|
||||
|
||||
# Normalize contrast based on brightest and darkest pixels. For example,
|
||||
# NORM_QUANTILE=0.1 will attempt to transform pixel values so that 80% fall
|
||||
# between 10% brightness and 90% brightness. In practice, a value around
|
||||
# 0.02 seems to work fairly well.
|
||||
NORM_QUANTILE = 0.03
|
||||
pixel_range = np.quantile(arr, 1.0 - NORM_QUANTILE) - np.quantile(
|
||||
arr, NORM_QUANTILE
|
||||
)
|
||||
if pixel_range == 0:
|
||||
arr_normalized = arr
|
||||
else:
|
||||
arr_normalized = arr * (1.0 - NORM_QUANTILE * 2) / pixel_range
|
||||
arr_normalized = (
|
||||
arr_normalized - np.quantile(arr_normalized, NORM_QUANTILE) + NORM_QUANTILE
|
||||
)
|
||||
arr_normalized = np.uint8(np.clip(arr_normalized, 0, 1) * 255)
|
||||
# Format for API.
|
||||
query = re.sub(r"\s+", "+", query.strip())
|
||||
sort = re.sub(r"\s+", "+", sort.strip())
|
||||
|
||||
# "Sharpness" is determined by measuring the median intensity of pixels
|
||||
# near edges, after an edge detection filter has been applied to the image.
|
||||
edges_arr = np.asarray(
|
||||
Image.fromarray(arr_normalized).filter(ImageFilter.FIND_EDGES)
|
||||
# params = {
|
||||
# "q": query,
|
||||
# "count": 100,
|
||||
# "fields": "identifier,review_date",
|
||||
# "sorts": sort,
|
||||
# }
|
||||
# for i in range(1, 999):
|
||||
# resp = requests.get(
|
||||
# "https://archive.org/services/search/v1/scrape",
|
||||
# params=params,
|
||||
# )
|
||||
# resp.raise_for_status()
|
||||
# print(resp.text)
|
||||
# try:
|
||||
# body = resp.json()
|
||||
# except Exception as err:
|
||||
# print("Body:", resp.text, file=stderr)
|
||||
# raise err
|
||||
# for doc in body["items"]:
|
||||
# cur.execute(
|
||||
# "insert into items (id, review_date, skip_analysis) values (?, ?, false) on conflict do nothing",
|
||||
# (doc["identifier"], doc["review_date"]),
|
||||
# )
|
||||
# conn.commit()
|
||||
# cursor = body.get("cursor", None)
|
||||
# if cursor is None:
|
||||
# break
|
||||
# params = params.copy()
|
||||
# params["cursor"] = cursor
|
||||
resp = requests.get(
|
||||
f"https://archive.org/advancedsearch.php?q={query}&sort[]={sort}&fl[]=identifier&fl[]=review_date&rows=250000&output=json",
|
||||
)
|
||||
EDGE_THRESHOLD = 8
|
||||
return np.median(edges_arr[edges_arr > EDGE_THRESHOLD]) / 255
|
||||
resp.raise_for_status()
|
||||
try:
|
||||
body = resp.json()
|
||||
except Exception as err:
|
||||
print("Body:", resp.text, file=stderr)
|
||||
raise err
|
||||
for doc in body["response"]["docs"]:
|
||||
cur.execute(
|
||||
"insert into items (id, review_date, skip_analysis) values (?, ?, false) on conflict do nothing",
|
||||
(doc["identifier"], doc["review_date"]),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
159
one_off.py
Normal file
159
one_off.py
Normal file
|
|
@ -0,0 +1,159 @@
|
|||
import json
|
||||
import re
|
||||
from argparse import ArgumentParser
|
||||
from dataclasses import dataclass
|
||||
from multiprocessing.pool import ThreadPool
|
||||
from sys import stderr, stdin, stdout
|
||||
|
||||
from engine import analyze_item
|
||||
|
||||
|
||||
OCR_LANGS = "eng+fra"
|
||||
|
||||
|
||||
def main():
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument("--summarize", action="store_true")
|
||||
parser.add_argument("-v", "--verbose", action="store_true")
|
||||
parser.add_argument("-w", "--workers", type=int, default=1)
|
||||
parser.add_argument("--page-margin-px", type=int, default=50)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Process STDIN line by line, where each line contains one or more item IDs
|
||||
# separated by whitespace.
|
||||
for line in stdin:
|
||||
item_ids = [value for value in re.split(r",|\s", line) if value]
|
||||
with ThreadPool(args.workers) as pool:
|
||||
if args.verbose:
|
||||
print(f"Running with {args.workers} workers.", file=stderr)
|
||||
stderr.flush()
|
||||
if args.summarize:
|
||||
pool.map(
|
||||
_summarize_item_to_stdout,
|
||||
[
|
||||
ItemTask(
|
||||
item_id=item_id,
|
||||
page_margin_px=args.page_margin_px,
|
||||
verbose=args.verbose,
|
||||
)
|
||||
for item_id in item_ids
|
||||
],
|
||||
)
|
||||
else:
|
||||
pool.map(
|
||||
_analyze_item_to_stdout,
|
||||
[
|
||||
ItemTask(
|
||||
item_id=item_id,
|
||||
page_margin_px=args.page_margin_px,
|
||||
verbose=args.verbose,
|
||||
)
|
||||
for item_id in item_ids
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ItemTask:
|
||||
item_id: str
|
||||
page_margin_px: int
|
||||
verbose: bool
|
||||
|
||||
|
||||
def _summarize_item_to_stdout(task):
|
||||
item_id = task.item_id
|
||||
page_margin_px = task.page_margin_px
|
||||
verbose = task.verbose
|
||||
|
||||
if verbose:
|
||||
print(f"Summarizing item {item_id}...", file=stderr)
|
||||
stderr.flush()
|
||||
|
||||
analysis = analyze_item(item_id, parallel=4, verbose=verbose)
|
||||
|
||||
# 3 or more blank pages in a row is a flag.
|
||||
CONSECUTIVE_BLANKS_THRESHOLD = 3
|
||||
if len(analysis["pages"]) >= CONSECUTIVE_BLANKS_THRESHOLD:
|
||||
consecutive_blanks = [page["blank"] for page in analysis["pages"]]
|
||||
for _ in range(1, CONSECUTIVE_BLANKS_THRESHOLD):
|
||||
consecutive_blanks = [
|
||||
value and consecutive_blanks[i]
|
||||
for i, value in enumerate(consecutive_blanks[1:])
|
||||
]
|
||||
consecutive_blanks = [
|
||||
i + 2 # +1 to account for enumeration offset, and +1 to 1-index
|
||||
for i, value in enumerate(consecutive_blanks[1:])
|
||||
if value and not consecutive_blanks[i]
|
||||
]
|
||||
else:
|
||||
consecutive_blanks = []
|
||||
|
||||
# 3 or more blank pages in a row is a flag.
|
||||
CONSECUTIVE_BLURRY_THRESHOLD = 3
|
||||
SHARPNESS_THRESHOLD = 0.1
|
||||
if len(analysis["pages"]) >= CONSECUTIVE_BLURRY_THRESHOLD:
|
||||
consecutive_blurry = [
|
||||
page["sharpness"] < SHARPNESS_THRESHOLD for page in analysis["pages"]
|
||||
]
|
||||
for _ in range(1, CONSECUTIVE_BLURRY_THRESHOLD):
|
||||
consecutive_blurry = [
|
||||
value and consecutive_blurry[i]
|
||||
for i, value in enumerate(consecutive_blurry[1:])
|
||||
]
|
||||
consecutive_blurry = [
|
||||
i + 2 # +1 to account for enumeration offset, and +1 to 1-index
|
||||
for i, value in enumerate(consecutive_blurry[1:])
|
||||
if value and not consecutive_blurry[i]
|
||||
]
|
||||
else:
|
||||
consecutive_blurry = []
|
||||
|
||||
check_orientation = [
|
||||
i + 1
|
||||
for i, page in enumerate(analysis["pages"])
|
||||
if not page["ocr_orientation_match"]
|
||||
]
|
||||
|
||||
check_crop = [
|
||||
i + 1
|
||||
for i, page in enumerate(analysis["pages"])
|
||||
if page["text_margin_px"] < page_margin_px
|
||||
]
|
||||
|
||||
if check_orientation or check_crop or consecutive_blanks or consecutive_blurry:
|
||||
print(
|
||||
json.dumps(
|
||||
{
|
||||
"item_id": item_id,
|
||||
"check_orientation": check_orientation,
|
||||
"check_crop": check_crop,
|
||||
"consecutive_blanks": consecutive_blanks,
|
||||
"consecutive_blurry": consecutive_blurry,
|
||||
}
|
||||
)
|
||||
)
|
||||
stdout.flush()
|
||||
|
||||
if verbose:
|
||||
print(f"Done summarizing item {item_id}.", file=stderr)
|
||||
stderr.flush()
|
||||
|
||||
|
||||
def _analyze_item_to_stdout(task):
|
||||
item_id = task.item_id
|
||||
verbose = task.verbose
|
||||
|
||||
if verbose:
|
||||
print(f"Analyzing item {item_id}...", file=stderr)
|
||||
stderr.flush()
|
||||
|
||||
print(json.dumps(analyze_item(item_id, parallel=6, verbose=verbose)))
|
||||
stdout.flush()
|
||||
|
||||
if verbose:
|
||||
print(f"Done analyzing item {item_id}.", file=stderr)
|
||||
stderr.flush()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Reference in a new issue