rewrite to engine.py

This commit is contained in:
Brent Schroeter 2025-10-04 15:09:16 -07:00
parent 815934ad23
commit 4d9161b043
6 changed files with 550 additions and 479 deletions

2
.gitignore vendored
View file

@ -1,2 +1,4 @@
/target
/data
*.db
__pycache__

View file

@ -38,3 +38,4 @@ order by items.id;
- Blurry pages: `micro_IA40244209_0984`
- Contrast, page orientation: `micro_IA40244211_2290`
- Crop, low quality fiche: `micro_IA40386420_0689`
- "Bite sized" SCOTUS doc with multiple viewable files and some blurry pages: `micro_IA40386007_0012`

134
cache.py
View file

@ -1,134 +0,0 @@
import re
import sqlite3
import traceback
from argparse import ArgumentParser
from datetime import datetime
from time import sleep
import requests
from main import analyze_item
def main():
parser = ArgumentParser()
parser.add_argument("--database", default="./microqa.db")
parser.add_argument("--cpus", type=int, default=2)
parser.add_argument("--earliest-review-date", default="20250701")
args = parser.parse_args()
with sqlite3.connect(args.database) as conn:
cur = conn.cursor()
cur.execute("""
create table if not exists items (
id text primary key not null,
review_date text not null,
analyzed_date text
)""")
cur.execute("""
create table if not exists pages (
id int primary key,
item text not null,
page int not null,
orientation_match boolean not null,
sharpness real not null,
is_blank boolean not null,
text_margin_px int not null
)""")
conn.commit()
while True:
print("Pulling item IDs")
pull_new_item_ids(conn, args.earliest_review_date)
print("Done.")
res = cur.execute(
"select id from items where analyzed_date is null order by review_date"
)
for (item_id,) in res.fetchall():
N_ATTEMPTS = 3
for _ in range(N_ATTEMPTS):
try:
print(f"Processing {item_id}")
analysis = analyze_item(
item_id, parallel=args.cpus, verbose=True
)
for i, page in enumerate(analysis["pages"]):
cur.execute(
"""
insert into pages (
item,
page,
orientation_match,
sharpness,
is_blank,
text_margin_px
) values (
?,
?,
?,
?,
?,
?
)""",
[
item_id,
i + 1,
page["ocr_orientation_match"],
page["sharpness"],
page["blank"],
page["text_margin_px"],
],
)
cur.execute(
"update items set analyzed_date = ? where id = ?",
[datetime.utcnow().strftime("%Y%m%d%H%M%S"), item_id],
)
conn.commit()
print("Done")
break
except Exception as err:
print(err)
traceback.print_tb(err.__traceback__)
sleep(15)
break
sleep(3600)
def pull_new_item_ids(conn, earliest_review_date):
cur = conn.cursor()
res = cur.execute("select review_date from items order by review_date desc limit 1")
(latest_review_date,) = res.fetchone() or (earliest_review_date,)
print(latest_review_date)
query = f"""
collection:(microfiche)
AND contributor:(Internet Archive)
AND micro_review:(done)
AND review_date:[{latest_review_date} TO null]
"""
sort = "reviewdate asc"
# Format for API.
query = re.sub(r"\s+", "+", query.strip())
sort = re.sub(r"\s+", "+", sort.strip())
for i in range(1, 999):
resp = requests.get(
f"https://archive.org/advancedsearch.php?q={query}&sort[]={sort}&fl[]=identifier&fl[]=review_date&rows=100&page={i}&output=json",
)
resp.raise_for_status()
body = resp.json()
if len(body["response"]["docs"]) == 0:
break
cur.executemany(
"insert into items (id, review_date) values (?, ?) on conflict do nothing",
[
(doc["identifier"], doc["review_date"])
for doc in body["response"]["docs"]
],
)
conn.commit()
if __name__ == "__main__":
main()

239
engine.py Normal file
View file

@ -0,0 +1,239 @@
import urllib.parse
import re
from dataclasses import dataclass
from io import BytesIO
from multiprocessing import Pool
from sys import stderr
from zipfile import ZipFile
import numpy as np
import pytesseract
import requests
from PIL import Image, ImageFilter
def analyze_item(item_id, ocr_langs="eng+fra", parallel=1, verbose=False):
escaped_item_id = urllib.parse.quote(item_id, safe="")
if verbose:
print("Downloading...", file=stderr)
stderr.flush()
page_nums_resp = requests.get(
f"https://archive.org/metadata/{escaped_item_id}/page_numbers/pages"
)
page_nums_resp.raise_for_status()
page_nums = page_nums_resp.json()["result"]
zip_resp = requests.get(
f"https://archive.org/download/{escaped_item_id}/{escaped_item_id}_jp2.zip"
)
zip_resp.raise_for_status()
if verbose:
print("Decompressing...", file=stderr)
stderr.flush()
tasks = []
with ZipFile(BytesIO(zip_resp.content)) as jp_zip:
for leaf_num, file_name in enumerate(sorted(jp_zip.namelist())):
for page_index, page_num_info in enumerate(page_nums):
if page_num_info["leafNum"] == leaf_num:
# Stop iterating and keep page_index set to the current item.
break
else:
# Set to -1 to indicate that leaf was not found in page_num list.
page_index = -1
if page_index != -1:
with jp_zip.open(file_name) as jp_file:
im = Image.open(jp_file).convert("L")
im.thumbnail((3200, 3200))
tasks.append(
PageAnalysisTask(
im=im,
ocr_langs=ocr_langs,
)
)
if verbose:
print(f"Processing {len(page_nums)} pages...", file=stderr)
stderr.flush()
if parallel > 1:
# Parallelize image processing and OCR of pages across up to n cores.
with Pool(parallel) as pool:
return {"pages": pool.map(analyze_page, tasks)}
return {"pages": [analyze_page(task) for task in tasks]}
@dataclass
class PageAnalysisTask:
"""
Attributes:
im PIL Image, pre-scaled using .thumbnail() to fit the long
edge to 3200 px.
ocr_langs Tesseract language codes (3 letters each, in a "+"-separated
list).
"""
im: Image.Image
ocr_langs: str = "eng+fra"
def analyze_page(task):
im_cropped = task.im.crop(
(
task.im.size[0] * 0.1,
task.im.size[1] * 0.1,
task.im.size[0] * 0.9,
task.im.size[1] * 0.9,
)
)
is_blank = im_cropped.getextrema()[0] > 255 * 0.8
if is_blank:
max_sharpness = 1
ocr_orientation_match = True
text_margin_px = -1
else:
max_sharpness = 0.0
if im_cropped.size[0] < im_cropped.size[1]:
# Page is in portrait orientation.
segments_x = 2
segments_y = 3
else:
# Page is in landscape orientation.
segments_x = 3
segments_y = 2
for i in range(segments_x):
for j in range(segments_y):
max_sharpness = max(
max_sharpness,
analyze_sharpness(
im_cropped.crop(
(
im_cropped.size[0] / segments_x * i,
im_cropped.size[1] / segments_y * j,
im_cropped.size[0] / segments_x * (i + 1),
im_cropped.size[1] / segments_y * (j + 1),
)
)
),
)
OCR_SCALE = 1
best_ocr_score = -1
best_ocr_words = None
best_ocr_orientation = -1
for orientation in range(4):
im_rotated = task.im.resize(
np.int_(np.array(task.im.size) * OCR_SCALE)
).rotate(90 * orientation, expand=True)
ocr = pytesseract.image_to_data(
im_rotated,
lang=task.ocr_langs,
config=f"--oem 1 --dpi {int(300 * OCR_SCALE)} --tessdata-dir ./data/tessdata_fast-4.1.0",
output_type=pytesseract.Output.DATAFRAME,
).fillna({"text": ""})
# Keep only words that Tesseract is confident in, and which are
# oriented horizontally.
words = ocr[(ocr["conf"] > 90) & (ocr["width"] > ocr["height"])]
# Keep only alphabetical words of 4 or more characters.
words = words[
words.apply(
lambda row: re.fullmatch(r"[a-zA-Z]{4,}", str(row["text"]))
is not None,
axis=1,
)
]
if words.shape[0] > best_ocr_score:
best_ocr_score = words.shape[0]
best_ocr_orientation = orientation
best_ocr_words = words
if best_ocr_score > 50:
# Unlikely that another orientation will have more words, so
# stop eating up CPU.
break
if best_ocr_words.empty:
ocr_orientation_match = True
text_margin_px = -1
else:
ocr_orientation_match = best_ocr_orientation == 0
best_ocr_dims = OCR_SCALE * np.array(
task.im.size
if best_ocr_orientation % 2 == 0
else (task.im.size[1], task.im.size[0])
)
word_margins_all_directions = np.sort(
np.int_(
np.concat(
(
best_ocr_words["left"].to_numpy(),
best_ocr_words["top"].to_numpy(),
best_ocr_dims[0]
- (
best_ocr_words["left"] + best_ocr_words["width"]
).to_numpy(),
best_ocr_dims[1]
- (
best_ocr_words["top"] + best_ocr_words["height"]
).to_numpy(),
)
)
# Transform back into original image pixel density
/ OCR_SCALE
)
)
# Skip the n closest words to the edge, to help ignore stray OCR artifacts.
SKIP_WORDS = 2
text_margin_px = int(
word_margins_all_directions[SKIP_WORDS]
if word_margins_all_directions.shape[0] > SKIP_WORDS
else -1
)
return {
"blank": is_blank,
"ocr_orientation_match": ocr_orientation_match,
"size_analyzed": task.im.size,
"sharpness": max_sharpness,
"text_margin_px": text_margin_px,
}
def analyze_sharpness(im):
"""
Crudely quantifies the "sharpness" of edges in an image, on a scale of 0 to
1. The scale is not linear with respect to scan quality: anything above 0.1
is usually fine.
"""
arr = np.asarray(im)
# Normalize contrast based on brightest and darkest pixels. For example,
# NORM_QUANTILE=0.1 will attempt to transform pixel values so that 80% fall
# between 10% brightness and 90% brightness. In practice, a value around
# 0.02 seems to work fairly well.
NORM_QUANTILE = 0.03
pixel_range = np.quantile(arr, 1.0 - NORM_QUANTILE) - np.quantile(
arr, NORM_QUANTILE
)
if pixel_range == 0:
arr_normalized = arr
else:
arr_normalized = arr * (1.0 - NORM_QUANTILE * 2) / pixel_range
arr_normalized = (
arr_normalized - np.quantile(arr_normalized, NORM_QUANTILE) + NORM_QUANTILE
)
arr_normalized = np.uint8(np.clip(arr_normalized, 0, 1) * 255)
# "Sharpness" is determined by measuring the median intensity of pixels
# near edges, after an edge detection filter has been applied to the image.
edges_arr = np.asarray(
Image.fromarray(arr_normalized).filter(ImageFilter.FIND_EDGES)
)
EDGE_THRESHOLD = 8
return np.median(edges_arr[edges_arr > EDGE_THRESHOLD]) / 255

494
main.py
View file

@ -1,371 +1,175 @@
import json
import re
import urllib.parse
import sqlite3
import traceback
from argparse import ArgumentParser
from dataclasses import dataclass
from io import BytesIO
from multiprocessing import Pool
from multiprocessing.pool import ThreadPool
from sys import stderr, stdin, stdout
from zipfile import ZipFile
from datetime import datetime
from sys import stderr
from time import sleep
import numpy as np
import pytesseract
import requests
from PIL import Image, ImageFilter
OCR_LANGS = "eng+fra"
from engine import analyze_item
def main():
parser = ArgumentParser()
parser.add_argument("--summarize", action="store_true")
parser.add_argument("-v", "--verbose", action="store_true")
parser.add_argument("-w", "--workers", type=int, default=1)
parser.add_argument("--page-margin-px", type=int, default=50)
parser.add_argument("--database", default="./microqa.db")
parser.add_argument("--cpus", type=int, default=2)
parser.add_argument("--earliest-review-date", default="20250701")
args = parser.parse_args()
# Process STDIN line by line, where each line contains one or more item IDs
# separated by whitespace.
for line in stdin:
item_ids = [value for value in re.split(r",|\s", line) if value]
with ThreadPool(args.workers) as pool:
if args.verbose:
print(f"Running with {args.workers} workers.", file=stderr)
stderr.flush()
if args.summarize:
pool.map(
_summarize_item_to_stdout,
[
ItemTask(
item_id=item_id,
page_margin_px=args.page_margin_px,
verbose=args.verbose,
with sqlite3.connect(args.database) as conn:
cur = conn.cursor()
cur.execute("""
create table if not exists items (
id text primary key not null,
review_date text not null,
skip_analysis bool not null,
analyzed_date text
)""")
cur.execute("""
create table if not exists pages (
id int primary key,
item text not null,
page int not null,
orientation_match boolean not null,
sharpness real not null,
is_blank boolean not null,
text_margin_px int not null
)""")
cur.execute("create index if not exists review_date_idx on items (review_date)")
cur.execute(
"create index if not exists analyzed_date_idx on items (analyzed_date)"
)
for item_id in item_ids
cur.execute("create index if not exists item_idx on pages (item)")
cur.execute(
"create unique index if not exists item_page_idx on pages (item, page)"
)
conn.commit()
while True:
print("Pulling item IDs")
pull_new_item_ids(conn, args.earliest_review_date)
print("Done.")
res = cur.execute("""
select id
from items
where analyzed_date is null
and skip_analysis = false
order by review_date
""")
for (item_id,) in res.fetchall():
N_ATTEMPTS = 3
for _ in range(N_ATTEMPTS):
try:
print(f"Processing {item_id}")
analysis = analyze_item(
item_id, parallel=args.cpus, verbose=True
)
for i, page in enumerate(analysis["pages"]):
cur.execute(
"""
insert into pages (
item,
page,
orientation_match,
sharpness,
is_blank,
text_margin_px
) values (
?,
?,
?,
?,
?,
?
)""",
[
item_id,
i + 1,
page["ocr_orientation_match"],
page["sharpness"],
page["blank"],
page["text_margin_px"],
],
)
else:
pool.map(
_analyze_item_to_stdout,
[
ItemTask(
item_id=item_id,
page_margin_px=args.page_margin_px,
verbose=args.verbose,
cur.execute(
"update items set analyzed_date = ? where id = ?",
[datetime.utcnow().strftime("%Y%m%d%H%M%S"), item_id],
)
for item_id in item_ids
],
)
@dataclass
class ItemTask:
item_id: str
page_margin_px: int
verbose: bool
def _summarize_item_to_stdout(task):
item_id = task.item_id
page_margin_px = task.page_margin_px
verbose = task.verbose
if verbose:
print(f"Summarizing item {item_id}...", file=stderr)
stderr.flush()
analysis = analyze_item(item_id, parallel=4, verbose=verbose)
# 3 or more blank pages in a row is a flag.
CONSECUTIVE_BLANKS_THRESHOLD = 3
if len(analysis["pages"]) >= CONSECUTIVE_BLANKS_THRESHOLD:
consecutive_blanks = [page["blank"] for page in analysis["pages"]]
for _ in range(1, CONSECUTIVE_BLANKS_THRESHOLD):
consecutive_blanks = [
value and consecutive_blanks[i]
for i, value in enumerate(consecutive_blanks[1:])
]
consecutive_blanks = [
i + 2 # +1 to account for enumeration offset, and +1 to 1-index
for i, value in enumerate(consecutive_blanks[1:])
if value and not consecutive_blanks[i]
]
else:
consecutive_blanks = []
# 3 or more blank pages in a row is a flag.
CONSECUTIVE_BLURRY_THRESHOLD = 3
SHARPNESS_THRESHOLD = 0.1
if len(analysis["pages"]) >= CONSECUTIVE_BLURRY_THRESHOLD:
consecutive_blurry = [
page["sharpness"] < SHARPNESS_THRESHOLD for page in analysis["pages"]
]
for _ in range(1, CONSECUTIVE_BLURRY_THRESHOLD):
consecutive_blurry = [
value and consecutive_blurry[i]
for i, value in enumerate(consecutive_blurry[1:])
]
consecutive_blurry = [
i + 2 # +1 to account for enumeration offset, and +1 to 1-index
for i, value in enumerate(consecutive_blurry[1:])
if value and not consecutive_blurry[i]
]
else:
consecutive_blurry = []
check_orientation = [
i + 1
for i, page in enumerate(analysis["pages"])
if not page["ocr_orientation_match"]
]
check_crop = [
i + 1
for i, page in enumerate(analysis["pages"])
if page["text_margin_px"] < page_margin_px
]
if check_orientation or check_crop or consecutive_blanks or consecutive_blurry:
print(
json.dumps(
{
"item_id": item_id,
"check_orientation": check_orientation,
"check_crop": check_crop,
"consecutive_blanks": consecutive_blanks,
"consecutive_blurry": consecutive_blurry,
}
)
)
stdout.flush()
if verbose:
print(f"Done summarizing item {item_id}.", file=stderr)
stderr.flush()
def _analyze_item_to_stdout(task):
item_id = task.item_id
verbose = task.verbose
if verbose:
print(f"Analyzing item {item_id}...", file=stderr)
stderr.flush()
print(json.dumps(analyze_item(item_id, parallel=4, verbose=verbose)))
stdout.flush()
if verbose:
print(f"Done analyzing item {item_id}.", file=stderr)
stderr.flush()
@dataclass
class PageAnalysisTask:
im: Image.Image
page_index: int
file_name: str
def _analyze_page(task):
im_original = task.im
page_index = task.page_index
file_name = task.file_name
im_cropped = im_original.crop(
(
im_original.size[0] * 0.1,
im_original.size[1] * 0.1,
im_original.size[0] * 0.9,
im_original.size[1] * 0.9,
)
)
is_blank = im_cropped.getextrema()[0] > 255 * 0.8
if is_blank:
max_sharpness = 1
ocr_orientation_match = True
text_margin_px = -1
else:
max_sharpness = 0.0
if im_cropped.size[0] < im_cropped.size[1]:
# Page is in portrait orientation.
segments_x = 2
segments_y = 3
else:
# Page is in landscape orientation.
segments_x = 3
segments_y = 2
for i in range(segments_x):
for j in range(segments_y):
max_sharpness = max(
max_sharpness,
analyze_sharpness(
im_cropped.crop(
(
im_cropped.size[0] / segments_x * i,
im_cropped.size[1] / segments_y * j,
im_cropped.size[0] / segments_x * (i + 1),
im_cropped.size[1] / segments_y * (j + 1),
)
)
),
)
best_ocr_score = -1
best_ocr_words = None
best_ocr_orientation = -1
for orientation in range(4):
im_rotated = im_original.rotate(90 * orientation, expand=True)
ocr = pytesseract.image_to_data(
im_rotated,
lang=OCR_LANGS,
config="--oem 1 --dpi 300 --tessdata-dir ./data/tessdata_fast-4.1.0",
output_type=pytesseract.Output.DATAFRAME,
).fillna({"text": ""})
words = ocr[(ocr["conf"] > 90) & (ocr["width"] > ocr["height"])]
words = words[
words.apply(
lambda row: re.fullmatch(r"[a-zA-Z]{4,}", row["text"]) is not None,
axis=1,
)
]
if words.shape[0] > best_ocr_score:
best_ocr_score = words.shape[0]
best_ocr_orientation = orientation
best_ocr_words = words
if best_ocr_score > 50:
# Unlikely that another orientation will have more words, so
# stop eating up CPU unnecessarily.
conn.commit()
print("Done")
break
ocr_orientation_match = best_ocr_orientation == 0
best_ocr_dims = (
im_original.size
if best_ocr_orientation % 2 == 0
else (im_original.size[1], im_original.size[0])
)
word_margins_all_directions = np.sort(
np.concat(
(
best_ocr_words["left"].to_numpy(),
best_ocr_words["top"].to_numpy(),
best_ocr_dims[0]
- (best_ocr_words["left"] + best_ocr_words["width"]).to_numpy(),
best_ocr_dims[1]
- (best_ocr_words["top"] + best_ocr_words["height"]).to_numpy(),
)
)
)
# Skip the n closest words to the edge, to help ignore stray OCR artifacts.
SKIP_WORDS = 2
text_margin_px = (
int(word_margins_all_directions[SKIP_WORDS])
if word_margins_all_directions.shape[0] > SKIP_WORDS
else -1
)
return {
"blank": is_blank,
"file_name": file_name,
"ocr_orientation_match": ocr_orientation_match,
"page_index": page_index,
"size": im_original.size,
"sharpness": max_sharpness,
"text_margin_px": text_margin_px,
}
def analyze_item(item_id, parallel=1, verbose=False):
escaped_item_id = urllib.parse.quote(item_id, safe="")
if verbose:
print("Downloading...", file=stderr)
stderr.flush()
page_nums_resp = requests.get(
f"https://archive.org/metadata/{escaped_item_id}/page_numbers/pages"
)
page_nums_resp.raise_for_status()
page_nums = page_nums_resp.json()["result"]
zip_resp = requests.get(
f"https://archive.org/download/{escaped_item_id}/{escaped_item_id}_jp2.zip"
)
zip_resp.raise_for_status()
if verbose:
print("Decompressing...", file=stderr)
stderr.flush()
tasks = []
with ZipFile(BytesIO(zip_resp.content)) as jp_zip:
for leaf_num, file_name in enumerate(sorted(jp_zip.namelist())):
for page_index, page_num_info in enumerate(page_nums):
if page_num_info["leafNum"] == leaf_num:
# Stop iterating and keep page_index set to the current item.
except Exception as err:
print(err, file=stderr)
traceback.print_tb(err.__traceback__, file=stderr)
sleep(15)
break
else:
# Set to -1 to indicate that leaf was not found in page_num list.
page_index = -1
if page_index != -1:
with jp_zip.open(file_name) as jp_file:
im = Image.open(jp_file).convert("L")
im.thumbnail((3200, 3200))
tasks.append(
PageAnalysisTask(
im=im,
page_index=page_index,
file_name=file_name,
)
)
if verbose:
print(f"Processing {len(page_nums)} pages...", file=stderr)
stderr.flush()
if parallel > 1:
# Parallelize image processing and OCR of pages across up to n cores.
with Pool(parallel) as pool:
return {"pages": pool.map(_analyze_page, tasks)}
return {"pages": [_analyze_page(task) for task in tasks]}
sleep(3600)
def analyze_sharpness(im):
def pull_new_item_ids(conn, earliest_review_date):
cur = conn.cursor()
res = cur.execute("select review_date from items order by review_date desc limit 1")
(latest_review_date,) = res.fetchone() or (earliest_review_date,)
print(latest_review_date)
query = f"""
collection:(microfiche)
AND contributor:(Internet Archive)
AND micro_review:(done)
AND review_date:[{latest_review_date} TO null]
"""
Crudely quantifies the "sharpness" of edges in an image, on a scale of 0 to
1. The scale is not linear with respect to scan quality: anything above 0.1
is usually fine.
"""
arr = np.asarray(im)
sort = "reviewdate asc"
# Normalize contrast based on brightest and darkest pixels. For example,
# NORM_QUANTILE=0.1 will attempt to transform pixel values so that 80% fall
# between 10% brightness and 90% brightness. In practice, a value around
# 0.02 seems to work fairly well.
NORM_QUANTILE = 0.03
pixel_range = np.quantile(arr, 1.0 - NORM_QUANTILE) - np.quantile(
arr, NORM_QUANTILE
)
if pixel_range == 0:
arr_normalized = arr
else:
arr_normalized = arr * (1.0 - NORM_QUANTILE * 2) / pixel_range
arr_normalized = (
arr_normalized - np.quantile(arr_normalized, NORM_QUANTILE) + NORM_QUANTILE
)
arr_normalized = np.uint8(np.clip(arr_normalized, 0, 1) * 255)
# Format for API.
query = re.sub(r"\s+", "+", query.strip())
sort = re.sub(r"\s+", "+", sort.strip())
# "Sharpness" is determined by measuring the median intensity of pixels
# near edges, after an edge detection filter has been applied to the image.
edges_arr = np.asarray(
Image.fromarray(arr_normalized).filter(ImageFilter.FIND_EDGES)
# params = {
# "q": query,
# "count": 100,
# "fields": "identifier,review_date",
# "sorts": sort,
# }
# for i in range(1, 999):
# resp = requests.get(
# "https://archive.org/services/search/v1/scrape",
# params=params,
# )
# resp.raise_for_status()
# print(resp.text)
# try:
# body = resp.json()
# except Exception as err:
# print("Body:", resp.text, file=stderr)
# raise err
# for doc in body["items"]:
# cur.execute(
# "insert into items (id, review_date, skip_analysis) values (?, ?, false) on conflict do nothing",
# (doc["identifier"], doc["review_date"]),
# )
# conn.commit()
# cursor = body.get("cursor", None)
# if cursor is None:
# break
# params = params.copy()
# params["cursor"] = cursor
resp = requests.get(
f"https://archive.org/advancedsearch.php?q={query}&sort[]={sort}&fl[]=identifier&fl[]=review_date&rows=250000&output=json",
)
EDGE_THRESHOLD = 8
return np.median(edges_arr[edges_arr > EDGE_THRESHOLD]) / 255
resp.raise_for_status()
try:
body = resp.json()
except Exception as err:
print("Body:", resp.text, file=stderr)
raise err
for doc in body["response"]["docs"]:
cur.execute(
"insert into items (id, review_date, skip_analysis) values (?, ?, false) on conflict do nothing",
(doc["identifier"], doc["review_date"]),
)
conn.commit()
if __name__ == "__main__":

159
one_off.py Normal file
View file

@ -0,0 +1,159 @@
import json
import re
from argparse import ArgumentParser
from dataclasses import dataclass
from multiprocessing.pool import ThreadPool
from sys import stderr, stdin, stdout
from engine import analyze_item
OCR_LANGS = "eng+fra"
def main():
parser = ArgumentParser()
parser.add_argument("--summarize", action="store_true")
parser.add_argument("-v", "--verbose", action="store_true")
parser.add_argument("-w", "--workers", type=int, default=1)
parser.add_argument("--page-margin-px", type=int, default=50)
args = parser.parse_args()
# Process STDIN line by line, where each line contains one or more item IDs
# separated by whitespace.
for line in stdin:
item_ids = [value for value in re.split(r",|\s", line) if value]
with ThreadPool(args.workers) as pool:
if args.verbose:
print(f"Running with {args.workers} workers.", file=stderr)
stderr.flush()
if args.summarize:
pool.map(
_summarize_item_to_stdout,
[
ItemTask(
item_id=item_id,
page_margin_px=args.page_margin_px,
verbose=args.verbose,
)
for item_id in item_ids
],
)
else:
pool.map(
_analyze_item_to_stdout,
[
ItemTask(
item_id=item_id,
page_margin_px=args.page_margin_px,
verbose=args.verbose,
)
for item_id in item_ids
],
)
@dataclass
class ItemTask:
item_id: str
page_margin_px: int
verbose: bool
def _summarize_item_to_stdout(task):
item_id = task.item_id
page_margin_px = task.page_margin_px
verbose = task.verbose
if verbose:
print(f"Summarizing item {item_id}...", file=stderr)
stderr.flush()
analysis = analyze_item(item_id, parallel=4, verbose=verbose)
# 3 or more blank pages in a row is a flag.
CONSECUTIVE_BLANKS_THRESHOLD = 3
if len(analysis["pages"]) >= CONSECUTIVE_BLANKS_THRESHOLD:
consecutive_blanks = [page["blank"] for page in analysis["pages"]]
for _ in range(1, CONSECUTIVE_BLANKS_THRESHOLD):
consecutive_blanks = [
value and consecutive_blanks[i]
for i, value in enumerate(consecutive_blanks[1:])
]
consecutive_blanks = [
i + 2 # +1 to account for enumeration offset, and +1 to 1-index
for i, value in enumerate(consecutive_blanks[1:])
if value and not consecutive_blanks[i]
]
else:
consecutive_blanks = []
# 3 or more blank pages in a row is a flag.
CONSECUTIVE_BLURRY_THRESHOLD = 3
SHARPNESS_THRESHOLD = 0.1
if len(analysis["pages"]) >= CONSECUTIVE_BLURRY_THRESHOLD:
consecutive_blurry = [
page["sharpness"] < SHARPNESS_THRESHOLD for page in analysis["pages"]
]
for _ in range(1, CONSECUTIVE_BLURRY_THRESHOLD):
consecutive_blurry = [
value and consecutive_blurry[i]
for i, value in enumerate(consecutive_blurry[1:])
]
consecutive_blurry = [
i + 2 # +1 to account for enumeration offset, and +1 to 1-index
for i, value in enumerate(consecutive_blurry[1:])
if value and not consecutive_blurry[i]
]
else:
consecutive_blurry = []
check_orientation = [
i + 1
for i, page in enumerate(analysis["pages"])
if not page["ocr_orientation_match"]
]
check_crop = [
i + 1
for i, page in enumerate(analysis["pages"])
if page["text_margin_px"] < page_margin_px
]
if check_orientation or check_crop or consecutive_blanks or consecutive_blurry:
print(
json.dumps(
{
"item_id": item_id,
"check_orientation": check_orientation,
"check_crop": check_crop,
"consecutive_blanks": consecutive_blanks,
"consecutive_blurry": consecutive_blurry,
}
)
)
stdout.flush()
if verbose:
print(f"Done summarizing item {item_id}.", file=stderr)
stderr.flush()
def _analyze_item_to_stdout(task):
item_id = task.item_id
verbose = task.verbose
if verbose:
print(f"Analyzing item {item_id}...", file=stderr)
stderr.flush()
print(json.dumps(analyze_item(item_id, parallel=6, verbose=verbose)))
stdout.flush()
if verbose:
print(f"Done analyzing item {item_id}.", file=stderr)
stderr.flush()
if __name__ == "__main__":
main()