rewrite to engine.py
This commit is contained in:
parent
815934ad23
commit
4d9161b043
6 changed files with 550 additions and 479 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
|
@ -1,2 +1,4 @@
|
||||||
/target
|
/target
|
||||||
/data
|
/data
|
||||||
|
*.db
|
||||||
|
__pycache__
|
||||||
|
|
|
||||||
|
|
@ -38,3 +38,4 @@ order by items.id;
|
||||||
- Blurry pages: `micro_IA40244209_0984`
|
- Blurry pages: `micro_IA40244209_0984`
|
||||||
- Contrast, page orientation: `micro_IA40244211_2290`
|
- Contrast, page orientation: `micro_IA40244211_2290`
|
||||||
- Crop, low quality fiche: `micro_IA40386420_0689`
|
- Crop, low quality fiche: `micro_IA40386420_0689`
|
||||||
|
- "Bite sized" SCOTUS doc with multiple viewable files and some blurry pages: `micro_IA40386007_0012`
|
||||||
|
|
|
||||||
134
cache.py
134
cache.py
|
|
@ -1,134 +0,0 @@
|
||||||
import re
|
|
||||||
import sqlite3
|
|
||||||
import traceback
|
|
||||||
from argparse import ArgumentParser
|
|
||||||
from datetime import datetime
|
|
||||||
from time import sleep
|
|
||||||
|
|
||||||
import requests
|
|
||||||
|
|
||||||
from main import analyze_item
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
parser = ArgumentParser()
|
|
||||||
parser.add_argument("--database", default="./microqa.db")
|
|
||||||
parser.add_argument("--cpus", type=int, default=2)
|
|
||||||
parser.add_argument("--earliest-review-date", default="20250701")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
with sqlite3.connect(args.database) as conn:
|
|
||||||
cur = conn.cursor()
|
|
||||||
cur.execute("""
|
|
||||||
create table if not exists items (
|
|
||||||
id text primary key not null,
|
|
||||||
review_date text not null,
|
|
||||||
analyzed_date text
|
|
||||||
)""")
|
|
||||||
cur.execute("""
|
|
||||||
create table if not exists pages (
|
|
||||||
id int primary key,
|
|
||||||
item text not null,
|
|
||||||
page int not null,
|
|
||||||
orientation_match boolean not null,
|
|
||||||
sharpness real not null,
|
|
||||||
is_blank boolean not null,
|
|
||||||
text_margin_px int not null
|
|
||||||
)""")
|
|
||||||
conn.commit()
|
|
||||||
|
|
||||||
while True:
|
|
||||||
print("Pulling item IDs")
|
|
||||||
pull_new_item_ids(conn, args.earliest_review_date)
|
|
||||||
print("Done.")
|
|
||||||
res = cur.execute(
|
|
||||||
"select id from items where analyzed_date is null order by review_date"
|
|
||||||
)
|
|
||||||
for (item_id,) in res.fetchall():
|
|
||||||
N_ATTEMPTS = 3
|
|
||||||
for _ in range(N_ATTEMPTS):
|
|
||||||
try:
|
|
||||||
print(f"Processing {item_id}")
|
|
||||||
analysis = analyze_item(
|
|
||||||
item_id, parallel=args.cpus, verbose=True
|
|
||||||
)
|
|
||||||
for i, page in enumerate(analysis["pages"]):
|
|
||||||
cur.execute(
|
|
||||||
"""
|
|
||||||
insert into pages (
|
|
||||||
item,
|
|
||||||
page,
|
|
||||||
orientation_match,
|
|
||||||
sharpness,
|
|
||||||
is_blank,
|
|
||||||
text_margin_px
|
|
||||||
) values (
|
|
||||||
?,
|
|
||||||
?,
|
|
||||||
?,
|
|
||||||
?,
|
|
||||||
?,
|
|
||||||
?
|
|
||||||
)""",
|
|
||||||
[
|
|
||||||
item_id,
|
|
||||||
i + 1,
|
|
||||||
page["ocr_orientation_match"],
|
|
||||||
page["sharpness"],
|
|
||||||
page["blank"],
|
|
||||||
page["text_margin_px"],
|
|
||||||
],
|
|
||||||
)
|
|
||||||
cur.execute(
|
|
||||||
"update items set analyzed_date = ? where id = ?",
|
|
||||||
[datetime.utcnow().strftime("%Y%m%d%H%M%S"), item_id],
|
|
||||||
)
|
|
||||||
conn.commit()
|
|
||||||
print("Done")
|
|
||||||
break
|
|
||||||
except Exception as err:
|
|
||||||
print(err)
|
|
||||||
traceback.print_tb(err.__traceback__)
|
|
||||||
sleep(15)
|
|
||||||
break
|
|
||||||
sleep(3600)
|
|
||||||
|
|
||||||
|
|
||||||
def pull_new_item_ids(conn, earliest_review_date):
|
|
||||||
cur = conn.cursor()
|
|
||||||
res = cur.execute("select review_date from items order by review_date desc limit 1")
|
|
||||||
(latest_review_date,) = res.fetchone() or (earliest_review_date,)
|
|
||||||
print(latest_review_date)
|
|
||||||
|
|
||||||
query = f"""
|
|
||||||
collection:(microfiche)
|
|
||||||
AND contributor:(Internet Archive)
|
|
||||||
AND micro_review:(done)
|
|
||||||
AND review_date:[{latest_review_date} TO null]
|
|
||||||
"""
|
|
||||||
sort = "reviewdate asc"
|
|
||||||
|
|
||||||
# Format for API.
|
|
||||||
query = re.sub(r"\s+", "+", query.strip())
|
|
||||||
sort = re.sub(r"\s+", "+", sort.strip())
|
|
||||||
|
|
||||||
for i in range(1, 999):
|
|
||||||
resp = requests.get(
|
|
||||||
f"https://archive.org/advancedsearch.php?q={query}&sort[]={sort}&fl[]=identifier&fl[]=review_date&rows=100&page={i}&output=json",
|
|
||||||
)
|
|
||||||
resp.raise_for_status()
|
|
||||||
body = resp.json()
|
|
||||||
if len(body["response"]["docs"]) == 0:
|
|
||||||
break
|
|
||||||
cur.executemany(
|
|
||||||
"insert into items (id, review_date) values (?, ?) on conflict do nothing",
|
|
||||||
[
|
|
||||||
(doc["identifier"], doc["review_date"])
|
|
||||||
for doc in body["response"]["docs"]
|
|
||||||
],
|
|
||||||
)
|
|
||||||
conn.commit()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
239
engine.py
Normal file
239
engine.py
Normal file
|
|
@ -0,0 +1,239 @@
|
||||||
|
import urllib.parse
|
||||||
|
import re
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from io import BytesIO
|
||||||
|
from multiprocessing import Pool
|
||||||
|
from sys import stderr
|
||||||
|
from zipfile import ZipFile
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pytesseract
|
||||||
|
import requests
|
||||||
|
from PIL import Image, ImageFilter
|
||||||
|
|
||||||
|
|
||||||
|
def analyze_item(item_id, ocr_langs="eng+fra", parallel=1, verbose=False):
|
||||||
|
escaped_item_id = urllib.parse.quote(item_id, safe="")
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print("Downloading...", file=stderr)
|
||||||
|
stderr.flush()
|
||||||
|
page_nums_resp = requests.get(
|
||||||
|
f"https://archive.org/metadata/{escaped_item_id}/page_numbers/pages"
|
||||||
|
)
|
||||||
|
page_nums_resp.raise_for_status()
|
||||||
|
page_nums = page_nums_resp.json()["result"]
|
||||||
|
|
||||||
|
zip_resp = requests.get(
|
||||||
|
f"https://archive.org/download/{escaped_item_id}/{escaped_item_id}_jp2.zip"
|
||||||
|
)
|
||||||
|
zip_resp.raise_for_status()
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print("Decompressing...", file=stderr)
|
||||||
|
stderr.flush()
|
||||||
|
tasks = []
|
||||||
|
with ZipFile(BytesIO(zip_resp.content)) as jp_zip:
|
||||||
|
for leaf_num, file_name in enumerate(sorted(jp_zip.namelist())):
|
||||||
|
for page_index, page_num_info in enumerate(page_nums):
|
||||||
|
if page_num_info["leafNum"] == leaf_num:
|
||||||
|
# Stop iterating and keep page_index set to the current item.
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
# Set to -1 to indicate that leaf was not found in page_num list.
|
||||||
|
page_index = -1
|
||||||
|
|
||||||
|
if page_index != -1:
|
||||||
|
with jp_zip.open(file_name) as jp_file:
|
||||||
|
im = Image.open(jp_file).convert("L")
|
||||||
|
im.thumbnail((3200, 3200))
|
||||||
|
tasks.append(
|
||||||
|
PageAnalysisTask(
|
||||||
|
im=im,
|
||||||
|
ocr_langs=ocr_langs,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print(f"Processing {len(page_nums)} pages...", file=stderr)
|
||||||
|
stderr.flush()
|
||||||
|
|
||||||
|
if parallel > 1:
|
||||||
|
# Parallelize image processing and OCR of pages across up to n cores.
|
||||||
|
with Pool(parallel) as pool:
|
||||||
|
return {"pages": pool.map(analyze_page, tasks)}
|
||||||
|
|
||||||
|
return {"pages": [analyze_page(task) for task in tasks]}
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PageAnalysisTask:
|
||||||
|
"""
|
||||||
|
Attributes:
|
||||||
|
im PIL Image, pre-scaled using .thumbnail() to fit the long
|
||||||
|
edge to 3200 px.
|
||||||
|
ocr_langs Tesseract language codes (3 letters each, in a "+"-separated
|
||||||
|
list).
|
||||||
|
"""
|
||||||
|
|
||||||
|
im: Image.Image
|
||||||
|
ocr_langs: str = "eng+fra"
|
||||||
|
|
||||||
|
|
||||||
|
def analyze_page(task):
|
||||||
|
im_cropped = task.im.crop(
|
||||||
|
(
|
||||||
|
task.im.size[0] * 0.1,
|
||||||
|
task.im.size[1] * 0.1,
|
||||||
|
task.im.size[0] * 0.9,
|
||||||
|
task.im.size[1] * 0.9,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
is_blank = im_cropped.getextrema()[0] > 255 * 0.8
|
||||||
|
|
||||||
|
if is_blank:
|
||||||
|
max_sharpness = 1
|
||||||
|
ocr_orientation_match = True
|
||||||
|
text_margin_px = -1
|
||||||
|
else:
|
||||||
|
max_sharpness = 0.0
|
||||||
|
if im_cropped.size[0] < im_cropped.size[1]:
|
||||||
|
# Page is in portrait orientation.
|
||||||
|
segments_x = 2
|
||||||
|
segments_y = 3
|
||||||
|
else:
|
||||||
|
# Page is in landscape orientation.
|
||||||
|
segments_x = 3
|
||||||
|
segments_y = 2
|
||||||
|
for i in range(segments_x):
|
||||||
|
for j in range(segments_y):
|
||||||
|
max_sharpness = max(
|
||||||
|
max_sharpness,
|
||||||
|
analyze_sharpness(
|
||||||
|
im_cropped.crop(
|
||||||
|
(
|
||||||
|
im_cropped.size[0] / segments_x * i,
|
||||||
|
im_cropped.size[1] / segments_y * j,
|
||||||
|
im_cropped.size[0] / segments_x * (i + 1),
|
||||||
|
im_cropped.size[1] / segments_y * (j + 1),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
OCR_SCALE = 1
|
||||||
|
best_ocr_score = -1
|
||||||
|
best_ocr_words = None
|
||||||
|
best_ocr_orientation = -1
|
||||||
|
for orientation in range(4):
|
||||||
|
im_rotated = task.im.resize(
|
||||||
|
np.int_(np.array(task.im.size) * OCR_SCALE)
|
||||||
|
).rotate(90 * orientation, expand=True)
|
||||||
|
ocr = pytesseract.image_to_data(
|
||||||
|
im_rotated,
|
||||||
|
lang=task.ocr_langs,
|
||||||
|
config=f"--oem 1 --dpi {int(300 * OCR_SCALE)} --tessdata-dir ./data/tessdata_fast-4.1.0",
|
||||||
|
output_type=pytesseract.Output.DATAFRAME,
|
||||||
|
).fillna({"text": ""})
|
||||||
|
# Keep only words that Tesseract is confident in, and which are
|
||||||
|
# oriented horizontally.
|
||||||
|
words = ocr[(ocr["conf"] > 90) & (ocr["width"] > ocr["height"])]
|
||||||
|
# Keep only alphabetical words of 4 or more characters.
|
||||||
|
words = words[
|
||||||
|
words.apply(
|
||||||
|
lambda row: re.fullmatch(r"[a-zA-Z]{4,}", str(row["text"]))
|
||||||
|
is not None,
|
||||||
|
axis=1,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
if words.shape[0] > best_ocr_score:
|
||||||
|
best_ocr_score = words.shape[0]
|
||||||
|
best_ocr_orientation = orientation
|
||||||
|
best_ocr_words = words
|
||||||
|
if best_ocr_score > 50:
|
||||||
|
# Unlikely that another orientation will have more words, so
|
||||||
|
# stop eating up CPU.
|
||||||
|
break
|
||||||
|
|
||||||
|
if best_ocr_words.empty:
|
||||||
|
ocr_orientation_match = True
|
||||||
|
text_margin_px = -1
|
||||||
|
else:
|
||||||
|
ocr_orientation_match = best_ocr_orientation == 0
|
||||||
|
|
||||||
|
best_ocr_dims = OCR_SCALE * np.array(
|
||||||
|
task.im.size
|
||||||
|
if best_ocr_orientation % 2 == 0
|
||||||
|
else (task.im.size[1], task.im.size[0])
|
||||||
|
)
|
||||||
|
|
||||||
|
word_margins_all_directions = np.sort(
|
||||||
|
np.int_(
|
||||||
|
np.concat(
|
||||||
|
(
|
||||||
|
best_ocr_words["left"].to_numpy(),
|
||||||
|
best_ocr_words["top"].to_numpy(),
|
||||||
|
best_ocr_dims[0]
|
||||||
|
- (
|
||||||
|
best_ocr_words["left"] + best_ocr_words["width"]
|
||||||
|
).to_numpy(),
|
||||||
|
best_ocr_dims[1]
|
||||||
|
- (
|
||||||
|
best_ocr_words["top"] + best_ocr_words["height"]
|
||||||
|
).to_numpy(),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
# Transform back into original image pixel density
|
||||||
|
/ OCR_SCALE
|
||||||
|
)
|
||||||
|
)
|
||||||
|
# Skip the n closest words to the edge, to help ignore stray OCR artifacts.
|
||||||
|
SKIP_WORDS = 2
|
||||||
|
text_margin_px = int(
|
||||||
|
word_margins_all_directions[SKIP_WORDS]
|
||||||
|
if word_margins_all_directions.shape[0] > SKIP_WORDS
|
||||||
|
else -1
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"blank": is_blank,
|
||||||
|
"ocr_orientation_match": ocr_orientation_match,
|
||||||
|
"size_analyzed": task.im.size,
|
||||||
|
"sharpness": max_sharpness,
|
||||||
|
"text_margin_px": text_margin_px,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def analyze_sharpness(im):
|
||||||
|
"""
|
||||||
|
Crudely quantifies the "sharpness" of edges in an image, on a scale of 0 to
|
||||||
|
1. The scale is not linear with respect to scan quality: anything above 0.1
|
||||||
|
is usually fine.
|
||||||
|
"""
|
||||||
|
arr = np.asarray(im)
|
||||||
|
|
||||||
|
# Normalize contrast based on brightest and darkest pixels. For example,
|
||||||
|
# NORM_QUANTILE=0.1 will attempt to transform pixel values so that 80% fall
|
||||||
|
# between 10% brightness and 90% brightness. In practice, a value around
|
||||||
|
# 0.02 seems to work fairly well.
|
||||||
|
NORM_QUANTILE = 0.03
|
||||||
|
pixel_range = np.quantile(arr, 1.0 - NORM_QUANTILE) - np.quantile(
|
||||||
|
arr, NORM_QUANTILE
|
||||||
|
)
|
||||||
|
if pixel_range == 0:
|
||||||
|
arr_normalized = arr
|
||||||
|
else:
|
||||||
|
arr_normalized = arr * (1.0 - NORM_QUANTILE * 2) / pixel_range
|
||||||
|
arr_normalized = (
|
||||||
|
arr_normalized - np.quantile(arr_normalized, NORM_QUANTILE) + NORM_QUANTILE
|
||||||
|
)
|
||||||
|
arr_normalized = np.uint8(np.clip(arr_normalized, 0, 1) * 255)
|
||||||
|
|
||||||
|
# "Sharpness" is determined by measuring the median intensity of pixels
|
||||||
|
# near edges, after an edge detection filter has been applied to the image.
|
||||||
|
edges_arr = np.asarray(
|
||||||
|
Image.fromarray(arr_normalized).filter(ImageFilter.FIND_EDGES)
|
||||||
|
)
|
||||||
|
EDGE_THRESHOLD = 8
|
||||||
|
return np.median(edges_arr[edges_arr > EDGE_THRESHOLD]) / 255
|
||||||
494
main.py
494
main.py
|
|
@ -1,371 +1,175 @@
|
||||||
import json
|
|
||||||
import re
|
import re
|
||||||
import urllib.parse
|
import sqlite3
|
||||||
|
import traceback
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
from dataclasses import dataclass
|
from datetime import datetime
|
||||||
from io import BytesIO
|
from sys import stderr
|
||||||
from multiprocessing import Pool
|
from time import sleep
|
||||||
from multiprocessing.pool import ThreadPool
|
|
||||||
from sys import stderr, stdin, stdout
|
|
||||||
from zipfile import ZipFile
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import pytesseract
|
|
||||||
import requests
|
import requests
|
||||||
from PIL import Image, ImageFilter
|
|
||||||
|
|
||||||
|
from engine import analyze_item
|
||||||
OCR_LANGS = "eng+fra"
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = ArgumentParser()
|
parser = ArgumentParser()
|
||||||
parser.add_argument("--summarize", action="store_true")
|
parser.add_argument("--database", default="./microqa.db")
|
||||||
parser.add_argument("-v", "--verbose", action="store_true")
|
parser.add_argument("--cpus", type=int, default=2)
|
||||||
parser.add_argument("-w", "--workers", type=int, default=1)
|
parser.add_argument("--earliest-review-date", default="20250701")
|
||||||
parser.add_argument("--page-margin-px", type=int, default=50)
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Process STDIN line by line, where each line contains one or more item IDs
|
with sqlite3.connect(args.database) as conn:
|
||||||
# separated by whitespace.
|
cur = conn.cursor()
|
||||||
for line in stdin:
|
cur.execute("""
|
||||||
item_ids = [value for value in re.split(r",|\s", line) if value]
|
create table if not exists items (
|
||||||
with ThreadPool(args.workers) as pool:
|
id text primary key not null,
|
||||||
if args.verbose:
|
review_date text not null,
|
||||||
print(f"Running with {args.workers} workers.", file=stderr)
|
skip_analysis bool not null,
|
||||||
stderr.flush()
|
analyzed_date text
|
||||||
if args.summarize:
|
)""")
|
||||||
pool.map(
|
cur.execute("""
|
||||||
_summarize_item_to_stdout,
|
create table if not exists pages (
|
||||||
[
|
id int primary key,
|
||||||
ItemTask(
|
item text not null,
|
||||||
item_id=item_id,
|
page int not null,
|
||||||
page_margin_px=args.page_margin_px,
|
orientation_match boolean not null,
|
||||||
verbose=args.verbose,
|
sharpness real not null,
|
||||||
|
is_blank boolean not null,
|
||||||
|
text_margin_px int not null
|
||||||
|
)""")
|
||||||
|
cur.execute("create index if not exists review_date_idx on items (review_date)")
|
||||||
|
cur.execute(
|
||||||
|
"create index if not exists analyzed_date_idx on items (analyzed_date)"
|
||||||
)
|
)
|
||||||
for item_id in item_ids
|
cur.execute("create index if not exists item_idx on pages (item)")
|
||||||
|
cur.execute(
|
||||||
|
"create unique index if not exists item_page_idx on pages (item, page)"
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
while True:
|
||||||
|
print("Pulling item IDs")
|
||||||
|
pull_new_item_ids(conn, args.earliest_review_date)
|
||||||
|
print("Done.")
|
||||||
|
res = cur.execute("""
|
||||||
|
select id
|
||||||
|
from items
|
||||||
|
where analyzed_date is null
|
||||||
|
and skip_analysis = false
|
||||||
|
order by review_date
|
||||||
|
""")
|
||||||
|
for (item_id,) in res.fetchall():
|
||||||
|
N_ATTEMPTS = 3
|
||||||
|
for _ in range(N_ATTEMPTS):
|
||||||
|
try:
|
||||||
|
print(f"Processing {item_id}")
|
||||||
|
analysis = analyze_item(
|
||||||
|
item_id, parallel=args.cpus, verbose=True
|
||||||
|
)
|
||||||
|
for i, page in enumerate(analysis["pages"]):
|
||||||
|
cur.execute(
|
||||||
|
"""
|
||||||
|
insert into pages (
|
||||||
|
item,
|
||||||
|
page,
|
||||||
|
orientation_match,
|
||||||
|
sharpness,
|
||||||
|
is_blank,
|
||||||
|
text_margin_px
|
||||||
|
) values (
|
||||||
|
?,
|
||||||
|
?,
|
||||||
|
?,
|
||||||
|
?,
|
||||||
|
?,
|
||||||
|
?
|
||||||
|
)""",
|
||||||
|
[
|
||||||
|
item_id,
|
||||||
|
i + 1,
|
||||||
|
page["ocr_orientation_match"],
|
||||||
|
page["sharpness"],
|
||||||
|
page["blank"],
|
||||||
|
page["text_margin_px"],
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
else:
|
cur.execute(
|
||||||
pool.map(
|
"update items set analyzed_date = ? where id = ?",
|
||||||
_analyze_item_to_stdout,
|
[datetime.utcnow().strftime("%Y%m%d%H%M%S"), item_id],
|
||||||
[
|
|
||||||
ItemTask(
|
|
||||||
item_id=item_id,
|
|
||||||
page_margin_px=args.page_margin_px,
|
|
||||||
verbose=args.verbose,
|
|
||||||
)
|
)
|
||||||
for item_id in item_ids
|
conn.commit()
|
||||||
],
|
print("Done")
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class ItemTask:
|
|
||||||
item_id: str
|
|
||||||
page_margin_px: int
|
|
||||||
verbose: bool
|
|
||||||
|
|
||||||
|
|
||||||
def _summarize_item_to_stdout(task):
|
|
||||||
item_id = task.item_id
|
|
||||||
page_margin_px = task.page_margin_px
|
|
||||||
verbose = task.verbose
|
|
||||||
|
|
||||||
if verbose:
|
|
||||||
print(f"Summarizing item {item_id}...", file=stderr)
|
|
||||||
stderr.flush()
|
|
||||||
|
|
||||||
analysis = analyze_item(item_id, parallel=4, verbose=verbose)
|
|
||||||
|
|
||||||
# 3 or more blank pages in a row is a flag.
|
|
||||||
CONSECUTIVE_BLANKS_THRESHOLD = 3
|
|
||||||
if len(analysis["pages"]) >= CONSECUTIVE_BLANKS_THRESHOLD:
|
|
||||||
consecutive_blanks = [page["blank"] for page in analysis["pages"]]
|
|
||||||
for _ in range(1, CONSECUTIVE_BLANKS_THRESHOLD):
|
|
||||||
consecutive_blanks = [
|
|
||||||
value and consecutive_blanks[i]
|
|
||||||
for i, value in enumerate(consecutive_blanks[1:])
|
|
||||||
]
|
|
||||||
consecutive_blanks = [
|
|
||||||
i + 2 # +1 to account for enumeration offset, and +1 to 1-index
|
|
||||||
for i, value in enumerate(consecutive_blanks[1:])
|
|
||||||
if value and not consecutive_blanks[i]
|
|
||||||
]
|
|
||||||
else:
|
|
||||||
consecutive_blanks = []
|
|
||||||
|
|
||||||
# 3 or more blank pages in a row is a flag.
|
|
||||||
CONSECUTIVE_BLURRY_THRESHOLD = 3
|
|
||||||
SHARPNESS_THRESHOLD = 0.1
|
|
||||||
if len(analysis["pages"]) >= CONSECUTIVE_BLURRY_THRESHOLD:
|
|
||||||
consecutive_blurry = [
|
|
||||||
page["sharpness"] < SHARPNESS_THRESHOLD for page in analysis["pages"]
|
|
||||||
]
|
|
||||||
for _ in range(1, CONSECUTIVE_BLURRY_THRESHOLD):
|
|
||||||
consecutive_blurry = [
|
|
||||||
value and consecutive_blurry[i]
|
|
||||||
for i, value in enumerate(consecutive_blurry[1:])
|
|
||||||
]
|
|
||||||
consecutive_blurry = [
|
|
||||||
i + 2 # +1 to account for enumeration offset, and +1 to 1-index
|
|
||||||
for i, value in enumerate(consecutive_blurry[1:])
|
|
||||||
if value and not consecutive_blurry[i]
|
|
||||||
]
|
|
||||||
else:
|
|
||||||
consecutive_blurry = []
|
|
||||||
|
|
||||||
check_orientation = [
|
|
||||||
i + 1
|
|
||||||
for i, page in enumerate(analysis["pages"])
|
|
||||||
if not page["ocr_orientation_match"]
|
|
||||||
]
|
|
||||||
|
|
||||||
check_crop = [
|
|
||||||
i + 1
|
|
||||||
for i, page in enumerate(analysis["pages"])
|
|
||||||
if page["text_margin_px"] < page_margin_px
|
|
||||||
]
|
|
||||||
|
|
||||||
if check_orientation or check_crop or consecutive_blanks or consecutive_blurry:
|
|
||||||
print(
|
|
||||||
json.dumps(
|
|
||||||
{
|
|
||||||
"item_id": item_id,
|
|
||||||
"check_orientation": check_orientation,
|
|
||||||
"check_crop": check_crop,
|
|
||||||
"consecutive_blanks": consecutive_blanks,
|
|
||||||
"consecutive_blurry": consecutive_blurry,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
)
|
|
||||||
stdout.flush()
|
|
||||||
|
|
||||||
if verbose:
|
|
||||||
print(f"Done summarizing item {item_id}.", file=stderr)
|
|
||||||
stderr.flush()
|
|
||||||
|
|
||||||
|
|
||||||
def _analyze_item_to_stdout(task):
|
|
||||||
item_id = task.item_id
|
|
||||||
verbose = task.verbose
|
|
||||||
|
|
||||||
if verbose:
|
|
||||||
print(f"Analyzing item {item_id}...", file=stderr)
|
|
||||||
stderr.flush()
|
|
||||||
|
|
||||||
print(json.dumps(analyze_item(item_id, parallel=4, verbose=verbose)))
|
|
||||||
stdout.flush()
|
|
||||||
|
|
||||||
if verbose:
|
|
||||||
print(f"Done analyzing item {item_id}.", file=stderr)
|
|
||||||
stderr.flush()
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class PageAnalysisTask:
|
|
||||||
im: Image.Image
|
|
||||||
page_index: int
|
|
||||||
file_name: str
|
|
||||||
|
|
||||||
|
|
||||||
def _analyze_page(task):
|
|
||||||
im_original = task.im
|
|
||||||
page_index = task.page_index
|
|
||||||
file_name = task.file_name
|
|
||||||
|
|
||||||
im_cropped = im_original.crop(
|
|
||||||
(
|
|
||||||
im_original.size[0] * 0.1,
|
|
||||||
im_original.size[1] * 0.1,
|
|
||||||
im_original.size[0] * 0.9,
|
|
||||||
im_original.size[1] * 0.9,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
is_blank = im_cropped.getextrema()[0] > 255 * 0.8
|
|
||||||
|
|
||||||
if is_blank:
|
|
||||||
max_sharpness = 1
|
|
||||||
ocr_orientation_match = True
|
|
||||||
text_margin_px = -1
|
|
||||||
else:
|
|
||||||
max_sharpness = 0.0
|
|
||||||
if im_cropped.size[0] < im_cropped.size[1]:
|
|
||||||
# Page is in portrait orientation.
|
|
||||||
segments_x = 2
|
|
||||||
segments_y = 3
|
|
||||||
else:
|
|
||||||
# Page is in landscape orientation.
|
|
||||||
segments_x = 3
|
|
||||||
segments_y = 2
|
|
||||||
for i in range(segments_x):
|
|
||||||
for j in range(segments_y):
|
|
||||||
max_sharpness = max(
|
|
||||||
max_sharpness,
|
|
||||||
analyze_sharpness(
|
|
||||||
im_cropped.crop(
|
|
||||||
(
|
|
||||||
im_cropped.size[0] / segments_x * i,
|
|
||||||
im_cropped.size[1] / segments_y * j,
|
|
||||||
im_cropped.size[0] / segments_x * (i + 1),
|
|
||||||
im_cropped.size[1] / segments_y * (j + 1),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
best_ocr_score = -1
|
|
||||||
best_ocr_words = None
|
|
||||||
best_ocr_orientation = -1
|
|
||||||
for orientation in range(4):
|
|
||||||
im_rotated = im_original.rotate(90 * orientation, expand=True)
|
|
||||||
ocr = pytesseract.image_to_data(
|
|
||||||
im_rotated,
|
|
||||||
lang=OCR_LANGS,
|
|
||||||
config="--oem 1 --dpi 300 --tessdata-dir ./data/tessdata_fast-4.1.0",
|
|
||||||
output_type=pytesseract.Output.DATAFRAME,
|
|
||||||
).fillna({"text": ""})
|
|
||||||
words = ocr[(ocr["conf"] > 90) & (ocr["width"] > ocr["height"])]
|
|
||||||
words = words[
|
|
||||||
words.apply(
|
|
||||||
lambda row: re.fullmatch(r"[a-zA-Z]{4,}", row["text"]) is not None,
|
|
||||||
axis=1,
|
|
||||||
)
|
|
||||||
]
|
|
||||||
if words.shape[0] > best_ocr_score:
|
|
||||||
best_ocr_score = words.shape[0]
|
|
||||||
best_ocr_orientation = orientation
|
|
||||||
best_ocr_words = words
|
|
||||||
if best_ocr_score > 50:
|
|
||||||
# Unlikely that another orientation will have more words, so
|
|
||||||
# stop eating up CPU unnecessarily.
|
|
||||||
break
|
break
|
||||||
|
except Exception as err:
|
||||||
ocr_orientation_match = best_ocr_orientation == 0
|
print(err, file=stderr)
|
||||||
|
traceback.print_tb(err.__traceback__, file=stderr)
|
||||||
best_ocr_dims = (
|
sleep(15)
|
||||||
im_original.size
|
|
||||||
if best_ocr_orientation % 2 == 0
|
|
||||||
else (im_original.size[1], im_original.size[0])
|
|
||||||
)
|
|
||||||
|
|
||||||
word_margins_all_directions = np.sort(
|
|
||||||
np.concat(
|
|
||||||
(
|
|
||||||
best_ocr_words["left"].to_numpy(),
|
|
||||||
best_ocr_words["top"].to_numpy(),
|
|
||||||
best_ocr_dims[0]
|
|
||||||
- (best_ocr_words["left"] + best_ocr_words["width"]).to_numpy(),
|
|
||||||
best_ocr_dims[1]
|
|
||||||
- (best_ocr_words["top"] + best_ocr_words["height"]).to_numpy(),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
# Skip the n closest words to the edge, to help ignore stray OCR artifacts.
|
|
||||||
SKIP_WORDS = 2
|
|
||||||
text_margin_px = (
|
|
||||||
int(word_margins_all_directions[SKIP_WORDS])
|
|
||||||
if word_margins_all_directions.shape[0] > SKIP_WORDS
|
|
||||||
else -1
|
|
||||||
)
|
|
||||||
|
|
||||||
return {
|
|
||||||
"blank": is_blank,
|
|
||||||
"file_name": file_name,
|
|
||||||
"ocr_orientation_match": ocr_orientation_match,
|
|
||||||
"page_index": page_index,
|
|
||||||
"size": im_original.size,
|
|
||||||
"sharpness": max_sharpness,
|
|
||||||
"text_margin_px": text_margin_px,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def analyze_item(item_id, parallel=1, verbose=False):
|
|
||||||
escaped_item_id = urllib.parse.quote(item_id, safe="")
|
|
||||||
|
|
||||||
if verbose:
|
|
||||||
print("Downloading...", file=stderr)
|
|
||||||
stderr.flush()
|
|
||||||
page_nums_resp = requests.get(
|
|
||||||
f"https://archive.org/metadata/{escaped_item_id}/page_numbers/pages"
|
|
||||||
)
|
|
||||||
page_nums_resp.raise_for_status()
|
|
||||||
page_nums = page_nums_resp.json()["result"]
|
|
||||||
|
|
||||||
zip_resp = requests.get(
|
|
||||||
f"https://archive.org/download/{escaped_item_id}/{escaped_item_id}_jp2.zip"
|
|
||||||
)
|
|
||||||
zip_resp.raise_for_status()
|
|
||||||
|
|
||||||
if verbose:
|
|
||||||
print("Decompressing...", file=stderr)
|
|
||||||
stderr.flush()
|
|
||||||
tasks = []
|
|
||||||
with ZipFile(BytesIO(zip_resp.content)) as jp_zip:
|
|
||||||
for leaf_num, file_name in enumerate(sorted(jp_zip.namelist())):
|
|
||||||
for page_index, page_num_info in enumerate(page_nums):
|
|
||||||
if page_num_info["leafNum"] == leaf_num:
|
|
||||||
# Stop iterating and keep page_index set to the current item.
|
|
||||||
break
|
break
|
||||||
else:
|
sleep(3600)
|
||||||
# Set to -1 to indicate that leaf was not found in page_num list.
|
|
||||||
page_index = -1
|
|
||||||
|
|
||||||
if page_index != -1:
|
|
||||||
with jp_zip.open(file_name) as jp_file:
|
|
||||||
im = Image.open(jp_file).convert("L")
|
|
||||||
im.thumbnail((3200, 3200))
|
|
||||||
tasks.append(
|
|
||||||
PageAnalysisTask(
|
|
||||||
im=im,
|
|
||||||
page_index=page_index,
|
|
||||||
file_name=file_name,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
if verbose:
|
|
||||||
print(f"Processing {len(page_nums)} pages...", file=stderr)
|
|
||||||
stderr.flush()
|
|
||||||
if parallel > 1:
|
|
||||||
# Parallelize image processing and OCR of pages across up to n cores.
|
|
||||||
with Pool(parallel) as pool:
|
|
||||||
return {"pages": pool.map(_analyze_page, tasks)}
|
|
||||||
return {"pages": [_analyze_page(task) for task in tasks]}
|
|
||||||
|
|
||||||
|
|
||||||
def analyze_sharpness(im):
|
def pull_new_item_ids(conn, earliest_review_date):
|
||||||
|
cur = conn.cursor()
|
||||||
|
res = cur.execute("select review_date from items order by review_date desc limit 1")
|
||||||
|
(latest_review_date,) = res.fetchone() or (earliest_review_date,)
|
||||||
|
print(latest_review_date)
|
||||||
|
|
||||||
|
query = f"""
|
||||||
|
collection:(microfiche)
|
||||||
|
AND contributor:(Internet Archive)
|
||||||
|
AND micro_review:(done)
|
||||||
|
AND review_date:[{latest_review_date} TO null]
|
||||||
"""
|
"""
|
||||||
Crudely quantifies the "sharpness" of edges in an image, on a scale of 0 to
|
sort = "reviewdate asc"
|
||||||
1. The scale is not linear with respect to scan quality: anything above 0.1
|
|
||||||
is usually fine.
|
|
||||||
"""
|
|
||||||
arr = np.asarray(im)
|
|
||||||
|
|
||||||
# Normalize contrast based on brightest and darkest pixels. For example,
|
# Format for API.
|
||||||
# NORM_QUANTILE=0.1 will attempt to transform pixel values so that 80% fall
|
query = re.sub(r"\s+", "+", query.strip())
|
||||||
# between 10% brightness and 90% brightness. In practice, a value around
|
sort = re.sub(r"\s+", "+", sort.strip())
|
||||||
# 0.02 seems to work fairly well.
|
|
||||||
NORM_QUANTILE = 0.03
|
|
||||||
pixel_range = np.quantile(arr, 1.0 - NORM_QUANTILE) - np.quantile(
|
|
||||||
arr, NORM_QUANTILE
|
|
||||||
)
|
|
||||||
if pixel_range == 0:
|
|
||||||
arr_normalized = arr
|
|
||||||
else:
|
|
||||||
arr_normalized = arr * (1.0 - NORM_QUANTILE * 2) / pixel_range
|
|
||||||
arr_normalized = (
|
|
||||||
arr_normalized - np.quantile(arr_normalized, NORM_QUANTILE) + NORM_QUANTILE
|
|
||||||
)
|
|
||||||
arr_normalized = np.uint8(np.clip(arr_normalized, 0, 1) * 255)
|
|
||||||
|
|
||||||
# "Sharpness" is determined by measuring the median intensity of pixels
|
# params = {
|
||||||
# near edges, after an edge detection filter has been applied to the image.
|
# "q": query,
|
||||||
edges_arr = np.asarray(
|
# "count": 100,
|
||||||
Image.fromarray(arr_normalized).filter(ImageFilter.FIND_EDGES)
|
# "fields": "identifier,review_date",
|
||||||
|
# "sorts": sort,
|
||||||
|
# }
|
||||||
|
# for i in range(1, 999):
|
||||||
|
# resp = requests.get(
|
||||||
|
# "https://archive.org/services/search/v1/scrape",
|
||||||
|
# params=params,
|
||||||
|
# )
|
||||||
|
# resp.raise_for_status()
|
||||||
|
# print(resp.text)
|
||||||
|
# try:
|
||||||
|
# body = resp.json()
|
||||||
|
# except Exception as err:
|
||||||
|
# print("Body:", resp.text, file=stderr)
|
||||||
|
# raise err
|
||||||
|
# for doc in body["items"]:
|
||||||
|
# cur.execute(
|
||||||
|
# "insert into items (id, review_date, skip_analysis) values (?, ?, false) on conflict do nothing",
|
||||||
|
# (doc["identifier"], doc["review_date"]),
|
||||||
|
# )
|
||||||
|
# conn.commit()
|
||||||
|
# cursor = body.get("cursor", None)
|
||||||
|
# if cursor is None:
|
||||||
|
# break
|
||||||
|
# params = params.copy()
|
||||||
|
# params["cursor"] = cursor
|
||||||
|
resp = requests.get(
|
||||||
|
f"https://archive.org/advancedsearch.php?q={query}&sort[]={sort}&fl[]=identifier&fl[]=review_date&rows=250000&output=json",
|
||||||
)
|
)
|
||||||
EDGE_THRESHOLD = 8
|
resp.raise_for_status()
|
||||||
return np.median(edges_arr[edges_arr > EDGE_THRESHOLD]) / 255
|
try:
|
||||||
|
body = resp.json()
|
||||||
|
except Exception as err:
|
||||||
|
print("Body:", resp.text, file=stderr)
|
||||||
|
raise err
|
||||||
|
for doc in body["response"]["docs"]:
|
||||||
|
cur.execute(
|
||||||
|
"insert into items (id, review_date, skip_analysis) values (?, ?, false) on conflict do nothing",
|
||||||
|
(doc["identifier"], doc["review_date"]),
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
159
one_off.py
Normal file
159
one_off.py
Normal file
|
|
@ -0,0 +1,159 @@
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from multiprocessing.pool import ThreadPool
|
||||||
|
from sys import stderr, stdin, stdout
|
||||||
|
|
||||||
|
from engine import analyze_item
|
||||||
|
|
||||||
|
|
||||||
|
OCR_LANGS = "eng+fra"
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = ArgumentParser()
|
||||||
|
parser.add_argument("--summarize", action="store_true")
|
||||||
|
parser.add_argument("-v", "--verbose", action="store_true")
|
||||||
|
parser.add_argument("-w", "--workers", type=int, default=1)
|
||||||
|
parser.add_argument("--page-margin-px", type=int, default=50)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Process STDIN line by line, where each line contains one or more item IDs
|
||||||
|
# separated by whitespace.
|
||||||
|
for line in stdin:
|
||||||
|
item_ids = [value for value in re.split(r",|\s", line) if value]
|
||||||
|
with ThreadPool(args.workers) as pool:
|
||||||
|
if args.verbose:
|
||||||
|
print(f"Running with {args.workers} workers.", file=stderr)
|
||||||
|
stderr.flush()
|
||||||
|
if args.summarize:
|
||||||
|
pool.map(
|
||||||
|
_summarize_item_to_stdout,
|
||||||
|
[
|
||||||
|
ItemTask(
|
||||||
|
item_id=item_id,
|
||||||
|
page_margin_px=args.page_margin_px,
|
||||||
|
verbose=args.verbose,
|
||||||
|
)
|
||||||
|
for item_id in item_ids
|
||||||
|
],
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
pool.map(
|
||||||
|
_analyze_item_to_stdout,
|
||||||
|
[
|
||||||
|
ItemTask(
|
||||||
|
item_id=item_id,
|
||||||
|
page_margin_px=args.page_margin_px,
|
||||||
|
verbose=args.verbose,
|
||||||
|
)
|
||||||
|
for item_id in item_ids
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ItemTask:
|
||||||
|
item_id: str
|
||||||
|
page_margin_px: int
|
||||||
|
verbose: bool
|
||||||
|
|
||||||
|
|
||||||
|
def _summarize_item_to_stdout(task):
|
||||||
|
item_id = task.item_id
|
||||||
|
page_margin_px = task.page_margin_px
|
||||||
|
verbose = task.verbose
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print(f"Summarizing item {item_id}...", file=stderr)
|
||||||
|
stderr.flush()
|
||||||
|
|
||||||
|
analysis = analyze_item(item_id, parallel=4, verbose=verbose)
|
||||||
|
|
||||||
|
# 3 or more blank pages in a row is a flag.
|
||||||
|
CONSECUTIVE_BLANKS_THRESHOLD = 3
|
||||||
|
if len(analysis["pages"]) >= CONSECUTIVE_BLANKS_THRESHOLD:
|
||||||
|
consecutive_blanks = [page["blank"] for page in analysis["pages"]]
|
||||||
|
for _ in range(1, CONSECUTIVE_BLANKS_THRESHOLD):
|
||||||
|
consecutive_blanks = [
|
||||||
|
value and consecutive_blanks[i]
|
||||||
|
for i, value in enumerate(consecutive_blanks[1:])
|
||||||
|
]
|
||||||
|
consecutive_blanks = [
|
||||||
|
i + 2 # +1 to account for enumeration offset, and +1 to 1-index
|
||||||
|
for i, value in enumerate(consecutive_blanks[1:])
|
||||||
|
if value and not consecutive_blanks[i]
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
consecutive_blanks = []
|
||||||
|
|
||||||
|
# 3 or more blank pages in a row is a flag.
|
||||||
|
CONSECUTIVE_BLURRY_THRESHOLD = 3
|
||||||
|
SHARPNESS_THRESHOLD = 0.1
|
||||||
|
if len(analysis["pages"]) >= CONSECUTIVE_BLURRY_THRESHOLD:
|
||||||
|
consecutive_blurry = [
|
||||||
|
page["sharpness"] < SHARPNESS_THRESHOLD for page in analysis["pages"]
|
||||||
|
]
|
||||||
|
for _ in range(1, CONSECUTIVE_BLURRY_THRESHOLD):
|
||||||
|
consecutive_blurry = [
|
||||||
|
value and consecutive_blurry[i]
|
||||||
|
for i, value in enumerate(consecutive_blurry[1:])
|
||||||
|
]
|
||||||
|
consecutive_blurry = [
|
||||||
|
i + 2 # +1 to account for enumeration offset, and +1 to 1-index
|
||||||
|
for i, value in enumerate(consecutive_blurry[1:])
|
||||||
|
if value and not consecutive_blurry[i]
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
consecutive_blurry = []
|
||||||
|
|
||||||
|
check_orientation = [
|
||||||
|
i + 1
|
||||||
|
for i, page in enumerate(analysis["pages"])
|
||||||
|
if not page["ocr_orientation_match"]
|
||||||
|
]
|
||||||
|
|
||||||
|
check_crop = [
|
||||||
|
i + 1
|
||||||
|
for i, page in enumerate(analysis["pages"])
|
||||||
|
if page["text_margin_px"] < page_margin_px
|
||||||
|
]
|
||||||
|
|
||||||
|
if check_orientation or check_crop or consecutive_blanks or consecutive_blurry:
|
||||||
|
print(
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"item_id": item_id,
|
||||||
|
"check_orientation": check_orientation,
|
||||||
|
"check_crop": check_crop,
|
||||||
|
"consecutive_blanks": consecutive_blanks,
|
||||||
|
"consecutive_blurry": consecutive_blurry,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
stdout.flush()
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print(f"Done summarizing item {item_id}.", file=stderr)
|
||||||
|
stderr.flush()
|
||||||
|
|
||||||
|
|
||||||
|
def _analyze_item_to_stdout(task):
|
||||||
|
item_id = task.item_id
|
||||||
|
verbose = task.verbose
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print(f"Analyzing item {item_id}...", file=stderr)
|
||||||
|
stderr.flush()
|
||||||
|
|
||||||
|
print(json.dumps(analyze_item(item_id, parallel=6, verbose=verbose)))
|
||||||
|
stdout.flush()
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print(f"Done analyzing item {item_id}.", file=stderr)
|
||||||
|
stderr.flush()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
Add table
Reference in a new issue