store results to sqlite
This commit is contained in:
parent
d33a7dc515
commit
815934ad23
3 changed files with 177 additions and 34 deletions
16
README.md
16
README.md
|
|
@ -17,6 +17,22 @@ a single line so that items are summarized in parallel):
|
||||||
pbpaste | tr '\n' ',' | uv run main.py --summarize -workers 4 -v | jq
|
pbpaste | tr '\n' ',' | uv run main.py --summarize -workers 4 -v | jq
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Query a pre-populated database for suspect pages:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
select 'https://archive.org/details/' || items.id,
|
||||||
|
pages.page,
|
||||||
|
pages.orientation_match,
|
||||||
|
pages.sharpness,
|
||||||
|
pages.text_margin_px
|
||||||
|
from items
|
||||||
|
join pages on pages.item = items.id
|
||||||
|
where pages.orientation_match = 0
|
||||||
|
or pages.sharpness < 0.07
|
||||||
|
or (pages.text_margin_px > -1 and pages.text_margin_px < 50)
|
||||||
|
order by items.id;
|
||||||
|
```
|
||||||
|
|
||||||
## Test Cases
|
## Test Cases
|
||||||
|
|
||||||
- Blurry pages: `micro_IA40244209_0984`
|
- Blurry pages: `micro_IA40244209_0984`
|
||||||
|
|
|
||||||
134
cache.py
Normal file
134
cache.py
Normal file
|
|
@ -0,0 +1,134 @@
|
||||||
|
import re
|
||||||
|
import sqlite3
|
||||||
|
import traceback
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
from datetime import datetime
|
||||||
|
from time import sleep
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from main import analyze_item
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = ArgumentParser()
|
||||||
|
parser.add_argument("--database", default="./microqa.db")
|
||||||
|
parser.add_argument("--cpus", type=int, default=2)
|
||||||
|
parser.add_argument("--earliest-review-date", default="20250701")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
with sqlite3.connect(args.database) as conn:
|
||||||
|
cur = conn.cursor()
|
||||||
|
cur.execute("""
|
||||||
|
create table if not exists items (
|
||||||
|
id text primary key not null,
|
||||||
|
review_date text not null,
|
||||||
|
analyzed_date text
|
||||||
|
)""")
|
||||||
|
cur.execute("""
|
||||||
|
create table if not exists pages (
|
||||||
|
id int primary key,
|
||||||
|
item text not null,
|
||||||
|
page int not null,
|
||||||
|
orientation_match boolean not null,
|
||||||
|
sharpness real not null,
|
||||||
|
is_blank boolean not null,
|
||||||
|
text_margin_px int not null
|
||||||
|
)""")
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
while True:
|
||||||
|
print("Pulling item IDs")
|
||||||
|
pull_new_item_ids(conn, args.earliest_review_date)
|
||||||
|
print("Done.")
|
||||||
|
res = cur.execute(
|
||||||
|
"select id from items where analyzed_date is null order by review_date"
|
||||||
|
)
|
||||||
|
for (item_id,) in res.fetchall():
|
||||||
|
N_ATTEMPTS = 3
|
||||||
|
for _ in range(N_ATTEMPTS):
|
||||||
|
try:
|
||||||
|
print(f"Processing {item_id}")
|
||||||
|
analysis = analyze_item(
|
||||||
|
item_id, parallel=args.cpus, verbose=True
|
||||||
|
)
|
||||||
|
for i, page in enumerate(analysis["pages"]):
|
||||||
|
cur.execute(
|
||||||
|
"""
|
||||||
|
insert into pages (
|
||||||
|
item,
|
||||||
|
page,
|
||||||
|
orientation_match,
|
||||||
|
sharpness,
|
||||||
|
is_blank,
|
||||||
|
text_margin_px
|
||||||
|
) values (
|
||||||
|
?,
|
||||||
|
?,
|
||||||
|
?,
|
||||||
|
?,
|
||||||
|
?,
|
||||||
|
?
|
||||||
|
)""",
|
||||||
|
[
|
||||||
|
item_id,
|
||||||
|
i + 1,
|
||||||
|
page["ocr_orientation_match"],
|
||||||
|
page["sharpness"],
|
||||||
|
page["blank"],
|
||||||
|
page["text_margin_px"],
|
||||||
|
],
|
||||||
|
)
|
||||||
|
cur.execute(
|
||||||
|
"update items set analyzed_date = ? where id = ?",
|
||||||
|
[datetime.utcnow().strftime("%Y%m%d%H%M%S"), item_id],
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
print("Done")
|
||||||
|
break
|
||||||
|
except Exception as err:
|
||||||
|
print(err)
|
||||||
|
traceback.print_tb(err.__traceback__)
|
||||||
|
sleep(15)
|
||||||
|
break
|
||||||
|
sleep(3600)
|
||||||
|
|
||||||
|
|
||||||
|
def pull_new_item_ids(conn, earliest_review_date):
|
||||||
|
cur = conn.cursor()
|
||||||
|
res = cur.execute("select review_date from items order by review_date desc limit 1")
|
||||||
|
(latest_review_date,) = res.fetchone() or (earliest_review_date,)
|
||||||
|
print(latest_review_date)
|
||||||
|
|
||||||
|
query = f"""
|
||||||
|
collection:(microfiche)
|
||||||
|
AND contributor:(Internet Archive)
|
||||||
|
AND micro_review:(done)
|
||||||
|
AND review_date:[{latest_review_date} TO null]
|
||||||
|
"""
|
||||||
|
sort = "reviewdate asc"
|
||||||
|
|
||||||
|
# Format for API.
|
||||||
|
query = re.sub(r"\s+", "+", query.strip())
|
||||||
|
sort = re.sub(r"\s+", "+", sort.strip())
|
||||||
|
|
||||||
|
for i in range(1, 999):
|
||||||
|
resp = requests.get(
|
||||||
|
f"https://archive.org/advancedsearch.php?q={query}&sort[]={sort}&fl[]=identifier&fl[]=review_date&rows=100&page={i}&output=json",
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
body = resp.json()
|
||||||
|
if len(body["response"]["docs"]) == 0:
|
||||||
|
break
|
||||||
|
cur.executemany(
|
||||||
|
"insert into items (id, review_date) values (?, ?) on conflict do nothing",
|
||||||
|
[
|
||||||
|
(doc["identifier"], doc["review_date"])
|
||||||
|
for doc in body["response"]["docs"]
|
||||||
|
],
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
61
main.py
61
main.py
|
|
@ -16,7 +16,6 @@ from PIL import Image, ImageFilter
|
||||||
|
|
||||||
|
|
||||||
OCR_LANGS = "eng+fra"
|
OCR_LANGS = "eng+fra"
|
||||||
N_OCR_PROCESSES = 4
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
@ -77,9 +76,7 @@ def _summarize_item_to_stdout(task):
|
||||||
print(f"Summarizing item {item_id}...", file=stderr)
|
print(f"Summarizing item {item_id}...", file=stderr)
|
||||||
stderr.flush()
|
stderr.flush()
|
||||||
|
|
||||||
analysis = analyze_item(
|
analysis = analyze_item(item_id, parallel=4, verbose=verbose)
|
||||||
item_id, page_margin_px=page_margin_px, parallel=True, verbose=verbose
|
|
||||||
)
|
|
||||||
|
|
||||||
# 3 or more blank pages in a row is a flag.
|
# 3 or more blank pages in a row is a flag.
|
||||||
CONSECUTIVE_BLANKS_THRESHOLD = 3
|
CONSECUTIVE_BLANKS_THRESHOLD = 3
|
||||||
|
|
@ -124,11 +121,10 @@ def _summarize_item_to_stdout(task):
|
||||||
if not page["ocr_orientation_match"]
|
if not page["ocr_orientation_match"]
|
||||||
]
|
]
|
||||||
|
|
||||||
WORDS_NEAR_EDGE_THRESHOLD = 2
|
|
||||||
check_crop = [
|
check_crop = [
|
||||||
i + 1
|
i + 1
|
||||||
for i, page in enumerate(analysis["pages"])
|
for i, page in enumerate(analysis["pages"])
|
||||||
if page["words_near_edge"] > WORDS_NEAR_EDGE_THRESHOLD
|
if page["text_margin_px"] < page_margin_px
|
||||||
]
|
]
|
||||||
|
|
||||||
if check_orientation or check_crop or consecutive_blanks or consecutive_blurry:
|
if check_orientation or check_crop or consecutive_blanks or consecutive_blurry:
|
||||||
|
|
@ -152,20 +148,13 @@ def _summarize_item_to_stdout(task):
|
||||||
|
|
||||||
def _analyze_item_to_stdout(task):
|
def _analyze_item_to_stdout(task):
|
||||||
item_id = task.item_id
|
item_id = task.item_id
|
||||||
page_margin_px = task.page_margin_px
|
|
||||||
verbose = task.verbose
|
verbose = task.verbose
|
||||||
|
|
||||||
if verbose:
|
if verbose:
|
||||||
print(f"Analyzing item {item_id}...", file=stderr)
|
print(f"Analyzing item {item_id}...", file=stderr)
|
||||||
stderr.flush()
|
stderr.flush()
|
||||||
|
|
||||||
print(
|
print(json.dumps(analyze_item(item_id, parallel=4, verbose=verbose)))
|
||||||
json.dumps(
|
|
||||||
analyze_item(
|
|
||||||
item_id, page_margin_px=page_margin_px, parallel=True, verbose=verbose
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
stdout.flush()
|
stdout.flush()
|
||||||
|
|
||||||
if verbose:
|
if verbose:
|
||||||
|
|
@ -177,14 +166,12 @@ def _analyze_item_to_stdout(task):
|
||||||
class PageAnalysisTask:
|
class PageAnalysisTask:
|
||||||
im: Image.Image
|
im: Image.Image
|
||||||
page_index: int
|
page_index: int
|
||||||
page_margin_px: int
|
|
||||||
file_name: str
|
file_name: str
|
||||||
|
|
||||||
|
|
||||||
def _analyze_page(task):
|
def _analyze_page(task):
|
||||||
im_original = task.im
|
im_original = task.im
|
||||||
page_index = task.page_index
|
page_index = task.page_index
|
||||||
page_margin_px = task.page_margin_px
|
|
||||||
file_name = task.file_name
|
file_name = task.file_name
|
||||||
|
|
||||||
im_cropped = im_original.crop(
|
im_cropped = im_original.crop(
|
||||||
|
|
@ -201,7 +188,7 @@ def _analyze_page(task):
|
||||||
if is_blank:
|
if is_blank:
|
||||||
max_sharpness = 1
|
max_sharpness = 1
|
||||||
ocr_orientation_match = True
|
ocr_orientation_match = True
|
||||||
words_near_edge = 0
|
text_margin_px = -1
|
||||||
else:
|
else:
|
||||||
max_sharpness = 0.0
|
max_sharpness = 0.0
|
||||||
if im_cropped.size[0] < im_cropped.size[1]:
|
if im_cropped.size[0] < im_cropped.size[1]:
|
||||||
|
|
@ -262,19 +249,26 @@ def _analyze_page(task):
|
||||||
if best_ocr_orientation % 2 == 0
|
if best_ocr_orientation % 2 == 0
|
||||||
else (im_original.size[1], im_original.size[0])
|
else (im_original.size[1], im_original.size[0])
|
||||||
)
|
)
|
||||||
words_near_edge = best_ocr_words[
|
|
||||||
(best_ocr_words["left"] < page_margin_px)
|
word_margins_all_directions = np.sort(
|
||||||
| (best_ocr_words["top"] < page_margin_px)
|
np.concat(
|
||||||
| (
|
(
|
||||||
best_ocr_words["left"] + best_ocr_words["width"]
|
best_ocr_words["left"].to_numpy(),
|
||||||
> best_ocr_dims[0] - page_margin_px
|
best_ocr_words["top"].to_numpy(),
|
||||||
|
best_ocr_dims[0]
|
||||||
|
- (best_ocr_words["left"] + best_ocr_words["width"]).to_numpy(),
|
||||||
|
best_ocr_dims[1]
|
||||||
|
- (best_ocr_words["top"] + best_ocr_words["height"]).to_numpy(),
|
||||||
|
)
|
||||||
)
|
)
|
||||||
| (
|
)
|
||||||
best_ocr_words["top"] + best_ocr_words["height"]
|
# Skip the n closest words to the edge, to help ignore stray OCR artifacts.
|
||||||
> best_ocr_dims[1] - page_margin_px
|
SKIP_WORDS = 2
|
||||||
)
|
text_margin_px = (
|
||||||
]
|
int(word_margins_all_directions[SKIP_WORDS])
|
||||||
words_near_edge = words_near_edge.shape[0]
|
if word_margins_all_directions.shape[0] > SKIP_WORDS
|
||||||
|
else -1
|
||||||
|
)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"blank": is_blank,
|
"blank": is_blank,
|
||||||
|
|
@ -283,11 +277,11 @@ def _analyze_page(task):
|
||||||
"page_index": page_index,
|
"page_index": page_index,
|
||||||
"size": im_original.size,
|
"size": im_original.size,
|
||||||
"sharpness": max_sharpness,
|
"sharpness": max_sharpness,
|
||||||
"words_near_edge": words_near_edge,
|
"text_margin_px": text_margin_px,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def analyze_item(item_id, page_margin_px, parallel=False, verbose=False):
|
def analyze_item(item_id, parallel=1, verbose=False):
|
||||||
escaped_item_id = urllib.parse.quote(item_id, safe="")
|
escaped_item_id = urllib.parse.quote(item_id, safe="")
|
||||||
|
|
||||||
if verbose:
|
if verbose:
|
||||||
|
|
@ -326,7 +320,6 @@ def analyze_item(item_id, page_margin_px, parallel=False, verbose=False):
|
||||||
PageAnalysisTask(
|
PageAnalysisTask(
|
||||||
im=im,
|
im=im,
|
||||||
page_index=page_index,
|
page_index=page_index,
|
||||||
page_margin_px=page_margin_px,
|
|
||||||
file_name=file_name,
|
file_name=file_name,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
@ -334,9 +327,9 @@ def analyze_item(item_id, page_margin_px, parallel=False, verbose=False):
|
||||||
if verbose:
|
if verbose:
|
||||||
print(f"Processing {len(page_nums)} pages...", file=stderr)
|
print(f"Processing {len(page_nums)} pages...", file=stderr)
|
||||||
stderr.flush()
|
stderr.flush()
|
||||||
if parallel:
|
if parallel > 1:
|
||||||
# Parallelize image processing and OCR of pages across up to n cores.
|
# Parallelize image processing and OCR of pages across up to n cores.
|
||||||
with Pool(N_OCR_PROCESSES) as pool:
|
with Pool(parallel) as pool:
|
||||||
return {"pages": pool.map(_analyze_page, tasks)}
|
return {"pages": pool.map(_analyze_page, tasks)}
|
||||||
return {"pages": [_analyze_page(task) for task in tasks]}
|
return {"pages": [_analyze_page(task) for task in tasks]}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue