rewrite data fetching into archive_item.py
This commit is contained in:
parent
4d9161b043
commit
d5757e3811
5 changed files with 446 additions and 106 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
|
@ -1,4 +1,6 @@
|
||||||
/target
|
/target
|
||||||
/data
|
/data
|
||||||
*.db
|
*.db
|
||||||
|
*.db-journal
|
||||||
|
/archive_cache
|
||||||
__pycache__
|
__pycache__
|
||||||
|
|
|
||||||
307
archive_item.py
Normal file
307
archive_item.py
Normal file
|
|
@ -0,0 +1,307 @@
|
||||||
|
"""
|
||||||
|
Python utilities for structuring data and metadata pulled from archive.org
|
||||||
|
microfiche scans.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import urllib
|
||||||
|
from contextlib import nullcontext
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from io import BytesIO
|
||||||
|
from typing import Optional
|
||||||
|
from zipfile import ZipFile
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
|
||||||
|
CACHE_DIR = "./archive_cache"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ArchiveLeaf:
|
||||||
|
"""
|
||||||
|
A leaf corresponds to a single image from one of the "Single Page Processed
|
||||||
|
JP2 Zip" files from an `ArchiveItem`. Not all leaves become part of the
|
||||||
|
final processed PDF displayed to the user, as some contain metadata or
|
||||||
|
superfluous information scanned off of the microfiche cards and retained for
|
||||||
|
posterity. To identify whether a leaf is pertinent or not, refer to the page
|
||||||
|
number metadata pulled as JSON from the archive.org API.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
|
||||||
|
image PIL Image, pre-scaled using .thumbnail() to fit the long
|
||||||
|
edge to 3200 pixels.
|
||||||
|
|
||||||
|
page_number `None` if the leaf is not included in the processed PDF
|
||||||
|
presented to users, otherwise a (potentially empty)
|
||||||
|
string with the inferred page number as defined by the
|
||||||
|
document being scanned.
|
||||||
|
"""
|
||||||
|
|
||||||
|
image: Image
|
||||||
|
page_number: Optional[str]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ArchiveDoc:
|
||||||
|
"""
|
||||||
|
Information pertaining to a single set of processed pages, of which there
|
||||||
|
may be multiple for any given ArchiveItem. For example, one SCOTUS case may
|
||||||
|
contain several briefs/petitions/etc., each presented as a distinct PDF but
|
||||||
|
all held within the parent `ArchiveItem`.
|
||||||
|
|
||||||
|
Note that this is a slightly different concept than the literal "files"
|
||||||
|
available via the archive.org API: an `ArchiveDoc` may combine information
|
||||||
|
from, say, both a `_page_numbers.json` file and a `_jp2.zip` file to store
|
||||||
|
image data and page number data conveniently within the same Python object.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
|
||||||
|
identifier archive.org identifier string, for example
|
||||||
|
`"micro_IA40386007_0012"`.
|
||||||
|
|
||||||
|
name Document name, with the item identifier, leading whitespace,
|
||||||
|
and file extension stripped.
|
||||||
|
|
||||||
|
title Optional `title` metadata field assigned to the `_jp2.zip`
|
||||||
|
file, usually indicating that this file represents a subset
|
||||||
|
of the parent item's content, for example a specific brief
|
||||||
|
or opinion from a larger SCOTUS case document.
|
||||||
|
|
||||||
|
For QA intents and purposes, it's usually easiest to skip
|
||||||
|
over any documents where `title is not None`, assuming that
|
||||||
|
the item has at least one processed `_jp2.zip` file for
|
||||||
|
which `title is None`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
identifier: str
|
||||||
|
name: str
|
||||||
|
title: Optional[str]
|
||||||
|
|
||||||
|
def fetch_leaves(self, numbered_only=True, use_cache=False) -> list[ArchiveLeaf]:
|
||||||
|
"""
|
||||||
|
Fetch images and page number data for this document from archive.org,
|
||||||
|
over the Internet.
|
||||||
|
|
||||||
|
Params:
|
||||||
|
|
||||||
|
numbered_only If `True`, discards any leaves with no corresponding
|
||||||
|
page number entries. Leaves for which the page
|
||||||
|
number is an empty string are retained.
|
||||||
|
use_cache If `True`, locally cached zip files under the
|
||||||
|
`./archive_cache` directory (relative to the working
|
||||||
|
directory) will be used instead of fetching over
|
||||||
|
HTTPS.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if use_cache:
|
||||||
|
# Cached file names are derived from the percent-encoded verison of
|
||||||
|
# `self.name`, so that there's no need to worry about directory
|
||||||
|
# separators or other disallowed characters in the file names
|
||||||
|
# defined by archive.org.
|
||||||
|
with open(
|
||||||
|
f"{CACHE_DIR}/{_url_encode(self.name)}_page_numbers.json", "r"
|
||||||
|
) as f:
|
||||||
|
page_nums = json.load(f)["pages"]
|
||||||
|
zip_reader_ctx = open(f"{CACHE_DIR}/{_url_encode(self.name)}_jp2.zip", "rb")
|
||||||
|
|
||||||
|
else:
|
||||||
|
page_nums = _fetch_page_nums(self.identifier, self.name)["pages"]
|
||||||
|
|
||||||
|
# Wrap in a context manager so that the reader can be used in a `with`
|
||||||
|
# block in the same way as a file accessed with `open()`.
|
||||||
|
zip_reader_ctx = nullcontext(
|
||||||
|
BytesIO(_fetch_jp2_zip(self.identifier, self.name))
|
||||||
|
)
|
||||||
|
|
||||||
|
leaves = []
|
||||||
|
|
||||||
|
with zip_reader_ctx as zip_reader, ZipFile(zip_reader) as jp_zip:
|
||||||
|
for leaf_num, file_name in enumerate(sorted(jp_zip.namelist())):
|
||||||
|
for page_index, page_num_info in enumerate(page_nums):
|
||||||
|
if page_num_info["leafNum"] == leaf_num:
|
||||||
|
# Stop iterating and keep page_index set to the current
|
||||||
|
# value.
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
# Indicate that leaf was not found in page_num list.
|
||||||
|
page_index = None
|
||||||
|
|
||||||
|
with jp_zip.open(file_name) as jp_file:
|
||||||
|
# Convert to single-channel greyscale ("L").
|
||||||
|
image = Image.open(jp_file).convert("L")
|
||||||
|
# Rescale long edge to no more than 3200 px.
|
||||||
|
image.thumbnail((3200, 3200))
|
||||||
|
leaves.append(ArchiveLeaf(image=image, page_number=page_index))
|
||||||
|
|
||||||
|
return leaves
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ArchiveItem:
|
||||||
|
"""
|
||||||
|
Information pertaining to an archive.org item. Documents, ultimately of type
|
||||||
|
`ArchiveDoc`, are referenced by name only in this class so that content
|
||||||
|
downloads for individual `ArchiveDoc`s may be skipped, staggered, or
|
||||||
|
performed in parallel if desired, rather than in one chunk per item.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
|
||||||
|
identifier archive.org identifier string, for example
|
||||||
|
`"micro_IA40386007_0012"`.
|
||||||
|
|
||||||
|
docs List of `ArchiveDoc` names, with the item identifier,
|
||||||
|
leading whitespace, and file extension stripped.
|
||||||
|
"""
|
||||||
|
|
||||||
|
identifier: str
|
||||||
|
docs: list[ArchiveDoc]
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_item(identifier: str, use_cache=False) -> ArchiveItem:
|
||||||
|
"""
|
||||||
|
Fetch the relevant top-level information for an `ArchiveItem` from
|
||||||
|
archive.org. This assumes a specific naming convention for the item's files:
|
||||||
|
- `<identifier>[ Title]_jp2.zip` for processed scans
|
||||||
|
- `<identifier>[ Title]_page_numbers.json` for page number metadata
|
||||||
|
- `<identifier>_micro_jp2.zip` for unprocessed scans
|
||||||
|
|
||||||
|
This function treats file names as case-insensitive, but preserves casing in
|
||||||
|
its output.
|
||||||
|
|
||||||
|
Params:
|
||||||
|
|
||||||
|
identifier archive.org identifier string, for example
|
||||||
|
`"micro_IA40386007_0012"`.
|
||||||
|
use_cache If `True`, locally cached zip files under the
|
||||||
|
`./archive_cache` directory (relative to the working
|
||||||
|
directory) will be used instead of fetching over HTTPS.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if use_cache:
|
||||||
|
# File names should be treated as case-insensitive, in case the file
|
||||||
|
# system is case-insensitive. As I understand it, this applies to FAT
|
||||||
|
# and APFS in their default configurations. Both are case-preserving, so
|
||||||
|
# this shouldn't usually be an issue, but if/when it is, it can be very
|
||||||
|
# frustrating to troubleshoot user-side.
|
||||||
|
file_names = [
|
||||||
|
_url_decode(name)
|
||||||
|
for name in os.listdir(CACHE_DIR)
|
||||||
|
if name.lower().startswith(identifier.lower())
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
files_resp = requests.get(
|
||||||
|
f"https://archive.org/metadata/{_url_encode(identifier)}/files"
|
||||||
|
)
|
||||||
|
files_resp.raise_for_status()
|
||||||
|
file_names = [item["name"] for item in files_resp.json()["result"]]
|
||||||
|
|
||||||
|
doc_names = [
|
||||||
|
# Strip suffix, to just leave the identifier, and title if present.
|
||||||
|
name[: -len("_jp2.zip")]
|
||||||
|
for name in file_names
|
||||||
|
if name.lower().endswith("_jp2.zip")
|
||||||
|
# Exclude unprocessed scans, which are also named `..._jp2.zip`.
|
||||||
|
and name.lower() != f"{identifier.lower()}_micro_jp2.zip"
|
||||||
|
]
|
||||||
|
|
||||||
|
# Assert that all files we expect to find are actually present.
|
||||||
|
for doc_name in doc_names:
|
||||||
|
if f"{_url_encode(doc_name.lower())}_page_numbers.json" not in [
|
||||||
|
name.lower() for name in file_names
|
||||||
|
]:
|
||||||
|
raise Exception(
|
||||||
|
f"expected file not found: {_url_encode(doc_name.lower())}_page_numbers.zip"
|
||||||
|
)
|
||||||
|
|
||||||
|
return ArchiveItem(
|
||||||
|
identifier=identifier,
|
||||||
|
docs=[
|
||||||
|
ArchiveDoc(
|
||||||
|
identifier=identifier,
|
||||||
|
name=name,
|
||||||
|
title=name[len(identifier) :].strip() or None,
|
||||||
|
)
|
||||||
|
for name in doc_names
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def cache_item(identifier: str, overwrite=True):
|
||||||
|
"""
|
||||||
|
Load the relevant files for an `ArchiveItem` and its component `ArchiveDoc`s
|
||||||
|
and store them within the `archive_cache` directory (relative to the working
|
||||||
|
directory). The `archive_cache` directory will be created if it does not
|
||||||
|
exist.
|
||||||
|
|
||||||
|
Params:
|
||||||
|
|
||||||
|
identifier archive.org identifier string, for example
|
||||||
|
`"micro_IA40386007_0012"`.
|
||||||
|
overwrite If set to `False` and any file names in the cache already
|
||||||
|
match the item, fetching the item is skipped.
|
||||||
|
"""
|
||||||
|
|
||||||
|
os.makedirs(CACHE_DIR, exist_ok=True)
|
||||||
|
|
||||||
|
for name in os.listdir(CACHE_DIR):
|
||||||
|
if _url_decode(name.lower()).startswith(identifier.lower()):
|
||||||
|
return
|
||||||
|
|
||||||
|
item = fetch_item(identifier)
|
||||||
|
for doc in item.docs:
|
||||||
|
page_nums = _fetch_page_nums(identifier, doc.name)
|
||||||
|
zip_file = _fetch_jp2_zip(identifier, doc.name)
|
||||||
|
with open(f"{CACHE_DIR}/{_url_encode(doc.name)}_page_numbers.json", "w") as f:
|
||||||
|
json.dump(page_nums, f)
|
||||||
|
with open(f"{CACHE_DIR}/{_url_encode(doc.name)}_jp2.zip", "wb") as f:
|
||||||
|
f.write(zip_file)
|
||||||
|
|
||||||
|
|
||||||
|
def _url_encode(string: str) -> str:
|
||||||
|
"""
|
||||||
|
Helper to encode to a URL-encoded (in other words, percent-encoded) string.
|
||||||
|
"""
|
||||||
|
|
||||||
|
return urllib.parse.quote(string, safe=" ._")
|
||||||
|
|
||||||
|
|
||||||
|
def _url_decode(string: str) -> str:
|
||||||
|
"""
|
||||||
|
Helper to decode from a URL-encoded (in other words, percent-encoded)
|
||||||
|
string.
|
||||||
|
"""
|
||||||
|
|
||||||
|
return urllib.parse.unquote(string)
|
||||||
|
|
||||||
|
|
||||||
|
def _fetch_page_nums(identifier: str, doc_name: str) -> dict:
|
||||||
|
"""
|
||||||
|
Fetch JSON file with page number metadata for an `ArchiveDoc`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# `self.name` does not get percent-encoded, because it is derived from the
|
||||||
|
# file path itself as defined by archive.org. Percent- encoding it further
|
||||||
|
# may result in a 404 error.
|
||||||
|
page_nums_resp = requests.get(
|
||||||
|
f"https://archive.org/download/{_url_encode(identifier)}/{doc_name}_page_numbers.json"
|
||||||
|
)
|
||||||
|
page_nums_resp.raise_for_status()
|
||||||
|
return page_nums_resp.json()
|
||||||
|
|
||||||
|
|
||||||
|
def _fetch_jp2_zip(identifier: str, doc_name: str) -> bytes:
|
||||||
|
"""
|
||||||
|
Fetch zip file with processed page scans for an `ArchiveDoc`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# `self.name` does not get percent-encoded, because it is derived
|
||||||
|
# from the file path itself as defined by archive.org. Percent-
|
||||||
|
# encoding it further may result in a 404 error.
|
||||||
|
zip_resp = requests.get(
|
||||||
|
f"https://archive.org/download/{_url_encode(identifier)}/{doc_name}_jp2.zip"
|
||||||
|
)
|
||||||
|
zip_resp.raise_for_status()
|
||||||
|
return zip_resp.content
|
||||||
65
diagnostics.py
Normal file
65
diagnostics.py
Normal file
|
|
@ -0,0 +1,65 @@
|
||||||
|
import json
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
from time import time
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from archive_item import cache_item, fetch_item
|
||||||
|
from engine import analyze_doc
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = ArgumentParser()
|
||||||
|
parser.add_argument("--item-id")
|
||||||
|
parser.add_argument("--cpus", type=int, default=4)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
cache_item(
|
||||||
|
args.item_id,
|
||||||
|
# Will not refetch if value is already cached.
|
||||||
|
overwrite=False,
|
||||||
|
)
|
||||||
|
item = fetch_item(args.item_id, use_cache=True)
|
||||||
|
|
||||||
|
t_start = time()
|
||||||
|
|
||||||
|
minimal_docs = (
|
||||||
|
[doc for doc in item.docs if doc.name != ""]
|
||||||
|
if len(item.docs) > 1
|
||||||
|
else item.docs
|
||||||
|
)
|
||||||
|
analyses = [
|
||||||
|
analyze_doc(doc, parallel=args.cpus, use_cache=True) for doc in minimal_docs
|
||||||
|
]
|
||||||
|
|
||||||
|
t_end = time()
|
||||||
|
|
||||||
|
print(
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"analyses": analyses,
|
||||||
|
"duration_secs": t_end - t_start,
|
||||||
|
"disoriented_pages": [
|
||||||
|
[
|
||||||
|
i
|
||||||
|
for i, page in enumerate(doc["pages"])
|
||||||
|
if not page["ocr_orientation_match"]
|
||||||
|
]
|
||||||
|
for doc in analyses
|
||||||
|
],
|
||||||
|
"sharpness_max": max(
|
||||||
|
*[page["sharpness"] for doc in analyses for page in doc["pages"]]
|
||||||
|
),
|
||||||
|
"sharpness_median": np.median(
|
||||||
|
[page["sharpness"] for doc in analyses for page in doc["pages"]]
|
||||||
|
).tolist(),
|
||||||
|
"sharpness_min": min(
|
||||||
|
*[page["sharpness"] for doc in analyses for page in doc["pages"]]
|
||||||
|
),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
64
engine.py
64
engine.py
|
|
@ -1,62 +1,30 @@
|
||||||
import urllib.parse
|
|
||||||
import re
|
import re
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from io import BytesIO
|
|
||||||
from multiprocessing import Pool
|
from multiprocessing import Pool
|
||||||
from sys import stderr
|
from sys import stdout
|
||||||
from zipfile import ZipFile
|
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pytesseract
|
import pytesseract
|
||||||
import requests
|
|
||||||
from PIL import Image, ImageFilter
|
from PIL import Image, ImageFilter
|
||||||
|
|
||||||
|
from archive_item import ArchiveDoc
|
||||||
|
|
||||||
def analyze_item(item_id, ocr_langs="eng+fra", parallel=1, verbose=False):
|
|
||||||
escaped_item_id = urllib.parse.quote(item_id, safe="")
|
def analyze_doc(
|
||||||
|
doc: ArchiveDoc, ocr_langs="eng+fra", parallel=1, use_cache=False, verbose=False
|
||||||
|
):
|
||||||
|
if verbose:
|
||||||
|
print(f"Loading {doc.name}...")
|
||||||
|
stdout.flush()
|
||||||
|
|
||||||
|
tasks: PageAnalysisTask = [
|
||||||
|
PageAnalysisTask(im=leaf.image, ocr_langs=ocr_langs)
|
||||||
|
for leaf in doc.fetch_leaves(use_cache=use_cache)
|
||||||
|
]
|
||||||
|
|
||||||
if verbose:
|
if verbose:
|
||||||
print("Downloading...", file=stderr)
|
print(f"Processing {len(tasks)} pages...", file=stdout)
|
||||||
stderr.flush()
|
stdout.flush()
|
||||||
page_nums_resp = requests.get(
|
|
||||||
f"https://archive.org/metadata/{escaped_item_id}/page_numbers/pages"
|
|
||||||
)
|
|
||||||
page_nums_resp.raise_for_status()
|
|
||||||
page_nums = page_nums_resp.json()["result"]
|
|
||||||
|
|
||||||
zip_resp = requests.get(
|
|
||||||
f"https://archive.org/download/{escaped_item_id}/{escaped_item_id}_jp2.zip"
|
|
||||||
)
|
|
||||||
zip_resp.raise_for_status()
|
|
||||||
|
|
||||||
if verbose:
|
|
||||||
print("Decompressing...", file=stderr)
|
|
||||||
stderr.flush()
|
|
||||||
tasks = []
|
|
||||||
with ZipFile(BytesIO(zip_resp.content)) as jp_zip:
|
|
||||||
for leaf_num, file_name in enumerate(sorted(jp_zip.namelist())):
|
|
||||||
for page_index, page_num_info in enumerate(page_nums):
|
|
||||||
if page_num_info["leafNum"] == leaf_num:
|
|
||||||
# Stop iterating and keep page_index set to the current item.
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
# Set to -1 to indicate that leaf was not found in page_num list.
|
|
||||||
page_index = -1
|
|
||||||
|
|
||||||
if page_index != -1:
|
|
||||||
with jp_zip.open(file_name) as jp_file:
|
|
||||||
im = Image.open(jp_file).convert("L")
|
|
||||||
im.thumbnail((3200, 3200))
|
|
||||||
tasks.append(
|
|
||||||
PageAnalysisTask(
|
|
||||||
im=im,
|
|
||||||
ocr_langs=ocr_langs,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
if verbose:
|
|
||||||
print(f"Processing {len(page_nums)} pages...", file=stderr)
|
|
||||||
stderr.flush()
|
|
||||||
|
|
||||||
if parallel > 1:
|
if parallel > 1:
|
||||||
# Parallelize image processing and OCR of pages across up to n cores.
|
# Parallelize image processing and OCR of pages across up to n cores.
|
||||||
|
|
|
||||||
114
main.py
114
main.py
|
|
@ -8,14 +8,28 @@ from time import sleep
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from engine import analyze_item
|
from archive_item import fetch_item
|
||||||
|
from engine import analyze_doc
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = ArgumentParser()
|
parser = ArgumentParser()
|
||||||
parser.add_argument("--database", default="./microqa.db")
|
parser.add_argument(
|
||||||
parser.add_argument("--cpus", type=int, default=2)
|
"--database",
|
||||||
parser.add_argument("--earliest-review-date", default="20250701")
|
help="path to sqlite database for analysis output",
|
||||||
|
default="./microqa.db",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--cpus",
|
||||||
|
type=int,
|
||||||
|
help="number of concurrent subprocesses to use; higher is generally faster but consumes more resources",
|
||||||
|
default=2,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--earliest-review-date",
|
||||||
|
help="script will attempt to analyze all items with a review date greater than or equal to this value (YYYYMMDD)",
|
||||||
|
default="20250701",
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
with sqlite3.connect(args.database) as conn:
|
with sqlite3.connect(args.database) as conn:
|
||||||
|
|
@ -28,9 +42,14 @@ create table if not exists items (
|
||||||
analyzed_date text
|
analyzed_date text
|
||||||
)""")
|
)""")
|
||||||
cur.execute("""
|
cur.execute("""
|
||||||
|
create table if not exists docs (
|
||||||
|
name text primary key not null,
|
||||||
|
item text not null
|
||||||
|
)""")
|
||||||
|
cur.execute("""
|
||||||
create table if not exists pages (
|
create table if not exists pages (
|
||||||
id int primary key,
|
id int primary key,
|
||||||
item text not null,
|
doc text not null,
|
||||||
page int not null,
|
page int not null,
|
||||||
orientation_match boolean not null,
|
orientation_match boolean not null,
|
||||||
sharpness real not null,
|
sharpness real not null,
|
||||||
|
|
@ -41,9 +60,10 @@ create table if not exists pages (
|
||||||
cur.execute(
|
cur.execute(
|
||||||
"create index if not exists analyzed_date_idx on items (analyzed_date)"
|
"create index if not exists analyzed_date_idx on items (analyzed_date)"
|
||||||
)
|
)
|
||||||
cur.execute("create index if not exists item_idx on pages (item)")
|
cur.execute("create index if not exists item_idx on docs (item)")
|
||||||
|
cur.execute("create index if not exists doc_idx on pages (doc)")
|
||||||
cur.execute(
|
cur.execute(
|
||||||
"create unique index if not exists item_page_idx on pages (item, page)"
|
"create unique index if not exists doc_page_idx on pages (doc, page)"
|
||||||
)
|
)
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
|
|
@ -63,36 +83,40 @@ order by review_date
|
||||||
for _ in range(N_ATTEMPTS):
|
for _ in range(N_ATTEMPTS):
|
||||||
try:
|
try:
|
||||||
print(f"Processing {item_id}")
|
print(f"Processing {item_id}")
|
||||||
analysis = analyze_item(
|
item = fetch_item(item_id)
|
||||||
item_id, parallel=args.cpus, verbose=True
|
minimal_docs = (
|
||||||
|
[doc for doc in item.docs if doc.name != ""]
|
||||||
|
if len(item.docs) > 1
|
||||||
|
else item.docs
|
||||||
)
|
)
|
||||||
for i, page in enumerate(analysis["pages"]):
|
for doc in minimal_docs:
|
||||||
cur.execute(
|
cur.execute(
|
||||||
"""
|
"insert into docs (name, item) values (?, ?) on conflict do nothing",
|
||||||
|
[doc.name, item_id],
|
||||||
|
)
|
||||||
|
analysis = analyze_doc(
|
||||||
|
doc, parallel=args.cpus, verbose=True
|
||||||
|
)
|
||||||
|
for i, page in enumerate(analysis["pages"]):
|
||||||
|
cur.execute(
|
||||||
|
"""
|
||||||
insert into pages (
|
insert into pages (
|
||||||
item,
|
doc,
|
||||||
page,
|
page,
|
||||||
orientation_match,
|
orientation_match,
|
||||||
sharpness,
|
sharpness,
|
||||||
is_blank,
|
is_blank,
|
||||||
text_margin_px
|
text_margin_px
|
||||||
) values (
|
) values (?, ?, ?, ?, ?, ?)""",
|
||||||
?,
|
[
|
||||||
?,
|
doc.name,
|
||||||
?,
|
i + 1,
|
||||||
?,
|
page["ocr_orientation_match"],
|
||||||
?,
|
page["sharpness"],
|
||||||
?
|
page["blank"],
|
||||||
)""",
|
page["text_margin_px"],
|
||||||
[
|
],
|
||||||
item_id,
|
)
|
||||||
i + 1,
|
|
||||||
page["ocr_orientation_match"],
|
|
||||||
page["sharpness"],
|
|
||||||
page["blank"],
|
|
||||||
page["text_margin_px"],
|
|
||||||
],
|
|
||||||
)
|
|
||||||
cur.execute(
|
cur.execute(
|
||||||
"update items set analyzed_date = ? where id = ?",
|
"update items set analyzed_date = ? where id = ?",
|
||||||
[datetime.utcnow().strftime("%Y%m%d%H%M%S"), item_id],
|
[datetime.utcnow().strftime("%Y%m%d%H%M%S"), item_id],
|
||||||
|
|
@ -126,35 +150,9 @@ def pull_new_item_ids(conn, earliest_review_date):
|
||||||
query = re.sub(r"\s+", "+", query.strip())
|
query = re.sub(r"\s+", "+", query.strip())
|
||||||
sort = re.sub(r"\s+", "+", sort.strip())
|
sort = re.sub(r"\s+", "+", sort.strip())
|
||||||
|
|
||||||
# params = {
|
# Archive.org has a paginated scraping API, but the query feature seems to
|
||||||
# "q": query,
|
# be broken in mysterious ways and more or less impossible to use for our
|
||||||
# "count": 100,
|
# purposes.
|
||||||
# "fields": "identifier,review_date",
|
|
||||||
# "sorts": sort,
|
|
||||||
# }
|
|
||||||
# for i in range(1, 999):
|
|
||||||
# resp = requests.get(
|
|
||||||
# "https://archive.org/services/search/v1/scrape",
|
|
||||||
# params=params,
|
|
||||||
# )
|
|
||||||
# resp.raise_for_status()
|
|
||||||
# print(resp.text)
|
|
||||||
# try:
|
|
||||||
# body = resp.json()
|
|
||||||
# except Exception as err:
|
|
||||||
# print("Body:", resp.text, file=stderr)
|
|
||||||
# raise err
|
|
||||||
# for doc in body["items"]:
|
|
||||||
# cur.execute(
|
|
||||||
# "insert into items (id, review_date, skip_analysis) values (?, ?, false) on conflict do nothing",
|
|
||||||
# (doc["identifier"], doc["review_date"]),
|
|
||||||
# )
|
|
||||||
# conn.commit()
|
|
||||||
# cursor = body.get("cursor", None)
|
|
||||||
# if cursor is None:
|
|
||||||
# break
|
|
||||||
# params = params.copy()
|
|
||||||
# params["cursor"] = cursor
|
|
||||||
resp = requests.get(
|
resp = requests.get(
|
||||||
f"https://archive.org/advancedsearch.php?q={query}&sort[]={sort}&fl[]=identifier&fl[]=review_date&rows=250000&output=json",
|
f"https://archive.org/advancedsearch.php?q={query}&sort[]={sort}&fl[]=identifier&fl[]=review_date&rows=250000&output=json",
|
||||||
)
|
)
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue