308 lines
11 KiB
Python
308 lines
11 KiB
Python
"""
|
|
Python utilities for structuring data and metadata pulled from archive.org
|
|
microfiche scans.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import urllib
|
|
from contextlib import nullcontext
|
|
from dataclasses import dataclass
|
|
from io import BytesIO
|
|
from typing import Optional
|
|
from zipfile import ZipFile
|
|
|
|
import requests
|
|
from PIL import Image
|
|
|
|
|
|
CACHE_DIR = "./archive_cache"
|
|
|
|
|
|
@dataclass
|
|
class ArchiveLeaf:
|
|
"""
|
|
A leaf corresponds to a single image from one of the "Single Page Processed
|
|
JP2 Zip" files from an `ArchiveItem`. Not all leaves become part of the
|
|
final processed PDF displayed to the user, as some contain metadata or
|
|
superfluous information scanned off of the microfiche cards and retained for
|
|
posterity. To identify whether a leaf is pertinent or not, refer to the page
|
|
number metadata pulled as JSON from the archive.org API.
|
|
|
|
Attributes:
|
|
|
|
image PIL Image, pre-scaled using .thumbnail() to fit the long
|
|
edge to 3200 pixels.
|
|
|
|
page_number `None` if the leaf is not included in the processed PDF
|
|
presented to users, otherwise a (potentially empty)
|
|
string with the inferred page number as defined by the
|
|
document being scanned.
|
|
"""
|
|
|
|
image: Image
|
|
page_number: Optional[str]
|
|
|
|
|
|
@dataclass
|
|
class ArchiveDoc:
|
|
"""
|
|
Information pertaining to a single set of processed pages, of which there
|
|
may be multiple for any given ArchiveItem. For example, one SCOTUS case may
|
|
contain several briefs/petitions/etc., each presented as a distinct PDF but
|
|
all held within the parent `ArchiveItem`.
|
|
|
|
Note that this is a slightly different concept than the literal "files"
|
|
available via the archive.org API: an `ArchiveDoc` may combine information
|
|
from, say, both a `_page_numbers.json` file and a `_jp2.zip` file to store
|
|
image data and page number data conveniently within the same Python object.
|
|
|
|
Attributes:
|
|
|
|
identifier archive.org identifier string, for example
|
|
`"micro_IA40386007_0012"`.
|
|
|
|
name Document name, with the item identifier, leading whitespace,
|
|
and file extension stripped.
|
|
|
|
title Optional `title` metadata field assigned to the `_jp2.zip`
|
|
file, usually indicating that this file represents a subset
|
|
of the parent item's content, for example a specific brief
|
|
or opinion from a larger SCOTUS case document.
|
|
|
|
For QA intents and purposes, it's usually easiest to skip
|
|
over any documents where `title is not None`, assuming that
|
|
the item has at least one processed `_jp2.zip` file for
|
|
which `title is None`.
|
|
"""
|
|
|
|
identifier: str
|
|
name: str
|
|
title: Optional[str]
|
|
|
|
def fetch_leaves(self, numbered_only=True, use_cache=False) -> list[ArchiveLeaf]:
|
|
"""
|
|
Fetch images and page number data for this document from archive.org,
|
|
over the Internet.
|
|
|
|
Params:
|
|
|
|
numbered_only If `True`, discards any leaves with no corresponding
|
|
page number entries. Leaves for which the page
|
|
number is an empty string are retained.
|
|
use_cache If `True`, locally cached zip files under the
|
|
`./archive_cache` directory (relative to the working
|
|
directory) will be used instead of fetching over
|
|
HTTPS.
|
|
"""
|
|
|
|
if use_cache:
|
|
# Cached file names are derived from the percent-encoded verison of
|
|
# `self.name`, so that there's no need to worry about directory
|
|
# separators or other disallowed characters in the file names
|
|
# defined by archive.org.
|
|
with open(
|
|
f"{CACHE_DIR}/{_url_encode(self.name)}_page_numbers.json", "r"
|
|
) as f:
|
|
page_nums = json.load(f)["pages"]
|
|
zip_reader_ctx = open(f"{CACHE_DIR}/{_url_encode(self.name)}_jp2.zip", "rb")
|
|
|
|
else:
|
|
page_nums = _fetch_page_nums(self.identifier, self.name)["pages"]
|
|
|
|
# Wrap in a context manager so that the reader can be used in a `with`
|
|
# block in the same way as a file accessed with `open()`.
|
|
zip_reader_ctx = nullcontext(
|
|
BytesIO(_fetch_jp2_zip(self.identifier, self.name))
|
|
)
|
|
|
|
leaves = []
|
|
|
|
with zip_reader_ctx as zip_reader, ZipFile(zip_reader) as jp_zip:
|
|
for leaf_num, file_name in enumerate(sorted(jp_zip.namelist())):
|
|
for page_index, page_num_info in enumerate(page_nums):
|
|
if page_num_info["leafNum"] == leaf_num:
|
|
# Stop iterating and keep page_index set to the current
|
|
# value.
|
|
break
|
|
else:
|
|
# Indicate that leaf was not found in page_num list.
|
|
page_index = None
|
|
|
|
if not numbered_only or page_index is not None:
|
|
with jp_zip.open(file_name) as jp_file:
|
|
# Convert to single-channel greyscale ("L").
|
|
image = Image.open(jp_file).convert("L")
|
|
# Rescale long edge to no more than 3200 px.
|
|
image.thumbnail((3200, 3200))
|
|
leaves.append(ArchiveLeaf(image=image, page_number=page_index))
|
|
|
|
return leaves
|
|
|
|
|
|
@dataclass
|
|
class ArchiveItem:
|
|
"""
|
|
Information pertaining to an archive.org item. Documents, ultimately of type
|
|
`ArchiveDoc`, are referenced by name only in this class so that content
|
|
downloads for individual `ArchiveDoc`s may be skipped, staggered, or
|
|
performed in parallel if desired, rather than in one chunk per item.
|
|
|
|
Attributes:
|
|
|
|
identifier archive.org identifier string, for example
|
|
`"micro_IA40386007_0012"`.
|
|
|
|
docs List of `ArchiveDoc` names, with the item identifier,
|
|
leading whitespace, and file extension stripped.
|
|
"""
|
|
|
|
identifier: str
|
|
docs: list[ArchiveDoc]
|
|
|
|
|
|
def fetch_item(identifier: str, use_cache=False) -> ArchiveItem:
|
|
"""
|
|
Fetch the relevant top-level information for an `ArchiveItem` from
|
|
archive.org. This assumes a specific naming convention for the item's files:
|
|
- `<identifier>[ Title]_jp2.zip` for processed scans
|
|
- `<identifier>[ Title]_page_numbers.json` for page number metadata
|
|
- `<identifier>_micro_jp2.zip` for unprocessed scans
|
|
|
|
This function treats file names as case-insensitive, but preserves casing in
|
|
its output.
|
|
|
|
Params:
|
|
|
|
identifier archive.org identifier string, for example
|
|
`"micro_IA40386007_0012"`.
|
|
use_cache If `True`, locally cached zip files under the
|
|
`./archive_cache` directory (relative to the working
|
|
directory) will be used instead of fetching over HTTPS.
|
|
"""
|
|
|
|
if use_cache:
|
|
# File names should be treated as case-insensitive, in case the file
|
|
# system is case-insensitive. As I understand it, this applies to FAT
|
|
# and APFS in their default configurations. Both are case-preserving, so
|
|
# this shouldn't usually be an issue, but if/when it is, it can be very
|
|
# frustrating to troubleshoot user-side.
|
|
file_names = [
|
|
_url_decode(name)
|
|
for name in os.listdir(CACHE_DIR)
|
|
if name.lower().startswith(identifier.lower())
|
|
]
|
|
else:
|
|
files_resp = requests.get(
|
|
f"https://archive.org/metadata/{_url_encode(identifier)}/files"
|
|
)
|
|
files_resp.raise_for_status()
|
|
file_names = [item["name"] for item in files_resp.json()["result"]]
|
|
|
|
doc_names = [
|
|
# Strip suffix, to just leave the identifier, and title if present.
|
|
name[: -len("_jp2.zip")]
|
|
for name in file_names
|
|
if name.lower().endswith("_jp2.zip")
|
|
# Exclude unprocessed scans, which are also named `..._jp2.zip`.
|
|
and name.lower() != f"{identifier.lower()}_micro_jp2.zip"
|
|
]
|
|
|
|
# Assert that all files we expect to find are actually present.
|
|
for doc_name in doc_names:
|
|
if f"{_url_encode(doc_name.lower())}_page_numbers.json" not in [
|
|
name.lower() for name in file_names
|
|
]:
|
|
raise Exception(
|
|
f"expected file not found: {_url_encode(doc_name.lower())}_page_numbers.zip"
|
|
)
|
|
|
|
return ArchiveItem(
|
|
identifier=identifier,
|
|
docs=[
|
|
ArchiveDoc(
|
|
identifier=identifier,
|
|
name=name,
|
|
title=name[len(identifier) :].strip() or None,
|
|
)
|
|
for name in doc_names
|
|
],
|
|
)
|
|
|
|
|
|
def cache_item(identifier: str, overwrite=True):
|
|
"""
|
|
Load the relevant files for an `ArchiveItem` and its component `ArchiveDoc`s
|
|
and store them within the `archive_cache` directory (relative to the working
|
|
directory). The `archive_cache` directory will be created if it does not
|
|
exist.
|
|
|
|
Params:
|
|
|
|
identifier archive.org identifier string, for example
|
|
`"micro_IA40386007_0012"`.
|
|
overwrite If set to `False` and any file names in the cache already
|
|
match the item, fetching the item is skipped.
|
|
"""
|
|
|
|
os.makedirs(CACHE_DIR, exist_ok=True)
|
|
|
|
for name in os.listdir(CACHE_DIR):
|
|
if _url_decode(name.lower()).startswith(identifier.lower()):
|
|
return
|
|
|
|
item = fetch_item(identifier)
|
|
for doc in item.docs:
|
|
page_nums = _fetch_page_nums(identifier, doc.name)
|
|
zip_file = _fetch_jp2_zip(identifier, doc.name)
|
|
with open(f"{CACHE_DIR}/{_url_encode(doc.name)}_page_numbers.json", "w") as f:
|
|
json.dump(page_nums, f)
|
|
with open(f"{CACHE_DIR}/{_url_encode(doc.name)}_jp2.zip", "wb") as f:
|
|
f.write(zip_file)
|
|
|
|
|
|
def _url_encode(string: str) -> str:
|
|
"""
|
|
Helper to encode to a URL-encoded (in other words, percent-encoded) string.
|
|
"""
|
|
|
|
return urllib.parse.quote(string, safe=" ._")
|
|
|
|
|
|
def _url_decode(string: str) -> str:
|
|
"""
|
|
Helper to decode from a URL-encoded (in other words, percent-encoded)
|
|
string.
|
|
"""
|
|
|
|
return urllib.parse.unquote(string)
|
|
|
|
|
|
def _fetch_page_nums(identifier: str, doc_name: str) -> dict:
|
|
"""
|
|
Fetch JSON file with page number metadata for an `ArchiveDoc`.
|
|
"""
|
|
|
|
# `self.name` does not get percent-encoded, because it is derived from the
|
|
# file path itself as defined by archive.org. Percent- encoding it further
|
|
# may result in a 404 error.
|
|
page_nums_resp = requests.get(
|
|
f"https://archive.org/download/{_url_encode(identifier)}/{doc_name}_page_numbers.json"
|
|
)
|
|
page_nums_resp.raise_for_status()
|
|
return page_nums_resp.json()
|
|
|
|
|
|
def _fetch_jp2_zip(identifier: str, doc_name: str) -> bytes:
|
|
"""
|
|
Fetch zip file with processed page scans for an `ArchiveDoc`.
|
|
"""
|
|
|
|
# `self.name` does not get percent-encoded, because it is derived
|
|
# from the file path itself as defined by archive.org. Percent-
|
|
# encoding it further may result in a 404 error.
|
|
zip_resp = requests.get(
|
|
f"https://archive.org/download/{_url_encode(identifier)}/{doc_name}_jp2.zip"
|
|
)
|
|
zip_resp.raise_for_status()
|
|
return zip_resp.content
|