""" Python utilities for structuring data and metadata pulled from archive.org microfiche scans. """ import json import os import urllib from contextlib import nullcontext from dataclasses import dataclass from io import BytesIO from typing import Optional from zipfile import ZipFile import requests from PIL import Image CACHE_DIR = "./archive_cache" @dataclass class ArchiveLeaf: """ A leaf corresponds to a single image from one of the "Single Page Processed JP2 Zip" files from an `ArchiveItem`. Not all leaves become part of the final processed PDF displayed to the user, as some contain metadata or superfluous information scanned off of the microfiche cards and retained for posterity. To identify whether a leaf is pertinent or not, refer to the page number metadata pulled as JSON from the archive.org API. Attributes: image PIL Image, pre-scaled using .thumbnail() to fit the long edge to 3200 pixels. page_number `None` if the leaf is not included in the processed PDF presented to users, otherwise a (potentially empty) string with the inferred page number as defined by the document being scanned. """ image: Image page_number: Optional[str] @dataclass class ArchiveDoc: """ Information pertaining to a single set of processed pages, of which there may be multiple for any given ArchiveItem. For example, one SCOTUS case may contain several briefs/petitions/etc., each presented as a distinct PDF but all held within the parent `ArchiveItem`. Note that this is a slightly different concept than the literal "files" available via the archive.org API: an `ArchiveDoc` may combine information from, say, both a `_page_numbers.json` file and a `_jp2.zip` file to store image data and page number data conveniently within the same Python object. Attributes: identifier archive.org identifier string, for example `"micro_IA40386007_0012"`. name Document name, with the item identifier, leading whitespace, and file extension stripped. title Optional `title` metadata field assigned to the `_jp2.zip` file, usually indicating that this file represents a subset of the parent item's content, for example a specific brief or opinion from a larger SCOTUS case document. For QA intents and purposes, it's usually easiest to skip over any documents where `title is not None`, assuming that the item has at least one processed `_jp2.zip` file for which `title is None`. """ identifier: str name: str title: Optional[str] def fetch_leaves(self, numbered_only=True, use_cache=False) -> list[ArchiveLeaf]: """ Fetch images and page number data for this document from archive.org, over the Internet. Params: numbered_only If `True`, discards any leaves with no corresponding page number entries. Leaves for which the page number is an empty string are retained. use_cache If `True`, locally cached zip files under the `./archive_cache` directory (relative to the working directory) will be used instead of fetching over HTTPS. """ if use_cache: # Cached file names are derived from the percent-encoded verison of # `self.name`, so that there's no need to worry about directory # separators or other disallowed characters in the file names # defined by archive.org. with open( f"{CACHE_DIR}/{_url_encode(self.name)}_page_numbers.json", "r" ) as f: page_nums = json.load(f)["pages"] zip_reader_ctx = open(f"{CACHE_DIR}/{_url_encode(self.name)}_jp2.zip", "rb") else: page_nums = _fetch_page_nums(self.identifier, self.name)["pages"] # Wrap in a context manager so that the reader can be used in a `with` # block in the same way as a file accessed with `open()`. zip_reader_ctx = nullcontext( BytesIO(_fetch_jp2_zip(self.identifier, self.name)) ) leaves = [] with zip_reader_ctx as zip_reader, ZipFile(zip_reader) as jp_zip: for leaf_num, file_name in enumerate(sorted(jp_zip.namelist())): for page_index, page_num_info in enumerate(page_nums): if page_num_info["leafNum"] == leaf_num: # Stop iterating and keep page_index set to the current # value. break else: # Indicate that leaf was not found in page_num list. page_index = None if not numbered_only or page_index is not None: with jp_zip.open(file_name) as jp_file: # Convert to single-channel greyscale ("L"). image = Image.open(jp_file).convert("L") # Rescale long edge to no more than 3200 px. image.thumbnail((3200, 3200)) leaves.append(ArchiveLeaf(image=image, page_number=page_index)) return leaves @dataclass class ArchiveItem: """ Information pertaining to an archive.org item. Documents, ultimately of type `ArchiveDoc`, are referenced by name only in this class so that content downloads for individual `ArchiveDoc`s may be skipped, staggered, or performed in parallel if desired, rather than in one chunk per item. Attributes: identifier archive.org identifier string, for example `"micro_IA40386007_0012"`. docs List of `ArchiveDoc` names, with the item identifier, leading whitespace, and file extension stripped. """ identifier: str docs: list[ArchiveDoc] def fetch_item(identifier: str, use_cache=False) -> ArchiveItem: """ Fetch the relevant top-level information for an `ArchiveItem` from archive.org. This assumes a specific naming convention for the item's files: - `[ Title]_jp2.zip` for processed scans - `[ Title]_page_numbers.json` for page number metadata - `_micro_jp2.zip` for unprocessed scans This function treats file names as case-insensitive, but preserves casing in its output. Params: identifier archive.org identifier string, for example `"micro_IA40386007_0012"`. use_cache If `True`, locally cached zip files under the `./archive_cache` directory (relative to the working directory) will be used instead of fetching over HTTPS. """ if use_cache: # File names should be treated as case-insensitive, in case the file # system is case-insensitive. As I understand it, this applies to FAT # and APFS in their default configurations. Both are case-preserving, so # this shouldn't usually be an issue, but if/when it is, it can be very # frustrating to troubleshoot user-side. file_names = [ _url_decode(name) for name in os.listdir(CACHE_DIR) if name.lower().startswith(identifier.lower()) ] else: files_resp = requests.get( f"https://archive.org/metadata/{_url_encode(identifier)}/files" ) files_resp.raise_for_status() file_names = [item["name"] for item in files_resp.json()["result"]] doc_names = [ # Strip suffix, to just leave the identifier, and title if present. name[: -len("_jp2.zip")] for name in file_names if name.lower().endswith("_jp2.zip") # Exclude unprocessed scans, which are also named `..._jp2.zip`. and name.lower() != f"{identifier.lower()}_micro_jp2.zip" ] # Assert that all files we expect to find are actually present. for doc_name in doc_names: if f"{_url_encode(doc_name.lower())}_page_numbers.json" not in [ name.lower() for name in file_names ]: raise Exception( f"expected file not found: {_url_encode(doc_name.lower())}_page_numbers.zip" ) return ArchiveItem( identifier=identifier, docs=[ ArchiveDoc( identifier=identifier, name=name, title=name[len(identifier) :].strip() or None, ) for name in doc_names ], ) def cache_item(identifier: str, overwrite=True): """ Load the relevant files for an `ArchiveItem` and its component `ArchiveDoc`s and store them within the `archive_cache` directory (relative to the working directory). The `archive_cache` directory will be created if it does not exist. Params: identifier archive.org identifier string, for example `"micro_IA40386007_0012"`. overwrite If set to `False` and any file names in the cache already match the item, fetching the item is skipped. """ os.makedirs(CACHE_DIR, exist_ok=True) for name in os.listdir(CACHE_DIR): if _url_decode(name.lower()).startswith(identifier.lower()): return item = fetch_item(identifier) for doc in item.docs: page_nums = _fetch_page_nums(identifier, doc.name) zip_file = _fetch_jp2_zip(identifier, doc.name) with open(f"{CACHE_DIR}/{_url_encode(doc.name)}_page_numbers.json", "w") as f: json.dump(page_nums, f) with open(f"{CACHE_DIR}/{_url_encode(doc.name)}_jp2.zip", "wb") as f: f.write(zip_file) def _url_encode(string: str) -> str: """ Helper to encode to a URL-encoded (in other words, percent-encoded) string. """ return urllib.parse.quote(string, safe=" ._") def _url_decode(string: str) -> str: """ Helper to decode from a URL-encoded (in other words, percent-encoded) string. """ return urllib.parse.unquote(string) def _fetch_page_nums(identifier: str, doc_name: str) -> dict: """ Fetch JSON file with page number metadata for an `ArchiveDoc`. """ # `self.name` does not get percent-encoded, because it is derived from the # file path itself as defined by archive.org. Percent- encoding it further # may result in a 404 error. page_nums_resp = requests.get( f"https://archive.org/download/{_url_encode(identifier)}/{doc_name}_page_numbers.json" ) page_nums_resp.raise_for_status() return page_nums_resp.json() def _fetch_jp2_zip(identifier: str, doc_name: str) -> bytes: """ Fetch zip file with processed page scans for an `ArchiveDoc`. """ # `self.name` does not get percent-encoded, because it is derived # from the file path itself as defined by archive.org. Percent- # encoding it further may result in a 404 error. zip_resp = requests.get( f"https://archive.org/download/{_url_encode(identifier)}/{doc_name}_jp2.zip" ) zip_resp.raise_for_status() return zip_resp.content