302 lines
10 KiB
Python
302 lines
10 KiB
Python
"""
|
|
Python utilities for structuring data and metadata pulled from archive.org
|
|
microfiche scans.
|
|
"""
|
|
|
|
import os
|
|
import urllib
|
|
from dataclasses import dataclass
|
|
from io import BytesIO
|
|
from typing import Optional
|
|
|
|
import pymupdf
|
|
import requests
|
|
from PIL import Image
|
|
|
|
from .ocr import TextBlock
|
|
|
|
|
|
CACHE_DIR = "./archive_cache"
|
|
|
|
|
|
@dataclass
|
|
class ArchiveLeaf:
|
|
"""
|
|
A leaf corresponds to a single image from one of the "Single Page Processed
|
|
JP2 Zip" files from an `ArchiveItem`. Not all leaves become part of the
|
|
final processed PDF displayed to the user, as some contain metadata or
|
|
superfluous information scanned off of the microfiche cards and retained for
|
|
posterity. To identify whether a leaf is pertinent or not, refer to the page
|
|
number metadata pulled as JSON from the archive.org API.
|
|
|
|
Attributes:
|
|
|
|
image PIL Image, pre-scaled using .thumbnail() to fit the long
|
|
edge to 3200 pixels.
|
|
|
|
page_number `None` if the leaf is not included in the processed PDF
|
|
presented to users, otherwise a (potentially empty)
|
|
string with the inferred page number as defined by the
|
|
document being scanned.
|
|
|
|
text_blocks List of text blocks extracted from PyMuPDF's
|
|
TextPage.extractBlocks() method.
|
|
"""
|
|
|
|
image: Image.Image
|
|
page_number: Optional[str]
|
|
text_blocks: list[TextBlock]
|
|
|
|
|
|
@dataclass
|
|
class ArchiveDoc:
|
|
"""
|
|
Information pdertaining to a single set of processed pages, of which there
|
|
may be multiple for any given ArchiveItem. For example, one SCOTUS case may
|
|
contain several briefs/petitions/etc., each presented as a distinct PDF but
|
|
all held within the parent `ArchiveItem`.
|
|
|
|
Note that this is a slightly different concept than the literal "files"
|
|
available via the archive.org API: an `ArchiveDoc` may combine information
|
|
from, say, both a `_page_numbers.json` file and a `_jp2.zip` file to store
|
|
image data and page number data conveniently within the same Python object.
|
|
|
|
Attributes:
|
|
|
|
identifier archive.org identifier string, for example
|
|
`"micro_IA40386007_0012"`.
|
|
|
|
name Document name, with the item identifier intact but file
|
|
extension stripped.
|
|
|
|
title Optional `title` metadata field assigned to the `_jp2.zip`
|
|
file, usually indicating that this file represents a subset
|
|
of the parent item's content, for example a specific brief
|
|
or opinion from a larger SCOTUS case document.
|
|
|
|
For QA intents and purposes, it's usually easiest to skip
|
|
over any documents where `title is not None`, assuming that
|
|
the item has at least one processed `_jp2.zip` file for
|
|
which `title is None`.
|
|
"""
|
|
|
|
identifier: str
|
|
name: str
|
|
title: Optional[str]
|
|
|
|
def fetch_leaves(self, use_cache=False) -> list[ArchiveLeaf]:
|
|
"""
|
|
Fetch images and OCR text data for this document from archive.org PDF files.
|
|
|
|
Params:
|
|
|
|
use_cache If `True`, locally cached PDF files under the
|
|
`./archive_cache` directory (relative to the working
|
|
directory) will be used instead of fetching over
|
|
HTTPS.
|
|
"""
|
|
|
|
if use_cache:
|
|
with open(f"{CACHE_DIR}/{_url_encode(self.name)}.pdf", "rb") as f:
|
|
pdf_data = f.read()
|
|
else:
|
|
pdf_data = _fetch_pdf(self.identifier, self.name)
|
|
|
|
leaves = []
|
|
|
|
# Open PDF from bytes
|
|
pdf_doc = pymupdf.open(stream=pdf_data, filetype="pdf")
|
|
|
|
try:
|
|
for page_num in range(len(pdf_doc)):
|
|
page = pdf_doc[page_num]
|
|
|
|
# Extract text blocks with coordinates
|
|
# Convert to TextBlock objects, discarding block_no and block_type
|
|
text_blocks = [
|
|
TextBlock(
|
|
x0=int(x0),
|
|
y0=int(y0),
|
|
x1=int(x1),
|
|
y1=int(y1),
|
|
text=text,
|
|
)
|
|
for x0, y0, x1, y1, text, *_ in page.get_text("blocks")
|
|
]
|
|
|
|
# Render page to image
|
|
# Use a matrix to scale appropriately (default is 72 DPI)
|
|
# Scale factor 4.44 gives approximately 320 DPI, which should produce
|
|
# images with long edge around 3200px for typical page sizes
|
|
mat = pymupdf.Matrix(4.44, 4.44)
|
|
pix = page.get_pixmap(matrix=mat, alpha=False)
|
|
|
|
# Convert PyMuPDF pixmap to PIL Image
|
|
img_data = pix.tobytes("ppm")
|
|
image = Image.open(BytesIO(img_data)).convert("L")
|
|
|
|
# Ensure long edge is no more than 3200 px
|
|
image.thumbnail((3200, 3200))
|
|
|
|
# Page numbers are 1-indexed for human readability
|
|
leaves.append(
|
|
ArchiveLeaf(
|
|
image=image,
|
|
page_number=str(page_num + 1),
|
|
text_blocks=text_blocks,
|
|
)
|
|
)
|
|
|
|
finally:
|
|
pdf_doc.close()
|
|
|
|
return leaves
|
|
|
|
|
|
@dataclass
|
|
class ArchiveItem:
|
|
"""
|
|
Information pertaining to an archive.org item. Documents, ultimately of type
|
|
`ArchiveDoc`, are referenced by name only in this class so that content
|
|
downloads for individual `ArchiveDoc`s may be skipped, staggered, or
|
|
performed in parallel if desired, rather than in one chunk per item.
|
|
|
|
Attributes:
|
|
|
|
identifier archive.org identifier string, for example
|
|
`"micro_IA40386007_0012"`.
|
|
|
|
docs List of `ArchiveDoc` names, with the item identifier,
|
|
leading whitespace, and file extension stripped.
|
|
"""
|
|
|
|
identifier: str
|
|
docs: list[ArchiveDoc]
|
|
|
|
|
|
def fetch_item(identifier: str, use_cache=False) -> ArchiveItem:
|
|
"""
|
|
Fetch the relevant top-level information for an `ArchiveItem` from
|
|
archive.org. This assumes a specific naming convention for the item's files:
|
|
- `<identifier>[ Title]_jp2.zip` for processed scans
|
|
- `<identifier>[ Title]_page_numbers.json` for page number metadata
|
|
- `<identifier>_micro_jp2.zip` for unprocessed scans
|
|
|
|
This function treats file names as case-insensitive, but preserves casing in
|
|
its output.
|
|
|
|
Params:
|
|
|
|
identifier archive.org identifier string, for example
|
|
`"micro_IA40386007_0012"`.
|
|
use_cache If `True`, locally cached zip files under the
|
|
`./archive_cache` directory (relative to the working
|
|
directory) will be used instead of fetching over HTTPS.
|
|
"""
|
|
|
|
if use_cache:
|
|
# File names should be treated as case-insensitive, in case the file
|
|
# system is case-insensitive. As I understand it, this applies to FAT
|
|
# and APFS in their default configurations. Both are case-preserving, so
|
|
# this shouldn't usually be an issue, but if/when it is, it can be very
|
|
# frustrating to troubleshoot user-side.
|
|
file_names = [
|
|
_url_decode(name)
|
|
for name in os.listdir(CACHE_DIR)
|
|
if name.lower().startswith(identifier.lower())
|
|
]
|
|
else:
|
|
files_resp = requests.get(
|
|
f"https://archive.org/metadata/{_url_encode(identifier)}/files"
|
|
)
|
|
files_resp.raise_for_status()
|
|
file_names = [item["name"] for item in files_resp.json()["result"]]
|
|
|
|
doc_names = [
|
|
# Strip suffix, to just leave the identifier, and title if present.
|
|
name[: -len(".pdf")]
|
|
for name in file_names
|
|
if name.lower().endswith(".pdf")
|
|
]
|
|
|
|
# Assert that all files we expect to find are actually present.
|
|
for doc_name in doc_names:
|
|
if f"{_url_encode(doc_name.lower())}.pdf" not in [
|
|
name.lower() for name in file_names
|
|
]:
|
|
raise Exception(
|
|
f"expected file not found: {_url_encode(doc_name.lower())}.pdf"
|
|
)
|
|
|
|
return ArchiveItem(
|
|
identifier=identifier,
|
|
docs=[
|
|
ArchiveDoc(
|
|
identifier=identifier,
|
|
name=name,
|
|
title=name[len(identifier) :].strip() or None,
|
|
)
|
|
for name in doc_names
|
|
],
|
|
)
|
|
|
|
|
|
def cache_item(identifier: str, overwrite=True):
|
|
"""
|
|
Load the PDF files for an `ArchiveItem` and its component `ArchiveDoc`s
|
|
and store them within the `archive_cache` directory (relative to the working
|
|
directory). The `archive_cache` directory will be created if it does not
|
|
exist.
|
|
|
|
Params:
|
|
|
|
identifier archive.org identifier string, for example
|
|
`"micro_IA40386007_0012"`.
|
|
overwrite If set to `False` and any file names in the cache already
|
|
match the item, fetching the item is skipped.
|
|
"""
|
|
|
|
os.makedirs(CACHE_DIR, exist_ok=True)
|
|
|
|
for name in os.listdir(CACHE_DIR):
|
|
if _url_decode(name.lower()).startswith(identifier.lower()):
|
|
if not overwrite:
|
|
return
|
|
|
|
item = fetch_item(identifier)
|
|
for doc in item.docs:
|
|
pdf_data = _fetch_pdf(identifier, doc.name)
|
|
with open(f"{CACHE_DIR}/{_url_encode(doc.name)}.pdf", "wb") as f:
|
|
f.write(pdf_data)
|
|
|
|
|
|
def _url_encode(string: str) -> str:
|
|
"""
|
|
Helper to encode to a URL-encoded (in other words, percent-encoded) string.
|
|
"""
|
|
|
|
return urllib.parse.quote(string, safe=" ._")
|
|
|
|
|
|
def _url_decode(string: str) -> str:
|
|
"""
|
|
Helper to decode from a URL-encoded (in other words, percent-encoded)
|
|
string.
|
|
"""
|
|
|
|
return urllib.parse.unquote(string)
|
|
|
|
|
|
def _fetch_pdf(identifier: str, doc_name: str) -> bytes:
|
|
"""
|
|
Fetch PDF file for an `ArchiveDoc`.
|
|
"""
|
|
|
|
# `doc_name` does not get percent-encoded, because it is derived from the
|
|
# file path itself as defined by archive.org. Percent-encoding it further
|
|
# may result in a 404 error.
|
|
resp = requests.get(
|
|
f"https://archive.org/download/{_url_encode(identifier)}/{doc_name}.pdf"
|
|
)
|
|
resp.raise_for_status()
|
|
return resp.content
|