MicroQA/microqa/items.py

"""
Python utilities for structuring data and metadata pulled from archive.org
microfiche scans.
"""

import os
import urllib
from dataclasses import dataclass
from io import BytesIO
from typing import Optional

import pymupdf
import requests
from PIL import Image

from .ocr import TextBlock


CACHE_DIR = "./archive_cache"


@dataclass
class ArchiveLeaf:
    """
    A leaf corresponds to a single image from one of the "Single Page Processed
    JP2 Zip" files from an `ArchiveItem`. Not all leaves become part of the
    final processed PDF displayed to the user, as some contain metadata or
    superfluous information scanned off of the microfiche cards and retained for
    posterity. To identify whether a leaf is pertinent or not, refer to the page
    number metadata pulled as JSON from the archive.org API.

    Attributes:

        image           PIL Image, pre-scaled using .thumbnail() to fit the long
                        edge to 3200 pixels.

        page_number     `None` if the leaf is not included in the processed PDF
                        presented to users, otherwise a (potentially empty)
                        string with the inferred page number as defined by the
                        document being scanned.

        text_blocks     List of text blocks extracted from PyMuPDF's
                        TextPage.extractBlocks() method.
    """

    image: Image.Image
    page_number: Optional[str]
    text_blocks: list[TextBlock]


@dataclass
class ArchiveDoc:
    """
    Information pdertaining to a single set of processed pages, of which there
    may be multiple for any given ArchiveItem. For example, one SCOTUS case may
    contain several briefs/petitions/etc., each presented as a distinct PDF but
    all held within the parent `ArchiveItem`.

    Note that this is a slightly different concept than the literal "files"
    available via the archive.org API: an `ArchiveDoc` may combine information
    from, say, both a `_page_numbers.json` file and a `_jp2.zip` file to store
    image data and page number data conveniently within the same Python object.

    Attributes:

        identifier  archive.org identifier string, for example
                    `"micro_IA40386007_0012"`.

        name        Document name, with the item identifier intact but file
                    extension stripped.

        title       Optional `title` metadata field assigned to the `_jp2.zip`
                    file, usually indicating that this file represents a subset
                    of the parent item's content, for example a specific brief
                    or opinion from a larger SCOTUS case document.

                    For QA intents and purposes, it's usually easiest to skip
                    over any documents where `title is not None`, assuming that
                    the item has at least one processed `_jp2.zip` file for
                    which `title is None`.
    """

    identifier: str
    name: str
    title: Optional[str]

    def fetch_leaves(self, use_cache=False) -> list[ArchiveLeaf]:
        """
        Fetch images and OCR text data for this document from archive.org PDF files.

        Params:

            use_cache   If `True`, locally cached PDF files under the
                        `./archive_cache` directory (relative to the working
                        directory) will be used instead of fetching over
                        HTTPS.
        """

        if use_cache:
            with open(f"{CACHE_DIR}/{_url_encode(self.name)}.pdf", "rb") as f:
                pdf_data = f.read()
        else:
            pdf_data = _fetch_pdf(self.identifier, self.name)

        leaves = []

        # Open PDF from bytes
        pdf_doc = pymupdf.open(stream=pdf_data, filetype="pdf")

        try:
            for page_num in range(len(pdf_doc)):
                page = pdf_doc[page_num]

                # Extract text blocks with coordinates
                # Convert to TextBlock objects, discarding block_no and block_type
                text_blocks = [
                    TextBlock(
                        x0=int(x0),
                        y0=int(y0),
                        x1=int(x1),
                        y1=int(y1),
                        text=text,
                    )
                    for x0, y0, x1, y1, text, *_ in page.get_text("blocks")
                ]

                # Render page to image
                # Use a matrix to scale appropriately (default is 72 DPI)
                # Scale factor 4.44 gives approximately 320 DPI, which should produce
                # images with long edge around 3200px for typical page sizes
                mat = pymupdf.Matrix(4.44, 4.44)
                pix = page.get_pixmap(matrix=mat, alpha=False)

                # Convert PyMuPDF pixmap to PIL Image
                img_data = pix.tobytes("ppm")
                image = Image.open(BytesIO(img_data)).convert("L")

                # Ensure long edge is no more than 3200 px
                image.thumbnail((3200, 3200))

                # Page numbers are 1-indexed for human readability
                leaves.append(
                    ArchiveLeaf(
                        image=image,
                        page_number=str(page_num + 1),
                        text_blocks=text_blocks,
                    )
                )

        finally:
            pdf_doc.close()

        return leaves


@dataclass
class ArchiveItem:
    """
    Information pertaining to an archive.org item. Documents, ultimately of type
    `ArchiveDoc`, are referenced by name only in this class so that content
    downloads for individual `ArchiveDoc`s may be skipped, staggered, or
    performed in parallel if desired, rather than in one chunk per item.

    Attributes:

        identifier  archive.org identifier string, for example
                    `"micro_IA40386007_0012"`.

        docs        List of `ArchiveDoc` names, with the item identifier,
                    leading whitespace, and file extension stripped.
    """

    identifier: str
    docs: list[ArchiveDoc]


def fetch_item(identifier: str, use_cache=False) -> ArchiveItem:
    """
    Fetch the relevant top-level information for an `ArchiveItem` from
    archive.org. This assumes a specific naming convention for the item's files:
    - `<identifier>[ Title]_jp2.zip` for processed scans
    - `<identifier>[ Title]_page_numbers.json` for page number metadata
    - `<identifier>_micro_jp2.zip` for unprocessed scans

    This function treats file names as case-insensitive, but preserves casing in
    its output.

    Params:

        identifier  archive.org identifier string, for example
                    `"micro_IA40386007_0012"`.
        use_cache   If `True`, locally cached zip files under the
                    `./archive_cache` directory (relative to the working
                    directory) will be used instead of fetching over HTTPS.
    """

    if use_cache:
        # File names should be treated as case-insensitive, in case the file
        # system is case-insensitive. As I understand it, this applies to FAT
        # and APFS in their default configurations. Both are case-preserving, so
        # this shouldn't usually be an issue, but if/when it is, it can be very
        # frustrating to troubleshoot user-side.
        file_names = [
            _url_decode(name)
            for name in os.listdir(CACHE_DIR)
            if name.lower().startswith(identifier.lower())
        ]
    else:
        files_resp = requests.get(
            f"https://archive.org/metadata/{_url_encode(identifier)}/files"
        )
        files_resp.raise_for_status()
        file_names = [item["name"] for item in files_resp.json()["result"]]

    doc_names = [
        # Strip suffix, to just leave the identifier, and title if present.
        name[: -len(".pdf")]
        for name in file_names
        if name.lower().endswith(".pdf")
    ]

    # Assert that all files we expect to find are actually present.
    for doc_name in doc_names:
        if f"{_url_encode(doc_name.lower())}.pdf" not in [
            name.lower() for name in file_names
        ]:
            raise Exception(
                f"expected file not found: {_url_encode(doc_name.lower())}.pdf"
            )

    return ArchiveItem(
        identifier=identifier,
        docs=[
            ArchiveDoc(
                identifier=identifier,
                name=name,
                title=name[len(identifier) :].strip() or None,
            )
            for name in doc_names
        ],
    )


def cache_item(identifier: str, overwrite=True):
    """
    Load the PDF files for an `ArchiveItem` and its component `ArchiveDoc`s
    and store them within the `archive_cache` directory (relative to the working
    directory). The `archive_cache` directory will be created if it does not
    exist.

    Params:

        identifier  archive.org identifier string, for example
                    `"micro_IA40386007_0012"`.
        overwrite   If set to `False` and any file names in the cache already
                    match the item, fetching the item is skipped.
    """

    os.makedirs(CACHE_DIR, exist_ok=True)

    for name in os.listdir(CACHE_DIR):
        if _url_decode(name.lower()).startswith(identifier.lower()):
            if not overwrite:
                return

    item = fetch_item(identifier)
    for doc in item.docs:
        pdf_data = _fetch_pdf(identifier, doc.name)
        with open(f"{CACHE_DIR}/{_url_encode(doc.name)}.pdf", "wb") as f:
            f.write(pdf_data)


def _url_encode(string: str) -> str:
    """
    Helper to encode to a URL-encoded (in other words, percent-encoded) string.
    """

    return urllib.parse.quote(string, safe=" ._")


def _url_decode(string: str) -> str:
    """
    Helper to decode from a URL-encoded (in other words, percent-encoded)
    string.
    """

    return urllib.parse.unquote(string)


def _fetch_pdf(identifier: str, doc_name: str) -> bytes:
    """
    Fetch PDF file for an `ArchiveDoc`.
    """

    # `doc_name` does not get percent-encoded, because it is derived from the
    # file path itself as defined by archive.org. Percent-encoding it further
    # may result in a 404 error.
    resp = requests.get(
        f"https://archive.org/download/{_url_encode(identifier)}/{doc_name}.pdf"
    )
    resp.raise_for_status()
    return resp.content