fix sloppy llm code for pdf parsing
This commit is contained in:
parent
1ca2238c5d
commit
ce42ab58f1
1 changed files with 17 additions and 34 deletions
|
|
@ -6,7 +6,6 @@ microfiche scans.
|
|||
import os
|
||||
import urllib
|
||||
from dataclasses import dataclass
|
||||
from io import BytesIO
|
||||
from typing import Optional
|
||||
|
||||
import pymupdf
|
||||
|
|
@ -104,52 +103,36 @@ class ArchiveDoc:
|
|||
|
||||
leaves = []
|
||||
|
||||
# Open PDF from bytes
|
||||
pdf_doc = pymupdf.open(stream=pdf_data, filetype="pdf")
|
||||
with pymupdf.open(stream=pdf_data, filetype="pdf") as pdf:
|
||||
for i, page in enumerate(pdf):
|
||||
bound = page.bound()
|
||||
im = page.get_pixmap(dpi=300).pil_image().convert("L")
|
||||
scale_factor_x = im.size[0] / bound.width
|
||||
scale_factor_y = im.size[1] / bound.height
|
||||
|
||||
try:
|
||||
for page_num in range(len(pdf_doc)):
|
||||
page = pdf_doc[page_num]
|
||||
|
||||
# Extract text blocks with coordinates
|
||||
# Convert to TextBlock objects, discarding block_no and block_type
|
||||
BLOCK_TYPE_TEXT = 0
|
||||
text_blocks = [
|
||||
TextBlock(
|
||||
x0=int(x0),
|
||||
y0=int(y0),
|
||||
x1=int(x1),
|
||||
y1=int(y1),
|
||||
x0=int((x0 - bound.x0) * scale_factor_x),
|
||||
y0=int((y0 - bound.y0) * scale_factor_y),
|
||||
x1=int((x1 - bound.x0) * scale_factor_x),
|
||||
y1=int((y1 - bound.y0) * scale_factor_y),
|
||||
text=text,
|
||||
)
|
||||
for x0, y0, x1, y1, text, *_ in page.get_text("blocks")
|
||||
# Tuple schema is documented here (accessed 2026-01-15):
|
||||
# https://pymupdf.readthedocs.io/en/latest/textpage.html#TextPage.extractText
|
||||
for x0, y0, x1, y1, text, _, block_type in page.get_text("blocks")
|
||||
if block_type == BLOCK_TYPE_TEXT
|
||||
]
|
||||
|
||||
# Render page to image
|
||||
# Use a matrix to scale appropriately (default is 72 DPI)
|
||||
# Scale factor 4.44 gives approximately 320 DPI, which should produce
|
||||
# images with long edge around 3200px for typical page sizes
|
||||
mat = pymupdf.Matrix(4.44, 4.44)
|
||||
pix = page.get_pixmap(matrix=mat, alpha=False)
|
||||
|
||||
# Convert PyMuPDF pixmap to PIL Image
|
||||
img_data = pix.tobytes("ppm")
|
||||
image = Image.open(BytesIO(img_data)).convert("L")
|
||||
|
||||
# Ensure long edge is no more than 3200 px
|
||||
image.thumbnail((3200, 3200))
|
||||
|
||||
# Page numbers are 1-indexed for human readability
|
||||
leaves.append(
|
||||
ArchiveLeaf(
|
||||
image=image,
|
||||
page_number=str(page_num + 1),
|
||||
image=im,
|
||||
page_number=str(page.number),
|
||||
text_blocks=text_blocks,
|
||||
)
|
||||
)
|
||||
|
||||
finally:
|
||||
pdf_doc.close()
|
||||
|
||||
return leaves
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue