fix sloppy llm code for pdf parsing
This commit is contained in:
parent
1ca2238c5d
commit
ce42ab58f1
1 changed files with 17 additions and 34 deletions
|
|
@ -6,7 +6,6 @@ microfiche scans.
|
||||||
import os
|
import os
|
||||||
import urllib
|
import urllib
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from io import BytesIO
|
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import pymupdf
|
import pymupdf
|
||||||
|
|
@ -104,52 +103,36 @@ class ArchiveDoc:
|
||||||
|
|
||||||
leaves = []
|
leaves = []
|
||||||
|
|
||||||
# Open PDF from bytes
|
with pymupdf.open(stream=pdf_data, filetype="pdf") as pdf:
|
||||||
pdf_doc = pymupdf.open(stream=pdf_data, filetype="pdf")
|
for i, page in enumerate(pdf):
|
||||||
|
bound = page.bound()
|
||||||
|
im = page.get_pixmap(dpi=300).pil_image().convert("L")
|
||||||
|
scale_factor_x = im.size[0] / bound.width
|
||||||
|
scale_factor_y = im.size[1] / bound.height
|
||||||
|
|
||||||
try:
|
BLOCK_TYPE_TEXT = 0
|
||||||
for page_num in range(len(pdf_doc)):
|
|
||||||
page = pdf_doc[page_num]
|
|
||||||
|
|
||||||
# Extract text blocks with coordinates
|
|
||||||
# Convert to TextBlock objects, discarding block_no and block_type
|
|
||||||
text_blocks = [
|
text_blocks = [
|
||||||
TextBlock(
|
TextBlock(
|
||||||
x0=int(x0),
|
x0=int((x0 - bound.x0) * scale_factor_x),
|
||||||
y0=int(y0),
|
y0=int((y0 - bound.y0) * scale_factor_y),
|
||||||
x1=int(x1),
|
x1=int((x1 - bound.x0) * scale_factor_x),
|
||||||
y1=int(y1),
|
y1=int((y1 - bound.y0) * scale_factor_y),
|
||||||
text=text,
|
text=text,
|
||||||
)
|
)
|
||||||
for x0, y0, x1, y1, text, *_ in page.get_text("blocks")
|
# Tuple schema is documented here (accessed 2026-01-15):
|
||||||
|
# https://pymupdf.readthedocs.io/en/latest/textpage.html#TextPage.extractText
|
||||||
|
for x0, y0, x1, y1, text, _, block_type in page.get_text("blocks")
|
||||||
|
if block_type == BLOCK_TYPE_TEXT
|
||||||
]
|
]
|
||||||
|
|
||||||
# Render page to image
|
|
||||||
# Use a matrix to scale appropriately (default is 72 DPI)
|
|
||||||
# Scale factor 4.44 gives approximately 320 DPI, which should produce
|
|
||||||
# images with long edge around 3200px for typical page sizes
|
|
||||||
mat = pymupdf.Matrix(4.44, 4.44)
|
|
||||||
pix = page.get_pixmap(matrix=mat, alpha=False)
|
|
||||||
|
|
||||||
# Convert PyMuPDF pixmap to PIL Image
|
|
||||||
img_data = pix.tobytes("ppm")
|
|
||||||
image = Image.open(BytesIO(img_data)).convert("L")
|
|
||||||
|
|
||||||
# Ensure long edge is no more than 3200 px
|
|
||||||
image.thumbnail((3200, 3200))
|
|
||||||
|
|
||||||
# Page numbers are 1-indexed for human readability
|
|
||||||
leaves.append(
|
leaves.append(
|
||||||
ArchiveLeaf(
|
ArchiveLeaf(
|
||||||
image=image,
|
image=im,
|
||||||
page_number=str(page_num + 1),
|
page_number=str(page.number),
|
||||||
text_blocks=text_blocks,
|
text_blocks=text_blocks,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
finally:
|
|
||||||
pdf_doc.close()
|
|
||||||
|
|
||||||
return leaves
|
return leaves
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue