ignore spurious text in clipping detection

This commit is contained in:
Brent Schroeter 2026-01-15 22:14:01 +00:00
parent 24e59e0bfc
commit df786f103f

View file

@ -1,3 +1,4 @@
import re
from sys import stdout from sys import stdout
from typing import Optional from typing import Optional
@ -243,23 +244,22 @@ def compute_text_margin_px(
Integer pixel count if text is present; otherwise `None`. Integer pixel count if text is present; otherwise `None`.
""" """
word_margins_all_directions = ( block_margins = [
np.sort( np.array(
np.concat( [
[ block.x0,
np.array( block.y0,
[ im_size[0] - block.x1,
block.x0, im_size[1] - block.y1,
block.y0, ]
im_size[0] - block.x1,
im_size[1] - block.y1,
]
)
for block in text_blocks
]
).astype(np.int_)
) )
if len(text_blocks) > 0 for block in text_blocks
# Exclude text without clear alphanumeric substance.
if len(re.sub(r"[^a-zA-Z0-9]", "", block.text)) > 1
]
word_margins_all_directions = (
np.sort(np.concat(block_margins).astype(np.int_))
if len(block_margins) > 0
else np.array([]) else np.array([])
) )
# Skip the n closest words to the edge, to help ignore stray OCR artifacts. # Skip the n closest words to the edge, to help ignore stray OCR artifacts.