ignore spurious text in clipping detection

This commit is contained in:
Brent Schroeter 2026-01-15 22:14:01 +00:00
parent 24e59e0bfc
commit df786f103f

View file

@ -1,3 +1,4 @@
import re
from sys import stdout from sys import stdout
from typing import Optional from typing import Optional
@ -243,10 +244,7 @@ def compute_text_margin_px(
Integer pixel count if text is present; otherwise `None`. Integer pixel count if text is present; otherwise `None`.
""" """
word_margins_all_directions = ( block_margins = [
np.sort(
np.concat(
[
np.array( np.array(
[ [
block.x0, block.x0,
@ -256,10 +254,12 @@ def compute_text_margin_px(
] ]
) )
for block in text_blocks for block in text_blocks
# Exclude text without clear alphanumeric substance.
if len(re.sub(r"[^a-zA-Z0-9]", "", block.text)) > 1
] ]
).astype(np.int_) word_margins_all_directions = (
) np.sort(np.concat(block_margins).astype(np.int_))
if len(text_blocks) > 0 if len(block_margins) > 0
else np.array([]) else np.array([])
) )
# Skip the n closest words to the edge, to help ignore stray OCR artifacts. # Skip the n closest words to the edge, to help ignore stray OCR artifacts.