ignore spurious text in clipping detection

This commit is contained in:
Brent Schroeter 2026-01-15 22:14:01 +00:00
parent 24e59e0bfc
commit df786f103f

View file

@ -1,3 +1,4 @@
import re
from sys import stdout
from typing import Optional
@ -243,10 +244,7 @@ def compute_text_margin_px(
Integer pixel count if text is present; otherwise `None`.
"""
word_margins_all_directions = (
np.sort(
np.concat(
[
block_margins = [
np.array(
[
block.x0,
@ -256,10 +254,12 @@ def compute_text_margin_px(
]
)
for block in text_blocks
# Exclude text without clear alphanumeric substance.
if len(re.sub(r"[^a-zA-Z0-9]", "", block.text)) > 1
]
).astype(np.int_)
)
if len(text_blocks) > 0
word_margins_all_directions = (
np.sort(np.concat(block_margins).astype(np.int_))
if len(block_margins) > 0
else np.array([])
)
# Skip the n closest words to the edge, to help ignore stray OCR artifacts.