diff --git a/microqa/engine.py b/microqa/engine.py index 57fa07e..2a285ef 100644 --- a/microqa/engine.py +++ b/microqa/engine.py @@ -1,3 +1,4 @@ +import re from sys import stdout from typing import Optional @@ -243,23 +244,22 @@ def compute_text_margin_px( Integer pixel count if text is present; otherwise `None`. """ - word_margins_all_directions = ( - np.sort( - np.concat( - [ - np.array( - [ - block.x0, - block.y0, - im_size[0] - block.x1, - im_size[1] - block.y1, - ] - ) - for block in text_blocks - ] - ).astype(np.int_) + block_margins = [ + np.array( + [ + block.x0, + block.y0, + im_size[0] - block.x1, + im_size[1] - block.y1, + ] ) - if len(text_blocks) > 0 + for block in text_blocks + # Exclude text without clear alphanumeric substance. + if len(re.sub(r"[^a-zA-Z0-9]", "", block.text)) > 1 + ] + word_margins_all_directions = ( + np.sort(np.concat(block_margins).astype(np.int_)) + if len(block_margins) > 0 else np.array([]) ) # Skip the n closest words to the edge, to help ignore stray OCR artifacts.