ignore spurious text in clipping detection
This commit is contained in:
parent
24e59e0bfc
commit
df786f103f
1 changed files with 16 additions and 16 deletions
|
|
@ -1,3 +1,4 @@
|
|||
import re
|
||||
from sys import stdout
|
||||
from typing import Optional
|
||||
|
||||
|
|
@ -243,23 +244,22 @@ def compute_text_margin_px(
|
|||
Integer pixel count if text is present; otherwise `None`.
|
||||
"""
|
||||
|
||||
word_margins_all_directions = (
|
||||
np.sort(
|
||||
np.concat(
|
||||
[
|
||||
np.array(
|
||||
[
|
||||
block.x0,
|
||||
block.y0,
|
||||
im_size[0] - block.x1,
|
||||
im_size[1] - block.y1,
|
||||
]
|
||||
)
|
||||
for block in text_blocks
|
||||
]
|
||||
).astype(np.int_)
|
||||
block_margins = [
|
||||
np.array(
|
||||
[
|
||||
block.x0,
|
||||
block.y0,
|
||||
im_size[0] - block.x1,
|
||||
im_size[1] - block.y1,
|
||||
]
|
||||
)
|
||||
if len(text_blocks) > 0
|
||||
for block in text_blocks
|
||||
# Exclude text without clear alphanumeric substance.
|
||||
if len(re.sub(r"[^a-zA-Z0-9]", "", block.text)) > 1
|
||||
]
|
||||
word_margins_all_directions = (
|
||||
np.sort(np.concat(block_margins).astype(np.int_))
|
||||
if len(block_margins) > 0
|
||||
else np.array([])
|
||||
)
|
||||
# Skip the n closest words to the edge, to help ignore stray OCR artifacts.
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue