ignore spurious text in clipping detection
This commit is contained in:
parent
24e59e0bfc
commit
df786f103f
1 changed files with 16 additions and 16 deletions
|
|
@ -1,3 +1,4 @@
|
||||||
|
import re
|
||||||
from sys import stdout
|
from sys import stdout
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
|
@ -243,23 +244,22 @@ def compute_text_margin_px(
|
||||||
Integer pixel count if text is present; otherwise `None`.
|
Integer pixel count if text is present; otherwise `None`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
word_margins_all_directions = (
|
block_margins = [
|
||||||
np.sort(
|
np.array(
|
||||||
np.concat(
|
[
|
||||||
[
|
block.x0,
|
||||||
np.array(
|
block.y0,
|
||||||
[
|
im_size[0] - block.x1,
|
||||||
block.x0,
|
im_size[1] - block.y1,
|
||||||
block.y0,
|
]
|
||||||
im_size[0] - block.x1,
|
|
||||||
im_size[1] - block.y1,
|
|
||||||
]
|
|
||||||
)
|
|
||||||
for block in text_blocks
|
|
||||||
]
|
|
||||||
).astype(np.int_)
|
|
||||||
)
|
)
|
||||||
if len(text_blocks) > 0
|
for block in text_blocks
|
||||||
|
# Exclude text without clear alphanumeric substance.
|
||||||
|
if len(re.sub(r"[^a-zA-Z0-9]", "", block.text)) > 1
|
||||||
|
]
|
||||||
|
word_margins_all_directions = (
|
||||||
|
np.sort(np.concat(block_margins).astype(np.int_))
|
||||||
|
if len(block_margins) > 0
|
||||||
else np.array([])
|
else np.array([])
|
||||||
)
|
)
|
||||||
# Skip the n closest words to the edge, to help ignore stray OCR artifacts.
|
# Skip the n closest words to the edge, to help ignore stray OCR artifacts.
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue