ignore spurious text in clipping detection

2026-01-15 22:14:01 +00:00 · 2026-01-15 22:14:01 +00:00 · df786f103f
commit df786f103f
parent 24e59e0bfc
1 changed files with 16 additions and 16 deletions
--- a/microqa/engine.py
+++ b/microqa/engine.py
@ -1,3 +1,4 @@
 import re
 from sys import stdout
 from typing import Optional
@ -243,23 +244,22 @@ def compute_text_margin_px(
        Integer pixel count if text is present; otherwise `None`.
    """
-    word_margins_all_directions = (
+    block_margins = [
-        np.sort(
+        np.array(
-            np.concat(
+            [
-                [
+                block.x0,
-                    np.array(
+                block.y0,
-                        [
+                im_size[0] - block.x1,
-                            block.x0,
+                im_size[1] - block.y1,
-                            block.y0,
+            ]
                            im_size[0] - block.x1,
                            im_size[1] - block.y1,
                        ]
                    )
                    for block in text_blocks
                ]
            ).astype(np.int_)
        )
-        if len(text_blocks) > 0
+        for block in text_blocks
        # Exclude text without clear alphanumeric substance.
        if len(re.sub(r"[^a-zA-Z0-9]", "", block.text)) > 1
    ]
    word_margins_all_directions = (
        np.sort(np.concat(block_margins).astype(np.int_))
        if len(block_margins) > 0
        else np.array([])
    )
    # Skip the n closest words to the edge, to help ignore stray OCR artifacts.