crop detection tuning
This commit is contained in:
parent
a5e3a2a429
commit
d33a7dc515
2 changed files with 41 additions and 13 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -1 +1,2 @@
|
||||||
/target
|
/target
|
||||||
|
/data
|
||||||
|
|
|
||||||
53
main.py
53
main.py
|
|
@ -24,6 +24,7 @@ def main():
|
||||||
parser.add_argument("--summarize", action="store_true")
|
parser.add_argument("--summarize", action="store_true")
|
||||||
parser.add_argument("-v", "--verbose", action="store_true")
|
parser.add_argument("-v", "--verbose", action="store_true")
|
||||||
parser.add_argument("-w", "--workers", type=int, default=1)
|
parser.add_argument("-w", "--workers", type=int, default=1)
|
||||||
|
parser.add_argument("--page-margin-px", type=int, default=50)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Process STDIN line by line, where each line contains one or more item IDs
|
# Process STDIN line by line, where each line contains one or more item IDs
|
||||||
|
|
@ -38,7 +39,11 @@ def main():
|
||||||
pool.map(
|
pool.map(
|
||||||
_summarize_item_to_stdout,
|
_summarize_item_to_stdout,
|
||||||
[
|
[
|
||||||
ItemTask(item_id=item_id, verbose=args.verbose)
|
ItemTask(
|
||||||
|
item_id=item_id,
|
||||||
|
page_margin_px=args.page_margin_px,
|
||||||
|
verbose=args.verbose,
|
||||||
|
)
|
||||||
for item_id in item_ids
|
for item_id in item_ids
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
@ -46,7 +51,11 @@ def main():
|
||||||
pool.map(
|
pool.map(
|
||||||
_analyze_item_to_stdout,
|
_analyze_item_to_stdout,
|
||||||
[
|
[
|
||||||
ItemTask(item_id=item_id, verbose=args.verbose)
|
ItemTask(
|
||||||
|
item_id=item_id,
|
||||||
|
page_margin_px=args.page_margin_px,
|
||||||
|
verbose=args.verbose,
|
||||||
|
)
|
||||||
for item_id in item_ids
|
for item_id in item_ids
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
@ -55,18 +64,22 @@ def main():
|
||||||
@dataclass
|
@dataclass
|
||||||
class ItemTask:
|
class ItemTask:
|
||||||
item_id: str
|
item_id: str
|
||||||
|
page_margin_px: int
|
||||||
verbose: bool
|
verbose: bool
|
||||||
|
|
||||||
|
|
||||||
def _summarize_item_to_stdout(task):
|
def _summarize_item_to_stdout(task):
|
||||||
item_id = task.item_id
|
item_id = task.item_id
|
||||||
|
page_margin_px = task.page_margin_px
|
||||||
verbose = task.verbose
|
verbose = task.verbose
|
||||||
|
|
||||||
if verbose:
|
if verbose:
|
||||||
print(f"Summarizing item {item_id}...", file=stderr)
|
print(f"Summarizing item {item_id}...", file=stderr)
|
||||||
stderr.flush()
|
stderr.flush()
|
||||||
|
|
||||||
analysis = analyze_item(item_id, True, verbose)
|
analysis = analyze_item(
|
||||||
|
item_id, page_margin_px=page_margin_px, parallel=True, verbose=verbose
|
||||||
|
)
|
||||||
|
|
||||||
# 3 or more blank pages in a row is a flag.
|
# 3 or more blank pages in a row is a flag.
|
||||||
CONSECUTIVE_BLANKS_THRESHOLD = 3
|
CONSECUTIVE_BLANKS_THRESHOLD = 3
|
||||||
|
|
@ -111,8 +124,11 @@ def _summarize_item_to_stdout(task):
|
||||||
if not page["ocr_orientation_match"]
|
if not page["ocr_orientation_match"]
|
||||||
]
|
]
|
||||||
|
|
||||||
|
WORDS_NEAR_EDGE_THRESHOLD = 2
|
||||||
check_crop = [
|
check_crop = [
|
||||||
i + 1 for i, page in enumerate(analysis["pages"]) if page["words_near_edge"] > 2
|
i + 1
|
||||||
|
for i, page in enumerate(analysis["pages"])
|
||||||
|
if page["words_near_edge"] > WORDS_NEAR_EDGE_THRESHOLD
|
||||||
]
|
]
|
||||||
|
|
||||||
if check_orientation or check_crop or consecutive_blanks or consecutive_blurry:
|
if check_orientation or check_crop or consecutive_blanks or consecutive_blurry:
|
||||||
|
|
@ -136,13 +152,20 @@ def _summarize_item_to_stdout(task):
|
||||||
|
|
||||||
def _analyze_item_to_stdout(task):
|
def _analyze_item_to_stdout(task):
|
||||||
item_id = task.item_id
|
item_id = task.item_id
|
||||||
|
page_margin_px = task.page_margin_px
|
||||||
verbose = task.verbose
|
verbose = task.verbose
|
||||||
|
|
||||||
if verbose:
|
if verbose:
|
||||||
print(f"Analyzing item {item_id}...", file=stderr)
|
print(f"Analyzing item {item_id}...", file=stderr)
|
||||||
stderr.flush()
|
stderr.flush()
|
||||||
|
|
||||||
print(json.dumps(analyze_item(item_id, True, verbose)))
|
print(
|
||||||
|
json.dumps(
|
||||||
|
analyze_item(
|
||||||
|
item_id, page_margin_px=page_margin_px, parallel=True, verbose=verbose
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
stdout.flush()
|
stdout.flush()
|
||||||
|
|
||||||
if verbose:
|
if verbose:
|
||||||
|
|
@ -154,12 +177,14 @@ def _analyze_item_to_stdout(task):
|
||||||
class PageAnalysisTask:
|
class PageAnalysisTask:
|
||||||
im: Image.Image
|
im: Image.Image
|
||||||
page_index: int
|
page_index: int
|
||||||
|
page_margin_px: int
|
||||||
file_name: str
|
file_name: str
|
||||||
|
|
||||||
|
|
||||||
def _analyze_page(task):
|
def _analyze_page(task):
|
||||||
im_original = task.im
|
im_original = task.im
|
||||||
page_index = task.page_index
|
page_index = task.page_index
|
||||||
|
page_margin_px = task.page_margin_px
|
||||||
file_name = task.file_name
|
file_name = task.file_name
|
||||||
|
|
||||||
im_cropped = im_original.crop(
|
im_cropped = im_original.crop(
|
||||||
|
|
@ -211,7 +236,7 @@ def _analyze_page(task):
|
||||||
ocr = pytesseract.image_to_data(
|
ocr = pytesseract.image_to_data(
|
||||||
im_rotated,
|
im_rotated,
|
||||||
lang=OCR_LANGS,
|
lang=OCR_LANGS,
|
||||||
config="--oem 1 --dpi 300 --tessdata-dir ./tessdata_fast-4.1.0",
|
config="--oem 1 --dpi 300 --tessdata-dir ./data/tessdata_fast-4.1.0",
|
||||||
output_type=pytesseract.Output.DATAFRAME,
|
output_type=pytesseract.Output.DATAFRAME,
|
||||||
).fillna({"text": ""})
|
).fillna({"text": ""})
|
||||||
words = ocr[(ocr["conf"] > 90) & (ocr["width"] > ocr["height"])]
|
words = ocr[(ocr["conf"] > 90) & (ocr["width"] > ocr["height"])]
|
||||||
|
|
@ -237,17 +262,16 @@ def _analyze_page(task):
|
||||||
if best_ocr_orientation % 2 == 0
|
if best_ocr_orientation % 2 == 0
|
||||||
else (im_original.size[1], im_original.size[0])
|
else (im_original.size[1], im_original.size[0])
|
||||||
)
|
)
|
||||||
EDGE_TOLERANCE = 0.03
|
|
||||||
words_near_edge = best_ocr_words[
|
words_near_edge = best_ocr_words[
|
||||||
(best_ocr_words["left"] < best_ocr_dims[0] * EDGE_TOLERANCE)
|
(best_ocr_words["left"] < page_margin_px)
|
||||||
| (best_ocr_words["top"] < best_ocr_dims[1] * EDGE_TOLERANCE)
|
| (best_ocr_words["top"] < page_margin_px)
|
||||||
| (
|
| (
|
||||||
best_ocr_words["left"] + best_ocr_words["width"]
|
best_ocr_words["left"] + best_ocr_words["width"]
|
||||||
> best_ocr_dims[0] * (1 - EDGE_TOLERANCE)
|
> best_ocr_dims[0] - page_margin_px
|
||||||
)
|
)
|
||||||
| (
|
| (
|
||||||
best_ocr_words["top"] + best_ocr_words["height"]
|
best_ocr_words["top"] + best_ocr_words["height"]
|
||||||
> best_ocr_dims[1] * (1 - EDGE_TOLERANCE)
|
> best_ocr_dims[1] - page_margin_px
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
words_near_edge = words_near_edge.shape[0]
|
words_near_edge = words_near_edge.shape[0]
|
||||||
|
|
@ -263,7 +287,7 @@ def _analyze_page(task):
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def analyze_item(item_id, parallel=False, verbose=False):
|
def analyze_item(item_id, page_margin_px, parallel=False, verbose=False):
|
||||||
escaped_item_id = urllib.parse.quote(item_id, safe="")
|
escaped_item_id = urllib.parse.quote(item_id, safe="")
|
||||||
|
|
||||||
if verbose:
|
if verbose:
|
||||||
|
|
@ -300,7 +324,10 @@ def analyze_item(item_id, parallel=False, verbose=False):
|
||||||
im.thumbnail((3200, 3200))
|
im.thumbnail((3200, 3200))
|
||||||
tasks.append(
|
tasks.append(
|
||||||
PageAnalysisTask(
|
PageAnalysisTask(
|
||||||
im=im, page_index=page_index, file_name=file_name
|
im=im,
|
||||||
|
page_index=page_index,
|
||||||
|
page_margin_px=page_margin_px,
|
||||||
|
file_name=file_name,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue