File size: 4,898 Bytes
c9019cd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
#!/usr/bin/env python
# Copyright (c) 2022, National Diet Library, Japan
#
# This software is released under the CC BY 4.0.
# https://creativecommons.org/licenses/by/4.0/
import argparse
import xml.etree.ElementTree as ET
def parse_args():
parser = argparse.ArgumentParser(
formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('xml', help='input xml file path')
parser.add_argument('-o', '--out', default=None,
help='output xml file path')
parser.add_argument('-ov', '--vh_overlap_th', type=int, default=2,
help='How many intersecting vertical and horizontal boxes should be removed')
parser.add_argument('-im', '--inclusion_margin', default=0.05,
help='inclusion margin ratio. default 0.05')
parser.add_argument('-co', '--category_option', default='SAME',
help='SAME(default) : investigate whether inclusion is only for the same category\n'
'SIM : investigate inclusion for similar categories(line/block).\n'
'ALL : investigate category-independent inclusions.\n')
parser.add_argument('--rm_vh_confusion_only', action='store_true')
parser.add_argument('--rm_inclusion_only', action='store_true')
return parser.parse_args()
def get_points(elm):
x1 = int(elm.attrib['X'])
y1 = int(elm.attrib['Y'])
x2 = x1 + int(elm.attrib['WIDTH'])
y2 = y1 + int(elm.attrib['HEIGHT'])
return x1, y1, x2, y2
def vh_comp(elm_a, elm_b):
v1 = int(elm_a.attrib['WIDTH'])-int(elm_a.attrib['HEIGHT'])
v2 = int(elm_b.attrib['WIDTH'])-int(elm_b.attrib['HEIGHT'])
return v1*v2 > 0
def vh_overlapping(elm_a, elm_b):
if vh_comp(elm_a, elm_b):
# vert vert or hori hori
return False
else:
a_x1, a_y1, a_x2, a_y2 = get_points(elm_a)
b_x1, b_y1, b_x2, b_y2 = get_points(elm_b)
# c ... intersection
c_x1 = max(a_x1, b_x1)
c_y1 = max(a_y1, b_y1)
c_x2 = min(a_x2, b_x2)
c_y2 = min(a_y2, b_y2)
if (c_x1 > c_x2) or (c_y1 > c_y2):
return False # No intersection
else:
return True
def refine_vh_confusion(root, overlap_th):
print('Refine VH Confusion')
for page in root:
print(page.attrib['IMAGENAME'])
for elm in reversed(page):
# vh overlap count
vh_overlap_count = 0
for elm_ref in page:
if elm.tag == 'LINE' and elm.tag == elm_ref.tag and elm.attrib['TYPE'] == elm_ref.attrib['TYPE']:
if vh_overlapping(elm, elm_ref):
vh_overlap_count += 1
if vh_overlap_count >= overlap_th:
page.remove(elm)
break
return root
def include(parent, child, margin=0.05):
p_x1, p_y1, p_x2, p_y2 = get_points(parent)
c_x1, c_y1, c_x2, c_y2 = get_points(child)
if p_x1 == c_x1 and p_y1 == c_y1 and p_x2 == c_x2 and p_y2 == c_y2:
return False
w_m = int(child.attrib['WIDTH']) * margin
h_m = int(child.attrib['HEIGHT']) * margin
if (p_x1-w_m <= c_x1) and (p_y1-h_m <= c_y1) and (p_x2+w_m >= c_x2) and (p_y2+h_m > c_y2):
return True
else:
return False
def refine_inclusion(root, margin=0.05, category_option='SAME'):
print('Refine inclusion')
for page in root:
print(page.attrib['IMAGENAME'])
for elm in reversed(page): # child
include_flag = False
for elm_ref in page: # parent
if category_option == 'SAME':
if elm.attrib['TYPE'] != elm_ref.attrib['TYPE']:
continue
elif category_option == 'SIM':
if elm.tag != elm_ref.tag:
continue
include_flag = include(parent=elm_ref, child=elm, margin=margin)
if include_flag:
page.remove(elm)
break
return root
def refine(xml, out_xml, vh_overlap_th=2, margin=0.05, category_option='SAME', vh=True, inc=True):
tree = ET.parse(xml)
root = tree.getroot()
if vh:
root = refine_vh_confusion(root, vh_overlap_th)
if inc:
root = refine_inclusion(root, margin, category_option)
tree.write(out_xml, encoding='UTF-8')
return
def main():
args = parse_args()
out_xml_path = 'out.xml'
if args.out is not None:
out_xml_path = args.out
refine(xml=args.xml,
out_xml=out_xml_path,
vh_overlap_th=args.vh_overlap_th,
margin=args.inclusion_margin,
category_option=args.category_option,
vh=not args.rm_inclusion_only,
inc=not args.rm_vh_confusion_only)
print('Export: {}'.format(out_xml_path))
if __name__ == '__main__':
main()
|