Spaces:
Sleeping
Sleeping
File size: 4,039 Bytes
eca7f4a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import re
from collections import Counter
from spacy.tokens import SpanGroup
def preprocess(text):
text = re.sub("--- Para SEP ---", '\n', text)
text = re.sub("\n\n", ' &&&&&&&&#&#&#&#&', text)
text = re.sub('\n', ' ', text)
text = re.sub(r'\s+', " ", text)
text = re.sub('&&&&&&&&#&#&#&#&', '\n\n', text)
return text
def del_spans(span_sc, indexes: list):
indexes.sort(
reverse=True
) # reversing allows the deletion from the last, keeping the original index
for idx in indexes:
if idx + 1 < len(span_sc):
del span_sc[idx + 1]
def delete_overlapping_span(span_sc: dict):
# print(span_sc)
start_token_list = [spn.start for spn in span_sc]
dict_ = Counter(start_token_list)
overlap = {k: v for k, v in dict_.items() if v > 1}
id_del = []
id_comp = {}
info = {}
for n, (spn, score) in enumerate(zip(span_sc, span_sc.attrs['scores']),
start=0):
res = {
'score': score,
'spn': spn,
'label': spn.label_,
'start': spn.start,
'end': spn.end,
'compare': spn.start in overlap,
"sents": len(list(spn.sents))
}
# print(res)
info[n] = res
if res['compare']:
if spn.start not in id_comp:
id_comp[spn.start] = n
else:
same_lbl = res['label'] == info[id_comp[spn.start]]['label']
update = res['score'] > info[id_comp[spn.start]]['score']
if update and same_lbl:
print(res['label'], info[id_comp[spn.start]]['label'])
print(same_lbl)
id_del.append(id_comp[spn.start])
id_comp[spn.start] = n
else:
id_del.append(n)
# print(update)
# delete span beyond sentences
if len(list(spn.sents)) > 1:
id_del.append(n)
# print(id_comp)
del_spans(span_sc, id_del)
# for n, idx in enumerate(id_del):
# # print(idx)
# try:
# del span_sc[idx - n]
# except IndexError:
# continue
def cleanup_justify(doc, span_sc: dict):
# This function adjusts the JUSTIFYING span
# First create an index of span with JUSTIFYING tags
justifies = {}
for idx, span in enumerate(span_sc):
# temp_root = span.root
# while span.start <= temp_root.head.i <= span.end:
# temp_root = temp_root.head
if span.label_ in ['JUSTIFYING']:
justifies[span.root] = {
"span": span,
"head": span.root.head,
"start": span.start,
"end": span.end,
"del": False,
"dependent": False,
"span_idx": idx
}
# print(justifies)
# flagging the dependency
for spanroot, info in justifies.items():
if spanroot.head in justifies:
info['dependent'] = True
info['del'] = True
# print(justifies)
new_spans = []
for spanroot, info in justifies.items():
if not info['dependent']:
# print("New Justifying candidate span:")
# print(doc[spanroot.left_edge.i:spanroot.right_edge.i + 1])
new_span = doc[spanroot.left_edge.i:spanroot.right_edge.i + 1]
new_span.label_ = "JUSTIFYING"
if new_span not in span_sc:
new_spans.append(new_span)
info['del'] = True
else:
info['del'] = True
to_delete = [
info['span_idx'] for spanroot, info in justifies.items() if info['del']
]
to_delete_span = [
info['span'] for spanroot, info in justifies.items() if info['del']
]
# print(to_delete)
# print(to_delete_span)
del_spans(span_sc, to_delete)
span_grp = SpanGroup(doc, spans=new_spans)
span_sc.extend(span_grp)
# print(justifies)
|