Spaces:

egumasa
/

engagement-analyzer-demo7

Sleeping

App Files Files Community

engagement-analyzer-demo7 / utils /utility.py

egumasa

push

eca7f4a 9 months ago

raw

history blame contribute delete

4.04 kB

	import re
	from collections import Counter
	from spacy.tokens import SpanGroup


	def preprocess(text):
	text = re.sub("--- Para SEP ---", '\n', text)
	text = re.sub("\n\n", ' &&&&&&&&#&#&#&#&', text)
	text = re.sub('\n', ' ', text)
	text = re.sub(r'\s+', " ", text)
	text = re.sub('&&&&&&&&#&#&#&#&', '\n\n', text)
	return text


	def del_spans(span_sc, indexes: list):

	indexes.sort(
	reverse=True
	) # reversing allows the deletion from the last, keeping the original index

	for idx in indexes:
	if idx + 1 < len(span_sc):
	del span_sc[idx + 1]


	def delete_overlapping_span(span_sc: dict):
	# print(span_sc)
	start_token_list = [spn.start for spn in span_sc]
	dict_ = Counter(start_token_list)
	overlap = {k: v for k, v in dict_.items() if v > 1}

	id_del = []
	id_comp = {}

	info = {}
	for n, (spn, score) in enumerate(zip(span_sc, span_sc.attrs['scores']),
	start=0):
	res = {
	'score': score,
	'spn': spn,
	'label': spn.label_,
	'start': spn.start,
	'end': spn.end,
	'compare': spn.start in overlap,
	"sents": len(list(spn.sents))
	}
	# print(res)
	info[n] = res

	if res['compare']:
	if spn.start not in id_comp:
	id_comp[spn.start] = n
	else:
	same_lbl = res['label'] == info[id_comp[spn.start]]['label']
	update = res['score'] > info[id_comp[spn.start]]['score']
	if update and same_lbl:
	print(res['label'], info[id_comp[spn.start]]['label'])
	print(same_lbl)
	id_del.append(id_comp[spn.start])
	id_comp[spn.start] = n
	else:
	id_del.append(n)
	# print(update)

	# delete span beyond sentences
	if len(list(spn.sents)) > 1:
	id_del.append(n)

	# print(id_comp)
	del_spans(span_sc, id_del)
	# for n, idx in enumerate(id_del):
	# # print(idx)

	# try:
	# del span_sc[idx - n]
	# except IndexError:
	# continue


	def cleanup_justify(doc, span_sc: dict):
	# This function adjusts the JUSTIFYING span

	# First create an index of span with JUSTIFYING tags
	justifies = {}
	for idx, span in enumerate(span_sc):
	# temp_root = span.root
	# while span.start <= temp_root.head.i <= span.end:
	# temp_root = temp_root.head
	if span.label_ in ['JUSTIFYING']:
	justifies[span.root] = {
	"span": span,
	"head": span.root.head,
	"start": span.start,
	"end": span.end,
	"del": False,
	"dependent": False,
	"span_idx": idx
	}
	# print(justifies)

	# flagging the dependency
	for spanroot, info in justifies.items():
	if spanroot.head in justifies:
	info['dependent'] = True
	info['del'] = True

	# print(justifies)
	new_spans = []
	for spanroot, info in justifies.items():

	if not info['dependent']:
	# print("New Justifying candidate span:")
	# print(doc[spanroot.left_edge.i:spanroot.right_edge.i + 1])

	new_span = doc[spanroot.left_edge.i:spanroot.right_edge.i + 1]
	new_span.label_ = "JUSTIFYING"

	if new_span not in span_sc:
	new_spans.append(new_span)
	info['del'] = True

	else:
	info['del'] = True

	to_delete = [
	info['span_idx'] for spanroot, info in justifies.items() if info['del']
	]

	to_delete_span = [
	info['span'] for spanroot, info in justifies.items() if info['del']
	]

	# print(to_delete)
	# print(to_delete_span)

	del_spans(span_sc, to_delete)

	span_grp = SpanGroup(doc, spans=new_spans)
	span_sc.extend(span_grp)

	# print(justifies)