Commit
·
8f7c535
1
Parent(s):
44e368b
Update helper.py
Browse files
helper.py
CHANGED
@@ -65,9 +65,6 @@ def find_comptives_straight_patterns(sentence):
|
|
65 |
if next_token.text.lower() == "than":
|
66 |
prev_token = token.nbor(-1)
|
67 |
|
68 |
-
# this part is to check what will be before more/less. We can add a NOUN as mandatory (e.g magnitude) or even specifically the word magnitude
|
69 |
-
# for the moment we have disable it
|
70 |
-
|
71 |
if token.text.lower() == 'more':
|
72 |
comparatives.append({'comparative': [token.text+" "+next_token.text, '>']})
|
73 |
elif token.text.lower() == 'less':
|
@@ -283,6 +280,7 @@ def identify_bigger_smaller_advanced(sentence):
|
|
283 |
return bigger_list + smaller_list
|
284 |
|
285 |
|
|
|
286 |
def find_equal_to_comptives_ngrams(sentence):
|
287 |
"""
|
288 |
This function takes a sentence as input and returns a reference phrase based on semantic similarity using n-grams.
|
@@ -347,23 +345,41 @@ def single_verb_comptives(sentence):
|
|
347 |
|
348 |
# search for all verbs and examine their lemma with all the synonyms of each of the previous references. Assign a label accordingly
|
349 |
for token in doc:
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
367 |
|
368 |
final_list = bigger_list + smaller_list + equal_list
|
369 |
|
@@ -406,6 +422,7 @@ def cosine_sim(a, b):
|
|
406 |
return cosine_similarity(a.reshape(1,-1), b.reshape(1,-1))[0][0]
|
407 |
|
408 |
|
|
|
409 |
# we examine the n-grams reversely and any time we find a match, we "delete" that match, so that lesser ngrams will not be matched \
|
410 |
# (e.g. is on a par with, would also match afterwords on a par with, par with, etc)
|
411 |
|
@@ -433,7 +450,7 @@ def multiword_verb_comptives(sentence):
|
|
433 |
matched_ngrams = set()
|
434 |
|
435 |
# Iterate through n-grams of sentence, starting with the largest n-grams
|
436 |
-
for n in range(5,
|
437 |
for i in range(len(tokens)-n+1):
|
438 |
ngram = ' '.join(tokens[i:i+n])
|
439 |
|
@@ -522,23 +539,41 @@ def identify_comparatives(sentence):
|
|
522 |
# return all the patterns that were captured
|
523 |
comparatives = straight_comptives + bigger_smaller_comparatives + equal_to_comparatives + single_verb + multi_verb
|
524 |
|
525 |
-
# since those different techniques might capture similar patterns, we keep only unique references. More precisely
|
526 |
-
|
527 |
|
528 |
-
|
529 |
-
|
530 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
531 |
|
532 |
unique_output = list(unique_comparatives.values())
|
533 |
|
534 |
return unique_output
|
535 |
|
536 |
|
537 |
-
|
538 |
def comparatives_binding(sentence):
|
539 |
|
540 |
try:
|
541 |
comparative_symbols = find_comptives_symbols(sentence)
|
|
|
542 |
comparative_mentions = identify_comparatives(sentence)
|
543 |
|
544 |
# starting with the symbols, if one was captured
|
|
|
65 |
if next_token.text.lower() == "than":
|
66 |
prev_token = token.nbor(-1)
|
67 |
|
|
|
|
|
|
|
68 |
if token.text.lower() == 'more':
|
69 |
comparatives.append({'comparative': [token.text+" "+next_token.text, '>']})
|
70 |
elif token.text.lower() == 'less':
|
|
|
280 |
return bigger_list + smaller_list
|
281 |
|
282 |
|
283 |
+
|
284 |
def find_equal_to_comptives_ngrams(sentence):
|
285 |
"""
|
286 |
This function takes a sentence as input and returns a reference phrase based on semantic similarity using n-grams.
|
|
|
345 |
|
346 |
# search for all verbs and examine their lemma with all the synonyms of each of the previous references. Assign a label accordingly
|
347 |
for token in doc:
|
348 |
+
|
349 |
+
# first examine for 1-1 pair matching and 1-1 lemma pair matching
|
350 |
+
if token.text in bigger_references_sg or token.lemma_ in bigger_references_sg:
|
351 |
+
bigger_list.append({'comparative': [token.text, ">"]})
|
352 |
+
break
|
353 |
+
|
354 |
+
elif token.text in lesser_references_sg or token.lemma_ in lesser_references_sg:
|
355 |
+
smaller_list.append({'comparative': [token.text, "<"]})
|
356 |
+
break
|
357 |
+
|
358 |
+
elif token.text in equal_references_sg or token.lemma_ in equal_references_sg:
|
359 |
+
equal_list.append({'comparative': [token.text, "="]})
|
360 |
+
break
|
361 |
+
|
362 |
+
else:
|
363 |
+
|
364 |
+
# if not, then try with synonyms only for verbs
|
365 |
+
if token.pos_ == "VERB":
|
366 |
+
|
367 |
+
for lemma in token.lemma_.split('|'):
|
368 |
+
synsets = wordnet.synsets(lemma, pos='v')
|
369 |
+
|
370 |
+
for syn in synsets:
|
371 |
+
if any(lemma in bigger_references_sg for lemma in syn.lemma_names()):
|
372 |
+
bigger_list.append({'comparative': [token.text, ">"]})
|
373 |
+
break
|
374 |
+
|
375 |
+
elif any(lemma in lesser_references_sg for lemma in syn.lemma_names()):
|
376 |
+
smaller_list.append({'comparative': [token.text, "<"]})
|
377 |
+
break
|
378 |
+
|
379 |
+
elif any(lemma in equal_references_sg for lemma in syn.lemma_names()):
|
380 |
+
equal_list.append({'comparative': [token.text, "="]})
|
381 |
+
break
|
382 |
+
|
383 |
|
384 |
final_list = bigger_list + smaller_list + equal_list
|
385 |
|
|
|
422 |
return cosine_similarity(a.reshape(1,-1), b.reshape(1,-1))[0][0]
|
423 |
|
424 |
|
425 |
+
|
426 |
# we examine the n-grams reversely and any time we find a match, we "delete" that match, so that lesser ngrams will not be matched \
|
427 |
# (e.g. is on a par with, would also match afterwords on a par with, par with, etc)
|
428 |
|
|
|
450 |
matched_ngrams = set()
|
451 |
|
452 |
# Iterate through n-grams of sentence, starting with the largest n-grams
|
453 |
+
for n in range(5, 0, -1):
|
454 |
for i in range(len(tokens)-n+1):
|
455 |
ngram = ' '.join(tokens[i:i+n])
|
456 |
|
|
|
539 |
# return all the patterns that were captured
|
540 |
comparatives = straight_comptives + bigger_smaller_comparatives + equal_to_comparatives + single_verb + multi_verb
|
541 |
|
542 |
+
# since those different techniques might capture similar patterns, we keep only unique references. More precisely
|
543 |
+
# we discard any unique reference while also any reference thay may exist as a substring on any other reference
|
544 |
|
545 |
+
# sort the list by length of the comparatives, in descending order
|
546 |
+
comparatives.sort(key=lambda item: len(item['comparative'][0]), reverse=False)
|
547 |
+
|
548 |
+
unique_comparatives = {}
|
549 |
+
for i, item in enumerate(comparatives):
|
550 |
+
comparative = item['comparative'][0]
|
551 |
+
# check if the comparative is already in the dictionary or a substring/similar string of an existing comparative
|
552 |
+
is_unique = True
|
553 |
+
for existing_comp in unique_comparatives:
|
554 |
+
if (comparative in existing_comp) or (existing_comp in comparative):
|
555 |
+
is_unique = False
|
556 |
+
break
|
557 |
+
if is_unique:
|
558 |
+
unique_comparatives[comparative] = item
|
559 |
+
elif i == len(comparatives) - 1:
|
560 |
+
# if it's the last item and it's not unique, replace the first unique item in the list with this item
|
561 |
+
for j, existing_item in enumerate(unique_comparatives.values()):
|
562 |
+
if (existing_item['comparative'][0] in comparative) or (comparative in existing_item['comparative'][0]):
|
563 |
+
unique_comparatives.pop(list(unique_comparatives.keys())[j])
|
564 |
+
unique_comparatives[comparative] = item
|
565 |
+
break
|
566 |
|
567 |
unique_output = list(unique_comparatives.values())
|
568 |
|
569 |
return unique_output
|
570 |
|
571 |
|
|
|
572 |
def comparatives_binding(sentence):
|
573 |
|
574 |
try:
|
575 |
comparative_symbols = find_comptives_symbols(sentence)
|
576 |
+
|
577 |
comparative_mentions = identify_comparatives(sentence)
|
578 |
|
579 |
# starting with the symbols, if one was captured
|