ValadisCERTH commited on
Commit
8f7c535
·
1 Parent(s): 44e368b

Update helper.py

Browse files
Files changed (1) hide show
  1. helper.py +62 -27
helper.py CHANGED
@@ -65,9 +65,6 @@ def find_comptives_straight_patterns(sentence):
65
  if next_token.text.lower() == "than":
66
  prev_token = token.nbor(-1)
67
 
68
- # this part is to check what will be before more/less. We can add a NOUN as mandatory (e.g magnitude) or even specifically the word magnitude
69
- # for the moment we have disable it
70
-
71
  if token.text.lower() == 'more':
72
  comparatives.append({'comparative': [token.text+" "+next_token.text, '>']})
73
  elif token.text.lower() == 'less':
@@ -283,6 +280,7 @@ def identify_bigger_smaller_advanced(sentence):
283
  return bigger_list + smaller_list
284
 
285
 
 
286
  def find_equal_to_comptives_ngrams(sentence):
287
  """
288
  This function takes a sentence as input and returns a reference phrase based on semantic similarity using n-grams.
@@ -347,23 +345,41 @@ def single_verb_comptives(sentence):
347
 
348
  # search for all verbs and examine their lemma with all the synonyms of each of the previous references. Assign a label accordingly
349
  for token in doc:
350
- if token.pos_ == "VERB":
351
-
352
- for lemma in token.lemma_.split('|'):
353
- synsets = wordnet.synsets(lemma, pos='v')
354
-
355
- for syn in synsets:
356
- if any(lemma in bigger_references_sg for lemma in syn.lemma_names()):
357
- bigger_list.append({'comparative': [token.text, ">"]})
358
- break
359
-
360
- elif any(lemma in lesser_references_sg for lemma in syn.lemma_names()):
361
- smaller_list.append({'comparative': [token.text, "<"]})
362
- break
363
-
364
- elif any(lemma in equal_references_sg for lemma in syn.lemma_names()):
365
- equal_list.append({'comparative': [token.text, "="]})
366
- break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
367
 
368
  final_list = bigger_list + smaller_list + equal_list
369
 
@@ -406,6 +422,7 @@ def cosine_sim(a, b):
406
  return cosine_similarity(a.reshape(1,-1), b.reshape(1,-1))[0][0]
407
 
408
 
 
409
  # we examine the n-grams reversely and any time we find a match, we "delete" that match, so that lesser ngrams will not be matched \
410
  # (e.g. is on a par with, would also match afterwords on a par with, par with, etc)
411
 
@@ -433,7 +450,7 @@ def multiword_verb_comptives(sentence):
433
  matched_ngrams = set()
434
 
435
  # Iterate through n-grams of sentence, starting with the largest n-grams
436
- for n in range(5, 1, -1):
437
  for i in range(len(tokens)-n+1):
438
  ngram = ' '.join(tokens[i:i+n])
439
 
@@ -522,23 +539,41 @@ def identify_comparatives(sentence):
522
  # return all the patterns that were captured
523
  comparatives = straight_comptives + bigger_smaller_comparatives + equal_to_comparatives + single_verb + multi_verb
524
 
525
- # since those different techniques might capture similar patterns, we keep only unique references. More precisely
526
- unique_comparatives = {}
527
 
528
- for item in comparatives:
529
- if item['comparative'][0] not in unique_comparatives:
530
- unique_comparatives[item['comparative'][0]] = item
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
531
 
532
  unique_output = list(unique_comparatives.values())
533
 
534
  return unique_output
535
 
536
 
537
-
538
  def comparatives_binding(sentence):
539
 
540
  try:
541
  comparative_symbols = find_comptives_symbols(sentence)
 
542
  comparative_mentions = identify_comparatives(sentence)
543
 
544
  # starting with the symbols, if one was captured
 
65
  if next_token.text.lower() == "than":
66
  prev_token = token.nbor(-1)
67
 
 
 
 
68
  if token.text.lower() == 'more':
69
  comparatives.append({'comparative': [token.text+" "+next_token.text, '>']})
70
  elif token.text.lower() == 'less':
 
280
  return bigger_list + smaller_list
281
 
282
 
283
+
284
  def find_equal_to_comptives_ngrams(sentence):
285
  """
286
  This function takes a sentence as input and returns a reference phrase based on semantic similarity using n-grams.
 
345
 
346
  # search for all verbs and examine their lemma with all the synonyms of each of the previous references. Assign a label accordingly
347
  for token in doc:
348
+
349
+ # first examine for 1-1 pair matching and 1-1 lemma pair matching
350
+ if token.text in bigger_references_sg or token.lemma_ in bigger_references_sg:
351
+ bigger_list.append({'comparative': [token.text, ">"]})
352
+ break
353
+
354
+ elif token.text in lesser_references_sg or token.lemma_ in lesser_references_sg:
355
+ smaller_list.append({'comparative': [token.text, "<"]})
356
+ break
357
+
358
+ elif token.text in equal_references_sg or token.lemma_ in equal_references_sg:
359
+ equal_list.append({'comparative': [token.text, "="]})
360
+ break
361
+
362
+ else:
363
+
364
+ # if not, then try with synonyms only for verbs
365
+ if token.pos_ == "VERB":
366
+
367
+ for lemma in token.lemma_.split('|'):
368
+ synsets = wordnet.synsets(lemma, pos='v')
369
+
370
+ for syn in synsets:
371
+ if any(lemma in bigger_references_sg for lemma in syn.lemma_names()):
372
+ bigger_list.append({'comparative': [token.text, ">"]})
373
+ break
374
+
375
+ elif any(lemma in lesser_references_sg for lemma in syn.lemma_names()):
376
+ smaller_list.append({'comparative': [token.text, "<"]})
377
+ break
378
+
379
+ elif any(lemma in equal_references_sg for lemma in syn.lemma_names()):
380
+ equal_list.append({'comparative': [token.text, "="]})
381
+ break
382
+
383
 
384
  final_list = bigger_list + smaller_list + equal_list
385
 
 
422
  return cosine_similarity(a.reshape(1,-1), b.reshape(1,-1))[0][0]
423
 
424
 
425
+
426
  # we examine the n-grams reversely and any time we find a match, we "delete" that match, so that lesser ngrams will not be matched \
427
  # (e.g. is on a par with, would also match afterwords on a par with, par with, etc)
428
 
 
450
  matched_ngrams = set()
451
 
452
  # Iterate through n-grams of sentence, starting with the largest n-grams
453
+ for n in range(5, 0, -1):
454
  for i in range(len(tokens)-n+1):
455
  ngram = ' '.join(tokens[i:i+n])
456
 
 
539
  # return all the patterns that were captured
540
  comparatives = straight_comptives + bigger_smaller_comparatives + equal_to_comparatives + single_verb + multi_verb
541
 
542
+ # since those different techniques might capture similar patterns, we keep only unique references. More precisely
543
+ # we discard any unique reference while also any reference thay may exist as a substring on any other reference
544
 
545
+ # sort the list by length of the comparatives, in descending order
546
+ comparatives.sort(key=lambda item: len(item['comparative'][0]), reverse=False)
547
+
548
+ unique_comparatives = {}
549
+ for i, item in enumerate(comparatives):
550
+ comparative = item['comparative'][0]
551
+ # check if the comparative is already in the dictionary or a substring/similar string of an existing comparative
552
+ is_unique = True
553
+ for existing_comp in unique_comparatives:
554
+ if (comparative in existing_comp) or (existing_comp in comparative):
555
+ is_unique = False
556
+ break
557
+ if is_unique:
558
+ unique_comparatives[comparative] = item
559
+ elif i == len(comparatives) - 1:
560
+ # if it's the last item and it's not unique, replace the first unique item in the list with this item
561
+ for j, existing_item in enumerate(unique_comparatives.values()):
562
+ if (existing_item['comparative'][0] in comparative) or (comparative in existing_item['comparative'][0]):
563
+ unique_comparatives.pop(list(unique_comparatives.keys())[j])
564
+ unique_comparatives[comparative] = item
565
+ break
566
 
567
  unique_output = list(unique_comparatives.values())
568
 
569
  return unique_output
570
 
571
 
 
572
  def comparatives_binding(sentence):
573
 
574
  try:
575
  comparative_symbols = find_comptives_symbols(sentence)
576
+
577
  comparative_mentions = identify_comparatives(sentence)
578
 
579
  # starting with the symbols, if one was captured