ikarasz commited on
Commit
f0b228b
Β·
1 Parent(s): aa6c0b3

add wordnet check for irregular plurals

Browse files
Files changed (1) hide show
  1. utils.py +10 -1
utils.py CHANGED
@@ -2,6 +2,7 @@ import torch
2
  import nltk
3
  from nltk import pos_tag
4
  from nltk.tokenize import word_tokenize
 
5
  from transformers.models.bert.modeling_bert import BertModel, BertPreTrainedModel
6
  from torch import nn
7
  from itertools import chain
@@ -16,7 +17,7 @@ nltk.download('punkt')
16
  nltk.download('punkt_tab')
17
  nltk.download('averaged_perceptron_tagger')
18
  nltk.download('averaged_perceptron_tagger_eng')
19
- # nltk.download('wordnet')
20
  # nltk.download('omw-1.4')
21
 
22
  punct_chars = list((set(string.punctuation) | {'’', 'β€˜', '–', 'β€”', '~', '|', 'β€œ', '”', '…', "'", "`", '_'}))
@@ -835,6 +836,12 @@ PLURAL_TO_SINGULAR_EXCLUSIONS = [
835
 
836
  p = inflect.engine()
837
 
 
 
 
 
 
 
838
  def is_plural(word):
839
  """Determine if a word is plural using NLTK's part-of-speech tagging."""
840
  # Tokenize the input word (necessary for NLTK tagging)
@@ -853,6 +860,8 @@ def plural_to_singular(word):
853
  """Convert plural word to singular using inflect."""
854
  if is_plural(word):
855
  return p.singular_noun(word) or word
 
 
856
  return word
857
 
858
  plural_MATH_WORDS = [singular_to_plural(word) for word in MATH_WORDS]
 
2
  import nltk
3
  from nltk import pos_tag
4
  from nltk.tokenize import word_tokenize
5
+ from nltk.corpus import wordnet
6
  from transformers.models.bert.modeling_bert import BertModel, BertPreTrainedModel
7
  from torch import nn
8
  from itertools import chain
 
17
  nltk.download('punkt_tab')
18
  nltk.download('averaged_perceptron_tagger')
19
  nltk.download('averaged_perceptron_tagger_eng')
20
+ nltk.download('wordnet')
21
  # nltk.download('omw-1.4')
22
 
23
  punct_chars = list((set(string.punctuation) | {'’', 'β€˜', '–', 'β€”', '~', '|', 'β€œ', '”', '…', "'", "`", '_'}))
 
836
 
837
  p = inflect.engine()
838
 
839
+ def is_plural_wordnet(word):
840
+ # Check if WordNet has both singular and plural forms
841
+ singular_synsets = wordnet.synsets(word, pos=wordnet.NOUN)
842
+ plural_synsets = wordnet.synsets(word.rstrip('s'), pos=wordnet.NOUN)
843
+ return len(plural_synsets) > len(singular_synsets)
844
+
845
  def is_plural(word):
846
  """Determine if a word is plural using NLTK's part-of-speech tagging."""
847
  # Tokenize the input word (necessary for NLTK tagging)
 
860
  """Convert plural word to singular using inflect."""
861
  if is_plural(word):
862
  return p.singular_noun(word) or word
863
+ if is_plural_wordnet(word):
864
+ return p.singular_noun(word) or word
865
  return word
866
 
867
  plural_MATH_WORDS = [singular_to_plural(word) for word in MATH_WORDS]