add wordnet check for irregular plurals
Browse files
utils.py
CHANGED
@@ -2,6 +2,7 @@ import torch
|
|
2 |
import nltk
|
3 |
from nltk import pos_tag
|
4 |
from nltk.tokenize import word_tokenize
|
|
|
5 |
from transformers.models.bert.modeling_bert import BertModel, BertPreTrainedModel
|
6 |
from torch import nn
|
7 |
from itertools import chain
|
@@ -16,7 +17,7 @@ nltk.download('punkt')
|
|
16 |
nltk.download('punkt_tab')
|
17 |
nltk.download('averaged_perceptron_tagger')
|
18 |
nltk.download('averaged_perceptron_tagger_eng')
|
19 |
-
|
20 |
# nltk.download('omw-1.4')
|
21 |
|
22 |
punct_chars = list((set(string.punctuation) | {'β', 'β', 'β', 'β', '~', '|', 'β', 'β', 'β¦', "'", "`", '_'}))
|
@@ -835,6 +836,12 @@ PLURAL_TO_SINGULAR_EXCLUSIONS = [
|
|
835 |
|
836 |
p = inflect.engine()
|
837 |
|
|
|
|
|
|
|
|
|
|
|
|
|
838 |
def is_plural(word):
|
839 |
"""Determine if a word is plural using NLTK's part-of-speech tagging."""
|
840 |
# Tokenize the input word (necessary for NLTK tagging)
|
@@ -853,6 +860,8 @@ def plural_to_singular(word):
|
|
853 |
"""Convert plural word to singular using inflect."""
|
854 |
if is_plural(word):
|
855 |
return p.singular_noun(word) or word
|
|
|
|
|
856 |
return word
|
857 |
|
858 |
plural_MATH_WORDS = [singular_to_plural(word) for word in MATH_WORDS]
|
|
|
2 |
import nltk
|
3 |
from nltk import pos_tag
|
4 |
from nltk.tokenize import word_tokenize
|
5 |
+
from nltk.corpus import wordnet
|
6 |
from transformers.models.bert.modeling_bert import BertModel, BertPreTrainedModel
|
7 |
from torch import nn
|
8 |
from itertools import chain
|
|
|
17 |
nltk.download('punkt_tab')
|
18 |
nltk.download('averaged_perceptron_tagger')
|
19 |
nltk.download('averaged_perceptron_tagger_eng')
|
20 |
+
nltk.download('wordnet')
|
21 |
# nltk.download('omw-1.4')
|
22 |
|
23 |
punct_chars = list((set(string.punctuation) | {'β', 'β', 'β', 'β', '~', '|', 'β', 'β', 'β¦', "'", "`", '_'}))
|
|
|
836 |
|
837 |
p = inflect.engine()
|
838 |
|
839 |
+
def is_plural_wordnet(word):
|
840 |
+
# Check if WordNet has both singular and plural forms
|
841 |
+
singular_synsets = wordnet.synsets(word, pos=wordnet.NOUN)
|
842 |
+
plural_synsets = wordnet.synsets(word.rstrip('s'), pos=wordnet.NOUN)
|
843 |
+
return len(plural_synsets) > len(singular_synsets)
|
844 |
+
|
845 |
def is_plural(word):
|
846 |
"""Determine if a word is plural using NLTK's part-of-speech tagging."""
|
847 |
# Tokenize the input word (necessary for NLTK tagging)
|
|
|
860 |
"""Convert plural word to singular using inflect."""
|
861 |
if is_plural(word):
|
862 |
return p.singular_noun(word) or word
|
863 |
+
if is_plural_wordnet(word):
|
864 |
+
return p.singular_noun(word) or word
|
865 |
return word
|
866 |
|
867 |
plural_MATH_WORDS = [singular_to_plural(word) for word in MATH_WORDS]
|