check-plurality (#5)
Browse files- check if word is plural (c43526515b87b34635528399a590940ce146da01)
- add missing resource (aa6c0b383f2970f9d6150ec7069342f9f5d46a36)
- add wordnet check for irregular plurals (f0b228b5a95e3d2aeb883af6f78edf58edbc2a57)
- add extr regex check to detect plurals (81d16871e38117d64ad905fb3a4133ddbd27f90f)
- put back plural exclusions (42608082e9a377f9ba190ee1be8e9fef9864b7cd)
utils.py
CHANGED
@@ -1,4 +1,8 @@
|
|
1 |
import torch
|
|
|
|
|
|
|
|
|
2 |
from transformers.models.bert.modeling_bert import BertModel, BertPreTrainedModel
|
3 |
from torch import nn
|
4 |
from itertools import chain
|
@@ -9,6 +13,13 @@ import re
|
|
9 |
import string
|
10 |
import inflect
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
punct_chars = list((set(string.punctuation) | {'β', 'β', 'β', 'β', '~', '|', 'β', 'β', 'β¦', "'", "`", '_'}))
|
13 |
punct_chars.sort()
|
14 |
punctuation = ''.join(punct_chars)
|
@@ -825,6 +836,32 @@ PLURAL_TO_SINGULAR_EXCLUSIONS = [
|
|
825 |
|
826 |
p = inflect.engine()
|
827 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
828 |
def singular_to_plural(word):
|
829 |
"""Convert singular words to plural using inflect."""
|
830 |
plural = p.plural(word)
|
@@ -832,9 +869,9 @@ def singular_to_plural(word):
|
|
832 |
|
833 |
def plural_to_singular(word):
|
834 |
"""Convert plural word to singular using inflect."""
|
835 |
-
if word
|
836 |
-
return word
|
837 |
-
return
|
838 |
|
839 |
plural_MATH_WORDS = [singular_to_plural(word) for word in MATH_WORDS]
|
840 |
|
|
|
1 |
import torch
|
2 |
+
import nltk
|
3 |
+
from nltk import pos_tag
|
4 |
+
from nltk.tokenize import word_tokenize
|
5 |
+
from nltk.corpus import wordnet
|
6 |
from transformers.models.bert.modeling_bert import BertModel, BertPreTrainedModel
|
7 |
from torch import nn
|
8 |
from itertools import chain
|
|
|
13 |
import string
|
14 |
import inflect
|
15 |
|
16 |
+
nltk.download('punkt')
|
17 |
+
nltk.download('punkt_tab')
|
18 |
+
nltk.download('averaged_perceptron_tagger')
|
19 |
+
nltk.download('averaged_perceptron_tagger_eng')
|
20 |
+
nltk.download('wordnet')
|
21 |
+
# nltk.download('omw-1.4')
|
22 |
+
|
23 |
punct_chars = list((set(string.punctuation) | {'β', 'β', 'β', 'β', '~', '|', 'β', 'β', 'β¦', "'", "`", '_'}))
|
24 |
punct_chars.sort()
|
25 |
punctuation = ''.join(punct_chars)
|
|
|
836 |
|
837 |
p = inflect.engine()
|
838 |
|
839 |
+
def is_plural_regex(word):
|
840 |
+
"""Detect if a word is plural using common pluralization rules."""
|
841 |
+
# Check for common plural forms
|
842 |
+
return re.search(r'(s$|es$|ies$)', word.lower()) and not re.search(r'(ss$)', word.lower())
|
843 |
+
|
844 |
+
def is_plural_wordnet(word):
|
845 |
+
# Check if WordNet has both singular and plural forms
|
846 |
+
singular_synsets = wordnet.synsets(word, pos=wordnet.NOUN)
|
847 |
+
plural_synsets = wordnet.synsets(word.rstrip('s'), pos=wordnet.NOUN)
|
848 |
+
return len(plural_synsets) > len(singular_synsets)
|
849 |
+
|
850 |
+
def is_plural_pos(word):
|
851 |
+
"""Determine if a word is plural using NLTK's part-of-speech tagging."""
|
852 |
+
# Tokenize the input word (necessary for NLTK tagging)
|
853 |
+
tokens = word_tokenize(word)
|
854 |
+
# Get the part-of-speech tag for the word
|
855 |
+
pos = pos_tag(tokens)[0][1]
|
856 |
+
# Check if the word is tagged as plural (NNS or NNPS in Penn Treebank tags)
|
857 |
+
return pos in ["NNS", "NNPS"]
|
858 |
+
|
859 |
+
def is_plural(word):
|
860 |
+
"""Check if a word is plural."""
|
861 |
+
if word in PLURAL_TO_SINGULAR_EXCLUSIONS:
|
862 |
+
return False
|
863 |
+
return is_plural_regex(word) or is_plural_pos(word) or is_plural_wordnet(word)
|
864 |
+
|
865 |
def singular_to_plural(word):
|
866 |
"""Convert singular words to plural using inflect."""
|
867 |
plural = p.plural(word)
|
|
|
869 |
|
870 |
def plural_to_singular(word):
|
871 |
"""Convert plural word to singular using inflect."""
|
872 |
+
if is_plural(word):
|
873 |
+
return p.singular_noun(word) or word
|
874 |
+
return word
|
875 |
|
876 |
plural_MATH_WORDS = [singular_to_plural(word) for word in MATH_WORDS]
|
877 |
|