ikarasz commited on
Commit
5bfe606
Β·
verified Β·
1 Parent(s): a643a4a

check-plurality (#5)

Browse files

- check if word is plural (c43526515b87b34635528399a590940ce146da01)
- add missing resource (aa6c0b383f2970f9d6150ec7069342f9f5d46a36)
- add wordnet check for irregular plurals (f0b228b5a95e3d2aeb883af6f78edf58edbc2a57)
- add extr regex check to detect plurals (81d16871e38117d64ad905fb3a4133ddbd27f90f)
- put back plural exclusions (42608082e9a377f9ba190ee1be8e9fef9864b7cd)

Files changed (1) hide show
  1. utils.py +40 -3
utils.py CHANGED
@@ -1,4 +1,8 @@
1
  import torch
 
 
 
 
2
  from transformers.models.bert.modeling_bert import BertModel, BertPreTrainedModel
3
  from torch import nn
4
  from itertools import chain
@@ -9,6 +13,13 @@ import re
9
  import string
10
  import inflect
11
 
 
 
 
 
 
 
 
12
  punct_chars = list((set(string.punctuation) | {'’', 'β€˜', '–', 'β€”', '~', '|', 'β€œ', '”', '…', "'", "`", '_'}))
13
  punct_chars.sort()
14
  punctuation = ''.join(punct_chars)
@@ -825,6 +836,32 @@ PLURAL_TO_SINGULAR_EXCLUSIONS = [
825
 
826
  p = inflect.engine()
827
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
828
  def singular_to_plural(word):
829
  """Convert singular words to plural using inflect."""
830
  plural = p.plural(word)
@@ -832,9 +869,9 @@ def singular_to_plural(word):
832
 
833
  def plural_to_singular(word):
834
  """Convert plural word to singular using inflect."""
835
- if word in PLURAL_TO_SINGULAR_EXCLUSIONS:
836
- return word
837
- return p.singular_noun(word) or word
838
 
839
  plural_MATH_WORDS = [singular_to_plural(word) for word in MATH_WORDS]
840
 
 
1
  import torch
2
+ import nltk
3
+ from nltk import pos_tag
4
+ from nltk.tokenize import word_tokenize
5
+ from nltk.corpus import wordnet
6
  from transformers.models.bert.modeling_bert import BertModel, BertPreTrainedModel
7
  from torch import nn
8
  from itertools import chain
 
13
  import string
14
  import inflect
15
 
16
+ nltk.download('punkt')
17
+ nltk.download('punkt_tab')
18
+ nltk.download('averaged_perceptron_tagger')
19
+ nltk.download('averaged_perceptron_tagger_eng')
20
+ nltk.download('wordnet')
21
+ # nltk.download('omw-1.4')
22
+
23
  punct_chars = list((set(string.punctuation) | {'’', 'β€˜', '–', 'β€”', '~', '|', 'β€œ', '”', '…', "'", "`", '_'}))
24
  punct_chars.sort()
25
  punctuation = ''.join(punct_chars)
 
836
 
837
  p = inflect.engine()
838
 
839
+ def is_plural_regex(word):
840
+ """Detect if a word is plural using common pluralization rules."""
841
+ # Check for common plural forms
842
+ return re.search(r'(s$|es$|ies$)', word.lower()) and not re.search(r'(ss$)', word.lower())
843
+
844
+ def is_plural_wordnet(word):
845
+ # Check if WordNet has both singular and plural forms
846
+ singular_synsets = wordnet.synsets(word, pos=wordnet.NOUN)
847
+ plural_synsets = wordnet.synsets(word.rstrip('s'), pos=wordnet.NOUN)
848
+ return len(plural_synsets) > len(singular_synsets)
849
+
850
+ def is_plural_pos(word):
851
+ """Determine if a word is plural using NLTK's part-of-speech tagging."""
852
+ # Tokenize the input word (necessary for NLTK tagging)
853
+ tokens = word_tokenize(word)
854
+ # Get the part-of-speech tag for the word
855
+ pos = pos_tag(tokens)[0][1]
856
+ # Check if the word is tagged as plural (NNS or NNPS in Penn Treebank tags)
857
+ return pos in ["NNS", "NNPS"]
858
+
859
+ def is_plural(word):
860
+ """Check if a word is plural."""
861
+ if word in PLURAL_TO_SINGULAR_EXCLUSIONS:
862
+ return False
863
+ return is_plural_regex(word) or is_plural_pos(word) or is_plural_wordnet(word)
864
+
865
  def singular_to_plural(word):
866
  """Convert singular words to plural using inflect."""
867
  plural = p.plural(word)
 
869
 
870
  def plural_to_singular(word):
871
  """Convert plural word to singular using inflect."""
872
+ if is_plural(word):
873
+ return p.singular_noun(word) or word
874
+ return word
875
 
876
  plural_MATH_WORDS = [singular_to_plural(word) for word in MATH_WORDS]
877