ikarasz commited on
Commit
c435265
Β·
1 Parent(s): a643a4a

check if word is plural

Browse files
Files changed (1) hide show
  1. utils.py +18 -3
utils.py CHANGED
@@ -1,4 +1,7 @@
1
  import torch
 
 
 
2
  from transformers.models.bert.modeling_bert import BertModel, BertPreTrainedModel
3
  from torch import nn
4
  from itertools import chain
@@ -9,6 +12,9 @@ import re
9
  import string
10
  import inflect
11
 
 
 
 
12
  punct_chars = list((set(string.punctuation) | {'’', 'β€˜', '–', 'β€”', '~', '|', 'β€œ', '”', '…', "'", "`", '_'}))
13
  punct_chars.sort()
14
  punctuation = ''.join(punct_chars)
@@ -825,6 +831,15 @@ PLURAL_TO_SINGULAR_EXCLUSIONS = [
825
 
826
  p = inflect.engine()
827
 
 
 
 
 
 
 
 
 
 
828
  def singular_to_plural(word):
829
  """Convert singular words to plural using inflect."""
830
  plural = p.plural(word)
@@ -832,9 +847,9 @@ def singular_to_plural(word):
832
 
833
  def plural_to_singular(word):
834
  """Convert plural word to singular using inflect."""
835
- if word in PLURAL_TO_SINGULAR_EXCLUSIONS:
836
- return word
837
- return p.singular_noun(word) or word
838
 
839
  plural_MATH_WORDS = [singular_to_plural(word) for word in MATH_WORDS]
840
 
 
1
  import torch
2
+ import nltk
3
+ from nltk import pos_tag
4
+ from nltk.tokenize import word_tokenize
5
  from transformers.models.bert.modeling_bert import BertModel, BertPreTrainedModel
6
  from torch import nn
7
  from itertools import chain
 
12
  import string
13
  import inflect
14
 
15
+ nltk.download("averaged_perceptron_tagger")
16
+ nltk.download("punkt")
17
+
18
  punct_chars = list((set(string.punctuation) | {'’', 'β€˜', '–', 'β€”', '~', '|', 'β€œ', '”', '…', "'", "`", '_'}))
19
  punct_chars.sort()
20
  punctuation = ''.join(punct_chars)
 
831
 
832
  p = inflect.engine()
833
 
834
+ def is_plural(word):
835
+ """Determine if a word is plural using NLTK's part-of-speech tagging."""
836
+ # Tokenize the input word (necessary for NLTK tagging)
837
+ tokens = word_tokenize(word)
838
+ # Get the part-of-speech tag for the word
839
+ pos = pos_tag(tokens)[0][1]
840
+ # Check if the word is tagged as plural (NNS or NNPS in Penn Treebank tags)
841
+ return pos in ["NNS", "NNPS"]
842
+
843
  def singular_to_plural(word):
844
  """Convert singular words to plural using inflect."""
845
  plural = p.plural(word)
 
847
 
848
  def plural_to_singular(word):
849
  """Convert plural word to singular using inflect."""
850
+ if is_plural(word):
851
+ return p.singular_noun(word) or word
852
+ return word
853
 
854
  plural_MATH_WORDS = [singular_to_plural(word) for word in MATH_WORDS]
855