saritha5 commited on
Commit
8cd729a
1 Parent(s): 1094158

Update clean_data.py

Browse files
Files changed (1) hide show
  1. clean_data.py +2 -0
clean_data.py CHANGED
@@ -37,6 +37,7 @@ def cleaned_complaints(text):
37
  import nltk
38
  from nltk.corpus import stopwords
39
  from nltk.stem import WordNetLemmatizer
 
40
  import warnings
41
  import re
42
 
@@ -76,6 +77,7 @@ def cleaned_complaints(text):
76
  letters_only = re.sub("[^a-zA-Z]", " ", newString) #Fetching out only letters
77
  lower_case = letters_only.lower() #converting all words to lowercase
78
  tokens = [w for w in lower_case.split() if not w in stop_words]#stopwords removal
 
79
  # tokens= lower_case.split()
80
  newString=''
81
  for i in tokens:
 
37
  import nltk
38
  from nltk.corpus import stopwords
39
  from nltk.stem import WordNetLemmatizer
40
+ from nltk.corpus import words
41
  import warnings
42
  import re
43
 
 
77
  letters_only = re.sub("[^a-zA-Z]", " ", newString) #Fetching out only letters
78
  lower_case = letters_only.lower() #converting all words to lowercase
79
  tokens = [w for w in lower_case.split() if not w in stop_words]#stopwords removal
80
+ tokens = [x for x in tokens if x in words.words()]
81
  # tokens= lower_case.split()
82
  newString=''
83
  for i in tokens: