Update clean_data.py
Browse files- clean_data.py +2 -0
clean_data.py
CHANGED
@@ -37,6 +37,7 @@ def cleaned_complaints(text):
|
|
37 |
import nltk
|
38 |
from nltk.corpus import stopwords
|
39 |
from nltk.stem import WordNetLemmatizer
|
|
|
40 |
import warnings
|
41 |
import re
|
42 |
|
@@ -76,6 +77,7 @@ def cleaned_complaints(text):
|
|
76 |
letters_only = re.sub("[^a-zA-Z]", " ", newString) #Fetching out only letters
|
77 |
lower_case = letters_only.lower() #converting all words to lowercase
|
78 |
tokens = [w for w in lower_case.split() if not w in stop_words]#stopwords removal
|
|
|
79 |
# tokens= lower_case.split()
|
80 |
newString=''
|
81 |
for i in tokens:
|
|
|
37 |
import nltk
|
38 |
from nltk.corpus import stopwords
|
39 |
from nltk.stem import WordNetLemmatizer
|
40 |
+
from nltk.corpus import words
|
41 |
import warnings
|
42 |
import re
|
43 |
|
|
|
77 |
letters_only = re.sub("[^a-zA-Z]", " ", newString) #Fetching out only letters
|
78 |
lower_case = letters_only.lower() #converting all words to lowercase
|
79 |
tokens = [w for w in lower_case.split() if not w in stop_words]#stopwords removal
|
80 |
+
tokens = [x for x in tokens if x in words.words()]
|
81 |
# tokens= lower_case.split()
|
82 |
newString=''
|
83 |
for i in tokens:
|