Spaces:

wilmerags
/

tweet-snest

Build error

wilmerags commited on Nov 24, 2021

Commit

4d8d3df

1 Parent(s): 0d13483

fix: Add stopwords removal

Files changed (1) hide show

app.py CHANGED Viewed

@@ -4,10 +4,13 @@ import string
 import re
 import tweepy
 import hdbscan
 import numpy as np
 import streamlit as st
 from gensim.utils import deaccent
 from bokeh.models import ColumnDataSource, HoverTool, Label
 from bokeh.palettes import Colorblind as Pallete
@@ -46,10 +49,16 @@ def _remove_punctuation(txt_list: List[str]):
     txt_list = [tweet.split(' ') for tweet in txt_list]
     return [' '.join([word.translate(str.maketrans('', '', punctuation)) for word in tweet]) for tweet in txt_list]
 preprocess_pipeline = [
     _remove_unk_chars,
     _remove_urls,
-    _remove_punctuation
 ]
 def preprocess(txt_list: str):

 import re
 import tweepy
 import hdbscan
+import nltk
 import numpy as np
 import streamlit as st
+nltk.download()
+from nltk.corpus import stopwords
 from gensim.utils import deaccent
 from bokeh.models import ColumnDataSource, HoverTool, Label
 from bokeh.palettes import Colorblind as Pallete
     txt_list = [tweet.split(' ') for tweet in txt_list]
     return [' '.join([word.translate(str.maketrans('', '', punctuation)) for word in tweet]) for tweet in txt_list]
+def _remove_stopwords(txt_list: List[str]):
+    stopwords_str = stopwords.words('english')
+    txt_list = [tweet.split(' ') for tweet in txt_list]
+    return [' '.join([word for word in tweet if word not in stopwords_str]) for tweet in txt_list]
 preprocess_pipeline = [
     _remove_unk_chars,
     _remove_urls,
+    _remove_punctuation,
+    _remove_stopwords,
 ]
 def preprocess(txt_list: str):