wilmerags commited on
Commit
4d8d3df
·
1 Parent(s): 0d13483

fix: Add stopwords removal

Browse files
Files changed (1) hide show
  1. app.py +10 -1
app.py CHANGED
@@ -4,10 +4,13 @@ import string
4
  import re
5
  import tweepy
6
  import hdbscan
 
7
  import numpy as np
8
  import streamlit as st
9
 
 
10
 
 
11
  from gensim.utils import deaccent
12
  from bokeh.models import ColumnDataSource, HoverTool, Label
13
  from bokeh.palettes import Colorblind as Pallete
@@ -46,10 +49,16 @@ def _remove_punctuation(txt_list: List[str]):
46
  txt_list = [tweet.split(' ') for tweet in txt_list]
47
  return [' '.join([word.translate(str.maketrans('', '', punctuation)) for word in tweet]) for tweet in txt_list]
48
 
 
 
 
 
 
49
  preprocess_pipeline = [
50
  _remove_unk_chars,
51
  _remove_urls,
52
- _remove_punctuation
 
53
  ]
54
 
55
  def preprocess(txt_list: str):
 
4
  import re
5
  import tweepy
6
  import hdbscan
7
+ import nltk
8
  import numpy as np
9
  import streamlit as st
10
 
11
+ nltk.download()
12
 
13
+ from nltk.corpus import stopwords
14
  from gensim.utils import deaccent
15
  from bokeh.models import ColumnDataSource, HoverTool, Label
16
  from bokeh.palettes import Colorblind as Pallete
 
49
  txt_list = [tweet.split(' ') for tweet in txt_list]
50
  return [' '.join([word.translate(str.maketrans('', '', punctuation)) for word in tweet]) for tweet in txt_list]
51
 
52
+ def _remove_stopwords(txt_list: List[str]):
53
+ stopwords_str = stopwords.words('english')
54
+ txt_list = [tweet.split(' ') for tweet in txt_list]
55
+ return [' '.join([word for word in tweet if word not in stopwords_str]) for tweet in txt_list]
56
+
57
  preprocess_pipeline = [
58
  _remove_unk_chars,
59
  _remove_urls,
60
+ _remove_punctuation,
61
+ _remove_stopwords,
62
  ]
63
 
64
  def preprocess(txt_list: str):