Spaces:
Build error
Build error
fix: Add stopwords removal
Browse files
app.py
CHANGED
@@ -4,10 +4,13 @@ import string
|
|
4 |
import re
|
5 |
import tweepy
|
6 |
import hdbscan
|
|
|
7 |
import numpy as np
|
8 |
import streamlit as st
|
9 |
|
|
|
10 |
|
|
|
11 |
from gensim.utils import deaccent
|
12 |
from bokeh.models import ColumnDataSource, HoverTool, Label
|
13 |
from bokeh.palettes import Colorblind as Pallete
|
@@ -46,10 +49,16 @@ def _remove_punctuation(txt_list: List[str]):
|
|
46 |
txt_list = [tweet.split(' ') for tweet in txt_list]
|
47 |
return [' '.join([word.translate(str.maketrans('', '', punctuation)) for word in tweet]) for tweet in txt_list]
|
48 |
|
|
|
|
|
|
|
|
|
|
|
49 |
preprocess_pipeline = [
|
50 |
_remove_unk_chars,
|
51 |
_remove_urls,
|
52 |
-
_remove_punctuation
|
|
|
53 |
]
|
54 |
|
55 |
def preprocess(txt_list: str):
|
|
|
4 |
import re
|
5 |
import tweepy
|
6 |
import hdbscan
|
7 |
+
import nltk
|
8 |
import numpy as np
|
9 |
import streamlit as st
|
10 |
|
11 |
+
nltk.download()
|
12 |
|
13 |
+
from nltk.corpus import stopwords
|
14 |
from gensim.utils import deaccent
|
15 |
from bokeh.models import ColumnDataSource, HoverTool, Label
|
16 |
from bokeh.palettes import Colorblind as Pallete
|
|
|
49 |
txt_list = [tweet.split(' ') for tweet in txt_list]
|
50 |
return [' '.join([word.translate(str.maketrans('', '', punctuation)) for word in tweet]) for tweet in txt_list]
|
51 |
|
52 |
+
def _remove_stopwords(txt_list: List[str]):
|
53 |
+
stopwords_str = stopwords.words('english')
|
54 |
+
txt_list = [tweet.split(' ') for tweet in txt_list]
|
55 |
+
return [' '.join([word for word in tweet if word not in stopwords_str]) for tweet in txt_list]
|
56 |
+
|
57 |
preprocess_pipeline = [
|
58 |
_remove_unk_chars,
|
59 |
_remove_urls,
|
60 |
+
_remove_punctuation,
|
61 |
+
_remove_stopwords,
|
62 |
]
|
63 |
|
64 |
def preprocess(txt_list: str):
|