Spaces:

wilmerags
/

tweet-snest

Build error

wilmerags commited on Nov 24, 2021

Commit

8b81843

1 Parent(s): 69b5115

fix: Improve handling of stopwords comparison without nltk

Files changed (1) hide show

app.py CHANGED Viewed

@@ -2,15 +2,13 @@ from typing import List
 import string
 import re
 import tweepy
 import hdbscan
-import nltk
 import numpy as np
 import streamlit as st
-nltk.download()
-from nltk.corpus import stopwords
 from gensim.utils import deaccent
 from bokeh.models import ColumnDataSource, HoverTool, Label
 from bokeh.palettes import Colorblind as Pallete
@@ -27,6 +25,10 @@ model_to_use = {
     "Use all the ones you know (~15 lang)": "paraphrase-multilingual-MiniLM-L12-v2"
 }
 def _remove_unk_chars(txt_list: List[str]):
     txt_list = [re.sub('\s+', ' ', tweet) for tweet in txt_list]
     txt_list = [re.sub("\'", "", tweet) for tweet in txt_list]
@@ -50,9 +52,8 @@ def _remove_punctuation(txt_list: List[str]):
     return [' '.join([word.translate(str.maketrans('', '', punctuation)) for word in tweet]) for tweet in txt_list]
 def _remove_stopwords(txt_list: List[str]):
-    stopwords_str = stopwords.words('english')
     txt_list = [tweet.split(' ') for tweet in txt_list]
-    return [' '.join([word for word in tweet if word not in stopwords_str]) for tweet in txt_list]
 preprocess_pipeline = [
     _remove_unk_chars,

 import string
 import re
+import requests
 import tweepy
 import hdbscan
 import numpy as np
 import streamlit as st
 from gensim.utils import deaccent
 from bokeh.models import ColumnDataSource, HoverTool, Label
 from bokeh.palettes import Colorblind as Pallete
     "Use all the ones you know (~15 lang)": "paraphrase-multilingual-MiniLM-L12-v2"
 }
+stopwords_list = requests.get("https://gist.githubusercontent.com/rg089/35e00abf8941d72d419224cfd5b5925d/raw/12d899b70156fd0041fa9778d657330b024b959c/stopwords.txt").content
+stopwords = set(stopwords_list.decode().splitlines())
 def _remove_unk_chars(txt_list: List[str]):
     txt_list = [re.sub('\s+', ' ', tweet) for tweet in txt_list]
     txt_list = [re.sub("\'", "", tweet) for tweet in txt_list]
     return [' '.join([word.translate(str.maketrans('', '', punctuation)) for word in tweet]) for tweet in txt_list]
 def _remove_stopwords(txt_list: List[str]):
     txt_list = [tweet.split(' ') for tweet in txt_list]
+    return [' '.join([word for word in tweet if word not in stopwords]) for tweet in txt_list]
 preprocess_pipeline = [
     _remove_unk_chars,