wilmerags commited on
Commit
8b81843
·
1 Parent(s): 69b5115

fix: Improve handling of stopwords comparison without nltk

Browse files
Files changed (1) hide show
  1. app.py +7 -6
app.py CHANGED
@@ -2,15 +2,13 @@ from typing import List
2
 
3
  import string
4
  import re
 
5
  import tweepy
6
  import hdbscan
7
- import nltk
8
  import numpy as np
9
  import streamlit as st
10
 
11
- nltk.download()
12
-
13
- from nltk.corpus import stopwords
14
  from gensim.utils import deaccent
15
  from bokeh.models import ColumnDataSource, HoverTool, Label
16
  from bokeh.palettes import Colorblind as Pallete
@@ -27,6 +25,10 @@ model_to_use = {
27
  "Use all the ones you know (~15 lang)": "paraphrase-multilingual-MiniLM-L12-v2"
28
  }
29
 
 
 
 
 
30
  def _remove_unk_chars(txt_list: List[str]):
31
  txt_list = [re.sub('\s+', ' ', tweet) for tweet in txt_list]
32
  txt_list = [re.sub("\'", "", tweet) for tweet in txt_list]
@@ -50,9 +52,8 @@ def _remove_punctuation(txt_list: List[str]):
50
  return [' '.join([word.translate(str.maketrans('', '', punctuation)) for word in tweet]) for tweet in txt_list]
51
 
52
  def _remove_stopwords(txt_list: List[str]):
53
- stopwords_str = stopwords.words('english')
54
  txt_list = [tweet.split(' ') for tweet in txt_list]
55
- return [' '.join([word for word in tweet if word not in stopwords_str]) for tweet in txt_list]
56
 
57
  preprocess_pipeline = [
58
  _remove_unk_chars,
 
2
 
3
  import string
4
  import re
5
+ import requests
6
  import tweepy
7
  import hdbscan
8
+
9
  import numpy as np
10
  import streamlit as st
11
 
 
 
 
12
  from gensim.utils import deaccent
13
  from bokeh.models import ColumnDataSource, HoverTool, Label
14
  from bokeh.palettes import Colorblind as Pallete
 
25
  "Use all the ones you know (~15 lang)": "paraphrase-multilingual-MiniLM-L12-v2"
26
  }
27
 
28
+
29
+ stopwords_list = requests.get("https://gist.githubusercontent.com/rg089/35e00abf8941d72d419224cfd5b5925d/raw/12d899b70156fd0041fa9778d657330b024b959c/stopwords.txt").content
30
+ stopwords = set(stopwords_list.decode().splitlines())
31
+
32
  def _remove_unk_chars(txt_list: List[str]):
33
  txt_list = [re.sub('\s+', ' ', tweet) for tweet in txt_list]
34
  txt_list = [re.sub("\'", "", tweet) for tweet in txt_list]
 
52
  return [' '.join([word.translate(str.maketrans('', '', punctuation)) for word in tweet]) for tweet in txt_list]
53
 
54
  def _remove_stopwords(txt_list: List[str]):
 
55
  txt_list = [tweet.split(' ') for tweet in txt_list]
56
+ return [' '.join([word for word in tweet if word not in stopwords]) for tweet in txt_list]
57
 
58
  preprocess_pipeline = [
59
  _remove_unk_chars,