Spaces:
Build error
Build error
fix: Improve handling of stopwords comparison without nltk
Browse files
app.py
CHANGED
@@ -2,15 +2,13 @@ from typing import List
|
|
2 |
|
3 |
import string
|
4 |
import re
|
|
|
5 |
import tweepy
|
6 |
import hdbscan
|
7 |
-
|
8 |
import numpy as np
|
9 |
import streamlit as st
|
10 |
|
11 |
-
nltk.download()
|
12 |
-
|
13 |
-
from nltk.corpus import stopwords
|
14 |
from gensim.utils import deaccent
|
15 |
from bokeh.models import ColumnDataSource, HoverTool, Label
|
16 |
from bokeh.palettes import Colorblind as Pallete
|
@@ -27,6 +25,10 @@ model_to_use = {
|
|
27 |
"Use all the ones you know (~15 lang)": "paraphrase-multilingual-MiniLM-L12-v2"
|
28 |
}
|
29 |
|
|
|
|
|
|
|
|
|
30 |
def _remove_unk_chars(txt_list: List[str]):
|
31 |
txt_list = [re.sub('\s+', ' ', tweet) for tweet in txt_list]
|
32 |
txt_list = [re.sub("\'", "", tweet) for tweet in txt_list]
|
@@ -50,9 +52,8 @@ def _remove_punctuation(txt_list: List[str]):
|
|
50 |
return [' '.join([word.translate(str.maketrans('', '', punctuation)) for word in tweet]) for tweet in txt_list]
|
51 |
|
52 |
def _remove_stopwords(txt_list: List[str]):
|
53 |
-
stopwords_str = stopwords.words('english')
|
54 |
txt_list = [tweet.split(' ') for tweet in txt_list]
|
55 |
-
return [' '.join([word for word in tweet if word not in
|
56 |
|
57 |
preprocess_pipeline = [
|
58 |
_remove_unk_chars,
|
|
|
2 |
|
3 |
import string
|
4 |
import re
|
5 |
+
import requests
|
6 |
import tweepy
|
7 |
import hdbscan
|
8 |
+
|
9 |
import numpy as np
|
10 |
import streamlit as st
|
11 |
|
|
|
|
|
|
|
12 |
from gensim.utils import deaccent
|
13 |
from bokeh.models import ColumnDataSource, HoverTool, Label
|
14 |
from bokeh.palettes import Colorblind as Pallete
|
|
|
25 |
"Use all the ones you know (~15 lang)": "paraphrase-multilingual-MiniLM-L12-v2"
|
26 |
}
|
27 |
|
28 |
+
|
29 |
+
stopwords_list = requests.get("https://gist.githubusercontent.com/rg089/35e00abf8941d72d419224cfd5b5925d/raw/12d899b70156fd0041fa9778d657330b024b959c/stopwords.txt").content
|
30 |
+
stopwords = set(stopwords_list.decode().splitlines())
|
31 |
+
|
32 |
def _remove_unk_chars(txt_list: List[str]):
|
33 |
txt_list = [re.sub('\s+', ' ', tweet) for tweet in txt_list]
|
34 |
txt_list = [re.sub("\'", "", tweet) for tweet in txt_list]
|
|
|
52 |
return [' '.join([word.translate(str.maketrans('', '', punctuation)) for word in tweet]) for tweet in txt_list]
|
53 |
|
54 |
def _remove_stopwords(txt_list: List[str]):
|
|
|
55 |
txt_list = [tweet.split(' ') for tweet in txt_list]
|
56 |
+
return [' '.join([word for word in tweet if word not in stopwords]) for tweet in txt_list]
|
57 |
|
58 |
preprocess_pipeline = [
|
59 |
_remove_unk_chars,
|