Spaces:

wilmerags
/

tweet-snest

Build error

App Files Files Community

wilmerags commited on Nov 24, 2021

Commit

b2c3406

1 Parent(s): 9460aa5

feat: Add preprocessing function to improve quality of topic detection

Browse files

Files changed (1) hide show

app.py +39 -3

app.py CHANGED Viewed

@@ -1,10 +1,13 @@
 from typing import List
-import numpy as np
-import streamlit as st
 import tweepy
 import hdbscan
 from bokeh.models import ColumnDataSource, HoverTool, Label
 from bokeh.palettes import Colorblind as Pallete
@@ -21,6 +24,38 @@ model_to_use = {
     "Use all the ones you know (~15 lang)": "paraphrase-multilingual-MiniLM-L12-v2"
 }
 # Original implementation from: https://huggingface.co/spaces/edugp/embedding-lenses/blob/main/app.py
 SEED = 42
@@ -137,6 +172,7 @@ if go_btn and tw_user != '':
         tweets_objs += tweets_response.data
     tweets_txt = [tweet.text for tweet in tweets_objs]
     tweets_txt = list(set(tweets_txt))
     # plot = generate_plot(df, text_column, label_column, sample, dimensionality_reduction_function, model)
     plot = generate_plot(tweets_txt, model, tw_user)
     st.bokeh_chart(plot)

 from typing import List
+import re
 import tweepy
 import hdbscan
+import numpy as np
+import streamlit as st
+from gensim.utils import deaccent # gensim==3.8.1
 from bokeh.models import ColumnDataSource, HoverTool, Label
 from bokeh.palettes import Colorblind as Pallete
     "Use all the ones you know (~15 lang)": "paraphrase-multilingual-MiniLM-L12-v2"
 }
+def remove_unk_chars(txt_list: List[str]):
+    txt_list = [re.sub('\s+', ' ', tweet) for tweet in txt_list]
+    txt_list = [re.sub("\'", "", tweet) for tweet in txt_list]
+    txt_list = [deaccent(tweet).lower() for tweet in txt_list]
+def _remove_urls(txt_list: List[str]):
+    url_regex = re.compile(
+        r'^(?:http|ftp)s?://' # http:// or https://
+        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
+        r'localhost|' #localhost...
+        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
+        r'(?::\d+)?' # optional port
+        r'(?:/?|[/?]\S+)$', re.IGNORECASE)
+    txt_list = [tweet.split(' ') for tweet in txt_list]
+    return [' '.join([word for word in tweet if not bool(re.match(url_regex, word))]) for tweet in txt_list]
+def _remove_punctuation(txt_list: List[str]):
+    punctuation = string.punctuation + '¿¡|'
+    txt_list = [tweet.split(' ') for tweet in txt_list]
+   return [' '.join([word.translate(str.maketrans('', '', punctuation)) for word in tweet]) for tweet in txt_list]
+preprocess_pipeline = [
+    _remove_unk_chars,
+    _remove_urls,
+    _remove_punctuation
+]
+def preprocess(txt_list: str):
+    for op in preprocess_pipeline:
+        txt_list = op(txt_list)
+    return txt_list
 # Original implementation from: https://huggingface.co/spaces/edugp/embedding-lenses/blob/main/app.py
 SEED = 42
         tweets_objs += tweets_response.data
     tweets_txt = [tweet.text for tweet in tweets_objs]
     tweets_txt = list(set(tweets_txt))
+    tweets_txt = preproces(tweets_txt)
     # plot = generate_plot(df, text_column, label_column, sample, dimensionality_reduction_function, model)
     plot = generate_plot(tweets_txt, model, tw_user)
     st.bokeh_chart(plot)