File size: 9,155 Bytes
f040401
 
7f1a60e
53bad35
b2c3406
8b81843
25a6734
7964bf4
8b81843
b2c3406
 
 
53bad35
2f7dfa0
5b5b795
 
f040401
 
 
 
c7010dd
25a6734
a5482a3
 
 
 
 
 
8b81843
 
 
 
d1cba0c
b2c3406
 
 
76a8518
b2c3406
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d2c094
b2c3406
4d8d3df
 
8b81843
4d8d3df
b2c3406
 
d556203
4d8d3df
 
b2c3406
 
 
 
 
 
 
f040401
a5482a3
 
d548156
 
 
 
 
f040401
 
 
 
1731f45
f040401
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b05a76
f040401
5b5b795
 
 
085853b
a27bca6
1323e99
f040401
61ce655
 
d71c853
 
97ec5a0
 
c959b1a
9460aa5
2f7dfa0
dc798d1
8e41537
2f7dfa0
 
f040401
 
 
 
9c5d67e
0d13483
f040401
9c5d67e
f040401
30f3de6
0d13483
a5482a3
20c61de
2d7af1c
bff1960
20c61de
 
7964bf4
7f1a60e
30f3de6
7f1a60e
da31fda
1323e99
 
7f1a60e
 
 
 
1dcaf3f
7f1a60e
 
 
 
 
 
29060ca
7f1a60e
 
d447a02
7f1a60e
4b05a76
7f1a60e
f040401
 
7f1a60e
f040401
 
 
d548156
1bbc870
 
9c5d67e
0a699f8
 
1bbc870
0a699f8
f040401
61a29bd
 
 
 
 
 
 
 
 
 
a38bdb0
d548156
30f3de6
f040401
d548156
691ccbe
dbebcc0
 
30f3de6
f040401
 
 
59ff44b
f040401
 
5c585a4
59ff44b
f040401
92e2ae9
 
0d13483
 
5e899e5
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
from typing import List

import itertools
import string
import re
import requests
import tweepy
import hdbscan

import numpy as np
import streamlit as st

from gensim.utils import deaccent
from bokeh.models import ColumnDataSource, HoverTool, Label
from bokeh.palettes import Colorblind as Pallete
from bokeh.palettes import Set3 as AuxPallete
from bokeh.plotting import Figure, figure
from bokeh.transform import factor_cmap

from sklearn.manifold import TSNE
from sentence_transformers import SentenceTransformer, util

client = tweepy.Client(bearer_token=st.secrets["tw_bearer_token"])
model_to_use = {
    "English": "all-MiniLM-L12-v2",
    "Use all the ones you know (~15 lang)": "paraphrase-multilingual-MiniLM-L12-v2"
}


stopwords_list = requests.get("https://gist.githubusercontent.com/rg089/35e00abf8941d72d419224cfd5b5925d/raw/12d899b70156fd0041fa9778d657330b024b959c/stopwords.txt").content
stopwords = set(stopwords_list.decode().splitlines()) 

def _remove_unk_chars(txt_list: List[str]):
    txt_list = [re.sub('\s+', ' ', tweet) for tweet in txt_list]
    txt_list = [re.sub("\'", "", tweet) for tweet in txt_list]
    txt_list = [deaccent(tweet).lower() for tweet in txt_list]
    return txt_list

def _remove_urls(txt_list: List[str]):
    url_regex = re.compile(
        r'^(?:http|ftp)s?://' # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
        r'localhost|' #localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
        r'(?::\d+)?' # optional port
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)
    txt_list = [tweet.split(' ') for tweet in txt_list]
    return [' '.join([word for word in tweet if not bool(re.match(url_regex, word))]) for tweet in txt_list]

def _remove_punctuation(txt_list: List[str]):
    punctuation = string.punctuation + 'ΒΏΒ‘|'
    txt_list = [tweet.split(' ') for tweet in txt_list]
    return [' '.join([word.translate(str.maketrans('', '', punctuation)) for word in tweet]) for tweet in txt_list]

def _remove_stopwords(txt_list: List[str]):
    txt_list = [tweet.split(' ') for tweet in txt_list]
    return [' '.join([word for word in tweet if word not in stopwords]) for tweet in txt_list]

preprocess_pipeline = [
    _remove_urls,
    _remove_unk_chars,
    _remove_punctuation,
    _remove_stopwords,
]

def preprocess(txt_list: str):
    for op in preprocess_pipeline:
        txt_list = op(txt_list)
    return txt_list

# Original implementation from: https://huggingface.co/spaces/edugp/embedding-lenses/blob/main/app.py
SEED = 42

@st.cache(show_spinner=False, allow_output_mutation=True)
def load_model(model_name: str) -> SentenceTransformer:
    embedder = model_name
    return SentenceTransformer(embedder)

def embed_text(text: List[str], model: SentenceTransformer) -> np.ndarray:
    return model.encode(text)

def get_tsne_embeddings(
    embeddings: np.ndarray, perplexity: int = 10, n_components: int = 2, init: str = "pca", n_iter: int = 5000, random_state: int = SEED
) -> np.ndarray:
    tsne = TSNE(perplexity=perplexity, n_components=n_components, init=init, n_iter=n_iter, random_state=random_state)
    return tsne.fit_transform(embeddings)

def draw_interactive_scatter_plot(
    texts: np.ndarray, xs: np.ndarray, ys: np.ndarray, values: np.ndarray, labels: np.ndarray, text_column: str, label_column: str
) -> Figure:
    # Normalize values to range between 0-255, to assign a color for each value
    max_value = values.max()
    min_value = values.min()
    if max_value - min_value == 0:
        values_color = np.ones(len(values))
    else:
        values_color = ((values - min_value) / (max_value - min_value) * 255).round().astype(int).astype(str)
    values_color_set = sorted(values_color)
    values_list = values.astype(str).tolist()
    values_set = sorted(values_list)
    source = ColumnDataSource(data=dict(x=xs, y=ys, text=texts, label=values_list, original_label=labels))
    hover = HoverTool(tooltips=[(text_column, "@text{safe}"), (label_column, "@original_label")])
    n_colors = len(set(values_color_set))
    if n_colors not in Pallete:
        Palette = AuxPallete
    p = figure(plot_width=800, plot_height=800, tools=[hover], title='2D visualization of tweets', background_fill_color="#fafafa")
    colors = factor_cmap("label", palette=[Pallete[n_colors][int(id_) + 1] for id_ in values_set], factors=values_set)
    p.circle("x", "y", size=12, source=source, fill_alpha=0.4, line_color=colors, fill_color=colors, legend_group="original_label")
    p.axis.visible = False
    p.xgrid.grid_line_dash = "dashed"
    p.ygrid.grid_line_dash = "dashed"
    # p.xgrid.grid_line_color = None
    # p.ygrid.grid_line_color = None
    # p.toolbar.logo = None
    p.legend.location = "bottom_right"
    p.legend.title = "Topics ID"
    p.legend.background_fill_alpha = 0.2
    
    disclaimer = Label(x=0, y=0, x_units="screen", y_units="screen",
                   text_font_size="14px", text_color="gray",
                   text="Topic equals -1 means no topic was detected for such tweet")
    p.add_layout(disclaimer, "below")
    return p

# Up to here
def generate_plot(
    tws: List[str],
    tws_cleaned: List[str],
    model: SentenceTransformer,
    tw_user: str
) -> Figure:
    with st.spinner(text=f"Trying to understand '{tw_user}' tweets... πŸ€”"):
        embeddings = embed_text(tws_cleaned, model)
    # encoded_labels = encode_labels(labels)
    cluster = hdbscan.HDBSCAN(
        min_cluster_size=3,
        metric='euclidean',
        cluster_selection_method='eom'
    ).fit(embeddings)
    encoded_labels = cluster.labels_
    cluster_keyword = {}
    with st.spinner("Now trying to express them with my own words... πŸ’¬"):
        for label in set(encoded_labels):
            if label == -1:
                cluster_keyword[label] = '-1'
                continue
            cluster_keyword[label] = []
            cluster_tws = []
            for ix, obs in enumerate(encoded_labels):
                if obs == label:
                    cluster_tws.append(tws_cleaned[ix])
            cluster_words = [tw.split(' ') for tw in cluster_tws]
            cluster_words = list(set(itertools.chain.from_iterable(cluster_words)))
            cluster_embeddings = embed_text(cluster_tws, model)
            cluster_embeddings_avg = np.mean(cluster_embeddings, axis=0)
            cluster_words_embeddings = embed_text(cluster_words, model)
            cluster_to_words_similarities = util.dot_score(cluster_embeddings_avg, cluster_words_embeddings)
            cluster_to_words_similarities = cluster_to_words_similarities.numpy()
            while len(cluster_keyword[label]) < 3:
                most_descriptive = np.argmax(cluster_to_words_similarities)
                cluster_to_words_similarities = np.delete(cluster_to_words_similarities, most_descriptive)
                cluster_keyword[label].append(cluster_words[most_descriptive])
            cluster_keyword[label] = ', '.join(cluster_keyword[label])
        encoded_labels_keywords = [cluster_keyword[encoded_label] for encoded_label in encoded_labels]
        embeddings_2d = get_tsne_embeddings(embeddings)
    plot = draw_interactive_scatter_plot(
        tws, embeddings_2d[:, 0], embeddings_2d[:, 1], encoded_labels, encoded_labels_keywords, 'Tweet', 'Topic'
    )
    return plot


st.title("Tweet-SNEst")
st.write("Visualize tweets embeddings in 2D using colors for topics labels.")
st.caption('Please beware this is using Twitter free version of their API and might be needed to wait sometimes.')
col1, col2 = st.columns(2)
with col1:
    tw_user = st.text_input("Twitter handle", "huggingface")
with col2:
    tw_sample = st.number_input("Maximum number of tweets to use", 1, 300, 100, 10)

col1, col2 = st.columns(2)

with col1:
    expected_lang = st.radio(
        "What language should be assumed to be found?",
        ('English', 'Use all the ones you know (~15 lang)'),
        0
    )
with col2:
    go_btn = st.button('Visualize πŸš€')

with st.spinner(text="Loading brain... 🧠"):
    model = load_model(model_to_use[expected_lang])

if go_btn and tw_user != '':
    usr = client.get_user(username=tw_user)
    tw_user = tw_user.replace(' ', '')
    with st.spinner(f"Getting to know the '{tw_user}'... πŸ”"):
        tweets_objs = []
        while tw_sample >= 100:
            current_sample = min(100, tw_sample)
            tweets_response = client.get_users_tweets(usr.data.id, max_results=current_sample, exclude=['retweets', 'replies'])
            tweets_objs += tweets_response.data
            tw_sample -= current_sample
        if tw_sample > 0:
            tweets_response = client.get_users_tweets(usr.data.id, max_results=tw_sample, exclude=['retweets', 'replies'])
        tweets_objs += tweets_response.data
    tweets_txt = [tweet.text for tweet in tweets_objs]
    tweets_txt = list(set(tweets_txt))
    tweets_txt_cleaned = preprocess(tweets_txt)
    plot = generate_plot(tweets_txt, tweets_txt_cleaned, model, tw_user)
    st.bokeh_chart(plot)
elif go_btn and tw_user == '':
    st.warning('Twitter handler field is empty πŸ™„')