Spaces:
Build error
Build error
File size: 4,653 Bytes
f040401 7eae2ec 25a6734 7964bf4 25a6734 f040401 25a6734 a5482a3 f040401 a5482a3 d548156 f040401 dd9138c f040401 a5482a3 20c61de 3bc7bbb 7964bf4 20c61de 7964bf4 f040401 2e82d12 f040401 d548156 1bbc870 0a699f8 1bbc870 0a699f8 f040401 25a6734 d548156 f040401 d548156 f040401 d548156 4700cf0 25a6734 f040401 568b321 f040401 5c585a4 f040401 7964bf4 f040401 7964bf4 a5482a3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
from typing import List
import numpy as np
import streamlit as st
import tweepy
import hdbscan
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.palettes import Cividis256 as Pallete
from bokeh.plotting import Figure, figure
from bokeh.transform import factor_cmap
from sklearn.manifold import TSNE
from sentence_transformers import SentenceTransformer
client = tweepy.Client(bearer_token=st.secrets["tw_bearer_token"])
model_to_use = {
"English": "all-MiniLM-L12-v2",
"Use all the ones you know (~15 lang)": "paraphrase-multilingual-MiniLM-L12-v2"
}
# Original implementation from: https://huggingface.co/spaces/edugp/embedding-lenses/blob/main/app.py
SEED = 42
@st.cache(show_spinner=False, allow_output_mutation=True)
def load_model(model_name: str) -> SentenceTransformer:
embedder = model_name
return SentenceTransformer(embedder)
def embed_text(text: List[str], model: SentenceTransformer) -> np.ndarray:
return model.encode(text)
def get_tsne_embeddings(
embeddings: np.ndarray, perplexity: int = 15, n_components: int = 2, init: str = "pca", n_iter: int = 5000, random_state: int = SEED
) -> np.ndarray:
tsne = TSNE(perplexity=perplexity, n_components=n_components, init=init, n_iter=n_iter, random_state=random_state)
return tsne.fit_transform(embeddings)
def draw_interactive_scatter_plot(
texts: np.ndarray, xs: np.ndarray, ys: np.ndarray, values: np.ndarray, labels: np.ndarray, text_column: str, label_column: str
) -> Figure:
# Normalize values to range between 0-255, to assign a color for each value
max_value = values.max()
min_value = values.min()
if max_value - min_value == 0:
values_color = np.ones(len(values))
else:
values_color = ((values - min_value) / (max_value - min_value) * 255).round().astype(int).astype(str)
values_color_set = sorted(values_color)
values_list = values.astype(str).tolist()
values_set = sorted(values_list)
labels_list = labels.astype(str).tolist()
source = ColumnDataSource(data=dict(x=xs, y=ys, text=texts, label=values_list, original_label=labels_list))
hover = HoverTool(tooltips=[(text_column, "@text{safe}"), (label_column, "@original_label")])
p = figure(plot_width=800, plot_height=800, tools=[hover])
p.circle("x", "y", size=10, source=source, fill_color=factor_cmap("label", palette=[Pallete[int(id_)] for id_ in values_color_set], factors=values_set))
p.axis.visible = False
p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None
p.toolbar.logo = None
return p
# Up to here
def generate_plot(
df: List[str],
model: SentenceTransformer,
) -> Figure:
with st.spinner(text="Embedding text..."):
embeddings = embed_text(df, model)
# encoded_labels = encode_labels(labels)
cluster = hdbscan.HDBSCAN(
min_cluster_size=5,
metric='euclidean',
cluster_selection_method='eom'
).fit(embeddings)
encoded_labels = cluster.labels_
with st.spinner("Reducing dimensionality..."):
embeddings_2d = get_tsne_embeddings(embeddings)
plot = draw_interactive_scatter_plot(
df, embeddings_2d[:, 0], embeddings_2d[:, 1], encoded_labels, encoded_labels, 'text', 'label'
)
return plot
st.title("Tweet-SNEst")
st.write("Visualize tweets embeddings in 2D using colors for topics labels.")
col1, col2 = st.columns(2)
with col1:
tw_user = st.text_input("Twitter handle", "huggingface")
with col2:
tw_sample = st.number_input("Maximum number of tweets to use", 1, 300, 100, 10)
expected_lang = st.radio(
"What language should be assumed to be found?",
('English', 'Use all the ones you know (~15 lang)'),
0
)
with st.spinner(text="Loading model..."):
model = load_model(model_to_use[expected_lang])
usr = client.get_user(username=tw_user)
# st.write(usr.data.id)
if tw_user:
with st.spinner(f"Getting to know the '{tw_user}'..."):
tweets_objs = []
while tw_sample >= 100:
current_sample = min(100, tw_sample)
tweets_response = client.get_users_tweets(usr.data.id, max_results=current_sample)
tweets_objs += tweets_response.data
tw_sample -= current_sample
if tw_sample > 0:
tweets_response = client.get_users_tweets(usr.data.id, max_results=tw_sample)
tweets_objs += tweets_response.data
tweets_txt = [tweet.text for tweet in tweets_objs]
# plot = generate_plot(df, text_column, label_column, sample, dimensionality_reduction_function, model)
plot = generate_plot(tweets_txt, model)
st.bokeh_chart(plot) |