Spaces:

wilmerags
/

tweet-snest

Build error

App Files Files Community

wilmerags commited on Nov 24, 2021

Commit

7f1a60e

1 Parent(s): e2fe46b

test: Experiment with keyword selection for topics

Browse files

Files changed (1) hide show

app.py +20 -1

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from typing import List
 import string
 import re
 import requests
@@ -138,10 +139,28 @@ def generate_plot(
         cluster_selection_method='eom'
     ).fit(embeddings)
     encoded_labels = cluster.labels_
     with st.spinner("Now trying to express them with my own words... 💬"):
         embeddings_2d = get_tsne_embeddings(embeddings)
     plot = draw_interactive_scatter_plot(
-        tws, embeddings_2d[:, 0], embeddings_2d[:, 1], encoded_labels, encoded_labels, 'Tweet', 'Topic'
     )
     return plot

 from typing import List
+import itertools
 import string
 import re
 import requests
         cluster_selection_method='eom'
     ).fit(embeddings)
     encoded_labels = cluster.labels_
+    cluster_keyword = {}
     with st.spinner("Now trying to express them with my own words... 💬"):
+        for label in set(encoded_labels):
+            cluster_keyword[label] = []
+            cluster_tws = []
+            for ix, obs in enumerate(encoded_labels):
+                if obs == label:
+                    cluster_tws.append(tws_cleaned)
+            cluster_words = [tw.split(' ') for tw in cluster_tws]
+            cluster_words = list(set(itertools.chain.from_iterable(cluster_words)))
+            cluster_embeddings = embed_text(cluster_tws, model)
+            cluster_embeddings_avg = np.mean(cluster_embeddings, axis=0)
+            cluster_words_embeddings = embed_text(cluster_words, model)
+            cluster_to_words_similarities = util.dot_score(cluster_embeddings_avg, cluster_words_embeddings)
+            while len(cluster_keyword[label]) < 3:
+                most_descriptive = np.argmax(cluster_to_words_similarities)
+                del cluster_to_words_similarities[most_descriptive]
+                cluster_keyword[label].append(cluster_words[most_descriptive])
+        encoded_labels_keywords = [cluster_keyword[encoded_label] for encoded_label in encoded_labels]
         embeddings_2d = get_tsne_embeddings(embeddings)
     plot = draw_interactive_scatter_plot(
+        tws, embeddings_2d[:, 0], embeddings_2d[:, 1], encoded_labels, encoded_labels_keywords, 'Tweet', 'Topic'
     )
     return plot