Spaces:
Runtime error
Runtime error
Fixed some code smell
Browse files
app.py
CHANGED
@@ -8,12 +8,11 @@ import nltk, spacy, gensim
|
|
8 |
from sklearn.decomposition import LatentDirichletAllocation
|
9 |
from sklearn.feature_extraction.text import CountVectorizer
|
10 |
from pprint import pprint
|
|
|
|
|
11 |
|
12 |
-
def concat_comments(
|
13 |
-
|
14 |
-
return [
|
15 |
-
format_s.format(s=s, c=c) for s, c in zip(sup_comment, comment)
|
16 |
-
]
|
17 |
|
18 |
def sent_to_words(sentences):
|
19 |
for sentence in sentences:
|
@@ -28,38 +27,34 @@ def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'], nlp=Non
|
|
28 |
]))
|
29 |
return texts_out
|
30 |
|
31 |
-
|
32 |
-
def main(choose_context):
|
33 |
df = pd.read_csv('./data/results.csv', index_col=0)
|
34 |
-
|
35 |
-
print(choose_context)
|
36 |
-
|
37 |
-
if choose_context == 'comment':
|
38 |
-
data = df.comment
|
39 |
-
elif choose_context == 'sup comment':
|
40 |
-
data = df.sup_comment
|
41 |
-
elif choose_context == 'sup comment + comment':
|
42 |
-
data = concat_comments(df.sup_comment, df.comment)
|
43 |
-
|
44 |
data_words = list(sent_to_words(data))
|
45 |
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
|
|
|
|
|
|
50 |
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
|
51 |
data_lemmatized = lemmatization(data_words, allowed_postags=["NOUN", "ADJ"], nlp=nlp) #select noun and verb
|
52 |
|
|
|
53 |
vectorizer = CountVectorizer(
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
)
|
60 |
-
data_vectorized = vectorizer.fit_transform(data_lemmatized)
|
61 |
|
|
|
|
|
62 |
|
|
|
63 |
lda_model = LatentDirichletAllocation(
|
64 |
n_components=5,
|
65 |
max_iter=10,
|
@@ -68,19 +63,28 @@ def main(choose_context):
|
|
68 |
batch_size=128,
|
69 |
evaluate_every = -1,
|
70 |
n_jobs = -1,
|
|
|
71 |
)
|
|
|
|
|
72 |
lda_output = lda_model.fit_transform(data_vectorized)
|
73 |
print(lda_model) # Model attributes
|
74 |
|
|
|
|
|
75 |
# Log Likelyhood: Higher the better
|
76 |
-
print("Log Likelihood: ",
|
77 |
# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
|
78 |
-
print("Perplexity: ",
|
|
|
|
|
79 |
# See model parameters
|
80 |
pprint(lda_model.get_params())
|
81 |
|
|
|
82 |
best_lda_model = lda_model
|
83 |
|
|
|
84 |
lda_output = best_lda_model.transform(data_vectorized)
|
85 |
|
86 |
topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]
|
@@ -96,8 +100,6 @@ def main(choose_context):
|
|
96 |
# Assign Column and Index
|
97 |
df_topic_keywords.columns = vectorizer.get_feature_names_out()
|
98 |
df_topic_keywords.index = topicnames
|
99 |
-
# View
|
100 |
-
df_topic_keywords
|
101 |
|
102 |
# Show top n keywords for each topic
|
103 |
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
|
@@ -190,25 +192,28 @@ def main(choose_context):
|
|
190 |
return fig
|
191 |
|
192 |
|
|
|
|
|
|
|
|
|
|
|
193 |
with gr.Blocks() as demo:
|
194 |
-
gr.Markdown("
|
195 |
-
gr.Markdown("
|
196 |
-
button = gr.Radio(
|
197 |
-
label="Plot type",
|
198 |
-
choices=['scatter_plot', 'heatmap', 'us_map', 'interactive_barplot', "radial", "multiline"], value='scatter_plot'
|
199 |
-
)
|
200 |
# gradio.Dataframe(路路路)
|
201 |
-
|
202 |
-
|
203 |
-
choices=['comment', 'sup comment', 'sup comment + comment'], value='sup comment'
|
204 |
-
)
|
205 |
-
plot = gr.Plot(label="Plot")
|
206 |
-
choose_context.change(main, inputs=[choose_context], outputs=[plot])
|
207 |
|
208 |
btn = gr.Button(value="Submit")
|
209 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
210 |
|
211 |
-
demo.load(main, inputs=[
|
212 |
|
213 |
|
214 |
# iface = gr.Interface(fn=greet, inputs="text", outputs="text")
|
|
|
8 |
from sklearn.decomposition import LatentDirichletAllocation
|
9 |
from sklearn.feature_extraction.text import CountVectorizer
|
10 |
from pprint import pprint
|
11 |
+
import matplotlib
|
12 |
+
matplotlib.use('agg')
|
13 |
|
14 |
+
def concat_comments(*kwargs):
|
15 |
+
return ['\n'.join(ele) for ele in zip(*kwargs)]
|
|
|
|
|
|
|
16 |
|
17 |
def sent_to_words(sentences):
|
18 |
for sentence in sentences:
|
|
|
27 |
]))
|
28 |
return texts_out
|
29 |
|
30 |
+
def get_lda(n_components):
|
|
|
31 |
df = pd.read_csv('./data/results.csv', index_col=0)
|
32 |
+
data = concat_comments(df.subreddit, df.sup_comment, df.comment)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
data_words = list(sent_to_words(data))
|
34 |
|
35 |
+
|
36 |
+
if not spacy.util.is_package("en_core_web_sm"):
|
37 |
+
print('[x] en_core_web_sm not found, downloading...')
|
38 |
+
os.system("python -m spacy download en_core_web_sm")
|
39 |
+
print('[x] en_core_web_sm downloaded')
|
40 |
+
|
41 |
+
print('[x] Lemmatization begins')
|
42 |
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
|
43 |
data_lemmatized = lemmatization(data_words, allowed_postags=["NOUN", "ADJ"], nlp=nlp) #select noun and verb
|
44 |
|
45 |
+
print('[x] Vectorizing')
|
46 |
vectorizer = CountVectorizer(
|
47 |
+
analyzer='word',
|
48 |
+
min_df=10,
|
49 |
+
stop_words='english',
|
50 |
+
lowercase=True,
|
51 |
+
token_pattern='[a-zA-Z0-9]{3,}'
|
52 |
+
)
|
|
|
53 |
|
54 |
+
print('[x] Fitting vectorized data on lemmatization')
|
55 |
+
data_vectorized = vectorizer.fit_transform(data_lemmatized)
|
56 |
|
57 |
+
print('[x] Init LDA model')
|
58 |
lda_model = LatentDirichletAllocation(
|
59 |
n_components=5,
|
60 |
max_iter=10,
|
|
|
63 |
batch_size=128,
|
64 |
evaluate_every = -1,
|
65 |
n_jobs = -1,
|
66 |
+
verbose=1,
|
67 |
)
|
68 |
+
|
69 |
+
print('[x] Fitting LDA model')
|
70 |
lda_output = lda_model.fit_transform(data_vectorized)
|
71 |
print(lda_model) # Model attributes
|
72 |
|
73 |
+
print('[x] Getting performances')
|
74 |
+
performances = lda_model.score(data_vectorized), lda_model.perplexity(data_vectorized)
|
75 |
# Log Likelyhood: Higher the better
|
76 |
+
print("Log Likelihood: ", performances[0])
|
77 |
# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
|
78 |
+
print("Perplexity: ", performances[1])
|
79 |
+
|
80 |
+
print('[x] Check parameters if they look correct')
|
81 |
# See model parameters
|
82 |
pprint(lda_model.get_params())
|
83 |
|
84 |
+
# switching to the best model
|
85 |
best_lda_model = lda_model
|
86 |
|
87 |
+
print('[x] Getting LDA output')
|
88 |
lda_output = best_lda_model.transform(data_vectorized)
|
89 |
|
90 |
topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]
|
|
|
100 |
# Assign Column and Index
|
101 |
df_topic_keywords.columns = vectorizer.get_feature_names_out()
|
102 |
df_topic_keywords.index = topicnames
|
|
|
|
|
103 |
|
104 |
# Show top n keywords for each topic
|
105 |
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
|
|
|
192 |
return fig
|
193 |
|
194 |
|
195 |
+
# def main():
|
196 |
+
|
197 |
+
|
198 |
+
|
199 |
+
|
200 |
with gr.Blocks() as demo:
|
201 |
+
gr.Markdown("# Dashboard per l'analisi con LDA")
|
202 |
+
gr.Markdown("### Questo 猫 un sottotitolo")
|
|
|
|
|
|
|
|
|
203 |
# gradio.Dataframe(路路路)
|
204 |
+
|
205 |
+
n_comp = gr.Slider(2, 25, value=5, step = 1, label="N components", info="Scegli il numero di componenti per LDA"),
|
|
|
|
|
|
|
|
|
206 |
|
207 |
btn = gr.Button(value="Submit")
|
208 |
+
|
209 |
+
plot = gr.Plot(label="Plot")
|
210 |
+
|
211 |
+
btn.click(get_lda, inputs=[n_comp[0]], outputs=[plot])
|
212 |
+
|
213 |
+
|
214 |
+
|
215 |
|
216 |
+
# demo.load(main, inputs=[], outputs=[plot])
|
217 |
|
218 |
|
219 |
# iface = gr.Interface(fn=greet, inputs="text", outputs="text")
|