Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -7,6 +7,8 @@ import torch
|
|
7 |
import numpy as np
|
8 |
from collections import Counter
|
9 |
import os
|
|
|
|
|
10 |
# Add Arabic stop words
|
11 |
ARABIC_STOP_WORDS = {
|
12 |
'في', 'من', 'إلى', 'على', 'عن', 'مع', 'خلال', 'حتى', 'إذا', 'ثم',
|
@@ -69,6 +71,21 @@ def split_text(text, max_length=512):
|
|
69 |
|
70 |
return chunks
|
71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
def clean_arabic_text(text):
|
73 |
"""Clean Arabic text by removing stop words and normalizing."""
|
74 |
words = text.split()
|
@@ -202,18 +219,22 @@ def format_emotions(emotion_counts):
|
|
202 |
})
|
203 |
return formatted_emotions
|
204 |
|
205 |
-
def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=
|
206 |
"""Process the data and generate summaries with flexible topic configuration."""
|
207 |
summaries = []
|
208 |
|
209 |
topic_model_params = {
|
210 |
"language": "arabic",
|
211 |
"calculate_probabilities": True,
|
212 |
-
"min_topic_size":
|
213 |
"n_gram_range": (1, 2),
|
214 |
"top_n_words": 15,
|
215 |
"verbose": True
|
|
|
216 |
}
|
|
|
|
|
|
|
217 |
|
218 |
if topic_strategy == "Manual":
|
219 |
topic_model_params["nr_topics"] = n_topics
|
@@ -250,7 +271,8 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
|
|
250 |
|
251 |
try:
|
252 |
topics, probs = topic_model.fit_transform(texts, embeddings)
|
253 |
-
|
|
|
254 |
topic_counts = Counter(topics)
|
255 |
if -1 in topic_counts:
|
256 |
del topic_counts[-1]
|
@@ -385,6 +407,12 @@ if uploaded_file is not None:
|
|
385 |
st.subheader("Emotions")
|
386 |
for emotion in summary['top_emotions']:
|
387 |
st.write(f"• {emotion['emotion']}: {emotion['count']} poems")
|
|
|
|
|
|
|
|
|
|
|
|
|
388 |
|
389 |
with tab2:
|
390 |
st.subheader("Global Topic Distribution")
|
|
|
7 |
import numpy as np
|
8 |
from collections import Counter
|
9 |
import os
|
10 |
+
from wordcloud import WordCloud
|
11 |
+
import matplotlib.pyplot as plt
|
12 |
# Add Arabic stop words
|
13 |
ARABIC_STOP_WORDS = {
|
14 |
'في', 'من', 'إلى', 'على', 'عن', 'مع', 'خلال', 'حتى', 'إذا', 'ثم',
|
|
|
71 |
|
72 |
return chunks
|
73 |
|
74 |
+
def create_arabic_wordcloud(text, title):
|
75 |
+
wordcloud = WordCloud(
|
76 |
+
width=1200,
|
77 |
+
height=600,
|
78 |
+
background_color='white',
|
79 |
+
font_path='arial', # Works with system Arabic fonts
|
80 |
+
max_words=200
|
81 |
+
).generate(text)
|
82 |
+
|
83 |
+
fig, ax = plt.subplots(figsize=(15, 8))
|
84 |
+
ax.imshow(wordcloud, interpolation='bilinear')
|
85 |
+
ax.axis('off')
|
86 |
+
ax.set_title(title, fontsize=16, pad=20)
|
87 |
+
return fig
|
88 |
+
|
89 |
def clean_arabic_text(text):
|
90 |
"""Clean Arabic text by removing stop words and normalizing."""
|
91 |
words = text.split()
|
|
|
219 |
})
|
220 |
return formatted_emotions
|
221 |
|
222 |
+
def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=1):
|
223 |
"""Process the data and generate summaries with flexible topic configuration."""
|
224 |
summaries = []
|
225 |
|
226 |
topic_model_params = {
|
227 |
"language": "arabic",
|
228 |
"calculate_probabilities": True,
|
229 |
+
"min_topic_size": 1,
|
230 |
"n_gram_range": (1, 2),
|
231 |
"top_n_words": 15,
|
232 |
"verbose": True
|
233 |
+
"seed_topic_list": None
|
234 |
}
|
235 |
+
st.write(f"Total documents: {len(df)}")
|
236 |
+
st.write(f"Topic strategy: {topic_strategy}")
|
237 |
+
st.write(f"Min topic size: {min_topic_size}")
|
238 |
|
239 |
if topic_strategy == "Manual":
|
240 |
topic_model_params["nr_topics"] = n_topics
|
|
|
271 |
|
272 |
try:
|
273 |
topics, probs = topic_model.fit_transform(texts, embeddings)
|
274 |
+
st.write(f"Number of unique topics: {len(set(topics))}")
|
275 |
+
st.write(f"Topic distribution: {Counter(topics)}")
|
276 |
topic_counts = Counter(topics)
|
277 |
if -1 in topic_counts:
|
278 |
del topic_counts[-1]
|
|
|
407 |
st.subheader("Emotions")
|
408 |
for emotion in summary['top_emotions']:
|
409 |
st.write(f"• {emotion['emotion']}: {emotion['count']} poems")
|
410 |
+
|
411 |
+
st.subheader("Word Cloud Visualization")
|
412 |
+
country_poems = df[df['country'] == summary['country']]['poem']
|
413 |
+
combined_text = ' '.join(country_poems)
|
414 |
+
wordcloud_fig = create_arabic_wordcloud(combined_text, f"Most Common Words in {summary['country']} Poems")
|
415 |
+
st.pyplot(wordcloud_fig)
|
416 |
|
417 |
with tab2:
|
418 |
st.subheader("Global Topic Distribution")
|