Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -10,7 +10,9 @@ import os
|
|
10 |
from wordcloud import WordCloud
|
11 |
import matplotlib.pyplot as plt
|
12 |
import pkg_resources
|
13 |
-
import
|
|
|
|
|
14 |
|
15 |
current_dir = os.path.dirname(os.path.abspath(__file__))
|
16 |
font_path = os.path.join(current_dir, "ArabicR2013-J25x.ttf")
|
@@ -52,16 +54,7 @@ def load_models():
|
|
52 |
return_all_scores=True
|
53 |
)
|
54 |
return tokenizer, bert_model, emotion_classifier
|
55 |
-
|
56 |
-
@st.cache_data
|
57 |
-
def cache_embeddings(text, _tokenizer, _model):
|
58 |
-
return get_embedding_for_text(text, _tokenizer, _model)
|
59 |
-
|
60 |
-
@st.cache_data
|
61 |
-
def cache_emotion_classification(text, _classifier):
|
62 |
-
return classify_emotion(text, _classifier)
|
63 |
|
64 |
-
@st.cache_data
|
65 |
def split_text(text, max_length=512):
|
66 |
"""Split text into chunks of maximum token length while preserving word boundaries."""
|
67 |
words = text.split()
|
@@ -84,6 +77,62 @@ def split_text(text, max_length=512):
|
|
84 |
chunks.append(' '.join(current_chunk))
|
85 |
|
86 |
return chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
|
88 |
def create_arabic_wordcloud(text, title):
|
89 |
wordcloud = WordCloud(
|
@@ -170,9 +219,9 @@ def classify_emotion(text, classifier):
|
|
170 |
return "LABEL_2"
|
171 |
|
172 |
def get_embedding_for_text(text, tokenizer, model):
|
|
|
173 |
chunks = split_text(text)
|
174 |
chunk_embeddings = []
|
175 |
-
embedding_size = model.config.hidden_size
|
176 |
|
177 |
for chunk in chunks:
|
178 |
try:
|
@@ -189,16 +238,18 @@ def get_embedding_for_text(text, tokenizer, model):
|
|
189 |
outputs = model(**inputs)
|
190 |
|
191 |
embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
|
192 |
-
chunk_embeddings.append(embedding
|
193 |
except Exception as e:
|
|
|
194 |
continue
|
195 |
|
196 |
if chunk_embeddings:
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
|
|
202 |
def format_topics(topic_model, topic_counts):
|
203 |
"""Format topics for display."""
|
204 |
formatted_topics = []
|
@@ -233,17 +284,20 @@ def format_emotions(emotion_counts):
|
|
233 |
return formatted_emotions
|
234 |
|
235 |
def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3):
|
|
|
236 |
summaries = []
|
237 |
-
embedding_size = bert_model.config.hidden_size
|
238 |
|
239 |
topic_model_params = {
|
240 |
"language": "arabic",
|
241 |
"calculate_probabilities": True,
|
242 |
-
"min_topic_size":
|
243 |
"n_gram_range": (1, 1),
|
244 |
"top_n_words": 15,
|
245 |
"verbose": True,
|
246 |
}
|
|
|
|
|
|
|
247 |
|
248 |
if topic_strategy == "Manual":
|
249 |
topic_model_params["nr_topics"] = n_topics
|
@@ -251,15 +305,12 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
|
|
251 |
topic_model_params["nr_topics"] = "auto"
|
252 |
|
253 |
topic_model = BERTopic(
|
254 |
-
embedding_model=
|
255 |
-
**topic_model_params
|
256 |
-
)
|
257 |
|
258 |
-
vectorizer = CountVectorizer(
|
259 |
-
|
260 |
-
|
261 |
-
max_df=1.0
|
262 |
-
)
|
263 |
topic_model.vectorizer_model = vectorizer
|
264 |
|
265 |
for country, group in df.groupby('country'):
|
@@ -268,48 +319,42 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
|
|
268 |
|
269 |
texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
|
270 |
all_emotions = []
|
271 |
-
embeddings = []
|
272 |
|
273 |
-
|
274 |
for i, text in enumerate(texts):
|
275 |
try:
|
276 |
embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
|
277 |
-
if embedding is not None and embedding.
|
278 |
embeddings.append(embedding)
|
279 |
-
|
280 |
-
|
|
|
281 |
except Exception as e:
|
282 |
-
st.warning(f"Error
|
283 |
continue
|
|
|
|
|
284 |
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
try:
|
295 |
-
emotion = classify_emotion(text, emotion_classifier)
|
296 |
-
all_emotions.append(emotion)
|
297 |
-
progress = 0.4 + ((i + 1) / len(texts) * 0.3)
|
298 |
-
progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")
|
299 |
-
except Exception as e:
|
300 |
-
st.warning(f"Error classifying emotion for poem {i+1} in {country}: {str(e)}")
|
301 |
-
continue
|
302 |
|
303 |
try:
|
|
|
304 |
if len(texts) < min_topic_size:
|
305 |
st.warning(f"Not enough documents for {country} to generate meaningful topics (minimum {min_topic_size} required)")
|
306 |
continue
|
|
|
307 |
|
308 |
-
# Ensure texts and embeddings match
|
309 |
-
texts = texts[:len(embeddings)]
|
310 |
-
|
311 |
-
# Fit and transform the topic model
|
312 |
topics, probs = topic_model.fit_transform(texts, embeddings)
|
|
|
|
|
313 |
topic_counts = Counter(topics)
|
314 |
|
315 |
top_topics = format_topics(topic_model, topic_counts.most_common(top_n))
|
@@ -329,7 +374,6 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
|
|
329 |
|
330 |
return summaries, topic_model
|
331 |
|
332 |
-
|
333 |
try:
|
334 |
bert_tokenizer, bert_model, emotion_classifier = load_models()
|
335 |
st.success("Models loaded successfully!")
|
@@ -412,7 +456,7 @@ if uploaded_file is not None:
|
|
412 |
if summaries:
|
413 |
st.success("Analysis complete!")
|
414 |
|
415 |
-
tab1, tab2 = st.tabs(["Country Summaries", "Global Topics"])
|
416 |
|
417 |
with tab1:
|
418 |
for summary in summaries:
|
@@ -445,6 +489,12 @@ if uploaded_file is not None:
|
|
445 |
words = topic_model.get_topic(row['Topic'])
|
446 |
topic_name = " | ".join([word for word, _ in words[:5]])
|
447 |
st.write(f"• Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
|
|
|
|
|
|
|
|
|
|
|
|
|
448 |
|
449 |
except Exception as e:
|
450 |
st.error(f"Error processing file: {str(e)}")
|
|
|
10 |
from wordcloud import WordCloud
|
11 |
import matplotlib.pyplot as plt
|
12 |
import pkg_resources
|
13 |
+
import folium
|
14 |
+
import country_converter as coco
|
15 |
+
|
16 |
|
17 |
current_dir = os.path.dirname(os.path.abspath(__file__))
|
18 |
font_path = os.path.join(current_dir, "ArabicR2013-J25x.ttf")
|
|
|
54 |
return_all_scores=True
|
55 |
)
|
56 |
return tokenizer, bert_model, emotion_classifier
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
|
|
58 |
def split_text(text, max_length=512):
|
59 |
"""Split text into chunks of maximum token length while preserving word boundaries."""
|
60 |
words = text.split()
|
|
|
77 |
chunks.append(' '.join(current_chunk))
|
78 |
|
79 |
return chunks
|
80 |
+
|
81 |
+
def get_country_coordinates():
|
82 |
+
"""Returns dictionary of Arab country coordinates"""
|
83 |
+
return {
|
84 |
+
'Egypt': [26.8206, 30.8025],
|
85 |
+
'Saudi Arabia': [23.8859, 45.0792],
|
86 |
+
'UAE': [23.4241, 53.8478],
|
87 |
+
'Kuwait': [29.3117, 47.4818],
|
88 |
+
'Iraq': [33.2232, 43.6793],
|
89 |
+
'Syria': [34.8021, 38.9968],
|
90 |
+
'Lebanon': [33.8547, 35.8623],
|
91 |
+
'Jordan': [30.5852, 36.2384],
|
92 |
+
'Palestine': [31.9522, 35.2332],
|
93 |
+
'Yemen': [15.5527, 48.5164],
|
94 |
+
'Oman': [21.4735, 55.9754],
|
95 |
+
'Qatar': [25.3548, 51.1839],
|
96 |
+
'Bahrain': [26.0667, 50.5577],
|
97 |
+
'Sudan': [12.8628, 30.2176],
|
98 |
+
'Libya': [26.3351, 17.2283],
|
99 |
+
'Tunisia': [33.8869, 9.5375],
|
100 |
+
'Algeria': [28.0339, 1.6596],
|
101 |
+
'Morocco': [31.7917, -7.0926],
|
102 |
+
'Mauritania': [21.0079, -10.9408]
|
103 |
+
}
|
104 |
+
def create_topic_map(summaries):
|
105 |
+
"""Create an interactive map showing topic distribution"""
|
106 |
+
coordinates = get_country_coordinates()
|
107 |
+
|
108 |
+
# Create base map centered on Arab world
|
109 |
+
m = folium.Map(location=[25.0, 30.0], zoom_start=4)
|
110 |
+
|
111 |
+
for summary in summaries:
|
112 |
+
country = summary['country']
|
113 |
+
if country in coordinates:
|
114 |
+
# Get top topic
|
115 |
+
top_topic = summary['top_topics'][0]['topic'] if summary['top_topics'] else "No topics"
|
116 |
+
top_emotion = summary['top_emotions'][0]['emotion'] if summary['top_emotions'] else "No emotion"
|
117 |
+
|
118 |
+
# Create popup content
|
119 |
+
popup_content = f"""
|
120 |
+
<b>{country}</b><br>
|
121 |
+
Top Topic: {top_topic}<br>
|
122 |
+
Main Emotion: {top_emotion}<br>
|
123 |
+
Total Poems: {summary['total_poems']}
|
124 |
+
"""
|
125 |
+
|
126 |
+
# Add marker
|
127 |
+
folium.CircleMarker(
|
128 |
+
location=coordinates[country],
|
129 |
+
radius=10,
|
130 |
+
popup=folium.Popup(popup_content, max_width=300),
|
131 |
+
color='red',
|
132 |
+
fill=True
|
133 |
+
).add_to(m)
|
134 |
+
|
135 |
+
return m
|
136 |
|
137 |
def create_arabic_wordcloud(text, title):
|
138 |
wordcloud = WordCloud(
|
|
|
219 |
return "LABEL_2"
|
220 |
|
221 |
def get_embedding_for_text(text, tokenizer, model):
|
222 |
+
"""Get embedding for complete text."""
|
223 |
chunks = split_text(text)
|
224 |
chunk_embeddings = []
|
|
|
225 |
|
226 |
for chunk in chunks:
|
227 |
try:
|
|
|
238 |
outputs = model(**inputs)
|
239 |
|
240 |
embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
|
241 |
+
chunk_embeddings.append(embedding[0])
|
242 |
except Exception as e:
|
243 |
+
st.warning(f"Error processing chunk: {str(e)}")
|
244 |
continue
|
245 |
|
246 |
if chunk_embeddings:
|
247 |
+
weights = np.array([len(chunk.split()) for chunk in chunks])
|
248 |
+
weights = weights / weights.sum()
|
249 |
+
weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights)
|
250 |
+
return weighted_embedding
|
251 |
+
return np.zeros(model.config.hidden_size)
|
252 |
+
|
253 |
def format_topics(topic_model, topic_counts):
|
254 |
"""Format topics for display."""
|
255 |
formatted_topics = []
|
|
|
284 |
return formatted_emotions
|
285 |
|
286 |
def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3):
|
287 |
+
"""Process the data and generate summaries with flexible topic configuration."""
|
288 |
summaries = []
|
|
|
289 |
|
290 |
topic_model_params = {
|
291 |
"language": "arabic",
|
292 |
"calculate_probabilities": True,
|
293 |
+
"min_topic_size": 3,
|
294 |
"n_gram_range": (1, 1),
|
295 |
"top_n_words": 15,
|
296 |
"verbose": True,
|
297 |
}
|
298 |
+
st.write(f"Total documents: {len(df)}")
|
299 |
+
st.write(f"Topic strategy: {topic_strategy}")
|
300 |
+
st.write(f"Min topic size: {min_topic_size}")
|
301 |
|
302 |
if topic_strategy == "Manual":
|
303 |
topic_model_params["nr_topics"] = n_topics
|
|
|
305 |
topic_model_params["nr_topics"] = "auto"
|
306 |
|
307 |
topic_model = BERTopic(
|
308 |
+
embedding_model=bert_model,
|
309 |
+
**topic_model_params)
|
|
|
310 |
|
311 |
+
vectorizer = CountVectorizer(stop_words=list(ARABIC_STOP_WORDS),
|
312 |
+
min_df=1,
|
313 |
+
max_df=1.0)
|
|
|
|
|
314 |
topic_model.vectorizer_model = vectorizer
|
315 |
|
316 |
for country, group in df.groupby('country'):
|
|
|
319 |
|
320 |
texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
|
321 |
all_emotions = []
|
|
|
322 |
|
323 |
+
embeddings = []
|
324 |
for i, text in enumerate(texts):
|
325 |
try:
|
326 |
embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
|
327 |
+
if embedding is not None and not np.isnan(embedding).any():
|
328 |
embeddings.append(embedding)
|
329 |
+
else:
|
330 |
+
st.warning(f"Invalid embedding generated for text {i+1} in {country}")
|
331 |
+
continue
|
332 |
except Exception as e:
|
333 |
+
st.warning(f"Error generating embedding for text {i+1} in {country}: {str(e)}")
|
334 |
continue
|
335 |
+
progress = (i + 1) / len(texts) * 0.4
|
336 |
+
progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...")
|
337 |
|
338 |
+
if len(embeddings) != len(texts):
|
339 |
+
texts = texts[:len(embeddings)]
|
340 |
+
embeddings = np.array(embeddings)
|
341 |
+
|
342 |
+
for i, text in enumerate(texts):
|
343 |
+
emotion = classify_emotion(text, emotion_classifier)
|
344 |
+
all_emotions.append(emotion)
|
345 |
+
progress = 0.4 + ((i + 1) / len(texts) * 0.3)
|
346 |
+
progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
347 |
|
348 |
try:
|
349 |
+
|
350 |
if len(texts) < min_topic_size:
|
351 |
st.warning(f"Not enough documents for {country} to generate meaningful topics (minimum {min_topic_size} required)")
|
352 |
continue
|
353 |
+
|
354 |
|
|
|
|
|
|
|
|
|
355 |
topics, probs = topic_model.fit_transform(texts, embeddings)
|
356 |
+
|
357 |
+
|
358 |
topic_counts = Counter(topics)
|
359 |
|
360 |
top_topics = format_topics(topic_model, topic_counts.most_common(top_n))
|
|
|
374 |
|
375 |
return summaries, topic_model
|
376 |
|
|
|
377 |
try:
|
378 |
bert_tokenizer, bert_model, emotion_classifier = load_models()
|
379 |
st.success("Models loaded successfully!")
|
|
|
456 |
if summaries:
|
457 |
st.success("Analysis complete!")
|
458 |
|
459 |
+
tab1, tab2, tab3 = st.tabs(["Country Summaries", "Global Topics", "Topic Map"])
|
460 |
|
461 |
with tab1:
|
462 |
for summary in summaries:
|
|
|
489 |
words = topic_model.get_topic(row['Topic'])
|
490 |
topic_name = " | ".join([word for word, _ in words[:5]])
|
491 |
st.write(f"• Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
|
492 |
+
|
493 |
+
with tab3:
|
494 |
+
st.subheader("Topic Distribution Map")
|
495 |
+
topic_map = create_topic_map(summaries)
|
496 |
+
# Display the map
|
497 |
+
st.components.v1.html(topic_map._repr_html_(), height=600)
|
498 |
|
499 |
except Exception as e:
|
500 |
st.error(f"Error processing file: {str(e)}")
|