kambris commited on
Commit
5835cbd
·
verified ·
1 Parent(s): 729733d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -56
app.py CHANGED
@@ -10,7 +10,9 @@ import os
10
  from wordcloud import WordCloud
11
  import matplotlib.pyplot as plt
12
  import pkg_resources
13
- import gc
 
 
14
 
15
  current_dir = os.path.dirname(os.path.abspath(__file__))
16
  font_path = os.path.join(current_dir, "ArabicR2013-J25x.ttf")
@@ -52,16 +54,7 @@ def load_models():
52
  return_all_scores=True
53
  )
54
  return tokenizer, bert_model, emotion_classifier
55
-
56
- @st.cache_data
57
- def cache_embeddings(text, _tokenizer, _model):
58
- return get_embedding_for_text(text, _tokenizer, _model)
59
-
60
- @st.cache_data
61
- def cache_emotion_classification(text, _classifier):
62
- return classify_emotion(text, _classifier)
63
 
64
- @st.cache_data
65
  def split_text(text, max_length=512):
66
  """Split text into chunks of maximum token length while preserving word boundaries."""
67
  words = text.split()
@@ -84,6 +77,62 @@ def split_text(text, max_length=512):
84
  chunks.append(' '.join(current_chunk))
85
 
86
  return chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
  def create_arabic_wordcloud(text, title):
89
  wordcloud = WordCloud(
@@ -170,9 +219,9 @@ def classify_emotion(text, classifier):
170
  return "LABEL_2"
171
 
172
  def get_embedding_for_text(text, tokenizer, model):
 
173
  chunks = split_text(text)
174
  chunk_embeddings = []
175
- embedding_size = model.config.hidden_size
176
 
177
  for chunk in chunks:
178
  try:
@@ -189,16 +238,18 @@ def get_embedding_for_text(text, tokenizer, model):
189
  outputs = model(**inputs)
190
 
191
  embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
192
- chunk_embeddings.append(embedding.reshape(-1))
193
  except Exception as e:
 
194
  continue
195
 
196
  if chunk_embeddings:
197
- # Ensure consistent shape
198
- final_embedding = np.mean(chunk_embeddings, axis=0)
199
- return final_embedding.reshape(-1)
200
- return np.zeros(embedding_size)
201
-
 
202
  def format_topics(topic_model, topic_counts):
203
  """Format topics for display."""
204
  formatted_topics = []
@@ -233,17 +284,20 @@ def format_emotions(emotion_counts):
233
  return formatted_emotions
234
 
235
  def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3):
 
236
  summaries = []
237
- embedding_size = bert_model.config.hidden_size
238
 
239
  topic_model_params = {
240
  "language": "arabic",
241
  "calculate_probabilities": True,
242
- "min_topic_size": min_topic_size,
243
  "n_gram_range": (1, 1),
244
  "top_n_words": 15,
245
  "verbose": True,
246
  }
 
 
 
247
 
248
  if topic_strategy == "Manual":
249
  topic_model_params["nr_topics"] = n_topics
@@ -251,15 +305,12 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
251
  topic_model_params["nr_topics"] = "auto"
252
 
253
  topic_model = BERTopic(
254
- embedding_model=None,
255
- **topic_model_params
256
- )
257
 
258
- vectorizer = CountVectorizer(
259
- stop_words=list(ARABIC_STOP_WORDS),
260
- min_df=1,
261
- max_df=1.0
262
- )
263
  topic_model.vectorizer_model = vectorizer
264
 
265
  for country, group in df.groupby('country'):
@@ -268,48 +319,42 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
268
 
269
  texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
270
  all_emotions = []
271
- embeddings = []
272
 
273
- # Generate embeddings
274
  for i, text in enumerate(texts):
275
  try:
276
  embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
277
- if embedding is not None and embedding.shape[0] == embedding_size:
278
  embeddings.append(embedding)
279
- progress = (i + 1) / len(texts) * 0.4
280
- progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...")
 
281
  except Exception as e:
282
- st.warning(f"Error processing poem {i+1} in {country}: {str(e)}")
283
  continue
 
 
284
 
285
- # Convert to numpy array and ensure 2D shape
286
- if embeddings:
287
- embeddings = np.vstack(embeddings)
288
- else:
289
- st.warning(f"No valid embeddings generated for {country}")
290
- continue
291
-
292
- # Process emotions
293
- for i, text in enumerate(texts[:len(embeddings)]):
294
- try:
295
- emotion = classify_emotion(text, emotion_classifier)
296
- all_emotions.append(emotion)
297
- progress = 0.4 + ((i + 1) / len(texts) * 0.3)
298
- progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")
299
- except Exception as e:
300
- st.warning(f"Error classifying emotion for poem {i+1} in {country}: {str(e)}")
301
- continue
302
 
303
  try:
 
304
  if len(texts) < min_topic_size:
305
  st.warning(f"Not enough documents for {country} to generate meaningful topics (minimum {min_topic_size} required)")
306
  continue
 
307
 
308
- # Ensure texts and embeddings match
309
- texts = texts[:len(embeddings)]
310
-
311
- # Fit and transform the topic model
312
  topics, probs = topic_model.fit_transform(texts, embeddings)
 
 
313
  topic_counts = Counter(topics)
314
 
315
  top_topics = format_topics(topic_model, topic_counts.most_common(top_n))
@@ -329,7 +374,6 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
329
 
330
  return summaries, topic_model
331
 
332
-
333
  try:
334
  bert_tokenizer, bert_model, emotion_classifier = load_models()
335
  st.success("Models loaded successfully!")
@@ -412,7 +456,7 @@ if uploaded_file is not None:
412
  if summaries:
413
  st.success("Analysis complete!")
414
 
415
- tab1, tab2 = st.tabs(["Country Summaries", "Global Topics"])
416
 
417
  with tab1:
418
  for summary in summaries:
@@ -445,6 +489,12 @@ if uploaded_file is not None:
445
  words = topic_model.get_topic(row['Topic'])
446
  topic_name = " | ".join([word for word, _ in words[:5]])
447
  st.write(f"• Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
 
 
 
 
 
 
448
 
449
  except Exception as e:
450
  st.error(f"Error processing file: {str(e)}")
 
10
  from wordcloud import WordCloud
11
  import matplotlib.pyplot as plt
12
  import pkg_resources
13
+ import folium
14
+ import country_converter as coco
15
+
16
 
17
  current_dir = os.path.dirname(os.path.abspath(__file__))
18
  font_path = os.path.join(current_dir, "ArabicR2013-J25x.ttf")
 
54
  return_all_scores=True
55
  )
56
  return tokenizer, bert_model, emotion_classifier
 
 
 
 
 
 
 
 
57
 
 
58
  def split_text(text, max_length=512):
59
  """Split text into chunks of maximum token length while preserving word boundaries."""
60
  words = text.split()
 
77
  chunks.append(' '.join(current_chunk))
78
 
79
  return chunks
80
+
81
+ def get_country_coordinates():
82
+ """Returns dictionary of Arab country coordinates"""
83
+ return {
84
+ 'Egypt': [26.8206, 30.8025],
85
+ 'Saudi Arabia': [23.8859, 45.0792],
86
+ 'UAE': [23.4241, 53.8478],
87
+ 'Kuwait': [29.3117, 47.4818],
88
+ 'Iraq': [33.2232, 43.6793],
89
+ 'Syria': [34.8021, 38.9968],
90
+ 'Lebanon': [33.8547, 35.8623],
91
+ 'Jordan': [30.5852, 36.2384],
92
+ 'Palestine': [31.9522, 35.2332],
93
+ 'Yemen': [15.5527, 48.5164],
94
+ 'Oman': [21.4735, 55.9754],
95
+ 'Qatar': [25.3548, 51.1839],
96
+ 'Bahrain': [26.0667, 50.5577],
97
+ 'Sudan': [12.8628, 30.2176],
98
+ 'Libya': [26.3351, 17.2283],
99
+ 'Tunisia': [33.8869, 9.5375],
100
+ 'Algeria': [28.0339, 1.6596],
101
+ 'Morocco': [31.7917, -7.0926],
102
+ 'Mauritania': [21.0079, -10.9408]
103
+ }
104
+ def create_topic_map(summaries):
105
+ """Create an interactive map showing topic distribution"""
106
+ coordinates = get_country_coordinates()
107
+
108
+ # Create base map centered on Arab world
109
+ m = folium.Map(location=[25.0, 30.0], zoom_start=4)
110
+
111
+ for summary in summaries:
112
+ country = summary['country']
113
+ if country in coordinates:
114
+ # Get top topic
115
+ top_topic = summary['top_topics'][0]['topic'] if summary['top_topics'] else "No topics"
116
+ top_emotion = summary['top_emotions'][0]['emotion'] if summary['top_emotions'] else "No emotion"
117
+
118
+ # Create popup content
119
+ popup_content = f"""
120
+ <b>{country}</b><br>
121
+ Top Topic: {top_topic}<br>
122
+ Main Emotion: {top_emotion}<br>
123
+ Total Poems: {summary['total_poems']}
124
+ """
125
+
126
+ # Add marker
127
+ folium.CircleMarker(
128
+ location=coordinates[country],
129
+ radius=10,
130
+ popup=folium.Popup(popup_content, max_width=300),
131
+ color='red',
132
+ fill=True
133
+ ).add_to(m)
134
+
135
+ return m
136
 
137
  def create_arabic_wordcloud(text, title):
138
  wordcloud = WordCloud(
 
219
  return "LABEL_2"
220
 
221
  def get_embedding_for_text(text, tokenizer, model):
222
+ """Get embedding for complete text."""
223
  chunks = split_text(text)
224
  chunk_embeddings = []
 
225
 
226
  for chunk in chunks:
227
  try:
 
238
  outputs = model(**inputs)
239
 
240
  embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
241
+ chunk_embeddings.append(embedding[0])
242
  except Exception as e:
243
+ st.warning(f"Error processing chunk: {str(e)}")
244
  continue
245
 
246
  if chunk_embeddings:
247
+ weights = np.array([len(chunk.split()) for chunk in chunks])
248
+ weights = weights / weights.sum()
249
+ weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights)
250
+ return weighted_embedding
251
+ return np.zeros(model.config.hidden_size)
252
+
253
  def format_topics(topic_model, topic_counts):
254
  """Format topics for display."""
255
  formatted_topics = []
 
284
  return formatted_emotions
285
 
286
  def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3):
287
+ """Process the data and generate summaries with flexible topic configuration."""
288
  summaries = []
 
289
 
290
  topic_model_params = {
291
  "language": "arabic",
292
  "calculate_probabilities": True,
293
+ "min_topic_size": 3,
294
  "n_gram_range": (1, 1),
295
  "top_n_words": 15,
296
  "verbose": True,
297
  }
298
+ st.write(f"Total documents: {len(df)}")
299
+ st.write(f"Topic strategy: {topic_strategy}")
300
+ st.write(f"Min topic size: {min_topic_size}")
301
 
302
  if topic_strategy == "Manual":
303
  topic_model_params["nr_topics"] = n_topics
 
305
  topic_model_params["nr_topics"] = "auto"
306
 
307
  topic_model = BERTopic(
308
+ embedding_model=bert_model,
309
+ **topic_model_params)
 
310
 
311
+ vectorizer = CountVectorizer(stop_words=list(ARABIC_STOP_WORDS),
312
+ min_df=1,
313
+ max_df=1.0)
 
 
314
  topic_model.vectorizer_model = vectorizer
315
 
316
  for country, group in df.groupby('country'):
 
319
 
320
  texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
321
  all_emotions = []
 
322
 
323
+ embeddings = []
324
  for i, text in enumerate(texts):
325
  try:
326
  embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
327
+ if embedding is not None and not np.isnan(embedding).any():
328
  embeddings.append(embedding)
329
+ else:
330
+ st.warning(f"Invalid embedding generated for text {i+1} in {country}")
331
+ continue
332
  except Exception as e:
333
+ st.warning(f"Error generating embedding for text {i+1} in {country}: {str(e)}")
334
  continue
335
+ progress = (i + 1) / len(texts) * 0.4
336
+ progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...")
337
 
338
+ if len(embeddings) != len(texts):
339
+ texts = texts[:len(embeddings)]
340
+ embeddings = np.array(embeddings)
341
+
342
+ for i, text in enumerate(texts):
343
+ emotion = classify_emotion(text, emotion_classifier)
344
+ all_emotions.append(emotion)
345
+ progress = 0.4 + ((i + 1) / len(texts) * 0.3)
346
+ progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")
 
 
 
 
 
 
 
 
347
 
348
  try:
349
+
350
  if len(texts) < min_topic_size:
351
  st.warning(f"Not enough documents for {country} to generate meaningful topics (minimum {min_topic_size} required)")
352
  continue
353
+
354
 
 
 
 
 
355
  topics, probs = topic_model.fit_transform(texts, embeddings)
356
+
357
+
358
  topic_counts = Counter(topics)
359
 
360
  top_topics = format_topics(topic_model, topic_counts.most_common(top_n))
 
374
 
375
  return summaries, topic_model
376
 
 
377
  try:
378
  bert_tokenizer, bert_model, emotion_classifier = load_models()
379
  st.success("Models loaded successfully!")
 
456
  if summaries:
457
  st.success("Analysis complete!")
458
 
459
+ tab1, tab2, tab3 = st.tabs(["Country Summaries", "Global Topics", "Topic Map"])
460
 
461
  with tab1:
462
  for summary in summaries:
 
489
  words = topic_model.get_topic(row['Topic'])
490
  topic_name = " | ".join([word for word, _ in words[:5]])
491
  st.write(f"• Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
492
+
493
+ with tab3:
494
+ st.subheader("Topic Distribution Map")
495
+ topic_map = create_topic_map(summaries)
496
+ # Display the map
497
+ st.components.v1.html(topic_map._repr_html_(), height=600)
498
 
499
  except Exception as e:
500
  st.error(f"Error processing file: {str(e)}")