kambris commited on
Commit
6b0bce1
·
verified ·
1 Parent(s): b449fa6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +154 -229
app.py CHANGED
@@ -10,10 +10,6 @@ import os
10
  from wordcloud import WordCloud
11
  import matplotlib.pyplot as plt
12
  import pkg_resources
13
- import folium
14
- from folium.plugins import HeatMap
15
- import country_converter as coco
16
- from streamlit_folium import folium_static
17
 
18
  current_dir = os.path.dirname(os.path.abspath(__file__))
19
  font_path = os.path.join(current_dir, "ArabicR2013-J25x.ttf")
@@ -43,98 +39,19 @@ st.set_page_config(
43
 
44
  @st.cache_resource
45
  def load_models():
46
- """Load and cache the models"""
47
- # + Added use_fast=True for faster tokenization
48
- tokenizer = AutoTokenizer.from_pretrained(
49
- "CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment",
50
- use_fast=True
51
- )
52
-
53
- # + Added torchscript and low_cpu_mem_usage
54
- bert_model = AutoModel.from_pretrained(
55
- "aubmindlab/bert-base-arabertv2",
56
- torchscript=True,
57
- low_cpu_mem_usage=True
58
- )
59
-
60
- # + Added optimizations for emotion model
61
- emotion_model = AutoModelForSequenceClassification.from_pretrained(
62
- "CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment",
63
- torchscript=True,
64
- low_cpu_mem_usage=True
65
- )
66
-
67
- # ~ Changed pipeline configuration to use batching
68
  emotion_classifier = pipeline(
69
  "sentiment-analysis",
70
  model=emotion_model,
71
- tokenizer=tokenizer,
72
- batch_size=32,
73
- device=-1 # + Added to force CPU usage
74
  )
75
-
76
  return tokenizer, bert_model, emotion_classifier
77
 
78
- # + Added new batch processing function
79
- def process_texts_in_batches(texts, batch_size=32):
80
- """Process texts in batches for better CPU utilization"""
81
- batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]
82
- results = []
83
-
84
- for batch in batches:
85
- batch_results = emotion_classifier(batch, truncation=True, max_length=512)
86
- results.extend(batch_results)
87
-
88
- return results
89
-
90
- # + Added caching decorator for embeddings
91
- @st.cache_data
92
- def get_cached_embeddings(text, tokenizer, model):
93
- """Cache embeddings to avoid recomputation"""
94
- return get_embedding_for_text(text, tokenizer, model)
95
-
96
- def create_theme_map(summaries, topic_model):
97
- """Create an interactive map showing theme distributions across countries"""
98
- try:
99
- # Create a base map centered on the Arab world
100
- m = folium.Map(location=[25, 45], zoom_start=4)
101
-
102
- # Convert country names to coordinates
103
- cc = coco.CountryConverter()
104
-
105
- for summary in summaries:
106
- try:
107
- # Get country coordinates
108
- country_iso = cc.convert(names=[summary['country']], to='ISO2')
109
- country_data = cc.convert(names=[summary['country']], to='name_short')
110
-
111
- # Create popup content with theme information
112
- popup_content = f"""
113
- <h4>{summary['country']}</h4>
114
- <b>Top Themes:</b><br>
115
- {'<br>'.join([f"• {topic['topic']}: {topic['count']}"
116
- for topic in summary['top_topics'][:5]])}
117
- """
118
-
119
- # Add marker for each country
120
- folium.CircleMarker(
121
- location=[cc.convert(country_iso, to='latitude')[0],
122
- cc.convert(country_iso, to='longitude')[0]],
123
- radius=20,
124
- popup=folium.Popup(popup_content, max_width=300),
125
- color='red',
126
- fill=True,
127
- fill_opacity=0.7
128
- ).add_to(m)
129
- except Exception as e:
130
- st.warning(f"Could not process {summary['country']}: {str(e)}")
131
- continue
132
-
133
- return m
134
- except Exception as e:
135
- st.error(f"Error creating map: {str(e)}")
136
- return None
137
-
138
  def split_text(text, max_length=512):
139
  """Split text into chunks of maximum token length while preserving word boundaries."""
140
  words = text.split()
@@ -181,94 +98,99 @@ def clean_arabic_text(text):
181
  return ' '.join(cleaned_words)
182
 
183
  def classify_emotion(text, classifier):
184
- """Classify emotion for complete text with precise token handling."""
185
- # Ensure text is properly formatted
186
- if not text or not isinstance(text, str):
187
- return "LABEL_2"
188
-
189
- # Split into manageable chunks
190
- words = text.split()
191
- chunks = []
192
- current_chunk = []
193
- current_length = 0
194
-
195
- # Create proper-sized chunks
196
- for word in words:
197
- word_tokens = len(classifier.tokenizer.encode(word))
198
- if current_length + word_tokens > 512:
199
- if current_chunk:
200
- chunks.append(' '.join(current_chunk))
201
- current_chunk = [word]
202
- current_length = word_tokens
203
- else:
204
- current_chunk.append(word)
205
- current_length += word_tokens
206
-
207
- if current_chunk:
208
- chunks.append(' '.join(current_chunk))
209
-
210
- if not chunks:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  return "LABEL_2"
212
-
213
- # Process chunks with proper output handling
214
- all_scores = []
215
- for chunk in chunks:
216
- # Direct classification with proper output structure
217
- result = classifier(chunk, return_all_scores=True)[0]
218
- all_scores.append(result)
219
-
220
- # Calculate final emotion
221
- label_scores = {}
222
- count = len(all_scores)
223
-
224
- for scores in all_scores:
225
- for score_dict in scores:
226
- label = score_dict['label']
227
- if label not in label_scores:
228
- label_scores[label] = 0
229
- label_scores[label] += score_dict['score']
230
-
231
- avg_scores = {label: score/count for label, score in label_scores.items()}
232
- final_emotion = max(avg_scores.items(), key=lambda x: x[1])[0]
233
-
234
- return final_emotion
235
 
 
 
 
 
236
  def get_embedding_for_text(text, tokenizer, model):
237
  """Get embedding for complete text."""
238
- # First tokenize to get exact count
239
- tokens = tokenizer.tokenize(text)
240
-
241
- # Process in chunks of exactly 510 tokens (512 - 2 for CLS and SEP)
242
- chunk_size = 510
243
  chunk_embeddings = []
244
 
245
- for i in range(0, len(tokens), chunk_size):
246
- chunk = tokens[i:i + chunk_size]
247
- # Convert tokens back to text
248
- chunk_text = tokenizer.convert_tokens_to_string(chunk)
249
- # Now encode with special tokens
250
- encoded = tokenizer(
251
- chunk_text,
252
- return_tensors='pt',
253
- max_length=512,
254
- truncation=True,
255
- padding='max_length'
256
- )
257
-
258
- # Move to device
259
- encoded = {k: v.to(model.device) for k, v in encoded.items()}
260
-
261
- # Get embedding
262
- with torch.no_grad():
263
- output = model(**encoded)
264
- embedding = output[0][:, 0, :].cpu().numpy()
265
  chunk_embeddings.append(embedding[0])
 
 
 
266
 
267
- # Combine all chunk embeddings
268
  if chunk_embeddings:
269
- return np.mean(chunk_embeddings, axis=0)
 
 
 
270
  return np.zeros(model.config.hidden_size)
271
-
272
  def format_topics(topic_model, topic_counts):
273
  """Format topics for display."""
274
  formatted_topics = []
@@ -301,26 +223,31 @@ def format_emotions(emotion_counts):
301
  'count': count
302
  })
303
  return formatted_emotions
304
-
305
- def get_optimized_topic_model(bert_model):
306
- """Configure BERTopic for better CPU performance"""
307
- return BERTopic(
308
- embedding_model=bert_model,
309
- language="arabic",
310
- calculate_probabilities=False,
311
- verbose=False,
312
- n_gram_range=(1, 1),
313
- min_topic_size=5,
314
- nr_topics="auto",
315
- low_memory=True
316
- )
317
-
318
 
319
  def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3):
320
  """Process the data and generate summaries with flexible topic configuration."""
321
  summaries = []
322
 
323
- topic_model = get_optimized_topic_model(bert_model)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
 
325
  vectorizer = CountVectorizer(stop_words=list(ARABIC_STOP_WORDS),
326
  min_df=1,
@@ -334,58 +261,58 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
334
  texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
335
  all_emotions = []
336
 
337
- # Get embeddings while keeping all content
338
  embeddings = []
339
  for i, text in enumerate(texts):
340
- # Tokenize the full text first
341
- full_tokens = bert_tokenizer.tokenize(text)
342
- chunk_embeddings = []
343
-
344
- # Create chunks of 510 tokens (leaving room for special tokens)
345
- for start_idx in range(0, len(full_tokens), 510):
346
- end_idx = start_idx + 510
347
- chunk_tokens = full_tokens[start_idx:end_idx]
348
- chunk_text = bert_tokenizer.convert_tokens_to_string(chunk_tokens)
349
-
350
- # Get embedding for this chunk
351
- chunk_embedding = get_embedding_for_text(chunk_text, bert_tokenizer, bert_model)
352
- chunk_embeddings.append(chunk_embedding)
353
-
354
- # Combine embeddings for full poem representation
355
- full_embedding = np.mean(chunk_embeddings, axis=0) if chunk_embeddings else np.zeros(bert_model.config.hidden_size)
356
- embeddings.append(full_embedding)
357
-
358
  progress = (i + 1) / len(texts) * 0.4
359
  progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...")
360
-
 
 
361
  embeddings = np.array(embeddings)
362
 
363
- # Process emotions with tuple output handling
364
  for i, text in enumerate(texts):
365
- result = emotion_classifier(text)
366
- emotion = result[0] # Access first element of tuple
367
  all_emotions.append(emotion)
368
  progress = 0.4 + ((i + 1) / len(texts) * 0.3)
369
  progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")
370
-
371
- if len(texts) < min_topic_size:
372
- st.info(f"Not enough documents for {country} to generate meaningful topics (minimum {min_topic_size} required)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
373
  continue
374
-
375
- topics, _ = topic_model.fit_transform(texts, embeddings)
376
- topic_counts = Counter(topics)
377
-
378
- top_topics = format_topics(topic_model, topic_counts.most_common(top_n))
379
- top_emotions = format_emotions(Counter(all_emotions).most_common(top_n))
380
-
381
- summaries.append({
382
- 'country': country,
383
- 'total_poems': len(texts),
384
- 'top_topics': top_topics,
385
- 'top_emotions': top_emotions
386
- })
387
- progress_bar.progress(1.0, text="Processing complete!")
388
-
389
  return summaries, topic_model
390
 
391
  try:
@@ -470,7 +397,7 @@ if uploaded_file is not None:
470
  if summaries:
471
  st.success("Analysis complete!")
472
 
473
- tab1, tab2, tab3 = st.tabs(["Country Summaries", "Global Topics", "Theme Map"])
474
 
475
  with tab1:
476
  for summary in summaries:
@@ -503,10 +430,7 @@ if uploaded_file is not None:
503
  words = topic_model.get_topic(row['Topic'])
504
  topic_name = " | ".join([word for word, _ in words[:5]])
505
  st.write(f"• Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
506
- with tab3:
507
- st.subheader("Thematic Distribution Map")
508
- theme_map = create_theme_map(summaries, topic_model)
509
- folium_static(theme_map)
510
  except Exception as e:
511
  st.error(f"Error processing file: {str(e)}")
512
 
@@ -520,3 +444,4 @@ else:
520
  })
521
  st.dataframe(example_df)
522
 
 
 
10
  from wordcloud import WordCloud
11
  import matplotlib.pyplot as plt
12
  import pkg_resources
 
 
 
 
13
 
14
  current_dir = os.path.dirname(os.path.abspath(__file__))
15
  font_path = os.path.join(current_dir, "ArabicR2013-J25x.ttf")
 
39
 
40
  @st.cache_resource
41
  def load_models():
42
+ """Load and cache the models to prevent reloading"""
43
+ tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
44
+ bert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2")
45
+ emotion_model = AutoModelForSequenceClassification.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
46
+ emotion_tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  emotion_classifier = pipeline(
48
  "sentiment-analysis",
49
  model=emotion_model,
50
+ tokenizer=emotion_tokenizer,
51
+ return_all_scores=True
 
52
  )
 
53
  return tokenizer, bert_model, emotion_classifier
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  def split_text(text, max_length=512):
56
  """Split text into chunks of maximum token length while preserving word boundaries."""
57
  words = text.split()
 
98
  return ' '.join(cleaned_words)
99
 
100
  def classify_emotion(text, classifier):
101
+ """Classify emotion for complete text with proper token handling."""
102
+ try:
103
+ words = text.split()
104
+ chunks = []
105
+ current_chunk = []
106
+ current_length = 0
107
+
108
+ for word in words:
109
+ word_tokens = len(classifier.tokenizer.encode(word))
110
+ if current_length + word_tokens > 512:
111
+ if current_chunk:
112
+ chunks.append(' '.join(current_chunk))
113
+ current_chunk = [word]
114
+ current_length = word_tokens
115
+ else:
116
+ current_chunk.append(word)
117
+ current_length += word_tokens
118
+
119
+ if current_chunk:
120
+ chunks.append(' '.join(current_chunk))
121
+
122
+ if not chunks:
123
+ chunks = [text]
124
+
125
+ all_scores = []
126
+ for chunk in chunks:
127
+ try:
128
+ inputs = classifier.tokenizer(
129
+ chunk,
130
+ truncation=True,
131
+ max_length=512,
132
+ return_tensors="pt"
133
+ )
134
+ result = classifier(chunk, truncation=True, max_length=512)
135
+ scores = result[0]
136
+ all_scores.append(scores)
137
+ except Exception as chunk_error:
138
+ st.warning(f"Skipping chunk due to error: {str(chunk_error)}")
139
+ continue
140
+
141
+ if all_scores:
142
+ label_scores = {}
143
+ count = len(all_scores)
144
+
145
+ for scores in all_scores:
146
+ for score in scores:
147
+ label = score['label']
148
+ if label not in label_scores:
149
+ label_scores[label] = 0
150
+ label_scores[label] += score['score']
151
+
152
+ avg_scores = {label: score/count for label, score in label_scores.items()}
153
+ final_emotion = max(avg_scores.items(), key=lambda x: x[1])[0]
154
+ return final_emotion
155
+
156
  return "LABEL_2"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
+ except Exception as e:
159
+ st.warning(f"Error in emotion classification: {str(e)}")
160
+ return "LABEL_2"
161
+
162
  def get_embedding_for_text(text, tokenizer, model):
163
  """Get embedding for complete text."""
164
+ chunks = split_text(text)
 
 
 
 
165
  chunk_embeddings = []
166
 
167
+ for chunk in chunks:
168
+ try:
169
+ inputs = tokenizer(
170
+ chunk,
171
+ return_tensors="pt",
172
+ padding=True,
173
+ truncation=True,
174
+ max_length=512
175
+ )
176
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
177
+
178
+ with torch.no_grad():
179
+ outputs = model(**inputs)
180
+
181
+ embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
 
 
 
 
 
182
  chunk_embeddings.append(embedding[0])
183
+ except Exception as e:
184
+ st.warning(f"Error processing chunk: {str(e)}")
185
+ continue
186
 
 
187
  if chunk_embeddings:
188
+ weights = np.array([len(chunk.split()) for chunk in chunks])
189
+ weights = weights / weights.sum()
190
+ weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights)
191
+ return weighted_embedding
192
  return np.zeros(model.config.hidden_size)
193
+
194
  def format_topics(topic_model, topic_counts):
195
  """Format topics for display."""
196
  formatted_topics = []
 
223
  'count': count
224
  })
225
  return formatted_emotions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
 
227
  def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3):
228
  """Process the data and generate summaries with flexible topic configuration."""
229
  summaries = []
230
 
231
+ topic_model_params = {
232
+ "language": "arabic",
233
+ "calculate_probabilities": True,
234
+ "min_topic_size": 3,
235
+ "n_gram_range": (1, 1),
236
+ "top_n_words": 15,
237
+ "verbose": True,
238
+ }
239
+ st.write(f"Total documents: {len(df)}")
240
+ st.write(f"Topic strategy: {topic_strategy}")
241
+ st.write(f"Min topic size: {min_topic_size}")
242
+
243
+ if topic_strategy == "Manual":
244
+ topic_model_params["nr_topics"] = n_topics
245
+ else:
246
+ topic_model_params["nr_topics"] = "auto"
247
+
248
+ topic_model = BERTopic(
249
+ embedding_model=bert_model,
250
+ **topic_model_params)
251
 
252
  vectorizer = CountVectorizer(stop_words=list(ARABIC_STOP_WORDS),
253
  min_df=1,
 
261
  texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
262
  all_emotions = []
263
 
 
264
  embeddings = []
265
  for i, text in enumerate(texts):
266
+ try:
267
+ embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
268
+ if embedding is not None and not np.isnan(embedding).any():
269
+ embeddings.append(embedding)
270
+ else:
271
+ st.warning(f"Invalid embedding generated for text {i+1} in {country}")
272
+ continue
273
+ except Exception as e:
274
+ st.warning(f"Error generating embedding for text {i+1} in {country}: {str(e)}")
275
+ continue
 
 
 
 
 
 
 
 
276
  progress = (i + 1) / len(texts) * 0.4
277
  progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...")
278
+
279
+ if len(embeddings) != len(texts):
280
+ texts = texts[:len(embeddings)]
281
  embeddings = np.array(embeddings)
282
 
 
283
  for i, text in enumerate(texts):
284
+ emotion = classify_emotion(text, emotion_classifier)
 
285
  all_emotions.append(emotion)
286
  progress = 0.4 + ((i + 1) / len(texts) * 0.3)
287
  progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")
288
+
289
+ try:
290
+
291
+ if len(texts) < min_topic_size:
292
+ st.warning(f"Not enough documents for {country} to generate meaningful topics (minimum {min_topic_size} required)")
293
+ continue
294
+
295
+
296
+ topics, probs = topic_model.fit_transform(texts, embeddings)
297
+
298
+
299
+ topic_counts = Counter(topics)
300
+
301
+ top_topics = format_topics(topic_model, topic_counts.most_common(top_n))
302
+ top_emotions = format_emotions(Counter(all_emotions).most_common(top_n))
303
+
304
+ summaries.append({
305
+ 'country': country,
306
+ 'total_poems': len(texts),
307
+ 'top_topics': top_topics,
308
+ 'top_emotions': top_emotions
309
+ })
310
+ progress_bar.progress(1.0, text="Processing complete!")
311
+
312
+ except Exception as e:
313
+ st.warning(f"Could not generate topics for {country}: {str(e)}")
314
  continue
315
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
  return summaries, topic_model
317
 
318
  try:
 
397
  if summaries:
398
  st.success("Analysis complete!")
399
 
400
+ tab1, tab2 = st.tabs(["Country Summaries", "Global Topics"])
401
 
402
  with tab1:
403
  for summary in summaries:
 
430
  words = topic_model.get_topic(row['Topic'])
431
  topic_name = " | ".join([word for word, _ in words[:5]])
432
  st.write(f"• Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
433
+
 
 
 
434
  except Exception as e:
435
  st.error(f"Error processing file: {str(e)}")
436
 
 
444
  })
445
  st.dataframe(example_df)
446
 
447
+