kambris commited on
Commit
cebfb12
·
verified ·
1 Parent(s): 4c9a0ea

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -54
app.py CHANGED
@@ -253,7 +253,7 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
253
  topic_model_params["nr_topics"] = "auto"
254
 
255
  topic_model = BERTopic(
256
- embedding_model=None, # Set to None since we're providing embeddings
257
  **topic_model_params
258
  )
259
 
@@ -264,49 +264,37 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
264
  )
265
  topic_model.vectorizer_model = vectorizer
266
 
267
- progress_placeholder = st.empty()
268
- progress_bar = progress_placeholder.progress(0)
269
- status_message = st.empty()
270
-
271
  for country, group in df.groupby('country'):
272
- gc.collect()
273
- torch.cuda.empty_cache() if torch.cuda.is_available() else None
274
 
275
- status_message.text(f"Processing poems for {country}...")
276
  texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
277
  all_emotions = []
278
- embeddings_list = []
279
 
280
- total_texts = len(texts)
 
281
  for i, text in enumerate(texts):
282
  try:
283
- embedding = cache_embeddings(text, bert_tokenizer, bert_model)
284
  if embedding is not None and not np.isnan(embedding).any():
285
- # Ensure embedding is 2D
286
- if len(embedding.shape) == 1:
287
- embedding = embedding.reshape(1, -1)
288
- embeddings_list.append(embedding)
289
 
290
- if i % max(1, total_texts // 100) == 0:
291
- progress = (i + 1) / total_texts * 0.4
292
- progress_bar.progress(progress)
293
- status_message.text(f"Generated embeddings for {i+1}/{total_texts} poems in {country}...")
294
-
295
  except Exception as e:
296
  st.warning(f"Error processing poem {i+1} in {country}: {str(e)}")
297
  continue
298
 
 
 
 
299
  # Process emotions
300
  for i, text in enumerate(texts):
301
  try:
302
- emotion = cache_emotion_classification(text, emotion_classifier)
303
  all_emotions.append(emotion)
304
-
305
- if i % max(1, total_texts // 100) == 0:
306
- progress = 0.4 + ((i + 1) / total_texts * 0.3)
307
- progress_bar.progress(progress)
308
- status_message.text(f"Classified emotions for {i+1}/{total_texts} poems in {country}...")
309
-
310
  except Exception as e:
311
  st.warning(f"Error classifying emotion for poem {i+1} in {country}: {str(e)}")
312
  continue
@@ -316,37 +304,31 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
316
  st.warning(f"Not enough documents for {country} to generate meaningful topics (minimum {min_topic_size} required)")
317
  continue
318
 
319
- if embeddings_list:
320
- # Stack all embeddings into a single 2D array
321
- embeddings = np.vstack(embeddings_list)
322
-
323
- topics, probs = topic_model.fit_transform(texts, embeddings)
324
- topic_counts = Counter(topics)
325
-
326
- top_topics = format_topics(topic_model, topic_counts.most_common(top_n))
327
- top_emotions = format_emotions(Counter(all_emotions).most_common(top_n))
328
-
329
- summaries.append({
330
- 'country': country,
331
- 'total_poems': len(texts),
332
- 'top_topics': top_topics,
333
- 'top_emotions': top_emotions
334
- })
335
- progress_bar.progress(1.0, text="Processing complete!")
336
- else:
337
- st.warning(f"No valid embeddings generated for {country}")
338
-
339
  except Exception as e:
340
  st.warning(f"Could not generate topics for {country}: {str(e)}")
341
  continue
342
-
343
- progress_placeholder.empty()
344
- status_message.empty()
345
- progress_placeholder = st.empty()
346
- progress_bar = progress_placeholder.progress(0)
347
- status_message = st.empty()
348
-
349
  return summaries, topic_model
 
350
  try:
351
  bert_tokenizer, bert_model, emotion_classifier = load_models()
352
  st.success("Models loaded successfully!")
 
253
  topic_model_params["nr_topics"] = "auto"
254
 
255
  topic_model = BERTopic(
256
+ embedding_model=None, # Changed from bert_model to None
257
  **topic_model_params
258
  )
259
 
 
264
  )
265
  topic_model.vectorizer_model = vectorizer
266
 
 
 
 
 
267
  for country, group in df.groupby('country'):
268
+ progress_text = f"Processing poems for {country}..."
269
+ progress_bar = st.progress(0, text=progress_text)
270
 
 
271
  texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
272
  all_emotions = []
 
273
 
274
+ # Generate embeddings
275
+ embeddings = []
276
  for i, text in enumerate(texts):
277
  try:
278
+ embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
279
  if embedding is not None and not np.isnan(embedding).any():
280
+ embeddings.append(embedding)
 
 
 
281
 
282
+ progress = (i + 1) / len(texts) * 0.4
283
+ progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...")
 
 
 
284
  except Exception as e:
285
  st.warning(f"Error processing poem {i+1} in {country}: {str(e)}")
286
  continue
287
 
288
+ # Convert embeddings to numpy array
289
+ embeddings = np.array(embeddings)
290
+
291
  # Process emotions
292
  for i, text in enumerate(texts):
293
  try:
294
+ emotion = classify_emotion(text, emotion_classifier)
295
  all_emotions.append(emotion)
296
+ progress = 0.4 + ((i + 1) / len(texts) * 0.3)
297
+ progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")
 
 
 
 
298
  except Exception as e:
299
  st.warning(f"Error classifying emotion for poem {i+1} in {country}: {str(e)}")
300
  continue
 
304
  st.warning(f"Not enough documents for {country} to generate meaningful topics (minimum {min_topic_size} required)")
305
  continue
306
 
307
+ # Ensure texts and embeddings match
308
+ if len(embeddings) != len(texts):
309
+ texts = texts[:len(embeddings)]
310
+
311
+ # Fit and transform the topic model
312
+ topics, probs = topic_model.fit_transform(texts, embeddings)
313
+ topic_counts = Counter(topics)
314
+
315
+ top_topics = format_topics(topic_model, topic_counts.most_common(top_n))
316
+ top_emotions = format_emotions(Counter(all_emotions).most_common(top_n))
317
+
318
+ summaries.append({
319
+ 'country': country,
320
+ 'total_poems': len(texts),
321
+ 'top_topics': top_topics,
322
+ 'top_emotions': top_emotions
323
+ })
324
+ progress_bar.progress(1.0, text="Processing complete!")
325
+
 
326
  except Exception as e:
327
  st.warning(f"Could not generate topics for {country}: {str(e)}")
328
  continue
329
+
 
 
 
 
 
 
330
  return summaries, topic_model
331
+
332
  try:
333
  bert_tokenizer, bert_model, emotion_classifier = load_models()
334
  st.success("Models loaded successfully!")