kambris commited on
Commit
5018c2f
·
verified ·
1 Parent(s): 6b0bce1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -70
app.py CHANGED
@@ -10,6 +10,7 @@ import os
10
  from wordcloud import WordCloud
11
  import matplotlib.pyplot as plt
12
  import pkg_resources
 
13
 
14
  current_dir = os.path.dirname(os.path.abspath(__file__))
15
  font_path = os.path.join(current_dir, "ArabicR2013-J25x.ttf")
@@ -51,7 +52,16 @@ def load_models():
51
  return_all_scores=True
52
  )
53
  return tokenizer, bert_model, emotion_classifier
 
 
 
 
 
 
 
 
54
 
 
55
  def split_text(text, max_length=512):
56
  """Split text into chunks of maximum token length while preserving word boundaries."""
57
  words = text.split()
@@ -225,96 +235,70 @@ def format_emotions(emotion_counts):
225
  return formatted_emotions
226
 
227
  def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3):
228
- """Process the data and generate summaries with flexible topic configuration."""
229
  summaries = []
230
 
231
- topic_model_params = {
232
- "language": "arabic",
233
- "calculate_probabilities": True,
234
- "min_topic_size": 3,
235
- "n_gram_range": (1, 1),
236
- "top_n_words": 15,
237
- "verbose": True,
238
- }
239
- st.write(f"Total documents: {len(df)}")
240
- st.write(f"Topic strategy: {topic_strategy}")
241
- st.write(f"Min topic size: {min_topic_size}")
242
-
243
- if topic_strategy == "Manual":
244
- topic_model_params["nr_topics"] = n_topics
245
- else:
246
- topic_model_params["nr_topics"] = "auto"
247
 
248
- topic_model = BERTopic(
249
- embedding_model=bert_model,
250
- **topic_model_params)
251
-
252
- vectorizer = CountVectorizer(stop_words=list(ARABIC_STOP_WORDS),
253
- min_df=1,
254
- max_df=1.0)
255
- topic_model.vectorizer_model = vectorizer
256
 
257
  for country, group in df.groupby('country'):
258
- progress_text = f"Processing poems for {country}..."
259
- progress_bar = st.progress(0, text=progress_text)
 
260
 
 
261
  texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
262
  all_emotions = []
263
 
 
264
  embeddings = []
 
265
  for i, text in enumerate(texts):
266
  try:
267
- embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
268
  if embedding is not None and not np.isnan(embedding).any():
269
  embeddings.append(embedding)
270
- else:
271
- st.warning(f"Invalid embedding generated for text {i+1} in {country}")
272
- continue
 
 
 
 
273
  except Exception as e:
274
- st.warning(f"Error generating embedding for text {i+1} in {country}: {str(e)}")
275
  continue
276
- progress = (i + 1) / len(texts) * 0.4
277
- progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...")
278
-
279
- if len(embeddings) != len(texts):
280
- texts = texts[:len(embeddings)]
281
- embeddings = np.array(embeddings)
282
 
 
283
  for i, text in enumerate(texts):
284
- emotion = classify_emotion(text, emotion_classifier)
285
- all_emotions.append(emotion)
286
- progress = 0.4 + ((i + 1) / len(texts) * 0.3)
287
- progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")
288
-
289
- try:
290
-
291
- if len(texts) < min_topic_size:
292
- st.warning(f"Not enough documents for {country} to generate meaningful topics (minimum {min_topic_size} required)")
293
- continue
294
 
295
-
296
- topics, probs = topic_model.fit_transform(texts, embeddings)
297
-
298
-
299
- topic_counts = Counter(topics)
300
-
301
- top_topics = format_topics(topic_model, topic_counts.most_common(top_n))
302
- top_emotions = format_emotions(Counter(all_emotions).most_common(top_n))
303
-
304
- summaries.append({
305
- 'country': country,
306
- 'total_poems': len(texts),
307
- 'top_topics': top_topics,
308
- 'top_emotions': top_emotions
309
- })
310
- progress_bar.progress(1.0, text="Processing complete!")
311
-
312
- except Exception as e:
313
- st.warning(f"Could not generate topics for {country}: {str(e)}")
314
- continue
315
-
316
  return summaries, topic_model
317
-
318
  try:
319
  bert_tokenizer, bert_model, emotion_classifier = load_models()
320
  st.success("Models loaded successfully!")
 
10
  from wordcloud import WordCloud
11
  import matplotlib.pyplot as plt
12
  import pkg_resources
13
+ import gc
14
 
15
  current_dir = os.path.dirname(os.path.abspath(__file__))
16
  font_path = os.path.join(current_dir, "ArabicR2013-J25x.ttf")
 
52
  return_all_scores=True
53
  )
54
  return tokenizer, bert_model, emotion_classifier
55
+
56
+ @st.cache_data
57
+ def cache_embeddings(text, tokenizer, model):
58
+ return get_embedding_for_text(text, tokenizer, model)
59
+
60
+ @st.cache_data
61
+ def cache_emotion_classification(text, classifier):
62
+ return classify_emotion(text, classifier)
63
 
64
+ @st.cache_data
65
  def split_text(text, max_length=512):
66
  """Split text into chunks of maximum token length while preserving word boundaries."""
67
  words = text.split()
 
235
  return formatted_emotions
236
 
237
  def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3):
 
238
  summaries = []
239
 
240
+ # Create a placeholder for the progress bar
241
+ progress_placeholder = st.empty()
242
+ progress_bar = progress_placeholder.progress(0)
 
 
 
 
 
 
 
 
 
 
 
 
 
243
 
244
+ # Create status message placeholder
245
+ status_message = st.empty()
 
 
 
 
 
 
246
 
247
  for country, group in df.groupby('country'):
248
+ # Clear memory at the start of each country's processing
249
+ gc.collect()
250
+ torch.cuda.empty_cache() if torch.cuda.is_available() else None
251
 
252
+ status_message.text(f"Processing poems for {country}...")
253
  texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
254
  all_emotions = []
255
 
256
+ # Use cached embeddings with progress tracking
257
  embeddings = []
258
+ total_texts = len(texts)
259
  for i, text in enumerate(texts):
260
  try:
261
+ embedding = cache_embeddings(text, bert_tokenizer, bert_model)
262
  if embedding is not None and not np.isnan(embedding).any():
263
  embeddings.append(embedding)
264
+
265
+ # Update progress more frequently
266
+ if i % max(1, total_texts // 100) == 0:
267
+ progress = (i + 1) / total_texts * 0.4
268
+ progress_bar.progress(progress)
269
+ status_message.text(f"Generated embeddings for {i+1}/{total_texts} poems in {country}...")
270
+
271
  except Exception as e:
272
+ st.warning(f"Error processing poem {i+1} in {country}: {str(e)}")
273
  continue
 
 
 
 
 
 
274
 
275
+ # Process emotions with caching and progress tracking
276
  for i, text in enumerate(texts):
277
+ try:
278
+ emotion = cache_emotion_classification(text, emotion_classifier)
279
+ all_emotions.append(emotion)
 
 
 
 
 
 
 
280
 
281
+ if i % max(1, total_texts // 100) == 0:
282
+ progress = 0.4 + ((i + 1) / total_texts * 0.3)
283
+ progress_bar.progress(progress)
284
+ status_message.text(f"Classified emotions for {i+1}/{total_texts} poems in {country}...")
285
+
286
+ except Exception as e:
287
+ st.warning(f"Error classifying emotion for poem {i+1} in {country}: {str(e)}")
288
+ continue
289
+
290
+ # Rest of your existing processing code...
291
+
292
+ # Clear progress for next country
293
+ progress_placeholder.empty()
294
+ status_message.empty()
295
+
296
+ # Create new progress bar for next country
297
+ progress_placeholder = st.empty()
298
+ progress_bar = progress_placeholder.progress(0)
299
+ status_message = st.empty()
300
+
 
301
  return summaries, topic_model
 
302
  try:
303
  bert_tokenizer, bert_model, emotion_classifier = load_models()
304
  st.success("Models loaded successfully!")