kambris commited on
Commit
f427760
·
verified ·
1 Parent(s): b77a329

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -12
app.py CHANGED
@@ -233,7 +233,7 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
233
  "n_gram_range": (1, 1),
234
  "top_n_words": 15,
235
  "verbose": True,
236
- }}
237
  st.write(f"Total documents: {len(df)}")
238
  st.write(f"Topic strategy: {topic_strategy}")
239
  st.write(f"Min topic size: {min_topic_size}")
@@ -243,10 +243,14 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
243
  else:
244
  topic_model_params["nr_topics"] = "auto"
245
 
246
- topic_model = BERTopic(**topic_model_params)
 
 
247
 
248
  # Create vectorizer with stop words
249
- vectorizer = CountVectorizer(stop_words=list(ARABIC_STOP_WORDS))
 
 
250
  topic_model.vectorizer_model = vectorizer
251
 
252
  for country, group in df.groupby('country'):
@@ -258,11 +262,21 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
258
 
259
  embeddings = []
260
  for i, text in enumerate(texts):
261
- embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
262
- embeddings.append(embedding)
 
 
 
 
 
 
 
 
263
  progress = (i + 1) / len(texts) * 0.4
264
  progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...")
265
 
 
 
266
  embeddings = np.array(embeddings)
267
 
268
  for i, text in enumerate(texts):
@@ -272,16 +286,25 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
272
  progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")
273
 
274
  try:
 
 
 
 
 
 
275
  topics, probs = topic_model.fit_transform(texts, embeddings)
276
- st.write(f"Number of unique topics: {len(set(topics))}")
277
- st.write(f"Topic distribution: {Counter(topics)}")
278
- topic_counts = Counter(topics)
279
- if -1 in topic_counts:
280
- del topic_counts[-1]
 
 
 
281
 
282
  top_topics = format_topics(topic_model, topic_counts.most_common(top_n))
283
  top_emotions = format_emotions(Counter(all_emotions).most_common(top_n))
284
-
285
  summaries.append({
286
  'country': country,
287
  'total_poems': len(texts),
@@ -295,7 +318,6 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
295
  continue
296
 
297
  return summaries, topic_model
298
-
299
  # Load models
300
  try:
301
  bert_tokenizer, bert_model, emotion_classifier = load_models()
 
233
  "n_gram_range": (1, 1),
234
  "top_n_words": 15,
235
  "verbose": True,
236
+ }
237
  st.write(f"Total documents: {len(df)}")
238
  st.write(f"Topic strategy: {topic_strategy}")
239
  st.write(f"Min topic size: {min_topic_size}")
 
243
  else:
244
  topic_model_params["nr_topics"] = "auto"
245
 
246
+ topic_model = BERTopic(
247
+ embedding_model=bert_model,
248
+ **topic_model_params)
249
 
250
  # Create vectorizer with stop words
251
+ vectorizer = CountVectorizer(stop_words=list(ARABIC_STOP_WORDS),
252
+ min_df=2,
253
+ max_df=0.95)
254
  topic_model.vectorizer_model = vectorizer
255
 
256
  for country, group in df.groupby('country'):
 
262
 
263
  embeddings = []
264
  for i, text in enumerate(texts):
265
+ try:
266
+ embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
267
+ if embedding is not None and not np.isnan(embedding).any():
268
+ embeddings.append(embedding)
269
+ else:
270
+ st.warning(f"Invalid embedding generated for text {i+1} in {country}")
271
+ continue
272
+ except Exception as e:
273
+ st.warning(f"Error generating embedding for text {i+1} in {country}: {str(e)}")
274
+ continue
275
  progress = (i + 1) / len(texts) * 0.4
276
  progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...")
277
 
278
+ if len(embeddings) != len(texts):
279
+ texts = texts[:len(embeddings)]
280
  embeddings = np.array(embeddings)
281
 
282
  for i, text in enumerate(texts):
 
286
  progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")
287
 
288
  try:
289
+
290
+ if len(texts) < min_topic_size:
291
+ st.warning(f"Not enough documents for {country} to generate meaningful topics (minimum {min_topic_size} required)")
292
+ continue
293
+
294
+
295
  topics, probs = topic_model.fit_transform(texts, embeddings)
296
+
297
+
298
+ valid_topics = [t for t in topics if t != -1]
299
+ if not valid_topics:
300
+ st.warning(f"No valid topics generated for {country}")
301
+ continue
302
+
303
+ topic_counts = Counter(valid_topics)
304
 
305
  top_topics = format_topics(topic_model, topic_counts.most_common(top_n))
306
  top_emotions = format_emotions(Counter(all_emotions).most_common(top_n))
307
+
308
  summaries.append({
309
  'country': country,
310
  'total_poems': len(texts),
 
318
  continue
319
 
320
  return summaries, topic_model
 
321
  # Load models
322
  try:
323
  bert_tokenizer, bert_model, emotion_classifier = load_models()