Spaces:

kambris
/

SoLProject

Runtime error

App Files Files Community

kambris commited on Dec 7, 2024

Commit

cebfb12

verified ·

1 Parent(s): 4c9a0ea

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -54

app.py CHANGED Viewed

@@ -253,7 +253,7 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
         topic_model_params["nr_topics"] = "auto"
     topic_model = BERTopic(
-        embedding_model=None,  # Set to None since we're providing embeddings
         **topic_model_params
     )
@@ -264,49 +264,37 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
     )
     topic_model.vectorizer_model = vectorizer
-    progress_placeholder = st.empty()
-    progress_bar = progress_placeholder.progress(0)
-    status_message = st.empty()
     for country, group in df.groupby('country'):
-        gc.collect()
-        torch.cuda.empty_cache() if torch.cuda.is_available() else None
-        status_message.text(f"Processing poems for {country}...")
         texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
         all_emotions = []
-        embeddings_list = []
-        total_texts = len(texts)
         for i, text in enumerate(texts):
             try:
-                embedding = cache_embeddings(text, bert_tokenizer, bert_model)
                 if embedding is not None and not np.isnan(embedding).any():
-                    # Ensure embedding is 2D
-                    if len(embedding.shape) == 1:
-                        embedding = embedding.reshape(1, -1)
-                    embeddings_list.append(embedding)
-                if i % max(1, total_texts // 100) == 0:
-                    progress = (i + 1) / total_texts * 0.4
-                    progress_bar.progress(progress)
-                    status_message.text(f"Generated embeddings for {i+1}/{total_texts} poems in {country}...")
             except Exception as e:
                 st.warning(f"Error processing poem {i+1} in {country}: {str(e)}")
                 continue
         # Process emotions
         for i, text in enumerate(texts):
             try:
-                emotion = cache_emotion_classification(text, emotion_classifier)
                 all_emotions.append(emotion)
-                if i % max(1, total_texts // 100) == 0:
-                    progress = 0.4 + ((i + 1) / total_texts * 0.3)
-                    progress_bar.progress(progress)
-                    status_message.text(f"Classified emotions for {i+1}/{total_texts} poems in {country}...")
             except Exception as e:
                 st.warning(f"Error classifying emotion for poem {i+1} in {country}: {str(e)}")
                 continue
@@ -316,37 +304,31 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
                 st.warning(f"Not enough documents for {country} to generate meaningful topics (minimum {min_topic_size} required)")
                 continue
-            if embeddings_list:
-                # Stack all embeddings into a single 2D array
-                embeddings = np.vstack(embeddings_list)
-                topics, probs = topic_model.fit_transform(texts, embeddings)
-                topic_counts = Counter(topics)
-                top_topics = format_topics(topic_model, topic_counts.most_common(top_n))
-                top_emotions = format_emotions(Counter(all_emotions).most_common(top_n))
-                summaries.append({
-                    'country': country,
-                    'total_poems': len(texts),
-                    'top_topics': top_topics,
-                    'top_emotions': top_emotions
-                })
-                progress_bar.progress(1.0, text="Processing complete!")
-            else:
-                st.warning(f"No valid embeddings generated for {country}")
         except Exception as e:
             st.warning(f"Could not generate topics for {country}: {str(e)}")
             continue
-        progress_placeholder.empty()
-        status_message.empty()
-        progress_placeholder = st.empty()
-        progress_bar = progress_placeholder.progress(0)
-        status_message = st.empty()
     return summaries, topic_model
 try:
     bert_tokenizer, bert_model, emotion_classifier = load_models()
     st.success("Models loaded successfully!")

         topic_model_params["nr_topics"] = "auto"
     topic_model = BERTopic(
+        embedding_model=None,  # Changed from bert_model to None
         **topic_model_params
     )
     )
     topic_model.vectorizer_model = vectorizer
     for country, group in df.groupby('country'):
+        progress_text = f"Processing poems for {country}..."
+        progress_bar = st.progress(0, text=progress_text)
         texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
         all_emotions = []
+        # Generate embeddings
+        embeddings = []
         for i, text in enumerate(texts):
             try:
+                embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
                 if embedding is not None and not np.isnan(embedding).any():
+                    embeddings.append(embedding)
+                progress = (i + 1) / len(texts) * 0.4
+                progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...")
             except Exception as e:
                 st.warning(f"Error processing poem {i+1} in {country}: {str(e)}")
                 continue
+        # Convert embeddings to numpy array
+        embeddings = np.array(embeddings)
         # Process emotions
         for i, text in enumerate(texts):
             try:
+                emotion = classify_emotion(text, emotion_classifier)
                 all_emotions.append(emotion)
+                progress = 0.4 + ((i + 1) / len(texts) * 0.3)
+                progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")
             except Exception as e:
                 st.warning(f"Error classifying emotion for poem {i+1} in {country}: {str(e)}")
                 continue
                 st.warning(f"Not enough documents for {country} to generate meaningful topics (minimum {min_topic_size} required)")
                 continue
+            # Ensure texts and embeddings match
+            if len(embeddings) != len(texts):
+                texts = texts[:len(embeddings)]
+            # Fit and transform the topic model
+            topics, probs = topic_model.fit_transform(texts, embeddings)
+            topic_counts = Counter(topics)
+            top_topics = format_topics(topic_model, topic_counts.most_common(top_n))
+            top_emotions = format_emotions(Counter(all_emotions).most_common(top_n))
+            summaries.append({
+                'country': country,
+                'total_poems': len(texts),
+                'top_topics': top_topics,
+                'top_emotions': top_emotions
+            })
+            progress_bar.progress(1.0, text="Processing complete!")
         except Exception as e:
             st.warning(f"Could not generate topics for {country}: {str(e)}")
             continue
     return summaries, topic_model
 try:
     bert_tokenizer, bert_model, emotion_classifier = load_models()
     st.success("Models loaded successfully!")