Spaces:

kambris
/

SoLProject

Runtime error

App Files Files Community

kambris commited on Dec 7, 2024

Commit

4c9a0ea

verified ·

1 Parent(s): 89e32b2

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -37

app.py CHANGED Viewed

@@ -195,12 +195,13 @@ def get_embedding_for_text(text, tokenizer, model):
             continue
     if chunk_embeddings:
-        weights = np.array([len(chunk.split()) for chunk in chunks])
-        weights = weights / weights.sum()
-        weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights)
-        return weighted_embedding
-    return np.zeros(model.config.hidden_size)
 def format_topics(topic_model, topic_counts):
     """Format topics for display."""
     formatted_topics = []
@@ -252,41 +253,40 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
         topic_model_params["nr_topics"] = "auto"
     topic_model = BERTopic(
-        embedding_model=bert_model,
         **topic_model_params
     )
-    vectorizer = CountVectorizer(stop_words=list(ARABIC_STOP_WORDS),
-                               min_df=1,
-                               max_df=1.0)
     topic_model.vectorizer_model = vectorizer
-    # Create a placeholder for the progress bar
     progress_placeholder = st.empty()
     progress_bar = progress_placeholder.progress(0)
-    # Create status message placeholder
     status_message = st.empty()
     for country, group in df.groupby('country'):
-        # Clear memory at the start of each country's processing
         gc.collect()
         torch.cuda.empty_cache() if torch.cuda.is_available() else None
         status_message.text(f"Processing poems for {country}...")
         texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
         all_emotions = []
-        # Use cached embeddings with progress tracking
-        embeddings = []
         total_texts = len(texts)
         for i, text in enumerate(texts):
             try:
                 embedding = cache_embeddings(text, bert_tokenizer, bert_model)
                 if embedding is not None and not np.isnan(embedding).any():
-                    embeddings.append(embedding)
-                # Update progress more frequently
                 if i % max(1, total_texts // 100) == 0:
                     progress = (i + 1) / total_texts * 0.4
                     progress_bar.progress(progress)
@@ -296,7 +296,7 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
                 st.warning(f"Error processing poem {i+1} in {country}: {str(e)}")
                 continue
-        # Process emotions with caching and progress tracking
         for i, text in enumerate(texts):
             try:
                 emotion = cache_emotion_classification(text, emotion_classifier)
@@ -316,30 +316,32 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
                 st.warning(f"Not enough documents for {country} to generate meaningful topics (minimum {min_topic_size} required)")
                 continue
-            topics, probs = topic_model.fit_transform(texts, embeddings)
-            topic_counts = Counter(topics)
-            top_topics = format_topics(topic_model, topic_counts.most_common(top_n))
-            top_emotions = format_emotions(Counter(all_emotions).most_common(top_n))
-            summaries.append({
-                'country': country,
-                'total_poems': len(texts),
-                'top_topics': top_topics,
-                'top_emotions': top_emotions
-            })
-            progress_bar.progress(1.0, text="Processing complete!")
         except Exception as e:
             st.warning(f"Could not generate topics for {country}: {str(e)}")
             continue
-        # Clear progress for next country
         progress_placeholder.empty()
         status_message.empty()
-        # Create new progress bar for next country
         progress_placeholder = st.empty()
         progress_bar = progress_placeholder.progress(0)
         status_message = st.empty()

             continue
     if chunk_embeddings:
+        # Convert to numpy array and ensure 2D shape
+        chunk_embeddings = np.array(chunk_embeddings)
+        if len(chunk_embeddings.shape) == 1:
+            chunk_embeddings = chunk_embeddings.reshape(1, -1)
+        return chunk_embeddings
+    return np.zeros((1, model.config.hidden_size))
 def format_topics(topic_model, topic_counts):
     """Format topics for display."""
     formatted_topics = []
         topic_model_params["nr_topics"] = "auto"
     topic_model = BERTopic(
+        embedding_model=None,  # Set to None since we're providing embeddings
         **topic_model_params
     )
+    vectorizer = CountVectorizer(
+        stop_words=list(ARABIC_STOP_WORDS),
+        min_df=1,
+        max_df=1.0
+    )
     topic_model.vectorizer_model = vectorizer
     progress_placeholder = st.empty()
     progress_bar = progress_placeholder.progress(0)
     status_message = st.empty()
     for country, group in df.groupby('country'):
         gc.collect()
         torch.cuda.empty_cache() if torch.cuda.is_available() else None
         status_message.text(f"Processing poems for {country}...")
         texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
         all_emotions = []
+        embeddings_list = []
         total_texts = len(texts)
         for i, text in enumerate(texts):
             try:
                 embedding = cache_embeddings(text, bert_tokenizer, bert_model)
                 if embedding is not None and not np.isnan(embedding).any():
+                    # Ensure embedding is 2D
+                    if len(embedding.shape) == 1:
+                        embedding = embedding.reshape(1, -1)
+                    embeddings_list.append(embedding)
                 if i % max(1, total_texts // 100) == 0:
                     progress = (i + 1) / total_texts * 0.4
                     progress_bar.progress(progress)
                 st.warning(f"Error processing poem {i+1} in {country}: {str(e)}")
                 continue
+        # Process emotions
         for i, text in enumerate(texts):
             try:
                 emotion = cache_emotion_classification(text, emotion_classifier)
                 st.warning(f"Not enough documents for {country} to generate meaningful topics (minimum {min_topic_size} required)")
                 continue
+            if embeddings_list:
+                # Stack all embeddings into a single 2D array
+                embeddings = np.vstack(embeddings_list)
+                topics, probs = topic_model.fit_transform(texts, embeddings)
+                topic_counts = Counter(topics)
+                top_topics = format_topics(topic_model, topic_counts.most_common(top_n))
+                top_emotions = format_emotions(Counter(all_emotions).most_common(top_n))
+                summaries.append({
+                    'country': country,
+                    'total_poems': len(texts),
+                    'top_topics': top_topics,
+                    'top_emotions': top_emotions
+                })
+                progress_bar.progress(1.0, text="Processing complete!")
+            else:
+                st.warning(f"No valid embeddings generated for {country}")
         except Exception as e:
             st.warning(f"Could not generate topics for {country}: {str(e)}")
             continue
         progress_placeholder.empty()
         status_message.empty()
         progress_placeholder = st.empty()
         progress_bar = progress_placeholder.progress(0)
         status_message = st.empty()