Spaces:

kambris
/

SoLProject

Runtime error

App Files Files Community

kambris commited on Nov 24, 2024

Commit

f427760

verified ·

1 Parent(s): b77a329

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -12

app.py CHANGED Viewed

@@ -233,7 +233,7 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
         "n_gram_range": (1, 1),
         "top_n_words": 15,
         "verbose": True,
-    }}
     st.write(f"Total documents: {len(df)}")
     st.write(f"Topic strategy: {topic_strategy}")
     st.write(f"Min topic size: {min_topic_size}")
@@ -243,10 +243,14 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
     else:
         topic_model_params["nr_topics"] = "auto"
-    topic_model = BERTopic(**topic_model_params)
     # Create vectorizer with stop words
-    vectorizer = CountVectorizer(stop_words=list(ARABIC_STOP_WORDS))
     topic_model.vectorizer_model = vectorizer
     for country, group in df.groupby('country'):
@@ -258,11 +262,21 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
         embeddings = []
         for i, text in enumerate(texts):
-            embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
-            embeddings.append(embedding)
             progress = (i + 1) / len(texts) * 0.4
             progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...")
         embeddings = np.array(embeddings)
         for i, text in enumerate(texts):
@@ -272,16 +286,25 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
             progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")
         try:
             topics, probs = topic_model.fit_transform(texts, embeddings)
-            st.write(f"Number of unique topics: {len(set(topics))}")
-            st.write(f"Topic distribution: {Counter(topics)}")
-            topic_counts = Counter(topics)
-            if -1 in topic_counts:
-                del topic_counts[-1]
             top_topics = format_topics(topic_model, topic_counts.most_common(top_n))
             top_emotions = format_emotions(Counter(all_emotions).most_common(top_n))
             summaries.append({
                 'country': country,
                 'total_poems': len(texts),
@@ -295,7 +318,6 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
             continue
     return summaries, topic_model
 # Load models
 try:
     bert_tokenizer, bert_model, emotion_classifier = load_models()

         "n_gram_range": (1, 1),
         "top_n_words": 15,
         "verbose": True,
+    }
     st.write(f"Total documents: {len(df)}")
     st.write(f"Topic strategy: {topic_strategy}")
     st.write(f"Min topic size: {min_topic_size}")
     else:
         topic_model_params["nr_topics"] = "auto"
+    topic_model = BERTopic(
+        embedding_model=bert_model,
+        **topic_model_params)
     # Create vectorizer with stop words
+    vectorizer = CountVectorizer(stop_words=list(ARABIC_STOP_WORDS),
+                                min_df=2,
+                                max_df=0.95)
     topic_model.vectorizer_model = vectorizer
     for country, group in df.groupby('country'):
         embeddings = []
         for i, text in enumerate(texts):
+            try:
+                embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
+                if embedding is not None and not np.isnan(embedding).any():
+                    embeddings.append(embedding)
+                else:
+                    st.warning(f"Invalid embedding generated for text {i+1} in {country}")
+                    continue
+            except Exception as e:
+                st.warning(f"Error generating embedding for text {i+1} in {country}: {str(e)}")
+                continue
             progress = (i + 1) / len(texts) * 0.4
             progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...")
+        if len(embeddings) != len(texts):
+            texts = texts[:len(embeddings)]
         embeddings = np.array(embeddings)
         for i, text in enumerate(texts):
             progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")
         try:
+            if len(texts) < min_topic_size:
+                st.warning(f"Not enough documents for {country} to generate meaningful topics (minimum {min_topic_size} required)")
+                continue
             topics, probs = topic_model.fit_transform(texts, embeddings)
+            valid_topics = [t for t in topics if t != -1]
+            if not valid_topics:
+                st.warning(f"No valid topics generated for {country}")
+                continue
+            topic_counts = Counter(valid_topics)
             top_topics = format_topics(topic_model, topic_counts.most_common(top_n))
             top_emotions = format_emotions(Counter(all_emotions).most_common(top_n))
             summaries.append({
                 'country': country,
                 'total_poems': len(texts),
             continue
     return summaries, topic_model
 # Load models
 try:
     bert_tokenizer, bert_model, emotion_classifier = load_models()