Spaces:

kambris
/

SoLProject

Runtime error

kambris commited on Nov 24, 2024

Commit

e2c8b5b

verified ·

1 Parent(s): a1fcd63

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -214,7 +214,6 @@ def process_and_summarize(df, top_n=50, topic_strategy="Auto", n_topics=None, mi
         "n_gram_range": (1, 3),
         "top_n_words": 15,
         "verbose": True,
-        "stop_words": ARABIC_STOP_WORDS
     }
     if topic_strategy == "Manual" and n_topics is not None:
@@ -223,6 +222,11 @@ def process_and_summarize(df, top_n=50, topic_strategy="Auto", n_topics=None, mi
         topic_model_params["nr_topics"] = "auto"
     topic_model = BERTopic(**topic_model_params)
     for country, group in df.groupby('country'):
         progress_text = f"Processing poems for {country}..."
@@ -319,10 +323,7 @@ def main():
                 if topic_strategy == "Manual":
                     # Calculate reasonable max topics based on dataset size
                     n_documents = len(df)
-                    if n_documents < 1000:
-                        max_topics = min(50, n_documents // 20)
-                    else:
-                        max_topics = min(500, int(np.log10(n_documents) * 100))
                     n_topics = st.slider(
                         "Number of Topics",

         "n_gram_range": (1, 3),
         "top_n_words": 15,
         "verbose": True,
     }
     if topic_strategy == "Manual" and n_topics is not None:
         topic_model_params["nr_topics"] = "auto"
     topic_model = BERTopic(**topic_model_params)
+        # Create vectorizer with stop words
+    from sklearn.feature_extraction.text import CountVectorizer
+    vectorizer = CountVectorizer(stop_words=list(ARABIC_STOP_WORDS))
+    topic_model.vectorizer_model = vectorizer
     for country, group in df.groupby('country'):
         progress_text = f"Processing poems for {country}..."
                 if topic_strategy == "Manual":
                     # Calculate reasonable max topics based on dataset size
                     n_documents = len(df)
+                    max_topics = max(2, min(50, n_documents // 20))
                     n_topics = st.slider(
                         "Number of Topics",