Spaces:

kambris
/

SoLProject

Runtime error

App Files Files Community

kambris commited on Dec 6, 2024

Commit

5018c2f

verified ·

1 Parent(s): 6b0bce1

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -70

app.py CHANGED Viewed

@@ -10,6 +10,7 @@ import os
 from wordcloud import WordCloud
 import matplotlib.pyplot as plt
 import pkg_resources
 current_dir = os.path.dirname(os.path.abspath(__file__))
 font_path = os.path.join(current_dir, "ArabicR2013-J25x.ttf")
@@ -51,7 +52,16 @@ def load_models():
         return_all_scores=True
     )
     return tokenizer, bert_model, emotion_classifier
 def split_text(text, max_length=512):
     """Split text into chunks of maximum token length while preserving word boundaries."""
     words = text.split()
@@ -225,96 +235,70 @@ def format_emotions(emotion_counts):
     return formatted_emotions
 def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3):
-    """Process the data and generate summaries with flexible topic configuration."""
     summaries = []
-    topic_model_params = {
-        "language": "arabic",
-        "calculate_probabilities": True,
-        "min_topic_size": 3,
-        "n_gram_range": (1, 1),
-        "top_n_words": 15,
-        "verbose": True,
-    }
-    st.write(f"Total documents: {len(df)}")
-    st.write(f"Topic strategy: {topic_strategy}")
-    st.write(f"Min topic size: {min_topic_size}")
-    if topic_strategy == "Manual":
-        topic_model_params["nr_topics"] = n_topics
-    else:
-        topic_model_params["nr_topics"] = "auto"
-    topic_model = BERTopic(
-        embedding_model=bert_model,
-        **topic_model_params)
-    vectorizer = CountVectorizer(stop_words=list(ARABIC_STOP_WORDS),
-                                min_df=1,
-                                max_df=1.0)
-    topic_model.vectorizer_model = vectorizer
     for country, group in df.groupby('country'):
-        progress_text = f"Processing poems for {country}..."
-        progress_bar = st.progress(0, text=progress_text)
         texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
         all_emotions = []
         embeddings = []
         for i, text in enumerate(texts):
             try:
-                embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
                 if embedding is not None and not np.isnan(embedding).any():
                     embeddings.append(embedding)
-                else:
-                    st.warning(f"Invalid embedding generated for text {i+1} in {country}")
-                    continue
             except Exception as e:
-                st.warning(f"Error generating embedding for text {i+1} in {country}: {str(e)}")
                 continue
-            progress = (i + 1) / len(texts) * 0.4
-            progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...")
-        if len(embeddings) != len(texts):
-            texts = texts[:len(embeddings)]
-        embeddings = np.array(embeddings)
         for i, text in enumerate(texts):
-            emotion = classify_emotion(text, emotion_classifier)
-            all_emotions.append(emotion)
-            progress = 0.4 + ((i + 1) / len(texts) * 0.3)
-            progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")
-        try:
-            if len(texts) < min_topic_size:
-                st.warning(f"Not enough documents for {country} to generate meaningful topics (minimum {min_topic_size} required)")
-                continue
-            topics, probs = topic_model.fit_transform(texts, embeddings)
-            topic_counts = Counter(topics)
-            top_topics = format_topics(topic_model, topic_counts.most_common(top_n))
-            top_emotions = format_emotions(Counter(all_emotions).most_common(top_n))
-            summaries.append({
-                'country': country,
-                'total_poems': len(texts),
-                'top_topics': top_topics,
-                'top_emotions': top_emotions
-            })
-            progress_bar.progress(1.0, text="Processing complete!")
-        except Exception as e:
-            st.warning(f"Could not generate topics for {country}: {str(e)}")
-            continue
     return summaries, topic_model
 try:
     bert_tokenizer, bert_model, emotion_classifier = load_models()
     st.success("Models loaded successfully!")

 from wordcloud import WordCloud
 import matplotlib.pyplot as plt
 import pkg_resources
+import gc
 current_dir = os.path.dirname(os.path.abspath(__file__))
 font_path = os.path.join(current_dir, "ArabicR2013-J25x.ttf")
         return_all_scores=True
     )
     return tokenizer, bert_model, emotion_classifier
+@st.cache_data
+def cache_embeddings(text, tokenizer, model):
+    return get_embedding_for_text(text, tokenizer, model)
+@st.cache_data
+def cache_emotion_classification(text, classifier):
+    return classify_emotion(text, classifier)
+@st.cache_data
 def split_text(text, max_length=512):
     """Split text into chunks of maximum token length while preserving word boundaries."""
     words = text.split()
     return formatted_emotions
 def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3):
     summaries = []
+    # Create a placeholder for the progress bar
+    progress_placeholder = st.empty()
+    progress_bar = progress_placeholder.progress(0)
+    # Create status message placeholder
+    status_message = st.empty()
     for country, group in df.groupby('country'):
+        # Clear memory at the start of each country's processing
+        gc.collect()
+        torch.cuda.empty_cache() if torch.cuda.is_available() else None
+        status_message.text(f"Processing poems for {country}...")
         texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
         all_emotions = []
+        # Use cached embeddings with progress tracking
         embeddings = []
+        total_texts = len(texts)
         for i, text in enumerate(texts):
             try:
+                embedding = cache_embeddings(text, bert_tokenizer, bert_model)
                 if embedding is not None and not np.isnan(embedding).any():
                     embeddings.append(embedding)
+                # Update progress more frequently
+                if i % max(1, total_texts // 100) == 0:
+                    progress = (i + 1) / total_texts * 0.4
+                    progress_bar.progress(progress)
+                    status_message.text(f"Generated embeddings for {i+1}/{total_texts} poems in {country}...")
             except Exception as e:
+                st.warning(f"Error processing poem {i+1} in {country}: {str(e)}")
                 continue
+        # Process emotions with caching and progress tracking
         for i, text in enumerate(texts):
+            try:
+                emotion = cache_emotion_classification(text, emotion_classifier)
+                all_emotions.append(emotion)
+                if i % max(1, total_texts // 100) == 0:
+                    progress = 0.4 + ((i + 1) / total_texts * 0.3)
+                    progress_bar.progress(progress)
+                    status_message.text(f"Classified emotions for {i+1}/{total_texts} poems in {country}...")
+            except Exception as e:
+                st.warning(f"Error classifying emotion for poem {i+1} in {country}: {str(e)}")
+                continue
+        # Rest of your existing processing code...
+        # Clear progress for next country
+        progress_placeholder.empty()
+        status_message.empty()
+        # Create new progress bar for next country
+        progress_placeholder = st.empty()
+        progress_bar = progress_placeholder.progress(0)
+        status_message = st.empty()
     return summaries, topic_model
 try:
     bert_tokenizer, bert_model, emotion_classifier = load_models()
     st.success("Models loaded successfully!")