Spaces:

kambris
/

SoLProject

Runtime error

App Files Files Community

kambris commited on Dec 7, 2024

Commit

5835cbd

verified ·

1 Parent(s): 729733d

Update app.py

Browse files

Files changed (1) hide show

app.py +106 -56

app.py CHANGED Viewed

@@ -10,7 +10,9 @@ import os
 from wordcloud import WordCloud
 import matplotlib.pyplot as plt
 import pkg_resources
-import gc
 current_dir = os.path.dirname(os.path.abspath(__file__))
 font_path = os.path.join(current_dir, "ArabicR2013-J25x.ttf")
@@ -52,16 +54,7 @@ def load_models():
         return_all_scores=True
     )
     return tokenizer, bert_model, emotion_classifier
-@st.cache_data
-def cache_embeddings(text, _tokenizer, _model):
-    return get_embedding_for_text(text, _tokenizer, _model)
-@st.cache_data
-def cache_emotion_classification(text, _classifier):
-    return classify_emotion(text, _classifier)
-@st.cache_data
 def split_text(text, max_length=512):
     """Split text into chunks of maximum token length while preserving word boundaries."""
     words = text.split()
@@ -84,6 +77,62 @@ def split_text(text, max_length=512):
         chunks.append(' '.join(current_chunk))
     return chunks
 def create_arabic_wordcloud(text, title):
     wordcloud = WordCloud(
@@ -170,9 +219,9 @@ def classify_emotion(text, classifier):
         return "LABEL_2"
 def get_embedding_for_text(text, tokenizer, model):
     chunks = split_text(text)
     chunk_embeddings = []
-    embedding_size = model.config.hidden_size
     for chunk in chunks:
         try:
@@ -189,16 +238,18 @@ def get_embedding_for_text(text, tokenizer, model):
                 outputs = model(**inputs)
             embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
-            chunk_embeddings.append(embedding.reshape(-1))
         except Exception as e:
             continue
     if chunk_embeddings:
-        # Ensure consistent shape
-        final_embedding = np.mean(chunk_embeddings, axis=0)
-        return final_embedding.reshape(-1)
-    return np.zeros(embedding_size)
 def format_topics(topic_model, topic_counts):
     """Format topics for display."""
     formatted_topics = []
@@ -233,17 +284,20 @@ def format_emotions(emotion_counts):
     return formatted_emotions
 def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3):
     summaries = []
-    embedding_size = bert_model.config.hidden_size
     topic_model_params = {
         "language": "arabic",
         "calculate_probabilities": True,
-        "min_topic_size": min_topic_size,
         "n_gram_range": (1, 1),
         "top_n_words": 15,
         "verbose": True,
     }
     if topic_strategy == "Manual":
         topic_model_params["nr_topics"] = n_topics
@@ -251,15 +305,12 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
         topic_model_params["nr_topics"] = "auto"
     topic_model = BERTopic(
-        embedding_model=None,
-        **topic_model_params
-    )
-    vectorizer = CountVectorizer(
-        stop_words=list(ARABIC_STOP_WORDS),
-        min_df=1,
-        max_df=1.0
-    )
     topic_model.vectorizer_model = vectorizer
     for country, group in df.groupby('country'):
@@ -268,48 +319,42 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
         texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
         all_emotions = []
-        embeddings = []
-        # Generate embeddings
         for i, text in enumerate(texts):
             try:
                 embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
-                if embedding is not None and embedding.shape[0] == embedding_size:
                     embeddings.append(embedding)
-                progress = (i + 1) / len(texts) * 0.4
-                progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...")
             except Exception as e:
-                st.warning(f"Error processing poem {i+1} in {country}: {str(e)}")
                 continue
-        # Convert to numpy array and ensure 2D shape
-        if embeddings:
-            embeddings = np.vstack(embeddings)
-        else:
-            st.warning(f"No valid embeddings generated for {country}")
-            continue
-        # Process emotions
-        for i, text in enumerate(texts[:len(embeddings)]):
-            try:
-                emotion = classify_emotion(text, emotion_classifier)
-                all_emotions.append(emotion)
-                progress = 0.4 + ((i + 1) / len(texts) * 0.3)
-                progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")
-            except Exception as e:
-                st.warning(f"Error classifying emotion for poem {i+1} in {country}: {str(e)}")
-                continue
         try:
             if len(texts) < min_topic_size:
                 st.warning(f"Not enough documents for {country} to generate meaningful topics (minimum {min_topic_size} required)")
                 continue
-            # Ensure texts and embeddings match
-            texts = texts[:len(embeddings)]
-            # Fit and transform the topic model
             topics, probs = topic_model.fit_transform(texts, embeddings)
             topic_counts = Counter(topics)
             top_topics = format_topics(topic_model, topic_counts.most_common(top_n))
@@ -329,7 +374,6 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
     return summaries, topic_model
 try:
     bert_tokenizer, bert_model, emotion_classifier = load_models()
     st.success("Models loaded successfully!")
@@ -412,7 +456,7 @@ if uploaded_file is not None:
                 if summaries:
                     st.success("Analysis complete!")
-                    tab1, tab2 = st.tabs(["Country Summaries", "Global Topics"])
                     with tab1:
                         for summary in summaries:
@@ -445,6 +489,12 @@ if uploaded_file is not None:
                                 words = topic_model.get_topic(row['Topic'])
                                 topic_name = " | ".join([word for word, _ in words[:5]])
                             st.write(f"• Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
     except Exception as e:
         st.error(f"Error processing file: {str(e)}")

 from wordcloud import WordCloud
 import matplotlib.pyplot as plt
 import pkg_resources
+import folium
+import country_converter as coco
 current_dir = os.path.dirname(os.path.abspath(__file__))
 font_path = os.path.join(current_dir, "ArabicR2013-J25x.ttf")
         return_all_scores=True
     )
     return tokenizer, bert_model, emotion_classifier
 def split_text(text, max_length=512):
     """Split text into chunks of maximum token length while preserving word boundaries."""
     words = text.split()
         chunks.append(' '.join(current_chunk))
     return chunks
+def get_country_coordinates():
+    """Returns dictionary of Arab country coordinates"""
+    return {
+        'Egypt': [26.8206, 30.8025],
+        'Saudi Arabia': [23.8859, 45.0792],
+        'UAE': [23.4241, 53.8478],
+        'Kuwait': [29.3117, 47.4818],
+        'Iraq': [33.2232, 43.6793],
+        'Syria': [34.8021, 38.9968],
+        'Lebanon': [33.8547, 35.8623],
+        'Jordan': [30.5852, 36.2384],
+        'Palestine': [31.9522, 35.2332],
+        'Yemen': [15.5527, 48.5164],
+        'Oman': [21.4735, 55.9754],
+        'Qatar': [25.3548, 51.1839],
+        'Bahrain': [26.0667, 50.5577],
+        'Sudan': [12.8628, 30.2176],
+        'Libya': [26.3351, 17.2283],
+        'Tunisia': [33.8869, 9.5375],
+        'Algeria': [28.0339, 1.6596],
+        'Morocco': [31.7917, -7.0926],
+        'Mauritania': [21.0079, -10.9408]
+    }
+def create_topic_map(summaries):
+    """Create an interactive map showing topic distribution"""
+    coordinates = get_country_coordinates()
+    # Create base map centered on Arab world
+    m = folium.Map(location=[25.0, 30.0], zoom_start=4)
+    for summary in summaries:
+        country = summary['country']
+        if country in coordinates:
+            # Get top topic
+            top_topic = summary['top_topics'][0]['topic'] if summary['top_topics'] else "No topics"
+            top_emotion = summary['top_emotions'][0]['emotion'] if summary['top_emotions'] else "No emotion"
+            # Create popup content
+            popup_content = f"""
+                <b>{country}</b><br>
+                Top Topic: {top_topic}<br>
+                Main Emotion: {top_emotion}<br>
+                Total Poems: {summary['total_poems']}
+            """
+            # Add marker
+            folium.CircleMarker(
+                location=coordinates[country],
+                radius=10,
+                popup=folium.Popup(popup_content, max_width=300),
+                color='red',
+                fill=True
+            ).add_to(m)
+    return m
 def create_arabic_wordcloud(text, title):
     wordcloud = WordCloud(
         return "LABEL_2"
 def get_embedding_for_text(text, tokenizer, model):
+    """Get embedding for complete text."""
     chunks = split_text(text)
     chunk_embeddings = []
     for chunk in chunks:
         try:
                 outputs = model(**inputs)
             embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
+            chunk_embeddings.append(embedding[0])
         except Exception as e:
+            st.warning(f"Error processing chunk: {str(e)}")
             continue
     if chunk_embeddings:
+        weights = np.array([len(chunk.split()) for chunk in chunks])
+        weights = weights / weights.sum()
+        weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights)
+        return weighted_embedding
+    return np.zeros(model.config.hidden_size)
 def format_topics(topic_model, topic_counts):
     """Format topics for display."""
     formatted_topics = []
     return formatted_emotions
 def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3):
+    """Process the data and generate summaries with flexible topic configuration."""
     summaries = []
     topic_model_params = {
         "language": "arabic",
         "calculate_probabilities": True,
+        "min_topic_size": 3,
         "n_gram_range": (1, 1),
         "top_n_words": 15,
         "verbose": True,
     }
+    st.write(f"Total documents: {len(df)}")
+    st.write(f"Topic strategy: {topic_strategy}")
+    st.write(f"Min topic size: {min_topic_size}")
     if topic_strategy == "Manual":
         topic_model_params["nr_topics"] = n_topics
         topic_model_params["nr_topics"] = "auto"
     topic_model = BERTopic(
+        embedding_model=bert_model,
+        **topic_model_params)
+    vectorizer = CountVectorizer(stop_words=list(ARABIC_STOP_WORDS),
+                                min_df=1,
+                                max_df=1.0)
     topic_model.vectorizer_model = vectorizer
     for country, group in df.groupby('country'):
         texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
         all_emotions = []
+        embeddings = []
         for i, text in enumerate(texts):
             try:
                 embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
+                if embedding is not None and not np.isnan(embedding).any():
                     embeddings.append(embedding)
+                else:
+                    st.warning(f"Invalid embedding generated for text {i+1} in {country}")
+                    continue
             except Exception as e:
+                st.warning(f"Error generating embedding for text {i+1} in {country}: {str(e)}")
                 continue
+            progress = (i + 1) / len(texts) * 0.4
+            progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...")
+        if len(embeddings) != len(texts):
+            texts = texts[:len(embeddings)]
+        embeddings = np.array(embeddings)
+        for i, text in enumerate(texts):
+            emotion = classify_emotion(text, emotion_classifier)
+            all_emotions.append(emotion)
+            progress = 0.4 + ((i + 1) / len(texts) * 0.3)
+            progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")
         try:
             if len(texts) < min_topic_size:
                 st.warning(f"Not enough documents for {country} to generate meaningful topics (minimum {min_topic_size} required)")
                 continue
             topics, probs = topic_model.fit_transform(texts, embeddings)
             topic_counts = Counter(topics)
             top_topics = format_topics(topic_model, topic_counts.most_common(top_n))
     return summaries, topic_model
 try:
     bert_tokenizer, bert_model, emotion_classifier = load_models()
     st.success("Models loaded successfully!")
                 if summaries:
                     st.success("Analysis complete!")
+                    tab1, tab2, tab3 = st.tabs(["Country Summaries", "Global Topics", "Topic Map"])
                     with tab1:
                         for summary in summaries:
                                 words = topic_model.get_topic(row['Topic'])
                                 topic_name = " | ".join([word for word, _ in words[:5]])
                             st.write(f"• Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
+                    with tab3:
+                        st.subheader("Topic Distribution Map")
+                        topic_map = create_topic_map(summaries)
+                        # Display the map
+                        st.components.v1.html(topic_map._repr_html_(), height=600)
     except Exception as e:
         st.error(f"Error processing file: {str(e)}")