Spaces:

kambris
/

SoLProject

Runtime error

App Files Files Community

kambris commited on Dec 6, 2024

Commit

6b0bce1

verified ·

1 Parent(s): b449fa6

Update app.py

Browse files

Files changed (1) hide show

app.py +154 -229

app.py CHANGED Viewed

@@ -10,10 +10,6 @@ import os
 from wordcloud import WordCloud
 import matplotlib.pyplot as plt
 import pkg_resources
-import folium
-from folium.plugins import HeatMap
-import country_converter as coco
-from streamlit_folium import folium_static
 current_dir = os.path.dirname(os.path.abspath(__file__))
 font_path = os.path.join(current_dir, "ArabicR2013-J25x.ttf")
@@ -43,98 +39,19 @@ st.set_page_config(
 @st.cache_resource
 def load_models():
-    """Load and cache the models"""
-    # + Added use_fast=True for faster tokenization
-    tokenizer = AutoTokenizer.from_pretrained(
-        "CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment",
-        use_fast=True
-    )
-    # + Added torchscript and low_cpu_mem_usage
-    bert_model = AutoModel.from_pretrained(
-        "aubmindlab/bert-base-arabertv2",
-        torchscript=True,
-        low_cpu_mem_usage=True
-    )
-    # + Added optimizations for emotion model
-    emotion_model = AutoModelForSequenceClassification.from_pretrained(
-        "CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment",
-        torchscript=True,
-        low_cpu_mem_usage=True
-    )
-    # ~ Changed pipeline configuration to use batching
     emotion_classifier = pipeline(
         "sentiment-analysis",
         model=emotion_model,
-        tokenizer=tokenizer,
-        batch_size=32,
-        device=-1  # + Added to force CPU usage
     )
     return tokenizer, bert_model, emotion_classifier
-# + Added new batch processing function
-def process_texts_in_batches(texts, batch_size=32):
-    """Process texts in batches for better CPU utilization"""
-    batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]
-    results = []
-    for batch in batches:
-        batch_results = emotion_classifier(batch, truncation=True, max_length=512)
-        results.extend(batch_results)
-    return results
-# + Added caching decorator for embeddings
-@st.cache_data
-def get_cached_embeddings(text, tokenizer, model):
-    """Cache embeddings to avoid recomputation"""
-    return get_embedding_for_text(text, tokenizer, model)
-def create_theme_map(summaries, topic_model):
-    """Create an interactive map showing theme distributions across countries"""
-    try:
-        # Create a base map centered on the Arab world
-        m = folium.Map(location=[25, 45], zoom_start=4)
-        # Convert country names to coordinates
-        cc = coco.CountryConverter()
-        for summary in summaries:
-            try:
-                # Get country coordinates
-                country_iso = cc.convert(names=[summary['country']], to='ISO2')
-                country_data = cc.convert(names=[summary['country']], to='name_short')
-                # Create popup content with theme information
-                popup_content = f"""
-                    <h4>{summary['country']}</h4>
-                    <b>Top Themes:</b><br>
-                    {'<br>'.join([f"• {topic['topic']}: {topic['count']}"
-                                 for topic in summary['top_topics'][:5]])}
-                """
-                # Add marker for each country
-                folium.CircleMarker(
-                    location=[cc.convert(country_iso, to='latitude')[0],
-                             cc.convert(country_iso, to='longitude')[0]],
-                    radius=20,
-                    popup=folium.Popup(popup_content, max_width=300),
-                    color='red',
-                    fill=True,
-                    fill_opacity=0.7
-                ).add_to(m)
-            except Exception as e:
-                st.warning(f"Could not process {summary['country']}: {str(e)}")
-                continue
-        return m
-    except Exception as e:
-        st.error(f"Error creating map: {str(e)}")
-        return None
 def split_text(text, max_length=512):
     """Split text into chunks of maximum token length while preserving word boundaries."""
     words = text.split()
@@ -181,94 +98,99 @@ def clean_arabic_text(text):
     return ' '.join(cleaned_words)
 def classify_emotion(text, classifier):
-    """Classify emotion for complete text with precise token handling."""
-    # Ensure text is properly formatted
-    if not text or not isinstance(text, str):
-        return "LABEL_2"
-    # Split into manageable chunks
-    words = text.split()
-    chunks = []
-    current_chunk = []
-    current_length = 0
-    # Create proper-sized chunks
-    for word in words:
-        word_tokens = len(classifier.tokenizer.encode(word))
-        if current_length + word_tokens > 512:
-            if current_chunk:
-                chunks.append(' '.join(current_chunk))
-            current_chunk = [word]
-            current_length = word_tokens
-        else:
-            current_chunk.append(word)
-            current_length += word_tokens
-    if current_chunk:
-        chunks.append(' '.join(current_chunk))
-    if not chunks:
         return "LABEL_2"
-    # Process chunks with proper output handling
-    all_scores = []
-    for chunk in chunks:
-        # Direct classification with proper output structure
-        result = classifier(chunk, return_all_scores=True)[0]
-        all_scores.append(result)
-    # Calculate final emotion
-    label_scores = {}
-    count = len(all_scores)
-    for scores in all_scores:
-        for score_dict in scores:
-            label = score_dict['label']
-            if label not in label_scores:
-                label_scores[label] = 0
-            label_scores[label] += score_dict['score']
-    avg_scores = {label: score/count for label, score in label_scores.items()}
-    final_emotion = max(avg_scores.items(), key=lambda x: x[1])[0]
-    return final_emotion
 def get_embedding_for_text(text, tokenizer, model):
     """Get embedding for complete text."""
-    # First tokenize to get exact count
-    tokens = tokenizer.tokenize(text)
-    # Process in chunks of exactly 510 tokens (512 - 2 for CLS and SEP)
-    chunk_size = 510
     chunk_embeddings = []
-    for i in range(0, len(tokens), chunk_size):
-        chunk = tokens[i:i + chunk_size]
-        # Convert tokens back to text
-        chunk_text = tokenizer.convert_tokens_to_string(chunk)
-        # Now encode with special tokens
-        encoded = tokenizer(
-            chunk_text,
-            return_tensors='pt',
-            max_length=512,
-            truncation=True,
-            padding='max_length'
-        )
-        # Move to device
-        encoded = {k: v.to(model.device) for k, v in encoded.items()}
-        # Get embedding
-        with torch.no_grad():
-            output = model(**encoded)
-            embedding = output[0][:, 0, :].cpu().numpy()
             chunk_embeddings.append(embedding[0])
-    # Combine all chunk embeddings
     if chunk_embeddings:
-        return np.mean(chunk_embeddings, axis=0)
     return np.zeros(model.config.hidden_size)
 def format_topics(topic_model, topic_counts):
     """Format topics for display."""
     formatted_topics = []
@@ -301,26 +223,31 @@ def format_emotions(emotion_counts):
             'count': count
         })
     return formatted_emotions
-def get_optimized_topic_model(bert_model):
-    """Configure BERTopic for better CPU performance"""
-    return BERTopic(
-        embedding_model=bert_model,
-        language="arabic",
-        calculate_probabilities=False,
-        verbose=False,
-        n_gram_range=(1, 1),
-        min_topic_size=5,
-        nr_topics="auto",
-        low_memory=True
-    )
 def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3):
     """Process the data and generate summaries with flexible topic configuration."""
     summaries = []
-    topic_model = get_optimized_topic_model(bert_model)
     vectorizer = CountVectorizer(stop_words=list(ARABIC_STOP_WORDS),
                                 min_df=1,
@@ -334,58 +261,58 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
         texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
         all_emotions = []
-        # Get embeddings while keeping all content
         embeddings = []
         for i, text in enumerate(texts):
-            # Tokenize the full text first
-            full_tokens = bert_tokenizer.tokenize(text)
-            chunk_embeddings = []
-            # Create chunks of 510 tokens (leaving room for special tokens)
-            for start_idx in range(0, len(full_tokens), 510):
-                end_idx = start_idx + 510
-                chunk_tokens = full_tokens[start_idx:end_idx]
-                chunk_text = bert_tokenizer.convert_tokens_to_string(chunk_tokens)
-                # Get embedding for this chunk
-                chunk_embedding = get_embedding_for_text(chunk_text, bert_tokenizer, bert_model)
-                chunk_embeddings.append(chunk_embedding)
-            # Combine embeddings for full poem representation
-            full_embedding = np.mean(chunk_embeddings, axis=0) if chunk_embeddings else np.zeros(bert_model.config.hidden_size)
-            embeddings.append(full_embedding)
             progress = (i + 1) / len(texts) * 0.4
             progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...")
         embeddings = np.array(embeddings)
-        # Process emotions with tuple output handling
         for i, text in enumerate(texts):
-            result = emotion_classifier(text)
-            emotion = result[0]  # Access first element of tuple
             all_emotions.append(emotion)
             progress = 0.4 + ((i + 1) / len(texts) * 0.3)
             progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")
-        if len(texts) < min_topic_size:
-            st.info(f"Not enough documents for {country} to generate meaningful topics (minimum {min_topic_size} required)")
             continue
-        topics, _ = topic_model.fit_transform(texts, embeddings)
-        topic_counts = Counter(topics)
-        top_topics = format_topics(topic_model, topic_counts.most_common(top_n))
-        top_emotions = format_emotions(Counter(all_emotions).most_common(top_n))
-        summaries.append({
-            'country': country,
-            'total_poems': len(texts),
-            'top_topics': top_topics,
-            'top_emotions': top_emotions
-        })
-        progress_bar.progress(1.0, text="Processing complete!")
     return summaries, topic_model
 try:
@@ -470,7 +397,7 @@ if uploaded_file is not None:
                 if summaries:
                     st.success("Analysis complete!")
-                    tab1, tab2, tab3 = st.tabs(["Country Summaries", "Global Topics", "Theme Map"])
                     with tab1:
                         for summary in summaries:
@@ -503,10 +430,7 @@ if uploaded_file is not None:
                                 words = topic_model.get_topic(row['Topic'])
                                 topic_name = " | ".join([word for word, _ in words[:5]])
                             st.write(f"• Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
-                    with tab3:
-                        st.subheader("Thematic Distribution Map")
-                        theme_map = create_theme_map(summaries, topic_model)
-                        folium_static(theme_map)
     except Exception as e:
         st.error(f"Error processing file: {str(e)}")
@@ -520,3 +444,4 @@ else:
     })
     st.dataframe(example_df)

 from wordcloud import WordCloud
 import matplotlib.pyplot as plt
 import pkg_resources
 current_dir = os.path.dirname(os.path.abspath(__file__))
 font_path = os.path.join(current_dir, "ArabicR2013-J25x.ttf")
 @st.cache_resource
 def load_models():
+    """Load and cache the models to prevent reloading"""
+    tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
+    bert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2")
+    emotion_model = AutoModelForSequenceClassification.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
+    emotion_tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
     emotion_classifier = pipeline(
         "sentiment-analysis",
         model=emotion_model,
+        tokenizer=emotion_tokenizer,
+        return_all_scores=True
     )
     return tokenizer, bert_model, emotion_classifier
 def split_text(text, max_length=512):
     """Split text into chunks of maximum token length while preserving word boundaries."""
     words = text.split()
     return ' '.join(cleaned_words)
 def classify_emotion(text, classifier):
+    """Classify emotion for complete text with proper token handling."""
+    try:
+        words = text.split()
+        chunks = []
+        current_chunk = []
+        current_length = 0
+        for word in words:
+            word_tokens = len(classifier.tokenizer.encode(word))
+            if current_length + word_tokens > 512:
+                if current_chunk:
+                    chunks.append(' '.join(current_chunk))
+                current_chunk = [word]
+                current_length = word_tokens
+            else:
+                current_chunk.append(word)
+                current_length += word_tokens
+        if current_chunk:
+            chunks.append(' '.join(current_chunk))
+        if not chunks:
+            chunks = [text]
+        all_scores = []
+        for chunk in chunks:
+            try:
+                inputs = classifier.tokenizer(
+                    chunk,
+                    truncation=True,
+                    max_length=512,
+                    return_tensors="pt"
+                )
+                result = classifier(chunk, truncation=True, max_length=512)
+                scores = result[0]
+                all_scores.append(scores)
+            except Exception as chunk_error:
+                st.warning(f"Skipping chunk due to error: {str(chunk_error)}")
+                continue
+        if all_scores:
+            label_scores = {}
+            count = len(all_scores)
+            for scores in all_scores:
+                for score in scores:
+                    label = score['label']
+                    if label not in label_scores:
+                        label_scores[label] = 0
+                    label_scores[label] += score['score']
+            avg_scores = {label: score/count for label, score in label_scores.items()}
+            final_emotion = max(avg_scores.items(), key=lambda x: x[1])[0]
+            return final_emotion
         return "LABEL_2"
+    except Exception as e:
+        st.warning(f"Error in emotion classification: {str(e)}")
+        return "LABEL_2"
 def get_embedding_for_text(text, tokenizer, model):
     """Get embedding for complete text."""
+    chunks = split_text(text)
     chunk_embeddings = []
+    for chunk in chunks:
+        try:
+            inputs = tokenizer(
+                chunk,
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+                max_length=512
+            )
+            inputs = {k: v.to(model.device) for k, v in inputs.items()}
+            with torch.no_grad():
+                outputs = model(**inputs)
+            embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
             chunk_embeddings.append(embedding[0])
+        except Exception as e:
+            st.warning(f"Error processing chunk: {str(e)}")
+            continue
     if chunk_embeddings:
+        weights = np.array([len(chunk.split()) for chunk in chunks])
+        weights = weights / weights.sum()
+        weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights)
+        return weighted_embedding
     return np.zeros(model.config.hidden_size)
 def format_topics(topic_model, topic_counts):
     """Format topics for display."""
     formatted_topics = []
             'count': count
         })
     return formatted_emotions
 def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3):
     """Process the data and generate summaries with flexible topic configuration."""
     summaries = []
+    topic_model_params = {
+        "language": "arabic",
+        "calculate_probabilities": True,
+        "min_topic_size": 3,
+        "n_gram_range": (1, 1),
+        "top_n_words": 15,
+        "verbose": True,
+    }
+    st.write(f"Total documents: {len(df)}")
+    st.write(f"Topic strategy: {topic_strategy}")
+    st.write(f"Min topic size: {min_topic_size}")
+    if topic_strategy == "Manual":
+        topic_model_params["nr_topics"] = n_topics
+    else:
+        topic_model_params["nr_topics"] = "auto"
+    topic_model = BERTopic(
+        embedding_model=bert_model,
+        **topic_model_params)
     vectorizer = CountVectorizer(stop_words=list(ARABIC_STOP_WORDS),
                                 min_df=1,
         texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
         all_emotions = []
         embeddings = []
         for i, text in enumerate(texts):
+            try:
+                embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
+                if embedding is not None and not np.isnan(embedding).any():
+                    embeddings.append(embedding)
+                else:
+                    st.warning(f"Invalid embedding generated for text {i+1} in {country}")
+                    continue
+            except Exception as e:
+                st.warning(f"Error generating embedding for text {i+1} in {country}: {str(e)}")
+                continue
             progress = (i + 1) / len(texts) * 0.4
             progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...")
+        if len(embeddings) != len(texts):
+            texts = texts[:len(embeddings)]
         embeddings = np.array(embeddings)
         for i, text in enumerate(texts):
+            emotion = classify_emotion(text, emotion_classifier)
             all_emotions.append(emotion)
             progress = 0.4 + ((i + 1) / len(texts) * 0.3)
             progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")
+        try:
+            if len(texts) < min_topic_size:
+                st.warning(f"Not enough documents for {country} to generate meaningful topics (minimum {min_topic_size} required)")
+                continue
+            topics, probs = topic_model.fit_transform(texts, embeddings)
+            topic_counts = Counter(topics)
+            top_topics = format_topics(topic_model, topic_counts.most_common(top_n))
+            top_emotions = format_emotions(Counter(all_emotions).most_common(top_n))
+            summaries.append({
+                'country': country,
+                'total_poems': len(texts),
+                'top_topics': top_topics,
+                'top_emotions': top_emotions
+            })
+            progress_bar.progress(1.0, text="Processing complete!")
+        except Exception as e:
+            st.warning(f"Could not generate topics for {country}: {str(e)}")
             continue
     return summaries, topic_model
 try:
                 if summaries:
                     st.success("Analysis complete!")
+                    tab1, tab2 = st.tabs(["Country Summaries", "Global Topics"])
                     with tab1:
                         for summary in summaries:
                                 words = topic_model.get_topic(row['Topic'])
                                 topic_name = " | ".join([word for word, _ in words[:5]])
                             st.write(f"• Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
     except Exception as e:
         st.error(f"Error processing file: {str(e)}")
     })
     st.dataframe(example_df)