Spaces:

kambris
/

SoLProject

Runtime error

App Files Files Community

kambris commited on Nov 24, 2024

Commit

7173364

verified ·

1 Parent(s): 00bf9b7

Update app.py

Browse files

Files changed (1) hide show

app.py +109 -47

app.py CHANGED Viewed

@@ -7,6 +7,24 @@ import numpy as np
 from collections import Counter
 import os
 # Configure page
 st.set_page_config(
     page_title="Arabic Poem Analysis",
@@ -17,7 +35,6 @@ st.set_page_config(
 @st.cache_resource
 def load_models():
     """Load and cache the models to prevent reloading"""
-    # Use CAMeL-Lab's tokenizer for consistency with the emotion model
     tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
     bert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2")
     emotion_model = AutoModelForSequenceClassification.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
@@ -40,7 +57,7 @@ def split_text(text, max_length=512):
     for word in words:
         word_length = len(word.split())
         if current_length + word_length > max_length:
-            if current_chunk:  # Only append if there are words in the current chunk
                 chunks.append(' '.join(current_chunk))
             current_chunk = [word]
             current_length = word_length
@@ -48,25 +65,26 @@ def split_text(text, max_length=512):
             current_chunk.append(word)
             current_length += word_length
-    if current_chunk:  # Append the last chunk if it exists
         chunks.append(' '.join(current_chunk))
     return chunks
-# The beginning of the code remains the same until the classify_emotion function
 def classify_emotion(text, classifier):
     """Classify emotion for complete text with proper token handling."""
     try:
-        # Split text into manageable chunks
         words = text.split()
         chunks = []
         current_chunk = []
         current_length = 0
-        # Create chunks that respect the 512 token limit
         for word in words:
-            # Add word length plus 1 for space
             word_tokens = len(classifier.tokenizer.encode(word))
             if current_length + word_tokens > 512:
                 if current_chunk:
@@ -80,14 +98,12 @@ def classify_emotion(text, classifier):
         if current_chunk:
             chunks.append(' '.join(current_chunk))
-        # If no chunks were created, use the original text with truncation
         if not chunks:
             chunks = [text]
         all_scores = []
         for chunk in chunks:
             try:
-                # Ensure proper truncation
                 inputs = classifier.tokenizer(
                     chunk,
                     truncation=True,
@@ -101,13 +117,10 @@ def classify_emotion(text, classifier):
                 st.warning(f"Skipping chunk due to error: {str(chunk_error)}")
                 continue
-        # Average scores across all chunks
         if all_scores:
-            # Create a dictionary to store summed scores for each label
             label_scores = {}
             count = len(all_scores)
-            # Sum up scores for each label
             for scores in all_scores:
                 for score in scores:
                     label = score['label']
@@ -115,19 +128,15 @@ def classify_emotion(text, classifier):
                         label_scores[label] = 0
                     label_scores[label] += score['score']
-            # Calculate averages
             avg_scores = {label: score/count for label, score in label_scores.items()}
-            # Get the label with highest average score
             final_emotion = max(avg_scores.items(), key=lambda x: x[1])[0]
             return final_emotion
-        return "LABEL_2"  # Default to neutral if no valid results
     except Exception as e:
         st.warning(f"Error in emotion classification: {str(e)}")
-        return "LABEL_2"  # Default to neutral
 def get_embedding_for_text(text, tokenizer, model):
     """Get embedding for complete text."""
@@ -155,7 +164,6 @@ def get_embedding_for_text(text, tokenizer, model):
             continue
     if chunk_embeddings:
-        # Use weighted average based on chunk length
         weights = np.array([len(chunk.split()) for chunk in chunks])
         weights = weights / weights.sum()
         weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights)
@@ -170,7 +178,7 @@ def format_topics(topic_model, topic_counts):
             topic_label = "Miscellaneous"
         else:
             words = topic_model.get_topic(topic_num)
-            topic_label = " | ".join([word for word, _ in words[:5]])  # Show top 5 words per topic
         formatted_topics.append({
             'topic': topic_label,
@@ -180,7 +188,6 @@ def format_topics(topic_model, topic_counts):
 def format_emotions(emotion_counts):
     """Format emotions for display."""
-    # Define emotion labels mapping
     EMOTION_LABELS = {
         'LABEL_0': 'Negative',
         'LABEL_1': 'Positive',
@@ -196,29 +203,35 @@ def format_emotions(emotion_counts):
         })
     return formatted_emotions
-def process_and_summarize(df, top_n=50):
-    """Process the data and generate summaries."""
     summaries = []
-    # Initialize BERTopic with Arabic-specific settings
-    topic_model = BERTopic(
-        language="multilingual",
-        calculate_probabilities=True,
-        min_topic_size=2,  # Allow smaller topic groups
-        n_gram_range=(1, 3),  # Include up to trigrams
-        top_n_words=15,  # Show more words per topic
-        verbose=True
-    )
-    # Group by country
     for country, group in df.groupby('country'):
         progress_text = f"Processing poems for {country}..."
         progress_bar = st.progress(0, text=progress_text)
-        texts = group['poem'].dropna().tolist()
         all_emotions = []
-        # Generate embeddings with progress tracking
         embeddings = []
         for i, text in enumerate(texts):
             embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
@@ -228,7 +241,6 @@ def process_and_summarize(df, top_n=50):
         embeddings = np.array(embeddings)
-        # Process emotions with progress tracking
         for i, text in enumerate(texts):
             emotion = classify_emotion(text, emotion_classifier)
             all_emotions.append(emotion)
@@ -236,11 +248,13 @@ def process_and_summarize(df, top_n=50):
             progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")
         try:
-            # Fit topic model
-            topics, _ = topic_model.fit_transform(texts, embeddings)
-            # Format results
-            top_topics = format_topics(topic_model, Counter(topics).most_common(top_n))
             top_emotions = format_emotions(Counter(all_emotions).most_common(top_n))
             summaries.append({
@@ -290,14 +304,61 @@ if uploaded_file is not None:
         df['country'] = df['country'].str.strip()
         df = df.dropna(subset=['country', 'poem'])
-        # Process data
-        top_n = st.number_input("Number of top topics/emotions to display:",
-                               min_value=1, max_value=100, value=10)
-        if st.button("Process Data"):
-            with st.spinner("Processing your data..."):
-                summaries, topic_model = process_and_summarize(df, top_n=top_n)
                 if summaries:
                     st.success("Analysis complete!")
@@ -341,4 +402,5 @@ else:
         'country': ['Egypt', 'Palestine'],
         'poem': ['قصيدة مصرية', 'قصيدة فلسطينية ']
     })
-    st.dataframe(example_df)

 from collections import Counter
 import os
+# Add Arabic stop words
+ARABIC_STOP_WORDS = {
+    'في', 'من', 'إلى', 'على', 'عن', 'مع', 'خلال', 'حتى', 'إذا', 'ثم',
+    'أو', 'و', 'ف', 'ل', 'ب', 'ك', 'لل', 'ال', 'هذا', 'هذه', 'ذلك',
+    'تلك', 'هؤلاء', 'هم', 'هن', 'هو', 'هي', 'نحن', 'انت', 'انتم',
+    'كان', 'كانت', 'يكون', 'تكون', 'اي', 'كل', 'بعض', 'غير', 'حول',
+    'عند', 'قد', 'لقد', 'لم', 'لن', 'لو', 'ما', 'ماذا', 'متى', 'كيف',
+    'اين', 'لماذا', 'الذي', 'التي', 'الذين', 'اللاتي', 'اللواتي',
+    'الان', 'بين', 'فوق', 'تحت', 'امام', 'خلف', 'حين', 'قبل', 'بعد',
+    'و', 'أن', 'في', 'كل', 'لم', 'لن', 'له', 'من', 'هو', 'هي', 'قوة',
+    'كما', 'لها', 'منذ', 'وقد', 'ولا', 'نفس', 'ولم', 'حيث', 'هناك',
+    'جدا', 'ذات', 'ضمن', 'انه', 'لدى', 'عليه', 'مثل', 'وله', 'عند',
+    'أما', 'هذه', 'وأن', 'وكل', 'وقال', 'لدي', 'وكان', 'فيه', 'وهي',
+    'وهو', 'تلك', 'كلم', 'لكن', 'وفي', 'وقف', 'ولقد', 'ومن', 'وهذا',
+    'اول', 'ضمن', 'انها', 'جميع', 'الذي', 'قبل', 'بعد', 'حول', 'ايضا',
+    'لازم', 'حاجة', 'علي', 'يجب', 'صار', 'صارت', 'تحت', 'ضد'
+}
 # Configure page
 st.set_page_config(
     page_title="Arabic Poem Analysis",
 @st.cache_resource
 def load_models():
     """Load and cache the models to prevent reloading"""
     tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
     bert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2")
     emotion_model = AutoModelForSequenceClassification.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
     for word in words:
         word_length = len(word.split())
         if current_length + word_length > max_length:
+            if current_chunk:
                 chunks.append(' '.join(current_chunk))
             current_chunk = [word]
             current_length = word_length
             current_chunk.append(word)
             current_length += word_length
+    if current_chunk:
         chunks.append(' '.join(current_chunk))
     return chunks
+def clean_arabic_text(text):
+    """Clean Arabic text by removing stop words and normalizing."""
+    words = text.split()
+    cleaned_words = [word for word in words if word not in ARABIC_STOP_WORDS and len(word) > 1]
+    return ' '.join(cleaned_words)
 def classify_emotion(text, classifier):
     """Classify emotion for complete text with proper token handling."""
     try:
         words = text.split()
         chunks = []
         current_chunk = []
         current_length = 0
         for word in words:
             word_tokens = len(classifier.tokenizer.encode(word))
             if current_length + word_tokens > 512:
                 if current_chunk:
         if current_chunk:
             chunks.append(' '.join(current_chunk))
         if not chunks:
             chunks = [text]
         all_scores = []
         for chunk in chunks:
             try:
                 inputs = classifier.tokenizer(
                     chunk,
                     truncation=True,
                 st.warning(f"Skipping chunk due to error: {str(chunk_error)}")
                 continue
         if all_scores:
             label_scores = {}
             count = len(all_scores)
             for scores in all_scores:
                 for score in scores:
                     label = score['label']
                         label_scores[label] = 0
                     label_scores[label] += score['score']
             avg_scores = {label: score/count for label, score in label_scores.items()}
             final_emotion = max(avg_scores.items(), key=lambda x: x[1])[0]
             return final_emotion
+        return "LABEL_2"
     except Exception as e:
         st.warning(f"Error in emotion classification: {str(e)}")
+        return "LABEL_2"
 def get_embedding_for_text(text, tokenizer, model):
     """Get embedding for complete text."""
             continue
     if chunk_embeddings:
         weights = np.array([len(chunk.split()) for chunk in chunks])
         weights = weights / weights.sum()
         weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights)
             topic_label = "Miscellaneous"
         else:
             words = topic_model.get_topic(topic_num)
+            topic_label = " | ".join([word for word, _ in words[:5]])
         formatted_topics.append({
             'topic': topic_label,
 def format_emotions(emotion_counts):
     """Format emotions for display."""
     EMOTION_LABELS = {
         'LABEL_0': 'Negative',
         'LABEL_1': 'Positive',
         })
     return formatted_emotions
+def process_and_summarize(df, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=30):
+    """Process the data and generate summaries with flexible topic configuration."""
     summaries = []
+    topic_model_params = {
+        "language": "multilingual",
+        "calculate_probabilities": True,
+        "min_topic_size": min_topic_size,
+        "n_gram_range": (1, 3),
+        "top_n_words": 15,
+        "verbose": True,
+        "diversity": 0.5,
+        "stop_words": ARABIC_STOP_WORDS
+    }
+    if topic_strategy == "Manual" and n_topics is not None:
+        topic_model_params["nr_topics"] = n_topics
+    else:
+        topic_model_params["nr_topics"] = "auto"
+    topic_model = BERTopic(**topic_model_params)
     for country, group in df.groupby('country'):
         progress_text = f"Processing poems for {country}..."
         progress_bar = st.progress(0, text=progress_text)
+        texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
         all_emotions = []
         embeddings = []
         for i, text in enumerate(texts):
             embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
         embeddings = np.array(embeddings)
         for i, text in enumerate(texts):
             emotion = classify_emotion(text, emotion_classifier)
             all_emotions.append(emotion)
             progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")
         try:
+            topics, probs = topic_model.fit_transform(texts, embeddings)
+            topic_counts = Counter(topics)
+            if -1 in topic_counts:
+                del topic_counts[-1]
+            top_topics = format_topics(topic_model, topic_counts.most_common(top_n))
             top_emotions = format_emotions(Counter(all_emotions).most_common(top_n))
             summaries.append({
         df['country'] = df['country'].str.strip()
         df = df.dropna(subset=['country', 'poem'])
+        # Add topic modeling controls
+        st.subheader("Topic Modeling Settings")
+        col1, col2 = st.columns(2)
+        with col1:
+            topic_strategy = st.radio(
+                "Topic Number Strategy",
+                ["Auto", "Manual"],
+                help="Choose whether to let the model determine the optimal number of topics or set it manually"
+            )
+            if topic_strategy == "Manual":
+                # Calculate reasonable max topics based on dataset size
+                n_documents = len(df)
+                if n_documents < 1000:
+                    max_topics = min(50, n_documents // 20)
+                else:
+                    max_topics = min(500, int(np.log10(n_documents) * 100))
+                n_topics = st.slider(
+                    "Number of Topics",
+                    min_value=2,
+                    max_value=max_topics,
+                    value=min(20, max_topics),
+                    help=f"Select the desired number of topics (max {max_topics} based on dataset size)"
+                )
+                st.info(f"""
+                    💡 For your dataset of {n_documents:,} documents:
+                    - Minimum topics: 2
+                    - Maximum topics: {max_topics}
+                    - Recommended range: {max(2, max_topics//5)}-{max_topics//2}
+                    """)
+        with col2:
+            top_n = st.number_input(
+                "Number of top topics/emotions to display:",
+                min_value=1,
+                max_value=100,
+                value=10
+            )
+            min_topic_size = st.slider(
+                "Minimum Topic Size",
+                min_value=10,
+                max_value=100,
+                value=30,
+                help="Minimum number of documents required to form a topic"
+            )
+if st.button("Process Data"):
+            with st.spinner("Processing your data..."):
+                summaries, topic_model = process_and_summarize(df, top_n=top_n, topic_strategy=topic_strategy, n_topics=n_topics, min_topic_size=min_topic_size)
                 if summaries:
                     st.success("Analysis complete!")
         'country': ['Egypt', 'Palestine'],
         'poem': ['قصيدة مصرية', 'قصيدة فلسطينية ']
     })
+    st.dataframe(example_df)