Spaces:

kambris
/

SoLProject

Runtime error

App Files Files Community

kambris commited on Nov 23, 2024

Commit

0156b72

verified ·

1 Parent(s): 79bbe0b

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -82

app.py CHANGED Viewed

@@ -21,40 +21,52 @@ EMOTION_LABELS = {
     'LABEL_2': 'Neutral'
 }
-def chunk_text(text, max_length=512):
-    """Split text into chunks of maximum token length."""
     tokens = bert_tokenizer.encode(text, add_special_tokens=False)
     chunks = []
-    for i in range(0, len(tokens), max_length - 2):  # -2 to account for [CLS] and [SEP] tokens
-        chunk = tokens[i:i + max_length - 2]
         # Add special tokens
-        chunk = [bert_tokenizer.cls_token_id] + chunk + [bert_tokenizer.sep_token_id]
-        chunks.append(chunk)
-    return chunks
 def get_embedding_for_text(text):
-    """Get embedding for a single text."""
-    chunks = chunk_text(text)
     chunk_embeddings = []
-    for chunk in chunks:
-        # Convert to tensor and add batch dimension
-        input_ids = torch.tensor([chunk]).to(bert_model.device)
-        attention_mask = torch.ones_like(input_ids)
         with torch.no_grad():
-            outputs = bert_model(input_ids, attention_mask=attention_mask)
-        # Get [CLS] token embedding for this chunk
-        chunk_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
-        chunk_embeddings.append(chunk_embedding[0])
     # Average embeddings from all chunks
     if chunk_embeddings:
         return np.mean(chunk_embeddings, axis=0)
-    return np.zeros(bert_model.config.hidden_size)  # fallback
 def generate_embeddings(texts):
     """Generate embeddings for a list of texts."""
@@ -66,22 +78,28 @@ def generate_embeddings(texts):
             embeddings.append(embedding)
         except Exception as e:
             st.warning(f"Error processing text: {str(e)}")
-            # Add zero embedding as fallback
             embeddings.append(np.zeros(bert_model.config.hidden_size))
     return np.array(embeddings)
 def classify_emotion(text):
-    """Classify emotion for a single text."""
     try:
-        chunks = chunk_text(text)
-        if not chunks:
-            return "unknown"
-        # Use first chunk for classification
-        chunk_text = bert_tokenizer.decode(chunks[0])
-        result = emotion_classifier(chunk_text)[0]
-        return result['label']
     except Exception as e:
         st.warning(f"Error in emotion classification: {str(e)}")
         return "unknown"
@@ -93,9 +111,7 @@ def format_topics(topic_model, topic_counts):
         if topic_num == -1:
             topic_label = "Miscellaneous"
         else:
-            # Get the top words for this topic
             words = topic_model.get_topic(topic_num)
-            # Take the top 3 words to form a topic label
             topic_label = " | ".join([word for word, _ in words[:3]])
         formatted_topics.append({
@@ -136,10 +152,11 @@ def process_and_summarize(uploaded_file, top_n=50):
     df['country'] = df['country'].str.strip()
     df = df.dropna(subset=['country', 'poem'])
-    # Initialize BERTopic with specific parameters
     topic_model = BERTopic(
         language="arabic",
         calculate_probabilities=True,
         verbose=True
     )
@@ -151,26 +168,23 @@ def process_and_summarize(uploaded_file, top_n=50):
         texts = group['poem'].dropna().tolist()
         batch_size = 10
         all_emotions = []
-        all_embeddings = []
         for i in range(0, len(texts), batch_size):
             batch_texts = texts[i:i + batch_size]
-            st.info(f"Generating embeddings for batch {i//batch_size + 1}...")
-            batch_embeddings = generate_embeddings(batch_texts)
-            all_embeddings.extend(batch_embeddings)
             st.info(f"Classifying emotions for batch {i//batch_size + 1}...")
             batch_emotions = [classify_emotion(text) for text in batch_texts]
             all_emotions.extend(batch_emotions)
         try:
-            embeddings = np.array(all_embeddings)
             st.info(f"Fitting topic model for {country}...")
             topics, _ = topic_model.fit_transform(texts, embeddings)
-            # Format topics and emotions with readable labels
             top_topics = format_topics(topic_model, Counter(topics).most_common(top_n))
             top_emotions = format_emotions(Counter(all_emotions).most_common(top_n))
@@ -186,46 +200,4 @@ def process_and_summarize(uploaded_file, top_n=50):
     return summaries, topic_model
-# Streamlit App Interface
-st.title("Arabic Poem Topic Modeling & Emotion Classification")
-st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.")
-uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"])
-if uploaded_file is not None:
-    try:
-        top_n = st.number_input("Select the number of top topics/emotions to display:",
-                               min_value=1, max_value=100, value=10)
-        summaries, topic_model = process_and_summarize(uploaded_file, top_n=top_n)
-        if summaries is not None:
-            st.success("Data successfully processed!")
-            # Display summary for each country
-            for summary in summaries:
-                st.write(f"### {summary['country']}")
-                st.write(f"Total Poems: {summary['total_poems']}")
-                st.write(f"\nTop {top_n} Topics:")
-                for topic in summary['top_topics']:
-                    st.write(f"• {topic['topic']}: {topic['count']} poems")
-                st.write(f"\nTop {top_n} Emotions:")
-                for emotion in summary['top_emotions']:
-                    st.write(f"• {emotion['emotion']}: {emotion['count']} poems")
-                st.write("---")
-            # Display overall topics in a more readable format
-            st.write("### Global Topic Information:")
-            topic_info = topic_model.get_topic_info()
-            for _, row in topic_info.iterrows():
-                if row['Topic'] == -1:
-                    topic_name = "Miscellaneous"
-                else:
-                    words = topic_model.get_topic(row['Topic'])
-                    topic_name = " | ".join([word for word, _ in words[:3]])
-                st.write(f"• Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
-    except Exception as e:
-        st.error(f"Error: {str(e)}")

     'LABEL_2': 'Neutral'
 }
+def chunk_long_text(text, max_length=512):
+    """
+    Split text into chunks respecting AraBERT's token limit.
+    Returns both tokenized chunks and decoded text chunks.
+    """
+    # Tokenize the entire text
     tokens = bert_tokenizer.encode(text, add_special_tokens=False)
     chunks = []
+    text_chunks = []
+    # Split into chunks of max_length-2 to account for [CLS] and [SEP]
+    for i in range(0, len(tokens), max_length-2):
+        chunk = tokens[i:i + max_length-2]
         # Add special tokens
+        full_chunk = [bert_tokenizer.cls_token_id] + chunk + [bert_tokenizer.sep_token_id]
+        chunks.append(full_chunk)
+        # Decode the chunk back to text (without special tokens)
+        text_chunks.append(bert_tokenizer.decode(chunk))
+    return chunks, text_chunks
 def get_embedding_for_text(text):
+    """Get embedding for a text, handling long sequences by averaging chunk embeddings."""
+    _, text_chunks = chunk_long_text(text)
     chunk_embeddings = []
+    for chunk in text_chunks:
+        # Encode chunk with padding and attention mask
+        inputs = bert_tokenizer(chunk,
+                              return_tensors="pt",
+                              padding=True,
+                              truncation=True,
+                              max_length=512)
+        inputs = {k: v.to(bert_model.device) for k, v in inputs.items()}
         with torch.no_grad():
+            outputs = bert_model(**inputs)
+        # Get [CLS] token embedding
+        embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
+        chunk_embeddings.append(embedding[0])
     # Average embeddings from all chunks
     if chunk_embeddings:
         return np.mean(chunk_embeddings, axis=0)
+    return np.zeros(bert_model.config.hidden_size)
 def generate_embeddings(texts):
     """Generate embeddings for a list of texts."""
             embeddings.append(embedding)
         except Exception as e:
             st.warning(f"Error processing text: {str(e)}")
             embeddings.append(np.zeros(bert_model.config.hidden_size))
     return np.array(embeddings)
 def classify_emotion(text):
+    """
+    Classify emotion for a text, handling long sequences by voting among chunks.
+    """
     try:
+        _, text_chunks = chunk_long_text(text)
+        chunk_emotions = []
+        for chunk in text_chunks:
+            result = emotion_classifier(chunk, max_length=512, truncation=True)[0]
+            chunk_emotions.append(result['label'])
+        # Use majority voting for final emotion
+        if chunk_emotions:
+            final_emotion = Counter(chunk_emotions).most_common(1)[0][0]
+            return final_emotion
+        return "unknown"
     except Exception as e:
         st.warning(f"Error in emotion classification: {str(e)}")
         return "unknown"
         if topic_num == -1:
             topic_label = "Miscellaneous"
         else:
             words = topic_model.get_topic(topic_num)
             topic_label = " | ".join([word for word, _ in words[:3]])
         formatted_topics.append({
     df['country'] = df['country'].str.strip()
     df = df.dropna(subset=['country', 'poem'])
+    # Initialize BERTopic with specific parameters for Arabic
     topic_model = BERTopic(
         language="arabic",
         calculate_probabilities=True,
+        min_topic_size=5,
         verbose=True
     )
         texts = group['poem'].dropna().tolist()
         batch_size = 10
         all_emotions = []
+        # Generate embeddings for all texts
+        st.info("Generating embeddings...")
+        embeddings = generate_embeddings(texts)
+        # Process emotions in batches
         for i in range(0, len(texts), batch_size):
             batch_texts = texts[i:i + batch_size]
             st.info(f"Classifying emotions for batch {i//batch_size + 1}...")
             batch_emotions = [classify_emotion(text) for text in batch_texts]
             all_emotions.extend(batch_emotions)
         try:
             st.info(f"Fitting topic model for {country}...")
             topics, _ = topic_model.fit_transform(texts, embeddings)
+            # Format results
             top_topics = format_topics(topic_model, Counter(topics).most_common(top_n))
             top_emotions = format_emotions(Counter(all_emotions).most_common(top_n))
     return summaries, topic_model
+# Streamlit interface remains the same...