Spaces:

kambris
/

SoLProject

Runtime error

App Files Files Community

kambris commited on Nov 23, 2024

Commit

b2576ed

verified ·

1 Parent(s): 0156b72

Update app.py

Browse files

Files changed (1) hide show

app.py +142 -76

app.py CHANGED Viewed

@@ -5,14 +5,31 @@ from bertopic import BERTopic
 import torch
 import numpy as np
 from collections import Counter
-# Load AraBERT tokenizer and model for embeddings
-bert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")
-bert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2")
-# Load AraBERT model for emotion classification
-emotion_model = AutoModelForSequenceClassification.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
-emotion_classifier = pipeline("text-classification", model=emotion_model, tokenizer=bert_tokenizer)
 # Define emotion labels mapping
 EMOTION_LABELS = {
@@ -21,80 +38,67 @@ EMOTION_LABELS = {
     'LABEL_2': 'Neutral'
 }
-def chunk_long_text(text, max_length=512):
-    """
-    Split text into chunks respecting AraBERT's token limit.
-    Returns both tokenized chunks and decoded text chunks.
-    """
-    # Tokenize the entire text
-    tokens = bert_tokenizer.encode(text, add_special_tokens=False)
     chunks = []
     text_chunks = []
-    # Split into chunks of max_length-2 to account for [CLS] and [SEP]
     for i in range(0, len(tokens), max_length-2):
         chunk = tokens[i:i + max_length-2]
-        # Add special tokens
-        full_chunk = [bert_tokenizer.cls_token_id] + chunk + [bert_tokenizer.sep_token_id]
         chunks.append(full_chunk)
-        # Decode the chunk back to text (without special tokens)
-        text_chunks.append(bert_tokenizer.decode(chunk))
     return chunks, text_chunks
-def get_embedding_for_text(text):
-    """Get embedding for a text, handling long sequences by averaging chunk embeddings."""
-    _, text_chunks = chunk_long_text(text)
     chunk_embeddings = []
     for chunk in text_chunks:
-        # Encode chunk with padding and attention mask
-        inputs = bert_tokenizer(chunk,
-                              return_tensors="pt",
-                              padding=True,
-                              truncation=True,
-                              max_length=512)
-        inputs = {k: v.to(bert_model.device) for k, v in inputs.items()}
         with torch.no_grad():
-            outputs = bert_model(**inputs)
-        # Get [CLS] token embedding
         embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
         chunk_embeddings.append(embedding[0])
-    # Average embeddings from all chunks
     if chunk_embeddings:
         return np.mean(chunk_embeddings, axis=0)
-    return np.zeros(bert_model.config.hidden_size)
-def generate_embeddings(texts):
     """Generate embeddings for a list of texts."""
     embeddings = []
     for text in texts:
         try:
-            embedding = get_embedding_for_text(text)
             embeddings.append(embedding)
         except Exception as e:
             st.warning(f"Error processing text: {str(e)}")
-            embeddings.append(np.zeros(bert_model.config.hidden_size))
     return np.array(embeddings)
-def classify_emotion(text):
-    """
-    Classify emotion for a text, handling long sequences by voting among chunks.
-    """
     try:
-        _, text_chunks = chunk_long_text(text)
         chunk_emotions = []
         for chunk in text_chunks:
-            result = emotion_classifier(chunk, max_length=512, truncation=True)[0]
             chunk_emotions.append(result['label'])
-        # Use majority voting for final emotion
         if chunk_emotions:
             final_emotion = Counter(chunk_emotions).most_common(1)[0][0]
             return final_emotion
@@ -105,7 +109,7 @@ def classify_emotion(text):
         return "unknown"
 def format_topics(topic_model, topic_counts):
-    """Convert topic numbers to readable labels."""
     formatted_topics = []
     for topic_num, count in topic_counts:
         if topic_num == -1:
@@ -121,7 +125,7 @@ def format_topics(topic_model, topic_counts):
     return formatted_topics
 def format_emotions(emotion_counts):
-    """Convert emotion labels to readable text."""
     formatted_emotions = []
     for label, count in emotion_counts:
         emotion = EMOTION_LABELS.get(label, label)
@@ -131,28 +135,11 @@ def format_emotions(emotion_counts):
         })
     return formatted_emotions
-def process_and_summarize(uploaded_file, top_n=50):
-    # Determine the file type
-    if uploaded_file.name.endswith(".csv"):
-        df = pd.read_csv(uploaded_file)
-    elif uploaded_file.name.endswith(".xlsx"):
-        df = pd.read_excel(uploaded_file)
-    else:
-        st.error("Unsupported file format.")
-        return None, None
-    # Validate required columns
-    required_columns = ['country', 'poem']
-    missing_columns = [col for col in required_columns if col not in df.columns]
-    if missing_columns:
-        st.error(f"Missing columns: {', '.join(missing_columns)}")
-        return None, None
-    # Parse and preprocess the file
-    df['country'] = df['country'].str.strip()
-    df = df.dropna(subset=['country', 'poem'])
-    # Initialize BERTopic with specific parameters for Arabic
     topic_model = BERTopic(
         language="arabic",
         calculate_probabilities=True,
@@ -161,27 +148,28 @@ def process_and_summarize(uploaded_file, top_n=50):
     )
     # Group by country
-    summaries = []
     for country, group in df.groupby('country'):
-        st.info(f"Processing poems for {country}...")
         texts = group['poem'].dropna().tolist()
         batch_size = 10
         all_emotions = []
-        # Generate embeddings for all texts
-        st.info("Generating embeddings...")
-        embeddings = generate_embeddings(texts)
-        # Process emotions in batches
         for i in range(0, len(texts), batch_size):
             batch_texts = texts[i:i + batch_size]
-            st.info(f"Classifying emotions for batch {i//batch_size + 1}...")
-            batch_emotions = [classify_emotion(text) for text in batch_texts]
             all_emotions.extend(batch_emotions)
         try:
-            st.info(f"Fitting topic model for {country}...")
             topics, _ = topic_model.fit_transform(texts, embeddings)
             # Format results
@@ -194,10 +182,88 @@ def process_and_summarize(uploaded_file, top_n=50):
                 'top_topics': top_topics,
                 'top_emotions': top_emotions
             })
         except Exception as e:
             st.warning(f"Could not generate topics for {country}: {str(e)}")
             continue
     return summaries, topic_model
-# Streamlit interface remains the same...

 import torch
 import numpy as np
 from collections import Counter
+import os
+# Configure page
+st.set_page_config(
+    page_title="Arabic Poem Analysis",
+    page_icon="📚",
+    layout="wide"
+)
+@st.cache_resource
+def load_models():
+    """Load and cache the models to prevent reloading"""
+    bert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")
+    bert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2")
+    emotion_model = AutoModelForSequenceClassification.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
+    emotion_classifier = pipeline("text-classification", model=emotion_model, tokenizer=bert_tokenizer)
+    return bert_tokenizer, bert_model, emotion_classifier
+# Load models
+try:
+    bert_tokenizer, bert_model, emotion_classifier = load_models()
+    st.success("Models loaded successfully!")
+except Exception as e:
+    st.error(f"Error loading models: {str(e)}")
+    st.stop()
 # Define emotion labels mapping
 EMOTION_LABELS = {
     'LABEL_2': 'Neutral'
 }
+def chunk_long_text(text, tokenizer, max_length=512):
+    """Split text into chunks respecting token limit."""
+    tokens = tokenizer.encode(text, add_special_tokens=False)
     chunks = []
     text_chunks = []
     for i in range(0, len(tokens), max_length-2):
         chunk = tokens[i:i + max_length-2]
+        full_chunk = [tokenizer.cls_token_id] + chunk + [tokenizer.sep_token_id]
         chunks.append(full_chunk)
+        text_chunks.append(tokenizer.decode(chunk))
     return chunks, text_chunks
+def get_embedding_for_text(text, tokenizer, model):
+    """Get embedding for a text, handling long sequences."""
+    _, text_chunks = chunk_long_text(text, tokenizer)
     chunk_embeddings = []
     for chunk in text_chunks:
+        inputs = tokenizer(chunk,
+                          return_tensors="pt",
+                          padding=True,
+                          truncation=True,
+                          max_length=512)
+        inputs = {k: v.to(model.device) for k, v in inputs.items()}
         with torch.no_grad():
+            outputs = model(**inputs)
         embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
         chunk_embeddings.append(embedding[0])
     if chunk_embeddings:
         return np.mean(chunk_embeddings, axis=0)
+    return np.zeros(model.config.hidden_size)
+def generate_embeddings(texts, tokenizer, model):
     """Generate embeddings for a list of texts."""
     embeddings = []
     for text in texts:
         try:
+            embedding = get_embedding_for_text(text, tokenizer, model)
             embeddings.append(embedding)
         except Exception as e:
             st.warning(f"Error processing text: {str(e)}")
+            embeddings.append(np.zeros(model.config.hidden_size))
     return np.array(embeddings)
+def classify_emotion(text, tokenizer, classifier):
+    """Classify emotion for a text using majority voting."""
     try:
+        _, text_chunks = chunk_long_text(text, tokenizer)
         chunk_emotions = []
         for chunk in text_chunks:
+            result = classifier(chunk, max_length=512, truncation=True)[0]
             chunk_emotions.append(result['label'])
         if chunk_emotions:
             final_emotion = Counter(chunk_emotions).most_common(1)[0][0]
             return final_emotion
         return "unknown"
 def format_topics(topic_model, topic_counts):
+    """Format topics for display."""
     formatted_topics = []
     for topic_num, count in topic_counts:
         if topic_num == -1:
     return formatted_topics
 def format_emotions(emotion_counts):
+    """Format emotions for display."""
     formatted_emotions = []
     for label, count in emotion_counts:
         emotion = EMOTION_LABELS.get(label, label)
         })
     return formatted_emotions
+def process_and_summarize(df, top_n=50):
+    """Process the data and generate summaries."""
+    summaries = []
+    # Initialize BERTopic
     topic_model = BERTopic(
         language="arabic",
         calculate_probabilities=True,
     )
     # Group by country
     for country, group in df.groupby('country'):
+        progress_text = f"Processing poems for {country}..."
+        progress_bar = st.progress(0, text=progress_text)
         texts = group['poem'].dropna().tolist()
         batch_size = 10
         all_emotions = []
+        # Generate embeddings
+        embeddings = generate_embeddings(texts, bert_tokenizer, bert_model)
+        progress_bar.progress(0.33, text="Generating embeddings...")
+        # Process emotions
         for i in range(0, len(texts), batch_size):
             batch_texts = texts[i:i + batch_size]
+            batch_emotions = [classify_emotion(text, bert_tokenizer, emotion_classifier)
+                            for text in batch_texts]
             all_emotions.extend(batch_emotions)
+        progress_bar.progress(0.66, text="Classifying emotions...")
         try:
+            # Fit topic model
             topics, _ = topic_model.fit_transform(texts, embeddings)
             # Format results
                 'top_topics': top_topics,
                 'top_emotions': top_emotions
             })
+            progress_bar.progress(1.0, text="Processing complete!")
         except Exception as e:
             st.warning(f"Could not generate topics for {country}: {str(e)}")
             continue
     return summaries, topic_model
+# Main app interface
+st.title("📚 Arabic Poem Analysis")
+st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.")
+# File upload
+uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"])
+if uploaded_file is not None:
+    try:
+        # Read the file
+        if uploaded_file.name.endswith('.csv'):
+            df = pd.read_csv(uploaded_file)
+        else:
+            df = pd.read_excel(uploaded_file)
+        # Validate columns
+        required_columns = ['country', 'poem']
+        if not all(col in df.columns for col in required_columns):
+            st.error("File must contain 'country' and 'poem' columns.")
+            st.stop()
+        # Clean data
+        df['country'] = df['country'].str.strip()
+        df = df.dropna(subset=['country', 'poem'])
+        # Process data
+        top_n = st.number_input("Number of top topics/emotions to display:",
+                               min_value=1, max_value=100, value=10)
+        if st.button("Process Data"):
+            with st.spinner("Processing your data..."):
+                summaries, topic_model = process_and_summarize(df, top_n=top_n)
+                if summaries:
+                    st.success("Analysis complete!")
+                    # Display results in tabs
+                    tab1, tab2 = st.tabs(["Country Summaries", "Global Topics"])
+                    with tab1:
+                        for summary in summaries:
+                            with st.expander(f"📍 {summary['country']} ({summary['total_poems']} poems)"):
+                                col1, col2 = st.columns(2)
+                                with col1:
+                                    st.subheader("Top Topics")
+                                    for topic in summary['top_topics']:
+                                        st.write(f"• {topic['topic']}: {topic['count']} poems")
+                                with col2:
+                                    st.subheader("Emotions")
+                                    for emotion in summary['top_emotions']:
+                                        st.write(f"• {emotion['emotion']}: {emotion['count']} poems")
+                    with tab2:
+                        st.subheader("Global Topic Distribution")
+                        topic_info = topic_model.get_topic_info()
+                        for _, row in topic_info.iterrows():
+                            if row['Topic'] == -1:
+                                topic_name = "Miscellaneous"
+                            else:
+                                words = topic_model.get_topic(row['Topic'])
+                                topic_name = " | ".join([word for word, _ in words[:3]])
+                            st.write(f"• Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
+    except Exception as e:
+        st.error(f"Error processing file: {str(e)}")
+else:
+    st.info("👆 Upload a file to get started!")
+    # Example format
+    st.write("### Expected File Format:")
+    example_df = pd.DataFrame({
+        'country': ['Egypt', 'Saudi Arabia'],
+        'poem': ['قصيدة مصرية', 'قصيدة سعودية']
+    })
+    st.dataframe(example_df)