Spaces:

kambris
/

SoLProject

Runtime error

App Files Files Community

kambris commited on Nov 23, 2024

Commit

6bd6b44

verified ·

1 Parent(s): 1c5ddd8

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -36

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import pandas as pd
 from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, pipeline
 from bertopic import BERTopic
 import torch
 from collections import Counter
 # Load AraBERT tokenizer and model for embeddings
@@ -15,27 +16,41 @@ emotion_classifier = pipeline("text-classification", model=emotion_model, tokeni
 # Function to generate embeddings using AraBERT
 def generate_embeddings(texts):
-    # Tokenize all the texts (poems)
-    inputs = bert_tokenizer(texts, return_tensors="pt", padding=True, truncation=False, max_length=512)
-    chunked_inputs = []
-    for input_ids in inputs['input_ids']:
-        # Split each long sequence into chunks of max 512 tokens
-        chunks = [input_ids[i:i + 512] for i in range(0, len(input_ids), 512)]
-        chunked_inputs.extend(chunks)
-    # Process each chunk and get embeddings
-    embeddings = []
-    for chunk in chunked_inputs:
-        input_tensor = torch.tensor(chunk).unsqueeze(0)  # Add batch dimension
         with torch.no_grad():
-            outputs = bert_model(input_tensor)
-        chunk_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
-        embeddings.append(chunk_embedding)
-    # Combine all embeddings (you can take the average of embeddings for each poem)
-    final_embeddings = sum(embeddings) / len(embeddings)
-    return final_embeddings
 # Function to process the uploaded file and summarize by country
 def process_and_summarize(uploaded_file, top_n=50):
@@ -59,34 +74,41 @@ def process_and_summarize(uploaded_file, top_n=50):
     df['country'] = df['country'].str.strip()
     df = df.dropna(subset=['country', 'poem'])
     # Group by country
     summaries = []
-    topic_model = BERTopic()
     for country, group in df.groupby('country'):
         st.info(f"Processing poems for {country}...")
-        # Combine all poems for the country
         texts = group['poem'].dropna().tolist()
         # Classify emotions
         st.info(f"Classifying emotions for {country}...")
-        emotions = [emotion_classifier(text)[0]['label'] for text in texts]
         # Generate embeddings and fit topic model
         st.info(f"Generating embeddings and topics for {country}...")
         embeddings = generate_embeddings(texts)
-        topics, _ = topic_model.fit_transform(embeddings)
-        # Aggregate topics and emotions
-        top_topics = Counter(topics).most_common(top_n)
-        top_emotions = Counter(emotions).most_common(top_n)
-        summaries.append({
-            'country': country,
-            'total_poems': len(texts),
-            'top_topics': top_topics,
-            'top_emotions': top_emotions
-        })
     return summaries, topic_model
@@ -117,4 +139,4 @@ if uploaded_file is not None:
             st.write("### Global Topic Information:")
             st.write(topic_model.get_topic_info())
     except Exception as e:
-        st.error(f"Error: {e}")

 from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, pipeline
 from bertopic import BERTopic
 import torch
+import numpy as np
 from collections import Counter
 # Load AraBERT tokenizer and model for embeddings
 # Function to generate embeddings using AraBERT
 def generate_embeddings(texts):
+    all_embeddings = []
+    for text in texts:
+        # Tokenize with truncation to handle long sequences
+        inputs = bert_tokenizer(
+            text,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=512
+        )
+        # Generate embeddings
         with torch.no_grad():
+            outputs = bert_model(**inputs)
+        # Get the mean of the last hidden state as the embedding
+        embedding = outputs.last_hidden_state.mean(dim=1).numpy()
+        all_embeddings.append(embedding[0])  # Remove batch dimension
+    return np.array(all_embeddings)
+# Function to perform emotion classification with proper truncation
+def classify_emotions(texts):
+    emotions = []
+    for text in texts:
+        # Process text in chunks if it's too long
+        if len(bert_tokenizer.encode(text)) > 512:
+            chunks = [text[i:i + 512] for i in range(0, len(text), 512)]
+            # Take the emotion of the first chunk (usually contains the most relevant information)
+            emotion = emotion_classifier(chunks[0])[0]['label']
+        else:
+            emotion = emotion_classifier(text)[0]['label']
+        emotions.append(emotion)
+    return emotions
 # Function to process the uploaded file and summarize by country
 def process_and_summarize(uploaded_file, top_n=50):
     df['country'] = df['country'].str.strip()
     df = df.dropna(subset=['country', 'poem'])
+    # Initialize BERTopic
+    topic_model = BERTopic(language="arabic")
     # Group by country
     summaries = []
     for country, group in df.groupby('country'):
         st.info(f"Processing poems for {country}...")
+        # Get texts for this country
         texts = group['poem'].dropna().tolist()
         # Classify emotions
         st.info(f"Classifying emotions for {country}...")
+        emotions = classify_emotions(texts)
         # Generate embeddings and fit topic model
         st.info(f"Generating embeddings and topics for {country}...")
         embeddings = generate_embeddings(texts)
+        try:
+            topics, _ = topic_model.fit_transform(texts, embeddings)
+            # Aggregate topics and emotions
+            top_topics = Counter(topics).most_common(top_n)
+            top_emotions = Counter(emotions).most_common(top_n)
+            summaries.append({
+                'country': country,
+                'total_poems': len(texts),
+                'top_topics': top_topics,
+                'top_emotions': top_emotions
+            })
+        except Exception as e:
+            st.warning(f"Could not generate topics for {country}: {str(e)}")
+            continue
     return summaries, topic_model
             st.write("### Global Topic Information:")
             st.write(topic_model.get_topic_info())
     except Exception as e:
+        st.error(f"Error: {e}")