Spaces:

kambris
/

SoLProject

Runtime error

App Files Files Community

kambris commited on Nov 23, 2024

Commit

3f0f6de

verified ·

1 Parent(s): 6f973fa

Update app.py

Browse files

Files changed (1) hide show

app.py +82 -53

app.py CHANGED Viewed

@@ -1,50 +1,43 @@
 import streamlit as st
 import pandas as pd
-from transformers import T5Tokenizer, T5ForConditionalGeneration, BertTokenizer, BertModel
 from bertopic import BERTopic
 import torch
-import numpy as np
-# Initialize ARAT5 model and tokenizer for topic modeling
-tokenizer = T5Tokenizer.from_pretrained("UBC-NLP/araT5-base")
-model = T5ForConditionalGeneration.from_pretrained("UBC-NLP/araT5-base")
-# Initialize BERT tokenizer and model for feature extraction
-bert_tokenizer = BertTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")
-bert_model = BertModel.from_pretrained("aubmindlab/bert-base-arabertv2")
-# Function to get embeddings from ARAT5 for topic modeling
 def generate_embeddings(texts):
-    embeddings = []
-    for text in texts:
-        # Tokenize the text with truncation set to False
-        # We are using the BertTokenizer directly without using the pipeline
-        tokens = bert_tokenizer.encode(text, add_special_tokens=True, truncation=False, padding=False)
-        # Split the tokens into chunks of size 512 (maximum length)
-        chunked_texts = [tokens[i:i + 512] for i in range(0, len(tokens), 512)]
-        poem_embeddings = []
-        for chunk in chunked_texts:
-            # Convert the chunk to a tensor and prepare the input for BERT model
-            inputs = torch.tensor(chunk).unsqueeze(0)  # Adding batch dimension
-            with torch.no_grad():
-                outputs = bert_model(inputs)
-            # Get the embeddings from the last hidden state (mean of all token embeddings)
-            chunk_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
-            poem_embeddings.append(chunk_embedding)
-        # Average the embeddings of all chunks (optional, can also concatenate them)
-        final_embedding = np.mean(np.array(poem_embeddings), axis=0)
-        embeddings.append(final_embedding)
     return embeddings
-# Function to process the CSV or Excel file
-def process_file(uploaded_file):
     # Determine the file type
     if uploaded_file.name.endswith(".csv"):
         df = pd.read_csv(uploaded_file)
@@ -52,39 +45,75 @@ def process_file(uploaded_file):
         df = pd.read_excel(uploaded_file)
     else:
         st.error("Unsupported file format.")
-        return None
     # Validate required columns
     required_columns = ['country', 'poem']
     missing_columns = [col for col in required_columns if col not in df.columns]
     if missing_columns:
         st.error(f"Missing columns: {', '.join(missing_columns)}")
-        return None
-    # Process the file
     df = df.dropna(subset=['country', 'poem'])
-    texts = df['poem'].dropna().tolist()
-    # Generate embeddings for all poems
-    embeddings = generate_embeddings(texts)
-    # Perform topic modeling with BERTopic
     topic_model = BERTopic()
-    topics, _ = topic_model.fit_transform(embeddings)
-    df['topic'] = topics
-    return df
-# Streamlit App
 st.title("Arabic Poem Topic Modeling & Emotion Classification")
 uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"])
 if uploaded_file is not None:
     try:
-        result_df = process_file(uploaded_file)
-        if result_df is not None:
-            st.write("Data successfully processed!")
-            st.write(result_df.head())
     except Exception as e:
         st.error(f"Error: {e}")

 import streamlit as st
 import pandas as pd
+from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, pipeline
 from bertopic import BERTopic
 import torch
+from collections import Counter
+# Load AraBERT tokenizer and model for embeddings
+bert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")
+bert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2")
+# Load AraBERT model for emotion classification
+emotion_model = AutoModelForSequenceClassification.from_pretrained("aubmindlab/bert-base-arabertv2")
+emotion_classifier = pipeline("text-classification", model=emotion_model, tokenizer=bert_tokenizer)
+# Function to generate embeddings using AraBERT
 def generate_embeddings(texts):
+    # Tokenize the list of texts using the tokenizer
+    inputs = bert_tokenizer(texts, return_tensors="pt", padding=True, truncation=False, max_length=512)
+    # Split large sequences into chunks of size 512
+    chunked_inputs = []
+    for input_ids in inputs['input_ids']:
+        chunks = [input_ids[i:i + 512] for i in range(0, len(input_ids), 512)]
+        chunked_inputs.extend(chunks)
+    # Process each chunk and get embeddings
+    embeddings = []
+    for chunk in chunked_inputs:
+        input_tensor = torch.tensor(chunk).unsqueeze(0)  # Add batch dimension
+        with torch.no_grad():
+            outputs = bert_model(input_tensor)
+        chunk_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
+        embeddings.append(chunk_embedding)
+    # Return the embeddings averaged across chunks
     return embeddings
+# Function to process the uploaded file and summarize by country
+def process_and_summarize(uploaded_file, top_n=50):
     # Determine the file type
     if uploaded_file.name.endswith(".csv"):
         df = pd.read_csv(uploaded_file)
         df = pd.read_excel(uploaded_file)
     else:
         st.error("Unsupported file format.")
+        return None, None
     # Validate required columns
     required_columns = ['country', 'poem']
     missing_columns = [col for col in required_columns if col not in df.columns]
     if missing_columns:
         st.error(f"Missing columns: {', '.join(missing_columns)}")
+        return None, None
+    # Parse and preprocess the file
+    df['country'] = df['country'].str.strip()
     df = df.dropna(subset=['country', 'poem'])
+    # Group by country
+    summaries = []
     topic_model = BERTopic()
+    for country, group in df.groupby('country'):
+        st.info(f"Processing poems for {country}...")
+        # Combine all poems for the country
+        texts = group['poem'].dropna().tolist()
+        # Classify emotions
+        st.info(f"Classifying emotions for {country}...")
+        emotions = [emotion_classifier(text)[0]['label'] for text in texts]
+        # Generate embeddings and fit topic model
+        st.info(f"Generating embeddings and topics for {country}...")
+        embeddings = generate_embeddings(texts)
+        topics, _ = topic_model.fit_transform(embeddings)
+        # Aggregate topics and emotions
+        top_topics = Counter(topics).most_common(top_n)
+        top_emotions = Counter(emotions).most_common(top_n)
+        summaries.append({
+            'country': country,
+            'total_poems': len(texts),
+            'top_topics': top_topics,
+            'top_emotions': top_emotions
+        })
+    return summaries, topic_model
+# Streamlit App Interface
 st.title("Arabic Poem Topic Modeling & Emotion Classification")
+st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.")
 uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"])
 if uploaded_file is not None:
     try:
+        top_n = st.number_input("Select the number of top topics/emotions to display:", min_value=1, max_value=100, value=50)
+        summaries, topic_model = process_and_summarize(uploaded_file, top_n=top_n)
+        if summaries is not None:
+            st.success("Data successfully processed!")
+            # Display summary for each country
+            for summary in summaries:
+                st.write(f"### {summary['country']}")
+                st.write(f"Total Poems: {summary['total_poems']}")
+                st.write(f"Top {top_n} Topics:")
+                st.write(summary['top_topics'])
+                st.write(f"Top {top_n} Emotions:")
+                st.write(summary['top_emotions'])
+            # Display overall topics
+            st.write("### Global Topic Information:")
+            st.write(topic_model.get_topic_info())
     except Exception as e:
         st.error(f"Error: {e}")