Spaces:
Runtime error
Runtime error
import streamlit as st | |
import pandas as pd | |
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, pipeline | |
from bertopic import BERTopic | |
import torch | |
import numpy as np | |
from collections import Counter | |
# Load AraBERT tokenizer and model for embeddings | |
bert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2") | |
bert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2") | |
# Load AraBERT model for emotion classification | |
emotion_model = AutoModelForSequenceClassification.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment") | |
emotion_classifier = pipeline("text-classification", model=emotion_model, tokenizer=bert_tokenizer) | |
# Define emotion labels mapping | |
EMOTION_LABELS = { | |
'LABEL_0': 'Negative', | |
'LABEL_1': 'Positive', | |
'LABEL_2': 'Neutral' | |
} | |
def chunk_text(text, max_length=512): | |
"""Split text into chunks of maximum token length.""" | |
tokens = bert_tokenizer.encode(text, add_special_tokens=False) | |
chunks = [] | |
for i in range(0, len(tokens), max_length - 2): # -2 to account for [CLS] and [SEP] tokens | |
chunk = tokens[i:i + max_length - 2] | |
# Add special tokens | |
chunk = [bert_tokenizer.cls_token_id] + chunk + [bert_tokenizer.sep_token_id] | |
chunks.append(chunk) | |
return chunks | |
def get_embedding_for_text(text): | |
"""Get embedding for a single text.""" | |
chunks = chunk_text(text) | |
chunk_embeddings = [] | |
for chunk in chunks: | |
# Convert to tensor and add batch dimension | |
input_ids = torch.tensor([chunk]).to(bert_model.device) | |
attention_mask = torch.ones_like(input_ids) | |
with torch.no_grad(): | |
outputs = bert_model(input_ids, attention_mask=attention_mask) | |
# Get [CLS] token embedding for this chunk | |
chunk_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy() | |
chunk_embeddings.append(chunk_embedding[0]) | |
# Average embeddings from all chunks | |
if chunk_embeddings: | |
return np.mean(chunk_embeddings, axis=0) | |
return np.zeros(bert_model.config.hidden_size) # fallback | |
def generate_embeddings(texts): | |
"""Generate embeddings for a list of texts.""" | |
embeddings = [] | |
for text in texts: | |
try: | |
embedding = get_embedding_for_text(text) | |
embeddings.append(embedding) | |
except Exception as e: | |
st.warning(f"Error processing text: {str(e)}") | |
# Add zero embedding as fallback | |
embeddings.append(np.zeros(bert_model.config.hidden_size)) | |
return np.array(embeddings) | |
def classify_emotion(text): | |
"""Classify emotion for a single text.""" | |
try: | |
chunks = chunk_text(text) | |
if not chunks: | |
return "unknown" | |
# Use first chunk for classification | |
chunk_text = bert_tokenizer.decode(chunks[0]) | |
result = emotion_classifier(chunk_text)[0] | |
return result['label'] | |
except Exception as e: | |
st.warning(f"Error in emotion classification: {str(e)}") | |
return "unknown" | |
def format_topics(topic_model, topic_counts): | |
"""Convert topic numbers to readable labels.""" | |
formatted_topics = [] | |
for topic_num, count in topic_counts: | |
if topic_num == -1: | |
topic_label = "Miscellaneous" | |
else: | |
# Get the top words for this topic | |
words = topic_model.get_topic(topic_num) | |
# Take the top 3 words to form a topic label | |
topic_label = " | ".join([word for word, _ in words[:3]]) | |
formatted_topics.append({ | |
'topic': topic_label, | |
'count': count | |
}) | |
return formatted_topics | |
def format_emotions(emotion_counts): | |
"""Convert emotion labels to readable text.""" | |
formatted_emotions = [] | |
for label, count in emotion_counts: | |
emotion = EMOTION_LABELS.get(label, label) | |
formatted_emotions.append({ | |
'emotion': emotion, | |
'count': count | |
}) | |
return formatted_emotions | |
def process_and_summarize(uploaded_file, top_n=50): | |
# Determine the file type | |
if uploaded_file.name.endswith(".csv"): | |
df = pd.read_csv(uploaded_file) | |
elif uploaded_file.name.endswith(".xlsx"): | |
df = pd.read_excel(uploaded_file) | |
else: | |
st.error("Unsupported file format.") | |
return None, None | |
# Validate required columns | |
required_columns = ['country', 'poem'] | |
missing_columns = [col for col in required_columns if col not in df.columns] | |
if missing_columns: | |
st.error(f"Missing columns: {', '.join(missing_columns)}") | |
return None, None | |
# Parse and preprocess the file | |
df['country'] = df['country'].str.strip() | |
df = df.dropna(subset=['country', 'poem']) | |
# Initialize BERTopic with specific parameters | |
topic_model = BERTopic( | |
language="arabic", | |
calculate_probabilities=True, | |
verbose=True | |
) | |
# Group by country | |
summaries = [] | |
for country, group in df.groupby('country'): | |
st.info(f"Processing poems for {country}...") | |
texts = group['poem'].dropna().tolist() | |
batch_size = 10 | |
all_emotions = [] | |
all_embeddings = [] | |
for i in range(0, len(texts), batch_size): | |
batch_texts = texts[i:i + batch_size] | |
st.info(f"Generating embeddings for batch {i//batch_size + 1}...") | |
batch_embeddings = generate_embeddings(batch_texts) | |
all_embeddings.extend(batch_embeddings) | |
st.info(f"Classifying emotions for batch {i//batch_size + 1}...") | |
batch_emotions = [classify_emotion(text) for text in batch_texts] | |
all_emotions.extend(batch_emotions) | |
try: | |
embeddings = np.array(all_embeddings) | |
st.info(f"Fitting topic model for {country}...") | |
topics, _ = topic_model.fit_transform(texts, embeddings) | |
# Format topics and emotions with readable labels | |
top_topics = format_topics(topic_model, Counter(topics).most_common(top_n)) | |
top_emotions = format_emotions(Counter(all_emotions).most_common(top_n)) | |
summaries.append({ | |
'country': country, | |
'total_poems': len(texts), | |
'top_topics': top_topics, | |
'top_emotions': top_emotions | |
}) | |
except Exception as e: | |
st.warning(f"Could not generate topics for {country}: {str(e)}") | |
continue | |
return summaries, topic_model | |
# Streamlit App Interface | |
st.title("Arabic Poem Topic Modeling & Emotion Classification") | |
st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.") | |
uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"]) | |
if uploaded_file is not None: | |
try: | |
top_n = st.number_input("Select the number of top topics/emotions to display:", | |
min_value=1, max_value=100, value=10) | |
summaries, topic_model = process_and_summarize(uploaded_file, top_n=top_n) | |
if summaries is not None: | |
st.success("Data successfully processed!") | |
# Display summary for each country | |
for summary in summaries: | |
st.write(f"### {summary['country']}") | |
st.write(f"Total Poems: {summary['total_poems']}") | |
st.write(f"\nTop {top_n} Topics:") | |
for topic in summary['top_topics']: | |
st.write(f"• {topic['topic']}: {topic['count']} poems") | |
st.write(f"\nTop {top_n} Emotions:") | |
for emotion in summary['top_emotions']: | |
st.write(f"• {emotion['emotion']}: {emotion['count']} poems") | |
st.write("---") | |
# Display overall topics in a more readable format | |
st.write("### Global Topic Information:") | |
topic_info = topic_model.get_topic_info() | |
for _, row in topic_info.iterrows(): | |
if row['Topic'] == -1: | |
topic_name = "Miscellaneous" | |
else: | |
words = topic_model.get_topic(row['Topic']) | |
topic_name = " | ".join([word for word, _ in words[:3]]) | |
st.write(f"• Topic {row['Topic']}: {topic_name} ({row['Count']} poems)") | |
except Exception as e: | |
st.error(f"Error: {str(e)}") |