import streamlit as st import pandas as pd from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, pipeline from bertopic import BERTopic import torch from collections import Counter # Load AraBERT tokenizer and model for embeddings bert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2") bert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2") # Load AraBERT model for emotion classification emotion_model = AutoModelForSequenceClassification.from_pretrained("aubmindlab/bert-base-arabertv2") emotion_classifier = pipeline("text-classification", model=emotion_model, tokenizer=bert_tokenizer) # Function to generate embeddings using AraBERT def generate_embeddings(texts): inputs = bert_tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512) with torch.no_grad(): outputs = bert_model(**inputs) embeddings = outputs.last_hidden_state.mean(dim=1).numpy() return embeddings # Function to process the uploaded file and summarize by country def process_and_summarize(uploaded_file, top_n=50): # Determine the file type if uploaded_file.name.endswith(".csv"): df = pd.read_csv(uploaded_file) elif uploaded_file.name.endswith(".xlsx"): df = pd.read_excel(uploaded_file) else: st.error("Unsupported file format.") return None, None # Validate required columns required_columns = ['country', 'poem'] missing_columns = [col for col in required_columns if col not in df.columns] if missing_columns: st.error(f"Missing columns: {', '.join(missing_columns)}") return None, None # Parse and preprocess the file df['country'] = df['country'].str.strip() df = df.dropna(subset=['country', 'poem']) # Group by country summaries = [] topic_model = BERTopic() for country, group in df.groupby('country'): st.info(f"Processing poems for {country}...") # Combine all poems for the country texts = group['poem'].dropna().tolist() # Classify emotions st.info(f"Classifying emotions for {country}...") emotions = [emotion_classifier(text)[0]['label'] for text in texts] # Generate embeddings and fit topic model st.info(f"Generating embeddings and topics for {country}...") embeddings = generate_embeddings(texts) topics, _ = topic_model.fit_transform(embeddings) # Aggregate topics and emotions top_topics = Counter(topics).most_common(top_n) top_emotions = Counter(emotions).most_common(top_n) summaries.append({ 'country': country, 'total_poems': len(texts), 'top_topics': top_topics, 'top_emotions': top_emotions }) return summaries, topic_model # Streamlit App Interface st.title("Arabic Poem Topic Modeling & Emotion Classification") st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.") uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"]) if uploaded_file is not None: try: top_n = st.number_input("Select the number of top topics/emotions to display:", min_value=1, max_value=100, value=50) summaries, topic_model = process_and_summarize(uploaded_file, top_n=top_n) if summaries is not None: st.success("Data successfully processed!") # Display summary for each country for summary in summaries: st.write(f"### {summary['country']}") st.write(f"Total Poems: {summary['total_poems']}") st.write(f"Top {top_n} Topics:") st.write(summary['top_topics']) st.write(f"Top {top_n} Emotions:") st.write(summary['top_emotions']) # Display overall topics st.write("### Global Topic Information:") st.write(topic_model.get_topic_info()) except Exception as e: st.error(f"Error: {e}")