import streamlit as st import pandas as pd from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, pipeline from sklearn.feature_extraction.text import CountVectorizer from bertopic import BERTopic import torch import numpy as np from collections import Counter import os from wordcloud import WordCloud import matplotlib.pyplot as plt import pkg_resources import folium from folium.plugins import HeatMap import country_converter as coco from streamlit_folium import folium_static current_dir = os.path.dirname(os.path.abspath(__file__)) font_path = os.path.join(current_dir, "ArabicR2013-J25x.ttf") ARABIC_STOP_WORDS = { 'في', 'من', 'إلى', 'على', 'عن', 'مع', 'خلال', 'حتى', 'إذا', 'ثم', 'أو', 'و', 'ف', 'ل', 'ب', 'ك', 'لل', 'ال', 'هذا', 'هذه', 'ذلك', 'تلك', 'هؤلاء', 'هم', 'هن', 'هو', 'هي', 'نحن', 'انت', 'انتم', 'كان', 'كانت', 'يكون', 'تكون', 'اي', 'كل', 'بعض', 'غير', 'حول', 'عند', 'قد', 'لقد', 'لم', 'لن', 'لو', 'ما', 'ماذا', 'متى', 'كيف', 'اين', 'لماذا', 'الذي', 'التي', 'الذين', 'اللاتي', 'اللواتي', 'الان', 'بين', 'فوق', 'تحت', 'امام', 'خلف', 'حين', 'قبل', 'بعد', 'و', 'أن', 'في', 'كل', 'لم', 'لن', 'له', 'من', 'هو', 'هي', 'قوة', 'كما', 'لها', 'منذ', 'وقد', 'ولا', 'نفس', 'ولم', 'حيث', 'هناك', 'جدا', 'ذات', 'ضمن', 'انه', 'لدى', 'عليه', 'مثل', 'وله', 'عند', 'أما', 'هذه', 'وأن', 'وكل', 'وقال', 'لدي', 'وكان', 'فيه', 'وهي', 'وهو', 'تلك', 'كلم', 'لكن', 'وفي', 'وقف', 'ولقد', 'ومن', 'وهذا', 'اول', 'ضمن', 'انها', 'جميع', 'الذي', 'قبل', 'بعد', 'حول', 'ايضا', 'لازم', 'حاجة', 'علي', 'يجب', 'صار', 'صارت', 'تحت', 'ضد' } st.set_page_config( page_title="Arabic Poem Analysis", page_icon="📚", layout="wide" ) @st.cache_resource def load_models(): """Load and cache the models""" # + Added use_fast=True for faster tokenization tokenizer = AutoTokenizer.from_pretrained( "CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment", use_fast=True ) # + Added torchscript and low_cpu_mem_usage bert_model = AutoModel.from_pretrained( "aubmindlab/bert-base-arabertv2", torchscript=True, low_cpu_mem_usage=True ) # + Added optimizations for emotion model emotion_model = AutoModelForSequenceClassification.from_pretrained( "CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment", torchscript=True, low_cpu_mem_usage=True ) # ~ Changed pipeline configuration to use batching emotion_classifier = pipeline( "sentiment-analysis", model=emotion_model, tokenizer=tokenizer, batch_size=32, device=-1 # + Added to force CPU usage ) return tokenizer, bert_model, emotion_classifier # + Added new batch processing function def process_texts_in_batches(texts, batch_size=32): """Process texts in batches for better CPU utilization""" batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)] results = [] for batch in batches: batch_results = emotion_classifier(batch, truncation=True, max_length=512) results.extend(batch_results) return results # + Added caching decorator for embeddings @st.cache_data def get_cached_embeddings(text, tokenizer, model): """Cache embeddings to avoid recomputation""" return get_embedding_for_text(text, tokenizer, model) def create_theme_map(summaries, topic_model): """Create an interactive map showing theme distributions across countries""" try: # Create a base map centered on the Arab world m = folium.Map(location=[25, 45], zoom_start=4) # Convert country names to coordinates cc = coco.CountryConverter() for summary in summaries: try: # Get country coordinates country_iso = cc.convert(names=[summary['country']], to='ISO2') country_data = cc.convert(names=[summary['country']], to='name_short') # Create popup content with theme information popup_content = f"""

{summary['country']}

Top Themes:
{'
'.join([f"• {topic['topic']}: {topic['count']}" for topic in summary['top_topics'][:5]])} """ # Add marker for each country folium.CircleMarker( location=[cc.convert(country_iso, to='latitude')[0], cc.convert(country_iso, to='longitude')[0]], radius=20, popup=folium.Popup(popup_content, max_width=300), color='red', fill=True, fill_opacity=0.7 ).add_to(m) except Exception as e: st.warning(f"Could not process {summary['country']}: {str(e)}") continue return m except Exception as e: st.error(f"Error creating map: {str(e)}") return None def split_text(text, max_length=512): """Split text into chunks of maximum token length while preserving word boundaries.""" words = text.split() chunks = [] current_chunk = [] current_length = 0 for word in words: word_length = len(word.split()) if current_length + word_length > max_length: if current_chunk: chunks.append(' '.join(current_chunk)) current_chunk = [word] current_length = word_length else: current_chunk.append(word) current_length += word_length if current_chunk: chunks.append(' '.join(current_chunk)) return chunks def create_arabic_wordcloud(text, title): wordcloud = WordCloud( width=1200, height=600, background_color='white', font_path=font_path, max_words=200, stopwords=ARABIC_STOP_WORDS ).generate(text) fig, ax = plt.subplots(figsize=(15, 8)) ax.imshow(wordcloud, interpolation='bilinear') ax.axis('off') ax.set_title(title, fontsize=16, pad=20) return fig def clean_arabic_text(text): """Clean Arabic text by removing stop words and normalizing.""" words = text.split() cleaned_words = [word for word in words if word not in ARABIC_STOP_WORDS and len(word) > 1] return ' '.join(cleaned_words) def classify_emotion(text, classifier): """Classify emotion for complete text with precise token handling.""" # Ensure text is properly formatted if not text or not isinstance(text, str): return "LABEL_2" # Split into manageable chunks words = text.split() chunks = [] current_chunk = [] current_length = 0 # Create proper-sized chunks for word in words: word_tokens = len(classifier.tokenizer.encode(word)) if current_length + word_tokens > 512: if current_chunk: chunks.append(' '.join(current_chunk)) current_chunk = [word] current_length = word_tokens else: current_chunk.append(word) current_length += word_tokens if current_chunk: chunks.append(' '.join(current_chunk)) if not chunks: return "LABEL_2" # Process chunks with proper output handling all_scores = [] for chunk in chunks: # Direct classification with proper output structure result = classifier(chunk, return_all_scores=True)[0] all_scores.append(result) # Calculate final emotion label_scores = {} count = len(all_scores) for scores in all_scores: for score_dict in scores: label = score_dict['label'] if label not in label_scores: label_scores[label] = 0 label_scores[label] += score_dict['score'] avg_scores = {label: score/count for label, score in label_scores.items()} final_emotion = max(avg_scores.items(), key=lambda x: x[1])[0] return final_emotion def get_embedding_for_text(text, tokenizer, model): """Get embedding for complete text.""" # Get the raw tokens first tokens = tokenizer.tokenize(text) # Process in chunks of exactly 510 tokens (512 - 2 special tokens) chunk_size = 510 chunks = [] for i in range(0, len(tokens), chunk_size): chunk = tokens[i:i + chunk_size] token_ids = tokenizer.convert_tokens_to_ids(chunk) # Add special tokens manually token_ids = [tokenizer.cls_token_id] + token_ids + [tokenizer.sep_token_id] # Create attention mask attention_mask = [1] * len(token_ids) # Pad if needed padding_length = 512 - len(token_ids) if padding_length > 0: token_ids = token_ids + ([tokenizer.pad_token_id] * padding_length) attention_mask = attention_mask + ([0] * padding_length) chunks.append({ 'input_ids': torch.tensor([token_ids]), 'attention_mask': torch.tensor([attention_mask]) }) # Get embeddings chunk_embeddings = [] for chunk in chunks: chunk = {k: v.to(model.device) for k, v in chunk.items()} with torch.no_grad(): outputs = model(**chunk)[0] embedding = outputs[:, 0, :].cpu().numpy() chunk_embeddings.append(embedding[0]) if chunk_embeddings: return np.mean(chunk_embeddings, axis=0) return np.zeros(model.config.hidden_size) def format_topics(topic_model, topic_counts): """Format topics for display.""" formatted_topics = [] for topic_num, count in topic_counts: if topic_num == -1: topic_label = "Miscellaneous" else: words = topic_model.get_topic(topic_num) topic_label = " | ".join([word for word, _ in words[:5]]) formatted_topics.append({ 'topic': topic_label, 'count': count }) return formatted_topics def format_emotions(emotion_counts): """Format emotions for display.""" EMOTION_LABELS = { 'LABEL_0': 'Negative', 'LABEL_1': 'Positive', 'LABEL_2': 'Neutral' } formatted_emotions = [] for label, count in emotion_counts: emotion = EMOTION_LABELS.get(label, label) formatted_emotions.append({ 'emotion': emotion, 'count': count }) return formatted_emotions def get_optimized_topic_model(bert_model): """Configure BERTopic for better CPU performance""" return BERTopic( embedding_model=bert_model, language="arabic", calculate_probabilities=False, verbose=False, n_gram_range=(1, 1), min_topic_size=5, nr_topics="auto", low_memory=True ) def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3): """Process the data and generate summaries with flexible topic configuration.""" summaries = [] topic_model = get_optimized_topic_model(bert_model) vectorizer = CountVectorizer(stop_words=list(ARABIC_STOP_WORDS), min_df=1, max_df=1.0) topic_model.vectorizer_model = vectorizer for country, group in df.groupby('country'): progress_text = f"Processing poems for {country}..." progress_bar = st.progress(0, text=progress_text) texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()] all_emotions = [] # Get embeddings with proper output handling embeddings = [] for i, text in enumerate(texts): embedding = get_embedding_for_text(text, bert_tokenizer, bert_model) embeddings.append(embedding) progress = (i + 1) / len(texts) * 0.4 progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...") embeddings = np.array(embeddings) # Process emotions with correct output structure for i, text in enumerate(texts): emotion = emotion_classifier(text)[0]['label'] # Access the label directly all_emotions.append(emotion) progress = 0.4 + ((i + 1) / len(texts) * 0.3) progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...") if len(texts) < min_topic_size: st.info(f"Not enough documents for {country} to generate meaningful topics (minimum {min_topic_size} required)") continue topics, _ = topic_model.fit_transform(texts, embeddings) topic_counts = Counter(topics) top_topics = format_topics(topic_model, topic_counts.most_common(top_n)) top_emotions = format_emotions(Counter(all_emotions).most_common(top_n)) summaries.append({ 'country': country, 'total_poems': len(texts), 'top_topics': top_topics, 'top_emotions': top_emotions }) progress_bar.progress(1.0, text="Processing complete!") return summaries, topic_model try: bert_tokenizer, bert_model, emotion_classifier = load_models() st.success("Models loaded successfully!") except Exception as e: st.error(f"Error loading models: {str(e)}") st.stop() # Main app interface st.title("📚 Arabic Poem Analysis") st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.") uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"]) if uploaded_file is not None: try: if uploaded_file.name.endswith('.csv'): df = pd.read_csv(uploaded_file) else: df = pd.read_excel(uploaded_file) required_columns = ['country', 'poem'] if not all(col in df.columns for col in required_columns): st.error("File must contain 'country' and 'poem' columns.") st.stop() df['country'] = df['country'].str.strip() df = df.dropna(subset=['country', 'poem']) st.subheader("Topic Modeling Settings") col1, col2 = st.columns(2) with col1: topic_strategy = st.radio( "Topic Number Strategy", ["Auto", "Manual"], help="Choose whether to let the model determine the optimal number of topics or set it manually" ) if topic_strategy == "Manual": n_documents = len(df) max_topics = 500 min_topics = 5 default_topics = 20 n_topics = st.slider( "Number of Topics", min_value=min_topics, max_value=max_topics, value=default_topics, help=f"Select the desired number of topics (max {max_topics} based on dataset size)" ) st.info(f""" 💡 For your dataset of {n_documents:,} documents: - Available topic range: {min_topics}-{max_topics} - Recommended range: {max_topics//10}-{max_topics//3} for optimal coherence """) with col2: top_n = st.number_input( "Number of top topics/emotions to display:", min_value=1, max_value=100, value=10 ) if st.button("Process Data"): with st.spinner("Processing your data..."): summaries, topic_model = process_and_summarize( df, bert_tokenizer, bert_model, emotion_classifier, top_n=top_n, topic_strategy=topic_strategy, n_topics=n_topics if topic_strategy == "Manual" else None, min_topic_size=3 ) if summaries: st.success("Analysis complete!") tab1, tab2, tab3 = st.tabs(["Country Summaries", "Global Topics", "Theme Map"]) with tab1: for summary in summaries: with st.expander(f"📍 {summary['country']} ({summary['total_poems']} poems)"): col1, col2 = st.columns(2) with col1: st.subheader("Top Topics") for topic in summary['top_topics']: st.write(f"• {topic['topic']}: {topic['count']} poems") with col2: st.subheader("Emotions") for emotion in summary['top_emotions']: st.write(f"• {emotion['emotion']}: {emotion['count']} poems") st.subheader("Word Cloud Visualization") country_poems = df[df['country'] == summary['country']]['poem'] combined_text = ' '.join(country_poems) wordcloud_fig = create_arabic_wordcloud(combined_text, f"Most Common Words in {summary['country']} Poems") st.pyplot(wordcloud_fig) with tab2: st.subheader("Global Topic Distribution") topic_info = topic_model.get_topic_info() for _, row in topic_info.iterrows(): if row['Topic'] == -1: topic_name = "Miscellaneous" else: words = topic_model.get_topic(row['Topic']) topic_name = " | ".join([word for word, _ in words[:5]]) st.write(f"• Topic {row['Topic']}: {topic_name} ({row['Count']} poems)") with tab3: st.subheader("Thematic Distribution Map") theme_map = create_theme_map(summaries, topic_model) folium_static(theme_map) except Exception as e: st.error(f"Error processing file: {str(e)}") else: st.info("👆 Upload a file to get started!") st.write("### Expected File Format:") example_df = pd.DataFrame({ 'country': ['Egypt', 'Palestine'], 'poem': ['قصيدة مصرية', 'قصيدة فلسطينية'] }) st.dataframe(example_df)