Spaces:

kambris
/

SoLProject

Runtime error

File size: 20,267 Bytes

4b4bf72
 
3f0f6de
58609ca
4b4bf72
b3d1640
6bd6b44
3f0f6de
b2576ed
9402b4b
 
077e097
506a6c4
 
 
 
ede97b6
de4e980
1e7baf3
ede97b6
f496437
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e9be7bd
b6873e7
 
 
 
 
7173364
db1f2f7
b2576ed
bd35972
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b88eade
 
 
bd35972
 
 
b88eade
bd35972
b88eade
4ec5d16
bd35972
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
506a6c4
 
1ba39e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bd35972
b88eade
 
 
79bbe0b
b88eade
 
79bbe0b
b88eade
 
 
7173364
b88eade
 
 
 
 
 
79bbe0b
7173364
b88eade
79bbe0b
b88eade
79bbe0b
9402b4b
 
 
 
 
de4e980
d27844e
643a16d
9402b4b
 
 
 
 
 
 
 
7173364
 
 
 
 
00bf9b7
b88eade
00bf9b7
79bbe0b
00bf9b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0156b72
b88eade
 
00bf9b7
 
 
 
 
 
 
 
 
 
 
 
 
0156b72
b88eade
 
 
 
 
 
 
 
 
 
 
 
 
0156b72
b88eade
7173364
79bbe0b
 
 
7173364
00bf9b7
b88eade
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79bbe0b
4ec5d16
b2576ed
4ec5d16
 
 
 
6bd6b44
4ec5d16
7173364
4ec5d16
 
 
 
 
 
 
 
b2576ed
b88eade
 
 
 
 
 
4ec5d16
 
 
 
 
 
 
 
bd35972
 
 
 
 
 
 
 
 
 
 
 
 
 
64fef51
e8e9aaf
7173364
b2576ed
4ec5d16
bd35972
95436ee
f427760
54bb263
 
e2c8b5b
6bd6b44
3f0f6de
b2576ed
 
 
7173364
4ec5d16
6bd6b44
b88eade
 
f427760
 
 
 
 
 
 
 
 
 
b88eade
 
 
f427760
 
b88eade
0156b72
b88eade
 
 
 
 
4ec5d16
6bd6b44
f427760
 
 
 
 
 
7173364
f427760
 
3772e46
6bd6b44
7173364
4ec5d16
f427760
6bd6b44
 
 
 
 
 
b2576ed
b88eade
6bd6b44
 
 
3f0f6de
 
e9be7bd
db1f2f7
 
 
 
 
 
b2576ed
db1f2f7
 
 
b2576ed
db1f2f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
afa7452
db1f2f7
 
113329a
f496437
113329a
7173364
db1f2f7
 
f496437
db1f2f7
f496437
db1f2f7
afa7452
 
db1f2f7
 
f496437
 
db1f2f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e8e9aaf
afa7452
7173364
db1f2f7
 
 
506a6c4
db1f2f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9402b4b
 
 
 
 
 
db1f2f7
 
 
 
 
 
 
 
 
 
 
506a6c4
 
 
 
db1f2f7
 
7173364
db1f2f7
 
b2576ed
db1f2f7

import streamlit as st
import pandas as pd
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, pipeline
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
import torch
import numpy as np
from collections import Counter
import os
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pkg_resources
import folium
from folium.plugins import HeatMap
import country_converter as coco
from streamlit_folium import folium_static

current_dir = os.path.dirname(os.path.abspath(__file__))
font_path = os.path.join(current_dir, "ArabicR2013-J25x.ttf")

ARABIC_STOP_WORDS = {
    'في', 'من', 'إلى', 'على', 'عن', 'مع', 'خلال', 'حتى', 'إذا', 'ثم',
    'أو', 'و', 'ف', 'ل', 'ب', 'ك', 'لل', 'ال', 'هذا', 'هذه', 'ذلك',
    'تلك', 'هؤلاء', 'هم', 'هن', 'هو', 'هي', 'نحن', 'انت', 'انتم',
    'كان', 'كانت', 'يكون', 'تكون', 'اي', 'كل', 'بعض', 'غير', 'حول',
    'عند', 'قد', 'لقد', 'لم', 'لن', 'لو', 'ما', 'ماذا', 'متى', 'كيف',
    'اين', 'لماذا', 'الذي', 'التي', 'الذين', 'اللاتي', 'اللواتي',
    'الان', 'بين', 'فوق', 'تحت', 'امام', 'خلف', 'حين', 'قبل', 'بعد',
    'و', 'أن', 'في', 'كل', 'لم', 'لن', 'له', 'من', 'هو', 'هي', 'قوة',
    'كما', 'لها', 'منذ', 'وقد', 'ولا', 'نفس', 'ولم', 'حيث', 'هناك',
    'جدا', 'ذات', 'ضمن', 'انه', 'لدى', 'عليه', 'مثل', 'وله', 'عند',
    'أما', 'هذه', 'وأن', 'وكل', 'وقال', 'لدي', 'وكان', 'فيه', 'وهي',
    'وهو', 'تلك', 'كلم', 'لكن', 'وفي', 'وقف', 'ولقد', 'ومن', 'وهذا',
    'اول', 'ضمن', 'انها', 'جميع', 'الذي', 'قبل', 'بعد', 'حول', 'ايضا',
    'لازم', 'حاجة', 'علي', 'يجب', 'صار', 'صارت', 'تحت', 'ضد'
    }

st.set_page_config(
    page_title="Arabic Poem Analysis",
    page_icon="📚",
    layout="wide"
)

@st.cache_resource
def load_models():
    """Load and cache the models"""
    # + Added use_fast=True for faster tokenization
    tokenizer = AutoTokenizer.from_pretrained(
        "CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment",
        use_fast=True
    )
    
    # + Added torchscript and low_cpu_mem_usage
    bert_model = AutoModel.from_pretrained(
        "aubmindlab/bert-base-arabertv2",
        torchscript=True,
        low_cpu_mem_usage=True
    )
    
    # + Added optimizations for emotion model
    emotion_model = AutoModelForSequenceClassification.from_pretrained(
        "CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment",
        torchscript=True,
        low_cpu_mem_usage=True
    )
    
    # ~ Changed pipeline configuration to use batching
    emotion_classifier = pipeline(
        "sentiment-analysis",
        model=emotion_model,
        tokenizer=tokenizer,
        batch_size=32,
        device=-1  # + Added to force CPU usage
    )
    
    return tokenizer, bert_model, emotion_classifier

# + Added new batch processing function
def process_texts_in_batches(texts, batch_size=32):
    """Process texts in batches for better CPU utilization"""
    batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]
    results = []
    
    for batch in batches:
        batch_results = emotion_classifier(batch, truncation=True, max_length=512)
        results.extend(batch_results)
        
    return results

# + Added caching decorator for embeddings
@st.cache_data
def get_cached_embeddings(text, tokenizer, model):
    """Cache embeddings to avoid recomputation"""
    return get_embedding_for_text(text, tokenizer, model)

def create_theme_map(summaries, topic_model):
    """Create an interactive map showing theme distributions across countries"""
    try:
        # Create a base map centered on the Arab world
        m = folium.Map(location=[25, 45], zoom_start=4)
        
        # Convert country names to coordinates
        cc = coco.CountryConverter()
        
        for summary in summaries:
            try:
                # Get country coordinates
                country_iso = cc.convert(names=[summary['country']], to='ISO2')
                country_data = cc.convert(names=[summary['country']], to='name_short')
                
                # Create popup content with theme information
                popup_content = f"""
                    <h4>{summary['country']}</h4>
                    <b>Top Themes:</b><br>
                    {'<br>'.join([f"• {topic['topic']}: {topic['count']}" 
                                 for topic in summary['top_topics'][:5]])}
                """
                
                # Add marker for each country
                folium.CircleMarker(
                    location=[cc.convert(country_iso, to='latitude')[0],
                             cc.convert(country_iso, to='longitude')[0]],
                    radius=20,
                    popup=folium.Popup(popup_content, max_width=300),
                    color='red',
                    fill=True,
                    fill_opacity=0.7
                ).add_to(m)
            except Exception as e:
                st.warning(f"Could not process {summary['country']}: {str(e)}")
                continue
                
        return m
    except Exception as e:
        st.error(f"Error creating map: {str(e)}")
        return None

def split_text(text, max_length=512):
    """Split text into chunks of maximum token length while preserving word boundaries."""
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    
    for word in words:
        word_length = len(word.split())
        if current_length + word_length > max_length:
            if current_chunk:
                chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_length = word_length
        else:
            current_chunk.append(word)
            current_length += word_length
    
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks

def create_arabic_wordcloud(text, title):
    wordcloud = WordCloud(
        width=1200, 
        height=600,
        background_color='white',
        font_path=font_path,
        max_words=200,
        stopwords=ARABIC_STOP_WORDS
    ).generate(text)
    
    fig, ax = plt.subplots(figsize=(15, 8))
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.axis('off')
    ax.set_title(title, fontsize=16, pad=20)
    return fig

def clean_arabic_text(text):
    """Clean Arabic text by removing stop words and normalizing."""
    words = text.split()
    cleaned_words = [word for word in words if word not in ARABIC_STOP_WORDS and len(word) > 1]
    return ' '.join(cleaned_words)

def classify_emotion(text, classifier):
    """Classify emotion for complete text with proper token handling."""
    try:
        words = text.split()
        chunks = []
        current_chunk = []
        current_length = 0
        
        for word in words:
            word_tokens = len(classifier.tokenizer.encode(word))
            if current_length + word_tokens > 512:
                if current_chunk:
                    chunks.append(' '.join(current_chunk))
                current_chunk = [word]
                current_length = word_tokens
            else:
                current_chunk.append(word)
                current_length += word_tokens
        
        if current_chunk:
            chunks.append(' '.join(current_chunk))
        
        if not chunks:
            chunks = [text]
        
        all_scores = []
        for chunk in chunks:
            try:
                inputs = classifier.tokenizer(
                    chunk,
                    truncation=True,
                    max_length=512,
                    return_tensors="pt"
                )
                result = classifier(chunk, truncation=True, max_length=512)
                scores = result[0]
                all_scores.append(scores)
            except Exception as chunk_error:
                st.warning(f"Skipping chunk due to error: {str(chunk_error)}")
                continue
        
        if all_scores:
            label_scores = {}
            count = len(all_scores)
            
            for scores in all_scores:
                for score in scores:
                    label = score['label']
                    if label not in label_scores:
                        label_scores[label] = 0
                    label_scores[label] += score['score']
            
            avg_scores = {label: score/count for label, score in label_scores.items()}
            final_emotion = max(avg_scores.items(), key=lambda x: x[1])[0]
            return final_emotion
        
        return "LABEL_2"
        
    except Exception as e:
        st.warning(f"Error in emotion classification: {str(e)}")
        return "LABEL_2"

def get_embedding_for_text(text, tokenizer, model):
    """Get embedding for complete text."""
    chunks = split_text(text)
    chunk_embeddings = []
    
    for chunk in chunks:
        try:
            inputs = tokenizer(
                chunk,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=512
            )
            inputs = {k: v.to(model.device) for k, v in inputs.items()}
            
            with torch.no_grad():
                outputs = model(**inputs)
            
            embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            chunk_embeddings.append(embedding[0])
        except Exception as e:
            st.warning(f"Error processing chunk: {str(e)}")
            continue
    
    if chunk_embeddings:
        weights = np.array([len(chunk.split()) for chunk in chunks])
        weights = weights / weights.sum()
        weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights)
        return weighted_embedding
    return np.zeros(model.config.hidden_size)

def format_topics(topic_model, topic_counts):
    """Format topics for display."""
    formatted_topics = []
    for topic_num, count in topic_counts:
        if topic_num == -1:
            topic_label = "Miscellaneous"
        else:
            words = topic_model.get_topic(topic_num)
            topic_label = " | ".join([word for word, _ in words[:5]])
        
        formatted_topics.append({
            'topic': topic_label,
            'count': count
        })
    return formatted_topics

def format_emotions(emotion_counts):
    """Format emotions for display."""
    EMOTION_LABELS = {
        'LABEL_0': 'Negative',
        'LABEL_1': 'Positive',
        'LABEL_2': 'Neutral'
    }
    
    formatted_emotions = []
    for label, count in emotion_counts:
        emotion = EMOTION_LABELS.get(label, label)
        formatted_emotions.append({
            'emotion': emotion,
            'count': count
        })
    return formatted_emotions

def get_optimized_topic_model(bert_model):
    """Configure BERTopic for better CPU performance"""
    return BERTopic(
        embedding_model=bert_model,
        language="arabic",
        calculate_probabilities=False,
        verbose=False,
        n_gram_range=(1, 1),
        min_topic_size=5,
        nr_topics="auto",
        low_memory=True
    )

    
def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3):
    """Process the data and generate summaries with flexible topic configuration."""
    summaries = []
    
    topic_model = get_optimized_topic_model(bert_model)
    
    vectorizer = CountVectorizer(stop_words=list(ARABIC_STOP_WORDS),
                                min_df=1,
                                max_df=1.0)
    topic_model.vectorizer_model = vectorizer
    
    for country, group in df.groupby('country'):
        progress_text = f"Processing poems for {country}..."
        progress_bar = st.progress(0, text=progress_text)
        
        texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
        all_emotions = []
        
        embeddings = []
        for i, text in enumerate(texts):
            try:
                embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
                if embedding is not None and not np.isnan(embedding).any():
                    embeddings.append(embedding)
                else:
                    st.warning(f"Invalid embedding generated for text {i+1} in {country}")
                    continue
            except Exception as e:
                st.warning(f"Error generating embedding for text {i+1} in {country}: {str(e)}")
                continue
            progress = (i + 1) / len(texts) * 0.4
            progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...")
        
        if len(embeddings) != len(texts):
            texts = texts[:len(embeddings)]
        embeddings = np.array(embeddings)
        
        for i, text in enumerate(texts):
            emotion = classify_emotion(text, emotion_classifier)
            all_emotions.append(emotion)
            progress = 0.4 + ((i + 1) / len(texts) * 0.3)
            progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")

        try:
            
            if len(texts) < min_topic_size:
                st.warning(f"Not enough documents for {country} to generate meaningful topics (minimum {min_topic_size} required)")
                continue
                
            
            topics, probs = topic_model.fit_transform(texts, embeddings)
            
            
            topic_counts = Counter(topics)
            
            top_topics = format_topics(topic_model, topic_counts.most_common(top_n))
            top_emotions = format_emotions(Counter(all_emotions).most_common(top_n))
            
            summaries.append({
                'country': country,
                'total_poems': len(texts),
                'top_topics': top_topics,
                'top_emotions': top_emotions
            })
            progress_bar.progress(1.0, text="Processing complete!")
            
        except Exception as e:
            st.warning(f"Could not generate topics for {country}: {str(e)}")
            continue

    return summaries, topic_model

try:
    bert_tokenizer, bert_model, emotion_classifier = load_models()
    st.success("Models loaded successfully!")
except Exception as e:
    st.error(f"Error loading models: {str(e)}")
    st.stop()

# Main app interface
st.title("📚 Arabic Poem Analysis")
st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.")

uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"])

if uploaded_file is not None:
    try:
        if uploaded_file.name.endswith('.csv'):
            df = pd.read_csv(uploaded_file)
        else:
            df = pd.read_excel(uploaded_file)
        
        required_columns = ['country', 'poem']
        if not all(col in df.columns for col in required_columns):
            st.error("File must contain 'country' and 'poem' columns.")
            st.stop()
        
        df['country'] = df['country'].str.strip()
        df = df.dropna(subset=['country', 'poem'])
        
        st.subheader("Topic Modeling Settings")
        col1, col2 = st.columns(2)
        
        with col1:
            topic_strategy = st.radio(
                "Topic Number Strategy",
                ["Auto", "Manual"],
                help="Choose whether to let the model determine the optimal number of topics or set it manually"
            )
            
            if topic_strategy == "Manual":
                n_documents = len(df)
                max_topics = 500
                min_topics = 5
                default_topics = 20
                
                n_topics = st.slider(
                    "Number of Topics",
                    min_value=min_topics,
                    max_value=max_topics,
                    value=default_topics,
                    help=f"Select the desired number of topics (max {max_topics} based on dataset size)"
                )
                
                st.info(f"""
                    💡 For your dataset of {n_documents:,} documents:
                    - Available topic range: {min_topics}-{max_topics}
                    - Recommended range: {max_topics//10}-{max_topics//3} for optimal coherence
                    """)
        
        with col2:
            top_n = st.number_input(
                "Number of top topics/emotions to display:", 
                min_value=1, 
                max_value=100, 
                value=10
            )

        if st.button("Process Data"):
            with st.spinner("Processing your data..."):
                summaries, topic_model = process_and_summarize(
                    df,
                    bert_tokenizer,
                    bert_model,
                    emotion_classifier,
                    top_n=top_n,
                    topic_strategy=topic_strategy,
                    n_topics=n_topics if topic_strategy == "Manual" else None,
                    min_topic_size=3
                )

                if summaries:
                    st.success("Analysis complete!")
                    
                    tab1, tab2, tab3 = st.tabs(["Country Summaries", "Global Topics", "Theme Map"])
                    
                    with tab1:
                        for summary in summaries:
                            with st.expander(f"📍 {summary['country']} ({summary['total_poems']} poems)"):
                                col1, col2 = st.columns(2)
                                
                                with col1:
                                    st.subheader("Top Topics")
                                    for topic in summary['top_topics']:
                                        st.write(f"• {topic['topic']}: {topic['count']} poems")
                                
                                with col2:
                                    st.subheader("Emotions")
                                    for emotion in summary['top_emotions']:
                                        st.write(f"• {emotion['emotion']}: {emotion['count']} poems")

                                st.subheader("Word Cloud Visualization")
                                country_poems = df[df['country'] == summary['country']]['poem']
                                combined_text = ' '.join(country_poems)
                                wordcloud_fig = create_arabic_wordcloud(combined_text, f"Most Common Words in {summary['country']} Poems")
                                st.pyplot(wordcloud_fig)                                
                    
                    with tab2:
                        st.subheader("Global Topic Distribution")
                        topic_info = topic_model.get_topic_info()
                        for _, row in topic_info.iterrows():
                            if row['Topic'] == -1:
                                topic_name = "Miscellaneous"
                            else:
                                words = topic_model.get_topic(row['Topic'])
                                topic_name = " | ".join([word for word, _ in words[:5]])
                            st.write(f"• Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
                    with tab3:
                        st.subheader("Thematic Distribution Map")
                        theme_map = create_theme_map(summaries, topic_model)
                        folium_static(theme_map)
    except Exception as e:
        st.error(f"Error processing file: {str(e)}")

else:
    st.info("👆 Upload a file to get started!")
    
    st.write("### Expected File Format:")
    example_df = pd.DataFrame({
        'country': ['Egypt', 'Palestine'],
        'poem': ['قصيدة مصرية', 'قصيدة فلسطينية']
    })
    st.dataframe(example_df)