import streamlit as st
import pandas as pd
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, pipeline
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
import torch
import numpy as np
from collections import Counter
import os
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pkg_resources
import folium
from folium.plugins import HeatMap
import country_converter as coco
from streamlit_folium import folium_static

current_dir = os.path.dirname(os.path.abspath(__file__))
font_path = os.path.join(current_dir, "ArabicR2013-J25x.ttf")

ARABIC_STOP_WORDS = {
    'في', 'من', 'إلى', 'على', 'عن', 'مع', 'خلال', 'حتى', 'إذا', 'ثم',
    'أو', 'و', 'ف', 'ل', 'ب', 'ك', 'لل', 'ال', 'هذا', 'هذه', 'ذلك',
    'تلك', 'هؤلاء', 'هم', 'هن', 'هو', 'هي', 'نحن', 'انت', 'انتم',
    'كان', 'كانت', 'يكون', 'تكون', 'اي', 'كل', 'بعض', 'غير', 'حول',
    'عند', 'قد', 'لقد', 'لم', 'لن', 'لو', 'ما', 'ماذا', 'متى', 'كيف',
    'اين', 'لماذا', 'الذي', 'التي', 'الذين', 'اللاتي', 'اللواتي',
    'الان', 'بين', 'فوق', 'تحت', 'امام', 'خلف', 'حين', 'قبل', 'بعد',
    'و', 'أن', 'في', 'كل', 'لم', 'لن', 'له', 'من', 'هو', 'هي', 'قوة',
    'كما', 'لها', 'منذ', 'وقد', 'ولا', 'نفس', 'ولم', 'حيث', 'هناك',
    'جدا', 'ذات', 'ضمن', 'انه', 'لدى', 'عليه', 'مثل', 'وله', 'عند',
    'أما', 'هذه', 'وأن', 'وكل', 'وقال', 'لدي', 'وكان', 'فيه', 'وهي',
    'وهو', 'تلك', 'كلم', 'لكن', 'وفي', 'وقف', 'ولقد', 'ومن', 'وهذا',
    'اول', 'ضمن', 'انها', 'جميع', 'الذي', 'قبل', 'بعد', 'حول', 'ايضا',
    'لازم', 'حاجة', 'علي', 'يجب', 'صار', 'صارت', 'تحت', 'ضد'
    }

st.set_page_config(
    page_title="Arabic Poem Analysis",
    page_icon="📚",
    layout="wide"
)

@st.cache_resource
def load_models():
    """Load and cache the models"""
    # + Added use_fast=True for faster tokenization
    tokenizer = AutoTokenizer.from_pretrained(
        "CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment",
        use_fast=True
    )
    
    # + Added torchscript and low_cpu_mem_usage
    bert_model = AutoModel.from_pretrained(
        "aubmindlab/bert-base-arabertv2",
        torchscript=True,
        low_cpu_mem_usage=True
    )
    
    # + Added optimizations for emotion model
    emotion_model = AutoModelForSequenceClassification.from_pretrained(
        "CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment",
        torchscript=True,
        low_cpu_mem_usage=True
    )
    
    # ~ Changed pipeline configuration to use batching
    emotion_classifier = pipeline(
        "sentiment-analysis",
        model=emotion_model,
        tokenizer=tokenizer,
        batch_size=32,
        device=-1  # + Added to force CPU usage
    )
    
    return tokenizer, bert_model, emotion_classifier

# + Added new batch processing function
def process_texts_in_batches(texts, batch_size=32):
    """Process texts in batches for better CPU utilization"""
    batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]
    results = []
    
    for batch in batches:
        batch_results = emotion_classifier(batch, truncation=True, max_length=512)
        results.extend(batch_results)
        
    return results

# + Added caching decorator for embeddings
@st.cache_data
def get_cached_embeddings(text, tokenizer, model):
    """Cache embeddings to avoid recomputation"""
    return get_embedding_for_text(text, tokenizer, model)

def create_theme_map(summaries, topic_model):
    """Create an interactive map showing theme distributions across countries"""
    try:
        # Create a base map centered on the Arab world
        m = folium.Map(location=[25, 45], zoom_start=4)
        
        # Convert country names to coordinates
        cc = coco.CountryConverter()
        
        for summary in summaries:
            try:
                # Get country coordinates
                country_iso = cc.convert(names=[summary['country']], to='ISO2')
                country_data = cc.convert(names=[summary['country']], to='name_short')
                
                # Create popup content with theme information
                popup_content = f"""
                    <h4>{summary['country']}</h4>
                    <b>Top Themes:</b><br>
                    {'<br>'.join([f"• {topic['topic']}: {topic['count']}" 
                                 for topic in summary['top_topics'][:5]])}
                """
                
                # Add marker for each country
                folium.CircleMarker(
                    location=[cc.convert(country_iso, to='latitude')[0],
                             cc.convert(country_iso, to='longitude')[0]],
                    radius=20,
                    popup=folium.Popup(popup_content, max_width=300),
                    color='red',
                    fill=True,
                    fill_opacity=0.7
                ).add_to(m)
            except Exception as e:
                st.warning(f"Could not process {summary['country']}: {str(e)}")
                continue
                
        return m
    except Exception as e:
        st.error(f"Error creating map: {str(e)}")
        return None

def split_text(text, max_length=512):
    """Split text into chunks of maximum token length while preserving word boundaries."""
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    
    for word in words:
        word_length = len(word.split())
        if current_length + word_length > max_length:
            if current_chunk:
                chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_length = word_length
        else:
            current_chunk.append(word)
            current_length += word_length
    
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks

def create_arabic_wordcloud(text, title):
    wordcloud = WordCloud(
        width=1200, 
        height=600,
        background_color='white',
        font_path=font_path,
        max_words=200,
        stopwords=ARABIC_STOP_WORDS
    ).generate(text)
    
    fig, ax = plt.subplots(figsize=(15, 8))
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.axis('off')
    ax.set_title(title, fontsize=16, pad=20)
    return fig

def clean_arabic_text(text):
    """Clean Arabic text by removing stop words and normalizing."""
    words = text.split()
    cleaned_words = [word for word in words if word not in ARABIC_STOP_WORDS and len(word) > 1]
    return ' '.join(cleaned_words)

def classify_emotion(text, classifier):
    """Classify emotion for complete text with precise token handling."""
    # Ensure text is properly formatted
    if not text or not isinstance(text, str):
        return "LABEL_2"
    
    # Split into manageable chunks
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    
    # Create proper-sized chunks
    for word in words:
        word_tokens = len(classifier.tokenizer.encode(word))
        if current_length + word_tokens > 512:
            if current_chunk:
                chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_length = word_tokens
        else:
            current_chunk.append(word)
            current_length += word_tokens
    
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    if not chunks:
        return "LABEL_2"
    
    # Process chunks with proper output handling
    all_scores = []
    for chunk in chunks:
        # Direct classification with proper output structure
        result = classifier(chunk, return_all_scores=True)[0]
        all_scores.append(result)
    
    # Calculate final emotion
    label_scores = {}
    count = len(all_scores)
    
    for scores in all_scores:
        for score_dict in scores:
            label = score_dict['label']
            if label not in label_scores:
                label_scores[label] = 0
            label_scores[label] += score_dict['score']
    
    avg_scores = {label: score/count for label, score in label_scores.items()}
    final_emotion = max(avg_scores.items(), key=lambda x: x[1])[0]
    
    return final_emotion
        
def get_embedding_for_text(text, tokenizer, model):
    """Get embedding for complete text."""
    # Get the raw tokens first
    tokens = tokenizer.tokenize(text)
    
    # Process in chunks of exactly 510 tokens (512 - 2 special tokens)
    chunk_size = 510
    chunks = []
    
    for i in range(0, len(tokens), chunk_size):
        chunk = tokens[i:i + chunk_size]
        token_ids = tokenizer.convert_tokens_to_ids(chunk)
        # Add special tokens manually
        token_ids = [tokenizer.cls_token_id] + token_ids + [tokenizer.sep_token_id]
        # Create attention mask
        attention_mask = [1] * len(token_ids)
        # Pad if needed
        padding_length = 512 - len(token_ids)
        if padding_length > 0:
            token_ids = token_ids + ([tokenizer.pad_token_id] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length)
            
        chunks.append({
            'input_ids': torch.tensor([token_ids]),
            'attention_mask': torch.tensor([attention_mask])
        })
    
    # Get embeddings
    chunk_embeddings = []
    for chunk in chunks:
        chunk = {k: v.to(model.device) for k, v in chunk.items()}
        with torch.no_grad():
            outputs = model(**chunk)[0]
            embedding = outputs[:, 0, :].cpu().numpy()
            chunk_embeddings.append(embedding[0])
    
    if chunk_embeddings:
        return np.mean(chunk_embeddings, axis=0)
    return np.zeros(model.config.hidden_size)

def format_topics(topic_model, topic_counts):
    """Format topics for display."""
    formatted_topics = []
    for topic_num, count in topic_counts:
        if topic_num == -1:
            topic_label = "Miscellaneous"
        else:
            words = topic_model.get_topic(topic_num)
            topic_label = " | ".join([word for word, _ in words[:5]])
        
        formatted_topics.append({
            'topic': topic_label,
            'count': count
        })
    return formatted_topics

def format_emotions(emotion_counts):
    """Format emotions for display."""
    EMOTION_LABELS = {
        'LABEL_0': 'Negative',
        'LABEL_1': 'Positive',
        'LABEL_2': 'Neutral'
    }
    
    formatted_emotions = []
    for label, count in emotion_counts:
        emotion = EMOTION_LABELS.get(label, label)
        formatted_emotions.append({
            'emotion': emotion,
            'count': count
        })
    return formatted_emotions

def get_optimized_topic_model(bert_model):
    """Configure BERTopic for better CPU performance"""
    return BERTopic(
        embedding_model=bert_model,
        language="arabic",
        calculate_probabilities=False,
        verbose=False,
        n_gram_range=(1, 1),
        min_topic_size=5,
        nr_topics="auto",
        low_memory=True
    )

    
def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3):
    """Process the data and generate summaries with flexible topic configuration."""
    summaries = []
    
    topic_model = get_optimized_topic_model(bert_model)
    
    vectorizer = CountVectorizer(stop_words=list(ARABIC_STOP_WORDS),
                                min_df=1,
                                max_df=1.0)
    topic_model.vectorizer_model = vectorizer
    
    for country, group in df.groupby('country'):
        progress_text = f"Processing poems for {country}..."
        progress_bar = st.progress(0, text=progress_text)
        
        texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
        all_emotions = []
        
        # Get embeddings with proper output handling
        embeddings = []
        for i, text in enumerate(texts):
            embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
            embeddings.append(embedding)
            progress = (i + 1) / len(texts) * 0.4
            progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...")
        
        embeddings = np.array(embeddings)
        
        # Process emotions with correct output structure
        for i, text in enumerate(texts):
            emotion = emotion_classifier(text)[0]['label']  # Access the label directly
            all_emotions.append(emotion)
            progress = 0.4 + ((i + 1) / len(texts) * 0.3)
            progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")
        
        if len(texts) < min_topic_size:
            st.info(f"Not enough documents for {country} to generate meaningful topics (minimum {min_topic_size} required)")
            continue
        
        topics, _ = topic_model.fit_transform(texts, embeddings)
        topic_counts = Counter(topics)
        
        top_topics = format_topics(topic_model, topic_counts.most_common(top_n))
        top_emotions = format_emotions(Counter(all_emotions).most_common(top_n))
        
        summaries.append({
            'country': country,
            'total_poems': len(texts),
            'top_topics': top_topics,
            'top_emotions': top_emotions
        })
        progress_bar.progress(1.0, text="Processing complete!")
    
    return summaries, topic_model


try:
    bert_tokenizer, bert_model, emotion_classifier = load_models()
    st.success("Models loaded successfully!")
except Exception as e:
    st.error(f"Error loading models: {str(e)}")
    st.stop()

# Main app interface
st.title("📚 Arabic Poem Analysis")
st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.")

uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"])

if uploaded_file is not None:
    try:
        if uploaded_file.name.endswith('.csv'):
            df = pd.read_csv(uploaded_file)
        else:
            df = pd.read_excel(uploaded_file)
        
        required_columns = ['country', 'poem']
        if not all(col in df.columns for col in required_columns):
            st.error("File must contain 'country' and 'poem' columns.")
            st.stop()
        
        df['country'] = df['country'].str.strip()
        df = df.dropna(subset=['country', 'poem'])
        
        st.subheader("Topic Modeling Settings")
        col1, col2 = st.columns(2)
        
        with col1:
            topic_strategy = st.radio(
                "Topic Number Strategy",
                ["Auto", "Manual"],
                help="Choose whether to let the model determine the optimal number of topics or set it manually"
            )
            
            if topic_strategy == "Manual":
                n_documents = len(df)
                max_topics = 500
                min_topics = 5
                default_topics = 20
                
                n_topics = st.slider(
                    "Number of Topics",
                    min_value=min_topics,
                    max_value=max_topics,
                    value=default_topics,
                    help=f"Select the desired number of topics (max {max_topics} based on dataset size)"
                )
                
                st.info(f"""
                    💡 For your dataset of {n_documents:,} documents:
                    - Available topic range: {min_topics}-{max_topics}
                    - Recommended range: {max_topics//10}-{max_topics//3} for optimal coherence
                    """)
        
        with col2:
            top_n = st.number_input(
                "Number of top topics/emotions to display:", 
                min_value=1, 
                max_value=100, 
                value=10
            )

        if st.button("Process Data"):
            with st.spinner("Processing your data..."):
                summaries, topic_model = process_and_summarize(
                    df,
                    bert_tokenizer,
                    bert_model,
                    emotion_classifier,
                    top_n=top_n,
                    topic_strategy=topic_strategy,
                    n_topics=n_topics if topic_strategy == "Manual" else None,
                    min_topic_size=3
                )

                if summaries:
                    st.success("Analysis complete!")
                    
                    tab1, tab2, tab3 = st.tabs(["Country Summaries", "Global Topics", "Theme Map"])
                    
                    with tab1:
                        for summary in summaries:
                            with st.expander(f"📍 {summary['country']} ({summary['total_poems']} poems)"):
                                col1, col2 = st.columns(2)
                                
                                with col1:
                                    st.subheader("Top Topics")
                                    for topic in summary['top_topics']:
                                        st.write(f"• {topic['topic']}: {topic['count']} poems")
                                
                                with col2:
                                    st.subheader("Emotions")
                                    for emotion in summary['top_emotions']:
                                        st.write(f"• {emotion['emotion']}: {emotion['count']} poems")

                                st.subheader("Word Cloud Visualization")
                                country_poems = df[df['country'] == summary['country']]['poem']
                                combined_text = ' '.join(country_poems)
                                wordcloud_fig = create_arabic_wordcloud(combined_text, f"Most Common Words in {summary['country']} Poems")
                                st.pyplot(wordcloud_fig)                                
                    
                    with tab2:
                        st.subheader("Global Topic Distribution")
                        topic_info = topic_model.get_topic_info()
                        for _, row in topic_info.iterrows():
                            if row['Topic'] == -1:
                                topic_name = "Miscellaneous"
                            else:
                                words = topic_model.get_topic(row['Topic'])
                                topic_name = " | ".join([word for word, _ in words[:5]])
                            st.write(f"• Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
                    with tab3:
                        st.subheader("Thematic Distribution Map")
                        theme_map = create_theme_map(summaries, topic_model)
                        folium_static(theme_map)
    except Exception as e:
        st.error(f"Error processing file: {str(e)}")

else:
    st.info("👆 Upload a file to get started!")
    
    st.write("### Expected File Format:")
    example_df = pd.DataFrame({
        'country': ['Egypt', 'Palestine'],
        'poem': ['قصيدة مصرية', 'قصيدة فلسطينية']
    })
    st.dataframe(example_df)