import streamlit as st
import pandas as pd
import numpy as np
import torch
import networkx as nx
import plotly.express as px
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.signal import savgol_filter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from wordcloud import WordCloud
import spacy

st.set_page_config(page_title="Advanced Political Speech Analysis", page_icon="🗣️", layout="wide")

# Advanced NLP Libraries
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    pipeline,
    AutoModelForTokenClassification,
    RobertaTokenizer,
    RobertaForSequenceClassification
)
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textstat import flesch_reading_ease, flesch_kincaid_grade

# Download necessary NLTK resources
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt_tab', quiet=True)

# Load spaCy model (requires separate installation)
try:
    nlp = spacy.load('en_core_web_lg')
except:
    st.error("Please install spaCy and en_core_web_lg model: \n"
             "pip install spacy\n"
             "python -m spacy download en_core_web_lg")

# Constants and Configurations
MORAL_FOUNDATIONS = {
    'care': 'Care/Harm',
    'fairness': 'Fairness/Cheating', 
    'loyalty': 'Loyalty/Betrayal',
    'authority': 'Authority/Subversion',
    'sanctity': 'Sanctity/Degradation'
}

RHETORICAL_DEVICES = {
    'analogy': ['like', 'as', 'similar to'],
    'repetition': ['repetitive', 'recurring'],
    'metaphor': ['as if', 'like', 'represents'],
    'hyperbole': ['always', 'never', 'absolute'],
    'rhetorical_question': ['?']
}

class SpeechAnalyzer:
    def __init__(self):
    # Load MoralFoundations model
        self.moral_model_path = "MMADS/MoralFoundationsClassifier"
        self.moral_tokenizer = RobertaTokenizer.from_pretrained(self.moral_model_path)
        self.moral_model = RobertaForSequenceClassification.from_pretrained(self.moral_model_path)
    
    # Define label names directly
        self.label_names = ['care', 'fairness', 'loyalty', 'authority', 'sanctity']
        
        # Other pipelines remain the same
        self.sentiment_pipeline = pipeline("sentiment-analysis")
        self.ner_tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
        self.ner_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
        self.ner_pipeline = pipeline("ner", model=self.ner_model, tokenizer=self.ner_tokenizer)

    def split_text(self, text, max_length=256, overlap=50):
        """Split long text into overlapping segments"""
        words = text.split()
        segments = []
        current_segment = []
        current_length = 0

        for word in words:
            if current_length + len(word.split()) > max_length:
                segments.append(' '.join(current_segment))
                current_segment = current_segment[-overlap:] + [word]
                current_length = len(' '.join(current_segment).split())
            else:
                current_segment.append(word)
                current_length = len(' '.join(current_segment).split())

        if current_segment:
            segments.append(' '.join(current_segment))

        return segments

    def analyze_moral_foundations(self, text):
        """Analyze moral foundations using the RoBERTa-based classifier"""
        segments = self.split_text(text)
        
        foundation_scores = {
            'care': [], 'fairness': [], 'loyalty': [],
            'authority': [], 'sanctity': []
        }
        
        for segment in segments:
            inputs = self.moral_tokenizer(segment, return_tensors="pt", truncation=True, max_length=512)
            
            with torch.no_grad():
                outputs = self.moral_model(**inputs)
            
            probabilities = torch.softmax(outputs.logits, dim=1)
            
            for idx, label in enumerate(self.label_names):
                foundation = label.lower()
                if foundation in foundation_scores:
                    foundation_scores[foundation].append(probabilities[0][idx].item())
        
        # Average the scores across segments
        aggregated_scores = {
            foundation: np.mean(scores) for foundation, scores in foundation_scores.items()
        }
        
        return aggregated_scores

    def analyze_emotional_trajectory(self, text, window_size=5):
        """Enhanced emotional trajectory analysis using sentence-level processing"""
        segments = self.split_text(text, max_length=256)
        sentiment_scores = []
        
        for segment in segments:
            # Split into sentences using spaCy or NLTK
            sentences = nltk.sent_tokenize(segment)
            
            # Process sentences in batches
            batch_size = 64
            segment_scores = []
            for i in range(0, len(sentences), batch_size):
                batch = sentences[i:i+batch_size]
                results = self.sentiment_pipeline(batch)
                batch_scores = [result['score'] for result in results]
                segment_scores.extend(batch_scores)
            
            avg_score = np.mean(segment_scores) if segment_scores else 0
            sentiment_scores.append(avg_score)
        
        # Normalize scores
        if sentiment_scores:
            min_score = min(sentiment_scores)
            max_score = max(sentiment_scores)
            score_range = max_score - min_score
            if score_range > 0:
                sentiment_scores = [(s - min_score) / score_range * 2 - 1 for s in sentiment_scores]
        
        return sentiment_scores


    def detect_named_entities(self, text):
        """Detect named entities in the text"""
        entities = self.ner_pipeline(text)
        return entities

    def extract_key_phrases(self, text, top_n=10):
        """Extract key phrases using TF-IDF"""
        vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
        tfidf_matrix = vectorizer.fit_transform([text])
        feature_names = vectorizer.get_feature_names_out()
        
        # Get top phrases by TF-IDF score
        sorted_idx = tfidf_matrix.toarray()[0].argsort()[::-1]
        top_phrases = [feature_names[i] for i in sorted_idx[:top_n]]
        
        return top_phrases

    def calculate_readability(self, text):
        """Calculate readability metrics"""
        return {
            'Flesch Reading Ease': flesch_reading_ease(text),
            'Flesch-Kincaid Grade Level': flesch_kincaid_grade(text)
        }

    def detect_rhetorical_devices(self, text):
        """Detect rhetorical devices"""
        devices_found = {}
        for device, markers in RHETORICAL_DEVICES.items():
            count = sum(text.lower().count(marker) for marker in markers)
            if count > 0:
                devices_found[device] = count
        return devices_found

    def create_semantic_network(self, text, top_n=20, window_size=10, chunk_size=10000):
        """Create semantic network graph with weighted edges"""
        # Process text in chunks
        chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
        
        # Initialize collections for aggregating results
        all_nouns = []
        noun_freq = nltk.FreqDist()
        
        # Process each chunk
        for chunk in chunks:
            doc = nlp(chunk)
            chunk_nouns = [token.text.lower() for token in doc if token.pos_ == 'NOUN']
            all_nouns.extend(chunk_nouns)
            noun_freq.update(chunk_nouns)
        
        # Get top nouns across all chunks
        top_nouns = [noun for noun, freq in noun_freq.most_common(top_n)]
        
        # Create graph and co-occurrence matrix
        G = nx.Graph()
        cooc_matrix = np.zeros((len(top_nouns), len(top_nouns)))
        noun_to_idx = {noun: idx for idx, noun in enumerate(top_nouns)}
        
        # Process co-occurrences in chunks
        for chunk in chunks:
            doc = nlp(chunk)
            words = [token.text.lower() for token in doc]
            
            for i in range(len(words)):
                window_words = words[max(0, i-window_size):min(len(words), i+window_size)]
                for noun1 in top_nouns:
                    if noun1 in window_words:
                        for noun2 in top_nouns:
                            if noun1 != noun2 and noun2 in window_words:
                                idx1, idx2 = noun_to_idx[noun1], noun_to_idx[noun2]
                                cooc_matrix[idx1][idx2] += 1
                                cooc_matrix[idx2][idx1] += 1
        
        # Build network
        for noun in top_nouns:
            G.add_node(noun, size=noun_freq[noun])
        
        # Add weighted edges
        max_weight = np.max(cooc_matrix)
        if max_weight > 0:  # Prevent division by zero
            for i in range(len(top_nouns)):
                for j in range(i+1, len(top_nouns)):
                    weight = cooc_matrix[i][j]
                    if weight > 0:
                        G.add_edge(top_nouns[i], top_nouns[j],
                                  weight=weight,
                                  width=3 * (weight/max_weight))
        
        # Calculate and store layout
        pos = nx.spring_layout(G, k=1, iterations=50)
        for node in G.nodes():
            G.nodes[node]['pos'] = pos[node]
        
        return G
def main():
    st.title("🗣️ Advanced Political Speech Analysis Toolkit")
    
    # Initialize analyzer
    analyzer = SpeechAnalyzer()
    
    # File upload
    uploaded_file = st.file_uploader("Upload Political Speech", type=['txt', 'docx', 'pdf'])
    
    if uploaded_file is not None:
        # Read file (similar to previous implementation)
        if uploaded_file.name.endswith('.txt'):
            text = uploaded_file.getvalue().decode('utf-8')
        elif uploaded_file.name.endswith('.docx'):
            import docx
            doc = docx.Document(uploaded_file)
            text = '\n'.join([paragraph.text for paragraph in doc.paragraphs])
        elif uploaded_file.name.endswith('.pdf'):
            import PyPDF2
            pdf_reader = PyPDF2.PdfReader(uploaded_file)
            text = ' '.join([page.extract_text() for page in pdf_reader.pages])
        
        # Create tabs for different analyses
        progress_bar = st.progress(0)
        status_text = st.empty()
        tab1, tab2, tab3, tab4, tab5 = st.tabs([
            "Moral Foundations", 
            "Emotional Analysis", 
            "Linguistic Insights", 
            "Semantic Network", 
            "Advanced NLP"
        ])
        
        with tab1:
            status_text.text('Analyzing Moral Foundations...')
            progress_bar.progress(20)
            st.subheader("Moral Foundations Analysis")
            moral_scores = analyzer.analyze_moral_foundations(text)
            
            # Plotly bar chart
            moral_df = pd.DataFrame.from_dict(moral_scores, orient='index', columns=['Score'])
            moral_df.index.name = 'Moral Foundation'
            moral_df = moral_df.reset_index()
            
            fig = px.bar(
                moral_df, 
                x='Moral Foundation', 
                y='Score', 
                title='Moral Foundations Breakdown',
                color='Moral Foundation'
            )
            st.plotly_chart(fig)
            
            # Detailed insights
            for foundation, score in moral_scores.items():
                st.write(f"**{MORAL_FOUNDATIONS[foundation]}**: {score:.2%}")
        
        with tab2:
            status_text.text('Processing Emotional Trajectory...')
            progress_bar.progress(40)
            st.subheader("Speech Trajectory Analysis")
            col1, col2 = st.columns(2)
            
            # Create consistent segments for both analyses
            segments = analyzer.split_text(text, max_length=256)
            num_segments = len(segments)
            segment_labels = [f"{i+1}" for i in range(num_segments)]
            
            with col1:
                st.write("### Emotional Flow")
                sentiment_scores = analyzer.analyze_emotional_trajectory(text)
                
                trajectory_fig = go.Figure(data=go.Scatter(
                    x=segment_labels,
                    y=sentiment_scores,
                    mode='lines+markers',
                    line=dict(color='#1f77b4', width=3),
                    marker=dict(size=8, color='#1f77b4', symbol='circle')
                ))
                
                trajectory_fig.update_layout(
                    title='Emotional Flow Throughout the Speech',
                    xaxis_title='Speech Segments',
                    yaxis_title='Relative Emotional Tone',
                    yaxis=dict(
                        ticktext=['Most Negative', 'Neutral', 'Most Positive'],
                        tickvals=[-1, 0, 1],
                        range=[-1.1, 1.1],
                        gridcolor='lightgray'
                    ),
                    hovermode='x unified',
                    showlegend=False,
                    plot_bgcolor='white'
                )
                
                st.plotly_chart(trajectory_fig)
            
            with col2:
                st.write("### Moral Foundations Flow")
                moral_trajectories = {foundation: [] for foundation in MORAL_FOUNDATIONS}
                
                for segment in segments:
                    segment_scores = analyzer.analyze_moral_foundations(segment)
                    for foundation, score in segment_scores.items():
                        moral_trajectories[foundation].append(score)
                
                moral_fig = go.Figure()
                for foundation, scores in moral_trajectories.items():
                    moral_fig.add_trace(go.Scatter(
                        x=segment_labels,
                        y=scores,
                        name=MORAL_FOUNDATIONS[foundation],
                        mode='lines+markers'
                    ))
                
                moral_fig.update_layout(
                    title='Moral Foundations Flow',
                    xaxis_title='Speech Segments',
                    yaxis_title='Foundation Strength',
                    hovermode='x unified',
                    plot_bgcolor='white',
                    showlegend=True,
                    legend=dict(
                        yanchor="top",
                        y=0.99,
                        xanchor="left",
                        x=1.05
                    )
                )
                st.plotly_chart(moral_fig)

        with tab3:
            status_text.text('Analyzing Linguistic Features...')
            progress_bar.progress(60)
            st.subheader("Linguistic Analysis")
            readability = analyzer.calculate_readability(text)
            
            # Readability metrics with context
            col1, col2 = st.columns(2)
            with col1:
                score = readability['Flesch Reading Ease']
                interpretation = "Complex" if score < 50 else "Standard" if score < 70 else "Easy"
                st.metric(
                    label="Reading Ease",
                    value=f"{score:.1f}/100",
                    delta=interpretation,
                    delta_color="normal"
                )
            
            with col2:
                grade = readability['Flesch-Kincaid Grade Level']
                st.metric(
                    label="Education Level",
                    value=f"Grade {grade:.1f}",
                    delta="Years of Education",
                    delta_color="normal"
                )
            
            # Enhanced key phrases display
            st.subheader("Key Topics and Themes")
            key_phrases = analyzer.extract_key_phrases(text)
            
            # Create columns for better phrase organization
            cols = st.columns(3)
            for idx, phrase in enumerate(key_phrases):
                col_idx = idx % 3
                cols[col_idx].markdown(
                    f"""<div style='
                        background-color: rgba(31, 119, 180, {0.9 - idx*0.05});
                        color: white;
                        padding: 8px 15px;
                        margin: 5px 0;
                        border-radius: 15px;
                        text-align: center;
                        '>{phrase}</div>""",
                    unsafe_allow_html=True
                )
        
        with tab4:
            status_text.text('Building Semantic Network...')
            progress_bar.progress(80)
            st.subheader("Semantic Network")
            semantic_graph = analyzer.create_semantic_network(text)
            
            network_fig = go.Figure()
        
            # Add edges with enhanced visual encoding
            for edge in semantic_graph.edges():
                x0, y0 = semantic_graph.nodes[edge[0]]['pos']
                x1, y1 = semantic_graph.nodes[edge[1]]['pos']
                weight = semantic_graph.edges[edge]['weight']
                max_weight = max(d['weight'] for _, _, d in semantic_graph.edges(data=True))
                
                # Normalize weight for visual encoding
                normalized_weight = weight / max_weight
                
                # Enhanced width scaling (more pronounced differences)
                width = 2 + (normalized_weight * 8)
                
                # Color gradient from light to dark based on weight
                color = f'rgba(31, 119, 180, {0.3 + normalized_weight * 0.7})'
                
                network_fig.add_trace(go.Scatter(
                    x=[x0, x1, None],
                    y=[y0, y1, None],
                    mode='lines',
                    line=dict(
                        width=width,
                        color=color
                    ),
                    hoverinfo='text',
                    hovertext=f'Relationship strength: {weight:.2f}'
                ))
        
            # Enhanced nodes with better visibility
            for node in semantic_graph.nodes():
                x, y = semantic_graph.nodes[node]['pos']
                size = semantic_graph.nodes[node]['size']
                
                network_fig.add_trace(go.Scatter(
                    x=[x],
                    y=[y],
                    mode='markers+text',
                    marker=dict(
                        size=15 + size/2,  # Increased base size
                        color='#ffffff',
                        line=dict(width=2, color='#1f77b4'),
                        symbol='circle'
                    ),
                    text=[node],
                    textposition="top center",
                    textfont=dict(size=12, color='black'),
                    hoverinfo='text',
                    hovertext=f'Term: {node}<br>Frequency: {size}'
                ))
        
            network_fig.update_layout(
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20, l=20, r=20, t=20),
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                plot_bgcolor='white',
                width=800,
                height=600
            )
            
            st.plotly_chart(network_fig, use_container_width=True)
    
        with tab5:
            status_text.text('Extracting Named Entities...')
            progress_bar.progress(100)
            st.subheader("Named Entity Recognition")
            named_entities = analyzer.detect_named_entities(text)
            
            # Process entities
            entities_df = pd.DataFrame(named_entities)
            
            # Map entity types to friendly names
            type_mapping = {
                'B-PER': 'Person',
                'I-PER': 'Person',
                'B-ORG': 'Organization',
                'I-ORG': 'Organization',
                'B-LOC': 'Location',
                'I-LOC': 'Location',
                'B-MISC': 'Other',
                'I-MISC': 'Other'
            }
            
            # Clean and transform the data
            display_df = pd.DataFrame({
                'Term': entities_df['word'],
                'Category': entities_df['entity'].map(type_mapping),
                'Confidence': entities_df['score'].apply(lambda x: f"{x*100:.1f}%")
            })
            
            # Group similar entities
            grouped_df = display_df.groupby('Category').agg({
                'Term': lambda x: ', '.join(set(x)),
                'Confidence': 'count'
            }).reset_index()
            
            # Display results in an organized way
            for category in grouped_df['Category'].unique():
                category_data = grouped_df[grouped_df['Category'] == category]
                st.write(f"### {category}")
                st.markdown(f"**Found**: {category_data['Term'].iloc[0]}")
        

if __name__ == "__main__":
    main()