import streamlit as st import pandas as pd import numpy as np import torch import networkx as nx import plotly.express as px import plotly.graph_objs as go import matplotlib.pyplot as plt import seaborn as sns from scipy.signal import savgol_filter from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from wordcloud import WordCloud import spacy st.set_page_config(page_title="Advanced Political Speech Analysis", page_icon="🗣️", layout="wide") # Advanced NLP Libraries from transformers import ( AutoTokenizer, AutoModelForSequenceClassification, pipeline, AutoModelForTokenClassification, RobertaTokenizer, RobertaForSequenceClassification ) import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from textstat import flesch_reading_ease, flesch_kincaid_grade # Download necessary NLTK resources nltk.download('punkt', quiet=True) nltk.download('averaged_perceptron_tagger', quiet=True) nltk.download('stopwords', quiet=True) nltk.download('punkt_tab', quiet=True) # Load spaCy model (requires separate installation) try: nlp = spacy.load('en_core_web_lg') except: st.error("Please install spaCy and en_core_web_lg model: \n" "pip install spacy\n" "python -m spacy download en_core_web_lg") # Constants and Configurations MORAL_FOUNDATIONS = { 'care': 'Care/Harm', 'fairness': 'Fairness/Cheating', 'loyalty': 'Loyalty/Betrayal', 'authority': 'Authority/Subversion', 'sanctity': 'Sanctity/Degradation' } RHETORICAL_DEVICES = { 'analogy': ['like', 'as', 'similar to'], 'repetition': ['repetitive', 'recurring'], 'metaphor': ['as if', 'like', 'represents'], 'hyperbole': ['always', 'never', 'absolute'], 'rhetorical_question': ['?'] } class SpeechAnalyzer: def __init__(self): # Load MoralFoundations model self.moral_model_path = "MMADS/MoralFoundationsClassifier" self.moral_tokenizer = RobertaTokenizer.from_pretrained(self.moral_model_path) self.moral_model = RobertaForSequenceClassification.from_pretrained(self.moral_model_path) # Define label names directly self.label_names = ['care', 'fairness', 'loyalty', 'authority', 'sanctity'] # Other pipelines remain the same self.sentiment_pipeline = pipeline("sentiment-analysis") self.ner_tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER") self.ner_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER") self.ner_pipeline = pipeline("ner", model=self.ner_model, tokenizer=self.ner_tokenizer) def split_text(self, text, max_length=256, overlap=50): """Split long text into overlapping segments""" words = text.split() segments = [] current_segment = [] current_length = 0 for word in words: if current_length + len(word.split()) > max_length: segments.append(' '.join(current_segment)) current_segment = current_segment[-overlap:] + [word] current_length = len(' '.join(current_segment).split()) else: current_segment.append(word) current_length = len(' '.join(current_segment).split()) if current_segment: segments.append(' '.join(current_segment)) return segments def analyze_moral_foundations(self, text): """Analyze moral foundations using the RoBERTa-based classifier""" segments = self.split_text(text) foundation_scores = { 'care': [], 'fairness': [], 'loyalty': [], 'authority': [], 'sanctity': [] } for segment in segments: inputs = self.moral_tokenizer(segment, return_tensors="pt", truncation=True, max_length=512) with torch.no_grad(): outputs = self.moral_model(**inputs) probabilities = torch.softmax(outputs.logits, dim=1) for idx, label in enumerate(self.label_names): foundation = label.lower() if foundation in foundation_scores: foundation_scores[foundation].append(probabilities[0][idx].item()) # Average the scores across segments aggregated_scores = { foundation: np.mean(scores) for foundation, scores in foundation_scores.items() } return aggregated_scores def analyze_emotional_trajectory(self, text, window_size=5): """Enhanced emotional trajectory analysis using sentence-level processing""" segments = self.split_text(text, max_length=256) sentiment_scores = [] for segment in segments: # Split into sentences using spaCy or NLTK sentences = nltk.sent_tokenize(segment) # Process sentences in batches batch_size = 64 segment_scores = [] for i in range(0, len(sentences), batch_size): batch = sentences[i:i+batch_size] results = self.sentiment_pipeline(batch) batch_scores = [result['score'] for result in results] segment_scores.extend(batch_scores) avg_score = np.mean(segment_scores) if segment_scores else 0 sentiment_scores.append(avg_score) # Normalize scores if sentiment_scores: min_score = min(sentiment_scores) max_score = max(sentiment_scores) score_range = max_score - min_score if score_range > 0: sentiment_scores = [(s - min_score) / score_range * 2 - 1 for s in sentiment_scores] return sentiment_scores def detect_named_entities(self, text): """Detect named entities in the text""" entities = self.ner_pipeline(text) return entities def extract_key_phrases(self, text, top_n=10): """Extract key phrases using TF-IDF""" vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2)) tfidf_matrix = vectorizer.fit_transform([text]) feature_names = vectorizer.get_feature_names_out() # Get top phrases by TF-IDF score sorted_idx = tfidf_matrix.toarray()[0].argsort()[::-1] top_phrases = [feature_names[i] for i in sorted_idx[:top_n]] return top_phrases def calculate_readability(self, text): """Calculate readability metrics""" return { 'Flesch Reading Ease': flesch_reading_ease(text), 'Flesch-Kincaid Grade Level': flesch_kincaid_grade(text) } def detect_rhetorical_devices(self, text): """Detect rhetorical devices""" devices_found = {} for device, markers in RHETORICAL_DEVICES.items(): count = sum(text.lower().count(marker) for marker in markers) if count > 0: devices_found[device] = count return devices_found def create_semantic_network(self, text, top_n=20, window_size=10, chunk_size=10000): """Create semantic network graph with weighted edges""" # Process text in chunks chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)] # Initialize collections for aggregating results all_nouns = [] noun_freq = nltk.FreqDist() # Process each chunk for chunk in chunks: doc = nlp(chunk) chunk_nouns = [token.text.lower() for token in doc if token.pos_ == 'NOUN'] all_nouns.extend(chunk_nouns) noun_freq.update(chunk_nouns) # Get top nouns across all chunks top_nouns = [noun for noun, freq in noun_freq.most_common(top_n)] # Create graph and co-occurrence matrix G = nx.Graph() cooc_matrix = np.zeros((len(top_nouns), len(top_nouns))) noun_to_idx = {noun: idx for idx, noun in enumerate(top_nouns)} # Process co-occurrences in chunks for chunk in chunks: doc = nlp(chunk) words = [token.text.lower() for token in doc] for i in range(len(words)): window_words = words[max(0, i-window_size):min(len(words), i+window_size)] for noun1 in top_nouns: if noun1 in window_words: for noun2 in top_nouns: if noun1 != noun2 and noun2 in window_words: idx1, idx2 = noun_to_idx[noun1], noun_to_idx[noun2] cooc_matrix[idx1][idx2] += 1 cooc_matrix[idx2][idx1] += 1 # Build network for noun in top_nouns: G.add_node(noun, size=noun_freq[noun]) # Add weighted edges max_weight = np.max(cooc_matrix) if max_weight > 0: # Prevent division by zero for i in range(len(top_nouns)): for j in range(i+1, len(top_nouns)): weight = cooc_matrix[i][j] if weight > 0: G.add_edge(top_nouns[i], top_nouns[j], weight=weight, width=3 * (weight/max_weight)) # Calculate and store layout pos = nx.spring_layout(G, k=1, iterations=50) for node in G.nodes(): G.nodes[node]['pos'] = pos[node] return G def main(): st.title("🗣️ Advanced Political Speech Analysis Toolkit") # Initialize analyzer analyzer = SpeechAnalyzer() # File upload uploaded_file = st.file_uploader("Upload Political Speech", type=['txt', 'docx', 'pdf']) if uploaded_file is not None: # Read file (similar to previous implementation) if uploaded_file.name.endswith('.txt'): text = uploaded_file.getvalue().decode('utf-8') elif uploaded_file.name.endswith('.docx'): import docx doc = docx.Document(uploaded_file) text = '\n'.join([paragraph.text for paragraph in doc.paragraphs]) elif uploaded_file.name.endswith('.pdf'): import PyPDF2 pdf_reader = PyPDF2.PdfReader(uploaded_file) text = ' '.join([page.extract_text() for page in pdf_reader.pages]) # Create tabs for different analyses progress_bar = st.progress(0) status_text = st.empty() tab1, tab2, tab3, tab4, tab5 = st.tabs([ "Moral Foundations", "Emotional Analysis", "Linguistic Insights", "Semantic Network", "Advanced NLP" ]) with tab1: status_text.text('Analyzing Moral Foundations...') progress_bar.progress(20) st.subheader("Moral Foundations Analysis") moral_scores = analyzer.analyze_moral_foundations(text) # Plotly bar chart moral_df = pd.DataFrame.from_dict(moral_scores, orient='index', columns=['Score']) moral_df.index.name = 'Moral Foundation' moral_df = moral_df.reset_index() fig = px.bar( moral_df, x='Moral Foundation', y='Score', title='Moral Foundations Breakdown', color='Moral Foundation' ) st.plotly_chart(fig) # Detailed insights for foundation, score in moral_scores.items(): st.write(f"**{MORAL_FOUNDATIONS[foundation]}**: {score:.2%}") with tab2: status_text.text('Processing Emotional Trajectory...') progress_bar.progress(40) st.subheader("Speech Trajectory Analysis") col1, col2 = st.columns(2) # Create consistent segments for both analyses segments = analyzer.split_text(text, max_length=256) num_segments = len(segments) segment_labels = [f"{i+1}" for i in range(num_segments)] with col1: st.write("### Emotional Flow") sentiment_scores = analyzer.analyze_emotional_trajectory(text) trajectory_fig = go.Figure(data=go.Scatter( x=segment_labels, y=sentiment_scores, mode='lines+markers', line=dict(color='#1f77b4', width=3), marker=dict(size=8, color='#1f77b4', symbol='circle') )) trajectory_fig.update_layout( title='Emotional Flow Throughout the Speech', xaxis_title='Speech Segments', yaxis_title='Relative Emotional Tone', yaxis=dict( ticktext=['Most Negative', 'Neutral', 'Most Positive'], tickvals=[-1, 0, 1], range=[-1.1, 1.1], gridcolor='lightgray' ), hovermode='x unified', showlegend=False, plot_bgcolor='white' ) st.plotly_chart(trajectory_fig) with col2: st.write("### Moral Foundations Flow") moral_trajectories = {foundation: [] for foundation in MORAL_FOUNDATIONS} for segment in segments: segment_scores = analyzer.analyze_moral_foundations(segment) for foundation, score in segment_scores.items(): moral_trajectories[foundation].append(score) moral_fig = go.Figure() for foundation, scores in moral_trajectories.items(): moral_fig.add_trace(go.Scatter( x=segment_labels, y=scores, name=MORAL_FOUNDATIONS[foundation], mode='lines+markers' )) moral_fig.update_layout( title='Moral Foundations Flow', xaxis_title='Speech Segments', yaxis_title='Foundation Strength', hovermode='x unified', plot_bgcolor='white', showlegend=True, legend=dict( yanchor="top", y=0.99, xanchor="left", x=1.05 ) ) st.plotly_chart(moral_fig) with tab3: status_text.text('Analyzing Linguistic Features...') progress_bar.progress(60) st.subheader("Linguistic Analysis") readability = analyzer.calculate_readability(text) # Readability metrics with context col1, col2 = st.columns(2) with col1: score = readability['Flesch Reading Ease'] interpretation = "Complex" if score < 50 else "Standard" if score < 70 else "Easy" st.metric( label="Reading Ease", value=f"{score:.1f}/100", delta=interpretation, delta_color="normal" ) with col2: grade = readability['Flesch-Kincaid Grade Level'] st.metric( label="Education Level", value=f"Grade {grade:.1f}", delta="Years of Education", delta_color="normal" ) # Enhanced key phrases display st.subheader("Key Topics and Themes") key_phrases = analyzer.extract_key_phrases(text) # Create columns for better phrase organization cols = st.columns(3) for idx, phrase in enumerate(key_phrases): col_idx = idx % 3 cols[col_idx].markdown( f"""
{phrase}
""", unsafe_allow_html=True ) with tab4: status_text.text('Building Semantic Network...') progress_bar.progress(80) st.subheader("Semantic Network") semantic_graph = analyzer.create_semantic_network(text) network_fig = go.Figure() # Add edges with enhanced visual encoding for edge in semantic_graph.edges(): x0, y0 = semantic_graph.nodes[edge[0]]['pos'] x1, y1 = semantic_graph.nodes[edge[1]]['pos'] weight = semantic_graph.edges[edge]['weight'] max_weight = max(d['weight'] for _, _, d in semantic_graph.edges(data=True)) # Normalize weight for visual encoding normalized_weight = weight / max_weight # Enhanced width scaling (more pronounced differences) width = 2 + (normalized_weight * 8) # Color gradient from light to dark based on weight color = f'rgba(31, 119, 180, {0.3 + normalized_weight * 0.7})' network_fig.add_trace(go.Scatter( x=[x0, x1, None], y=[y0, y1, None], mode='lines', line=dict( width=width, color=color ), hoverinfo='text', hovertext=f'Relationship strength: {weight:.2f}' )) # Enhanced nodes with better visibility for node in semantic_graph.nodes(): x, y = semantic_graph.nodes[node]['pos'] size = semantic_graph.nodes[node]['size'] network_fig.add_trace(go.Scatter( x=[x], y=[y], mode='markers+text', marker=dict( size=15 + size/2, # Increased base size color='#ffffff', line=dict(width=2, color='#1f77b4'), symbol='circle' ), text=[node], textposition="top center", textfont=dict(size=12, color='black'), hoverinfo='text', hovertext=f'Term: {node}
Frequency: {size}' )) network_fig.update_layout( showlegend=False, hovermode='closest', margin=dict(b=20, l=20, r=20, t=20), xaxis=dict(showgrid=False, zeroline=False, showticklabels=False), yaxis=dict(showgrid=False, zeroline=False, showticklabels=False), plot_bgcolor='white', width=800, height=600 ) st.plotly_chart(network_fig, use_container_width=True) with tab5: status_text.text('Extracting Named Entities...') progress_bar.progress(100) st.subheader("Named Entity Recognition") named_entities = analyzer.detect_named_entities(text) # Process entities entities_df = pd.DataFrame(named_entities) # Map entity types to friendly names type_mapping = { 'B-PER': 'Person', 'I-PER': 'Person', 'B-ORG': 'Organization', 'I-ORG': 'Organization', 'B-LOC': 'Location', 'I-LOC': 'Location', 'B-MISC': 'Other', 'I-MISC': 'Other' } # Clean and transform the data display_df = pd.DataFrame({ 'Term': entities_df['word'], 'Category': entities_df['entity'].map(type_mapping), 'Confidence': entities_df['score'].apply(lambda x: f"{x*100:.1f}%") }) # Group similar entities grouped_df = display_df.groupby('Category').agg({ 'Term': lambda x: ', '.join(set(x)), 'Confidence': 'count' }).reset_index() # Display results in an organized way for category in grouped_df['Category'].unique(): category_data = grouped_df[grouped_df['Category'] == category] st.write(f"### {category}") st.markdown(f"**Found**: {category_data['Term'].iloc[0]}") if __name__ == "__main__": main()