FLS / app.py
kambris's picture
Update app.py
5507d34 verified
raw
history blame
18.3 kB
import streamlit as st
import pandas as pd
import numpy as np
import torch
import networkx as nx
import plotly.express as px
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.signal import savgol_filter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from wordcloud import WordCloud
import spacy
st.set_page_config(page_title="Advanced Political Speech Analysis", page_icon="🗣️", layout="wide")
# Advanced NLP Libraries
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
pipeline,
AutoModelForTokenClassification,
RobertaTokenizer,
RobertaForSequenceClassification
)
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textstat import flesch_reading_ease, flesch_kincaid_grade
# Download necessary NLTK resources
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
# Load spaCy model (requires separate installation)
try:
nlp = spacy.load('en_core_web_lg')
except:
st.error("Please install spaCy and en_core_web_lg model: \n"
"pip install spacy\n"
"python -m spacy download en_core_web_lg")
# Constants and Configurations
MORAL_FOUNDATIONS = {
'care': 'Care/Harm',
'fairness': 'Fairness/Cheating',
'loyalty': 'Loyalty/Betrayal',
'authority': 'Authority/Subversion',
'sanctity': 'Sanctity/Degradation'
}
RHETORICAL_DEVICES = {
'analogy': ['like', 'as', 'similar to'],
'repetition': ['repetitive', 'recurring'],
'metaphor': ['as if', 'like', 'represents'],
'hyperbole': ['always', 'never', 'absolute'],
'rhetorical_question': ['?']
}
class SpeechAnalyzer:
def __init__(self):
# Load MoralFoundations model
self.moral_model_path = "MMADS/MoralFoundationsClassifier"
self.moral_tokenizer = RobertaTokenizer.from_pretrained(self.moral_model_path)
self.moral_model = RobertaForSequenceClassification.from_pretrained(self.moral_model_path)
# Define label names directly
self.label_names = ['care', 'fairness', 'loyalty', 'authority', 'sanctity']
# Other pipelines remain the same
self.sentiment_pipeline = pipeline("sentiment-analysis")
self.ner_tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
self.ner_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
self.ner_pipeline = pipeline("ner", model=self.ner_model, tokenizer=self.ner_tokenizer)
def split_text(self, text, max_length=512, overlap=50):
"""Split long text into overlapping segments"""
words = text.split()
segments = []
current_segment = []
current_length = 0
for word in words:
if current_length + len(word.split()) > max_length:
segments.append(' '.join(current_segment))
current_segment = current_segment[-overlap:] + [word]
current_length = len(' '.join(current_segment).split())
else:
current_segment.append(word)
current_length = len(' '.join(current_segment).split())
if current_segment:
segments.append(' '.join(current_segment))
return segments
def analyze_moral_foundations(self, text):
"""Analyze moral foundations using the RoBERTa-based classifier"""
segments = self.split_text(text)
foundation_scores = {
'care': [], 'fairness': [], 'loyalty': [],
'authority': [], 'sanctity': []
}
for segment in segments:
inputs = self.moral_tokenizer(segment, return_tensors="pt", truncation=True, max_length=512)
with torch.no_grad():
outputs = self.moral_model(**inputs)
probabilities = torch.softmax(outputs.logits, dim=1)
for idx, label in enumerate(self.label_names):
foundation = label.lower()
if foundation in foundation_scores:
foundation_scores[foundation].append(probabilities[0][idx].item())
# Average the scores across segments
aggregated_scores = {
foundation: np.mean(scores) for foundation, scores in foundation_scores.items()
}
return aggregated_scores
def analyze_emotional_trajectory(self, text, window_size=5):
"""Perform emotional trajectory analysis"""
segments = self.split_text(text, max_length=256)
sentiment_scores = []
for segment in segments:
result = self.sentiment_pipeline(segment)[0]
score = 1 if result['label'] == 'POSITIVE' else -1
sentiment_scores.append(score)
smoothed_scores = (savgol_filter(sentiment_scores, window_length=window_size, polyorder=2)
if len(sentiment_scores) > window_size else sentiment_scores)
return smoothed_scores
def detect_named_entities(self, text):
"""Detect named entities in the text"""
entities = self.ner_pipeline(text)
return entities
def extract_key_phrases(self, text, top_n=10):
"""Extract key phrases using TF-IDF"""
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
tfidf_matrix = vectorizer.fit_transform([text])
feature_names = vectorizer.get_feature_names_out()
# Get top phrases by TF-IDF score
sorted_idx = tfidf_matrix.toarray()[0].argsort()[::-1]
top_phrases = [feature_names[i] for i in sorted_idx[:top_n]]
return top_phrases
def calculate_readability(self, text):
"""Calculate readability metrics"""
return {
'Flesch Reading Ease': flesch_reading_ease(text),
'Flesch-Kincaid Grade Level': flesch_kincaid_grade(text)
}
def detect_rhetorical_devices(self, text):
"""Detect rhetorical devices"""
devices_found = {}
for device, markers in RHETORICAL_DEVICES.items():
count = sum(text.lower().count(marker) for marker in markers)
if count > 0:
devices_found[device] = count
return devices_found
def create_semantic_network(self, text, top_n=20, window_size=10):
"""Create semantic network graph with weighted edges"""
doc = nlp(text)
# Create graph
G = nx.Graph()
# Extract top nouns and their relationships
nouns = [token.text.lower() for token in doc if token.pos_ == 'NOUN']
noun_freq = nltk.FreqDist(nouns)
top_nouns = [noun for noun, freq in noun_freq.most_common(top_n)]
# Create co-occurrence matrix
cooc_matrix = np.zeros((len(top_nouns), len(top_nouns)))
noun_to_idx = {noun: idx for idx, noun in enumerate(top_nouns)}
# Calculate co-occurrences within window_size
words = [token.text.lower() for token in doc]
for i in range(len(words)):
window_words = words[max(0, i-window_size):min(len(words), i+window_size)]
for noun1 in top_nouns:
if noun1 in window_words:
for noun2 in top_nouns:
if noun1 != noun2 and noun2 in window_words:
idx1, idx2 = noun_to_idx[noun1], noun_to_idx[noun2]
cooc_matrix[idx1][idx2] += 1
cooc_matrix[idx2][idx1] += 1
# Add nodes and weighted edges
for noun in top_nouns:
G.add_node(noun, size=noun_freq[noun])
# Add edges with weights based on co-occurrence
max_weight = np.max(cooc_matrix)
for i in range(len(top_nouns)):
for j in range(i+1, len(top_nouns)):
weight = cooc_matrix[i][j]
if weight > 0:
G.add_edge(top_nouns[i], top_nouns[j],
weight=weight,
width=3 * (weight/max_weight))
# Calculate layout with weighted edges
pos = nx.spring_layout(G, k=1, iterations=50)
# Store positions and attributes in graph
for node in G.nodes():
G.nodes[node]['pos'] = pos[node]
return G
def main():
st.title("🗣️ Advanced Political Speech Analysis Toolkit")
# Initialize analyzer
analyzer = SpeechAnalyzer()
# File upload
uploaded_file = st.file_uploader("Upload Political Speech", type=['txt', 'docx', 'pdf'])
if uploaded_file is not None:
# Read file (similar to previous implementation)
if uploaded_file.name.endswith('.txt'):
text = uploaded_file.getvalue().decode('utf-8')
elif uploaded_file.name.endswith('.docx'):
import docx
doc = docx.Document(uploaded_file)
text = '\n'.join([paragraph.text for paragraph in doc.paragraphs])
elif uploaded_file.name.endswith('.pdf'):
import PyPDF2
pdf_reader = PyPDF2.PdfReader(uploaded_file)
text = ' '.join([page.extract_text() for page in pdf_reader.pages])
# Create tabs for different analyses
tab1, tab2, tab3, tab4, tab5 = st.tabs([
"Moral Foundations",
"Emotional Analysis",
"Linguistic Insights",
"Semantic Network",
"Advanced NLP"
])
with tab1:
st.subheader("Moral Foundations Analysis")
moral_scores = analyzer.analyze_moral_foundations(text)
# Plotly bar chart
moral_df = pd.DataFrame.from_dict(moral_scores, orient='index', columns=['Score'])
moral_df.index.name = 'Moral Foundation'
moral_df = moral_df.reset_index()
fig = px.bar(
moral_df,
x='Moral Foundation',
y='Score',
title='Moral Foundations Breakdown',
color='Moral Foundation'
)
st.plotly_chart(fig)
# Detailed insights
for foundation, score in moral_scores.items():
st.write(f"**{MORAL_FOUNDATIONS[foundation]}**: {score:.2%}")
with tab2:
st.subheader("Emotional Trajectory")
emotional_trajectory = analyzer.analyze_emotional_trajectory(text)
# Scale values to a -1 to 1 range
scaled_trajectory = np.array(emotional_trajectory)
scaled_trajectory = np.clip(scaled_trajectory, -1, 1)
# Create segment labels for x-axis
num_segments = len(scaled_trajectory)
segment_labels = [f"Segment {i+1}" for i in range(num_segments)]
trajectory_fig = go.Figure(data=go.Scatter(
x=segment_labels,
y=scaled_trajectory,
mode='lines+markers',
name='Emotional Intensity',
line=dict(
color='#1f77b4',
width=3
),
marker=dict(
size=8,
color='#1f77b4'
)
))
trajectory_fig.update_layout(
title='Speech Emotional Flow',
xaxis_title='Speech Progression',
yaxis_title='Sentiment',
yaxis=dict(
ticktext=['Very Negative', 'Neutral', 'Very Positive'],
tickvals=[-1, 0, 1],
range=[-1, 1]
),
hovermode='x unified',
showlegend=False
)
st.plotly_chart(trajectory_fig)
with tab3:
st.subheader("Linguistic Complexity")
readability = analyzer.calculate_readability(text)
col1, col2 = st.columns(2)
with col1:
st.metric("Flesch Reading Ease", f"{readability['Flesch Reading Ease']:.2f}")
with col2:
st.metric("Flesch-Kincaid Grade Level", f"{readability['Flesch-Kincaid Grade Level']:.2f}")
# Key Phrases
st.subheader("Key Phrases")
key_phrases = analyzer.extract_key_phrases(text)
st.write(", ".join(key_phrases))
with tab4:
st.subheader("Semantic Network")
semantic_graph = analyzer.create_semantic_network(text)
network_fig = go.Figure()
# Add edges with enhanced visual encoding
for edge in semantic_graph.edges():
x0, y0 = semantic_graph.nodes[edge[0]]['pos']
x1, y1 = semantic_graph.nodes[edge[1]]['pos']
weight = semantic_graph.edges[edge]['weight']
max_weight = max(d['weight'] for _, _, d in semantic_graph.edges(data=True))
# Normalize weight for visual encoding
normalized_weight = weight / max_weight
# Enhanced width scaling (more pronounced differences)
width = 2 + (normalized_weight * 8)
# Color gradient from light to dark based on weight
color = f'rgba(31, 119, 180, {0.3 + normalized_weight * 0.7})'
network_fig.add_trace(go.Scatter(
x=[x0, x1, None],
y=[y0, y1, None],
mode='lines',
line=dict(
width=width,
color=color
),
hoverinfo='text',
hovertext=f'Relationship strength: {weight:.2f}'
))
# Enhanced nodes with better visibility
for node in semantic_graph.nodes():
x, y = semantic_graph.nodes[node]['pos']
size = semantic_graph.nodes[node]['size']
network_fig.add_trace(go.Scatter(
x=[x],
y=[y],
mode='markers+text',
marker=dict(
size=15 + size/2, # Increased base size
color='#ffffff',
line=dict(width=2, color='#1f77b4'),
symbol='circle'
),
text=[node],
textposition="top center",
textfont=dict(size=12, color='black'),
hoverinfo='text',
hovertext=f'Term: {node}<br>Frequency: {size}'
))
network_fig.update_layout(
showlegend=False,
hovermode='closest',
margin=dict(b=20, l=20, r=20, t=20),
xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
plot_bgcolor='white',
width=800,
height=600
)
st.plotly_chart(network_fig, use_container_width=True)
with tab5:
st.subheader("Advanced NLP Analysis")
# Named Entities with clear explanations
st.write("### Key People, Organizations, and Places")
named_entities = analyzer.detect_named_entities(text)
# Create intuitive mapping of entity types
entity_type_mapping = {
'PER': 'Person',
'ORG': 'Organization',
'LOC': 'Location',
'GPE': 'Country/City',
'MISC': 'Miscellaneous'
}
# Transform the entities dataframe
entities_df = pd.DataFrame(named_entities)
entities_df['entity_type'] = entities_df['entity_group'].map(entity_type_mapping)
entities_df['confidence'] = entities_df['score'].apply(lambda x: f"{x*100:.1f}%")
# Display enhanced table
display_df = entities_df[['word', 'entity_type', 'confidence']].rename(columns={
'word': 'Name/Term',
'entity_type': 'Type',
'confidence': 'Confidence Level'
})
st.dataframe(
display_df,
column_config={
"Name/Term": st.column_config.TextColumn(
help="The identified name or term from the text"
),
"Type": st.column_config.TextColumn(
help="Category of the identified term"
),
"Confidence Level": st.column_config.TextColumn(
help="How certain the AI is about this identification"
)
},
hide_index=True
)
# Enhanced Rhetorical Devices section
st.write("### Persuasive Language Techniques")
rhetorical_devices = analyzer.detect_rhetorical_devices(text)
# Create columns for better layout
col1, col2 = st.columns(2)
# Define friendly names and descriptions
device_explanations = {
'analogy': 'Comparisons (using "like" or "as")',
'repetition': 'Repeated phrases for emphasis',
'metaphor': 'Symbolic comparisons',
'hyperbole': 'Dramatic exaggerations',
'rhetorical_question': 'Questions asked for effect'
}
for device, count in rhetorical_devices.items():
with col1:
st.metric(
label=device_explanations[device],
value=f"{count} times"
)
if __name__ == "__main__":
main()