FLS

Sleeping

App Files Files Community

FLS / app.py

kambris

Update app.py

5507d34 verified 7 months ago

raw

history blame

18.3 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import torch
	import networkx as nx
	import plotly.express as px
	import plotly.graph_objs as go
	import matplotlib.pyplot as plt
	import seaborn as sns
	from scipy.signal import savgol_filter
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	from wordcloud import WordCloud
	import spacy

	st.set_page_config(page_title="Advanced Political Speech Analysis", page_icon="🗣️", layout="wide")

	# Advanced NLP Libraries
	from transformers import (
	AutoTokenizer,
	AutoModelForSequenceClassification,
	pipeline,
	AutoModelForTokenClassification,
	RobertaTokenizer,
	RobertaForSequenceClassification
	)
	import nltk
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize
	from textstat import flesch_reading_ease, flesch_kincaid_grade

	# Download necessary NLTK resources
	nltk.download('punkt', quiet=True)
	nltk.download('stopwords', quiet=True)

	# Load spaCy model (requires separate installation)
	try:
	nlp = spacy.load('en_core_web_lg')
	except:
	st.error("Please install spaCy and en_core_web_lg model: \n"
	"pip install spacy\n"
	"python -m spacy download en_core_web_lg")

	# Constants and Configurations
	MORAL_FOUNDATIONS = {
	'care': 'Care/Harm',
	'fairness': 'Fairness/Cheating',
	'loyalty': 'Loyalty/Betrayal',
	'authority': 'Authority/Subversion',
	'sanctity': 'Sanctity/Degradation'
	}

	RHETORICAL_DEVICES = {
	'analogy': ['like', 'as', 'similar to'],
	'repetition': ['repetitive', 'recurring'],
	'metaphor': ['as if', 'like', 'represents'],
	'hyperbole': ['always', 'never', 'absolute'],
	'rhetorical_question': ['?']
	}

	class SpeechAnalyzer:
	def __init__(self):
	# Load MoralFoundations model
	self.moral_model_path = "MMADS/MoralFoundationsClassifier"
	self.moral_tokenizer = RobertaTokenizer.from_pretrained(self.moral_model_path)
	self.moral_model = RobertaForSequenceClassification.from_pretrained(self.moral_model_path)

	# Define label names directly
	self.label_names = ['care', 'fairness', 'loyalty', 'authority', 'sanctity']

	# Other pipelines remain the same
	self.sentiment_pipeline = pipeline("sentiment-analysis")
	self.ner_tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
	self.ner_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
	self.ner_pipeline = pipeline("ner", model=self.ner_model, tokenizer=self.ner_tokenizer)

	def split_text(self, text, max_length=512, overlap=50):
	"""Split long text into overlapping segments"""
	words = text.split()
	segments = []
	current_segment = []
	current_length = 0

	for word in words:
	if current_length + len(word.split()) > max_length:
	segments.append(' '.join(current_segment))
	current_segment = current_segment[-overlap:] + [word]
	current_length = len(' '.join(current_segment).split())
	else:
	current_segment.append(word)
	current_length = len(' '.join(current_segment).split())

	if current_segment:
	segments.append(' '.join(current_segment))

	return segments

	def analyze_moral_foundations(self, text):
	"""Analyze moral foundations using the RoBERTa-based classifier"""
	segments = self.split_text(text)

	foundation_scores = {
	'care': [], 'fairness': [], 'loyalty': [],
	'authority': [], 'sanctity': []
	}

	for segment in segments:
	inputs = self.moral_tokenizer(segment, return_tensors="pt", truncation=True, max_length=512)

	with torch.no_grad():
	outputs = self.moral_model(**inputs)

	probabilities = torch.softmax(outputs.logits, dim=1)

	for idx, label in enumerate(self.label_names):
	foundation = label.lower()
	if foundation in foundation_scores:
	foundation_scores[foundation].append(probabilities[0][idx].item())

	# Average the scores across segments
	aggregated_scores = {
	foundation: np.mean(scores) for foundation, scores in foundation_scores.items()
	}

	return aggregated_scores

	def analyze_emotional_trajectory(self, text, window_size=5):
	"""Perform emotional trajectory analysis"""
	segments = self.split_text(text, max_length=256)

	sentiment_scores = []
	for segment in segments:
	result = self.sentiment_pipeline(segment)[0]
	score = 1 if result['label'] == 'POSITIVE' else -1
	sentiment_scores.append(score)

	smoothed_scores = (savgol_filter(sentiment_scores, window_length=window_size, polyorder=2)
	if len(sentiment_scores) > window_size else sentiment_scores)

	return smoothed_scores

	def detect_named_entities(self, text):
	"""Detect named entities in the text"""
	entities = self.ner_pipeline(text)
	return entities

	def extract_key_phrases(self, text, top_n=10):
	"""Extract key phrases using TF-IDF"""
	vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
	tfidf_matrix = vectorizer.fit_transform([text])
	feature_names = vectorizer.get_feature_names_out()

	# Get top phrases by TF-IDF score
	sorted_idx = tfidf_matrix.toarray()[0].argsort()[::-1]
	top_phrases = [feature_names[i] for i in sorted_idx[:top_n]]

	return top_phrases

	def calculate_readability(self, text):
	"""Calculate readability metrics"""
	return {
	'Flesch Reading Ease': flesch_reading_ease(text),
	'Flesch-Kincaid Grade Level': flesch_kincaid_grade(text)
	}

	def detect_rhetorical_devices(self, text):
	"""Detect rhetorical devices"""
	devices_found = {}
	for device, markers in RHETORICAL_DEVICES.items():
	count = sum(text.lower().count(marker) for marker in markers)
	if count > 0:
	devices_found[device] = count
	return devices_found

	def create_semantic_network(self, text, top_n=20, window_size=10):
	"""Create semantic network graph with weighted edges"""
	doc = nlp(text)

	# Create graph
	G = nx.Graph()

	# Extract top nouns and their relationships
	nouns = [token.text.lower() for token in doc if token.pos_ == 'NOUN']
	noun_freq = nltk.FreqDist(nouns)
	top_nouns = [noun for noun, freq in noun_freq.most_common(top_n)]

	# Create co-occurrence matrix
	cooc_matrix = np.zeros((len(top_nouns), len(top_nouns)))
	noun_to_idx = {noun: idx for idx, noun in enumerate(top_nouns)}

	# Calculate co-occurrences within window_size
	words = [token.text.lower() for token in doc]
	for i in range(len(words)):
	window_words = words[max(0, i-window_size):min(len(words), i+window_size)]
	for noun1 in top_nouns:
	if noun1 in window_words:
	for noun2 in top_nouns:
	if noun1 != noun2 and noun2 in window_words:
	idx1, idx2 = noun_to_idx[noun1], noun_to_idx[noun2]
	cooc_matrix[idx1][idx2] += 1
	cooc_matrix[idx2][idx1] += 1

	# Add nodes and weighted edges
	for noun in top_nouns:
	G.add_node(noun, size=noun_freq[noun])

	# Add edges with weights based on co-occurrence
	max_weight = np.max(cooc_matrix)
	for i in range(len(top_nouns)):
	for j in range(i+1, len(top_nouns)):
	weight = cooc_matrix[i][j]
	if weight > 0:
	G.add_edge(top_nouns[i], top_nouns[j],
	weight=weight,
	width=3 * (weight/max_weight))

	# Calculate layout with weighted edges
	pos = nx.spring_layout(G, k=1, iterations=50)

	# Store positions and attributes in graph
	for node in G.nodes():
	G.nodes[node]['pos'] = pos[node]

	return G
	def main():
	st.title("🗣️ Advanced Political Speech Analysis Toolkit")

	# Initialize analyzer
	analyzer = SpeechAnalyzer()

	# File upload
	uploaded_file = st.file_uploader("Upload Political Speech", type=['txt', 'docx', 'pdf'])

	if uploaded_file is not None:
	# Read file (similar to previous implementation)
	if uploaded_file.name.endswith('.txt'):
	text = uploaded_file.getvalue().decode('utf-8')
	elif uploaded_file.name.endswith('.docx'):
	import docx
	doc = docx.Document(uploaded_file)
	text = '\n'.join([paragraph.text for paragraph in doc.paragraphs])
	elif uploaded_file.name.endswith('.pdf'):
	import PyPDF2
	pdf_reader = PyPDF2.PdfReader(uploaded_file)
	text = ' '.join([page.extract_text() for page in pdf_reader.pages])

	# Create tabs for different analyses
	tab1, tab2, tab3, tab4, tab5 = st.tabs([
	"Moral Foundations",
	"Emotional Analysis",
	"Linguistic Insights",
	"Semantic Network",
	"Advanced NLP"
	])

	with tab1:
	st.subheader("Moral Foundations Analysis")
	moral_scores = analyzer.analyze_moral_foundations(text)

	# Plotly bar chart
	moral_df = pd.DataFrame.from_dict(moral_scores, orient='index', columns=['Score'])
	moral_df.index.name = 'Moral Foundation'
	moral_df = moral_df.reset_index()

	fig = px.bar(
	moral_df,
	x='Moral Foundation',
	y='Score',
	title='Moral Foundations Breakdown',
	color='Moral Foundation'
	)
	st.plotly_chart(fig)

	# Detailed insights
	for foundation, score in moral_scores.items():
	st.write(f"{MORAL_FOUNDATIONS[foundation]}: {score:.2%}")

	with tab2:
	st.subheader("Emotional Trajectory")
	emotional_trajectory = analyzer.analyze_emotional_trajectory(text)

	# Scale values to a -1 to 1 range
	scaled_trajectory = np.array(emotional_trajectory)
	scaled_trajectory = np.clip(scaled_trajectory, -1, 1)

	# Create segment labels for x-axis
	num_segments = len(scaled_trajectory)
	segment_labels = [f"Segment {i+1}" for i in range(num_segments)]

	trajectory_fig = go.Figure(data=go.Scatter(
	x=segment_labels,
	y=scaled_trajectory,
	mode='lines+markers',
	name='Emotional Intensity',
	line=dict(
	color='#1f77b4',
	width=3
	),
	marker=dict(
	size=8,
	color='#1f77b4'
	)
	))

	trajectory_fig.update_layout(
	title='Speech Emotional Flow',
	xaxis_title='Speech Progression',
	yaxis_title='Sentiment',
	yaxis=dict(
	ticktext=['Very Negative', 'Neutral', 'Very Positive'],
	tickvals=[-1, 0, 1],
	range=[-1, 1]
	),
	hovermode='x unified',
	showlegend=False
	)

	st.plotly_chart(trajectory_fig)


	with tab3:
	st.subheader("Linguistic Complexity")
	readability = analyzer.calculate_readability(text)

	col1, col2 = st.columns(2)
	with col1:
	st.metric("Flesch Reading Ease", f"{readability['Flesch Reading Ease']:.2f}")
	with col2:
	st.metric("Flesch-Kincaid Grade Level", f"{readability['Flesch-Kincaid Grade Level']:.2f}")

	# Key Phrases
	st.subheader("Key Phrases")
	key_phrases = analyzer.extract_key_phrases(text)
	st.write(", ".join(key_phrases))

	with tab4:
	st.subheader("Semantic Network")
	semantic_graph = analyzer.create_semantic_network(text)

	network_fig = go.Figure()

	# Add edges with enhanced visual encoding
	for edge in semantic_graph.edges():
	x0, y0 = semantic_graph.nodes[edge[0]]['pos']
	x1, y1 = semantic_graph.nodes[edge[1]]['pos']
	weight = semantic_graph.edges[edge]['weight']
	max_weight = max(d['weight'] for _, _, d in semantic_graph.edges(data=True))

	# Normalize weight for visual encoding
	normalized_weight = weight / max_weight

	# Enhanced width scaling (more pronounced differences)
	width = 2 + (normalized_weight * 8)

	# Color gradient from light to dark based on weight
	color = f'rgba(31, 119, 180, {0.3 + normalized_weight * 0.7})'

	network_fig.add_trace(go.Scatter(
	x=[x0, x1, None],
	y=[y0, y1, None],
	mode='lines',
	line=dict(
	width=width,
	color=color
	),
	hoverinfo='text',
	hovertext=f'Relationship strength: {weight:.2f}'
	))

	# Enhanced nodes with better visibility
	for node in semantic_graph.nodes():
	x, y = semantic_graph.nodes[node]['pos']
	size = semantic_graph.nodes[node]['size']

	network_fig.add_trace(go.Scatter(
	x=[x],
	y=[y],
	mode='markers+text',
	marker=dict(
	size=15 + size/2, # Increased base size
	color='#ffffff',
	line=dict(width=2, color='#1f77b4'),
	symbol='circle'
	),
	text=[node],
	textposition="top center",
	textfont=dict(size=12, color='black'),
	hoverinfo='text',
	hovertext=f'Term: {node}<br>Frequency: {size}'
	))

	network_fig.update_layout(
	showlegend=False,
	hovermode='closest',
	margin=dict(b=20, l=20, r=20, t=20),
	xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
	yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
	plot_bgcolor='white',
	width=800,
	height=600
	)

	st.plotly_chart(network_fig, use_container_width=True)

	with tab5:
	st.subheader("Advanced NLP Analysis")

	# Named Entities with clear explanations
	st.write("### Key People, Organizations, and Places")
	named_entities = analyzer.detect_named_entities(text)

	# Create intuitive mapping of entity types
	entity_type_mapping = {
	'PER': 'Person',
	'ORG': 'Organization',
	'LOC': 'Location',
	'GPE': 'Country/City',
	'MISC': 'Miscellaneous'
	}

	# Transform the entities dataframe
	entities_df = pd.DataFrame(named_entities)
	entities_df['entity_type'] = entities_df['entity_group'].map(entity_type_mapping)
	entities_df['confidence'] = entities_df['score'].apply(lambda x: f"{x*100:.1f}%")

	# Display enhanced table
	display_df = entities_df[['word', 'entity_type', 'confidence']].rename(columns={
	'word': 'Name/Term',
	'entity_type': 'Type',
	'confidence': 'Confidence Level'
	})

	st.dataframe(
	display_df,
	column_config={
	"Name/Term": st.column_config.TextColumn(
	help="The identified name or term from the text"
	),
	"Type": st.column_config.TextColumn(
	help="Category of the identified term"
	),
	"Confidence Level": st.column_config.TextColumn(
	help="How certain the AI is about this identification"
	)
	},
	hide_index=True
	)

	# Enhanced Rhetorical Devices section
	st.write("### Persuasive Language Techniques")
	rhetorical_devices = analyzer.detect_rhetorical_devices(text)

	# Create columns for better layout
	col1, col2 = st.columns(2)

	# Define friendly names and descriptions
	device_explanations = {
	'analogy': 'Comparisons (using "like" or "as")',
	'repetition': 'Repeated phrases for emphasis',
	'metaphor': 'Symbolic comparisons',
	'hyperbole': 'Dramatic exaggerations',
	'rhetorical_question': 'Questions asked for effect'
	}

	for device, count in rhetorical_devices.items():
	with col1:
	st.metric(
	label=device_explanations[device],
	value=f"{count} times"
	)


	if __name__ == "__main__":
	main()