kambris commited on
Commit
e4d0f85
·
verified ·
1 Parent(s): 98644f4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -48
app.py CHANGED
@@ -15,7 +15,6 @@ import spacy
15
 
16
  st.set_page_config(page_title="Advanced Political Speech Analysis", page_icon="🗣️", layout="wide")
17
 
18
- # Advanced NLP Libraries
19
  from transformers import (
20
  AutoTokenizer,
21
  AutoModelForSequenceClassification,
@@ -29,13 +28,11 @@ from nltk.corpus import stopwords
29
  from nltk.tokenize import word_tokenize
30
  from textstat import flesch_reading_ease, flesch_kincaid_grade
31
 
32
- # Download necessary NLTK resources
33
  nltk.download('punkt', quiet=True)
34
  nltk.download('averaged_perceptron_tagger', quiet=True)
35
  nltk.download('stopwords', quiet=True)
36
  nltk.download('punkt_tab', quiet=True)
37
 
38
- # Load spaCy model (requires separate installation)
39
  try:
40
  nlp = spacy.load('en_core_web_lg')
41
  except:
@@ -43,7 +40,6 @@ except:
43
  "pip install spacy\n"
44
  "python -m spacy download en_core_web_lg")
45
 
46
- # Constants and Configurations
47
  MORAL_FOUNDATIONS = {
48
  'care': 'Care/Harm',
49
  'fairness': 'Fairness/Cheating',
@@ -62,21 +58,17 @@ RHETORICAL_DEVICES = {
62
 
63
  class SpeechAnalyzer:
64
  def __init__(self):
65
- # Load MoralFoundations model
66
  self.moral_model_path = "MMADS/MoralFoundationsClassifier"
67
  self.moral_tokenizer = RobertaTokenizer.from_pretrained(self.moral_model_path)
68
  self.moral_model = RobertaForSequenceClassification.from_pretrained(self.moral_model_path)
69
 
70
- # Define label names directly
71
  self.label_names = ['care', 'fairness', 'loyalty', 'authority', 'sanctity']
72
 
73
- # Other pipelines remain the same
74
  self.sentiment_pipeline = pipeline("sentiment-analysis")
75
  self.ner_tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
76
  self.ner_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
77
  self.ner_pipeline = pipeline("ner", model=self.ner_model, tokenizer=self.ner_tokenizer)
78
 
79
- # Add emotion classifier
80
  self.emotion_classifier = pipeline("text-classification",
81
  model="j-hartmann/emotion-english-distilroberta-base")
82
 
@@ -91,7 +83,6 @@ class SpeechAnalyzer:
91
  for word in words:
92
  if current_length + len(word.split()) > max_length:
93
  segments.append(' '.join(current_segment))
94
- # Use the overlap parameter from the method arguments
95
  current_segment = current_segment[-overlap:] + [word]
96
  current_length = len(' '.join(current_segment).split())
97
  else:
@@ -125,7 +116,6 @@ class SpeechAnalyzer:
125
  if foundation in foundation_scores:
126
  foundation_scores[foundation].append(probabilities[0][idx].item())
127
 
128
- # Average the scores across segments
129
  aggregated_scores = {
130
  foundation: np.mean(scores) for foundation, scores in foundation_scores.items()
131
  }
@@ -139,7 +129,6 @@ class SpeechAnalyzer:
139
  basic_emotions = []
140
 
141
  for segment in segments:
142
- # Get sentiment scores with truncation
143
  sentiment_result = self.sentiment_pipeline(segment, truncation=True, max_length=512)
144
  score = sentiment_result[0]['score']
145
  if sentiment_result[0]['label'] == 'POSITIVE':
@@ -148,7 +137,6 @@ class SpeechAnalyzer:
148
  score = 0.5 - (score * 0.5)
149
  sentiment_scores.append(score)
150
 
151
- # Get emotion classification with truncation
152
  emotion_result = self.emotion_classifier(segment, truncation=True, max_length=512)
153
  emotion = emotion_result[0]['label']
154
  basic_emotions.append(emotion)
@@ -167,7 +155,6 @@ class SpeechAnalyzer:
167
  tfidf_matrix = vectorizer.fit_transform([text])
168
  feature_names = vectorizer.get_feature_names_out()
169
 
170
- # Get top phrases by TF-IDF score
171
  sorted_idx = tfidf_matrix.toarray()[0].argsort()[::-1]
172
  top_phrases = [feature_names[i] for i in sorted_idx[:top_n]]
173
 
@@ -191,29 +178,23 @@ class SpeechAnalyzer:
191
 
192
  def create_semantic_network(self, text, top_n=20, window_size=10, chunk_size=10000):
193
  """Create semantic network graph with weighted edges"""
194
- # Process text in chunks
195
  chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
196
 
197
- # Initialize collections for aggregating results
198
  all_nouns = []
199
  noun_freq = nltk.FreqDist()
200
 
201
- # Process each chunk
202
  for chunk in chunks:
203
  doc = nlp(chunk)
204
  chunk_nouns = [token.text.lower() for token in doc if token.pos_ == 'NOUN']
205
  all_nouns.extend(chunk_nouns)
206
  noun_freq.update(chunk_nouns)
207
 
208
- # Get top nouns across all chunks
209
  top_nouns = [noun for noun, freq in noun_freq.most_common(top_n)]
210
 
211
- # Create graph and co-occurrence matrix
212
  G = nx.Graph()
213
  cooc_matrix = np.zeros((len(top_nouns), len(top_nouns)))
214
  noun_to_idx = {noun: idx for idx, noun in enumerate(top_nouns)}
215
 
216
- # Process co-occurrences in chunks
217
  for chunk in chunks:
218
  doc = nlp(chunk)
219
  words = [token.text.lower() for token in doc]
@@ -228,11 +209,9 @@ class SpeechAnalyzer:
228
  cooc_matrix[idx1][idx2] += 1
229
  cooc_matrix[idx2][idx1] += 1
230
 
231
- # Build network
232
  for noun in top_nouns:
233
  G.add_node(noun, size=noun_freq[noun])
234
 
235
- # Add weighted edges
236
  max_weight = np.max(cooc_matrix)
237
  if max_weight > 0: # Prevent division by zero
238
  for i in range(len(top_nouns)):
@@ -243,7 +222,6 @@ class SpeechAnalyzer:
243
  weight=weight,
244
  width=3 * (weight/max_weight))
245
 
246
- # Calculate and store layout
247
  pos = nx.spring_layout(G, k=1, iterations=50)
248
  for node in G.nodes():
249
  G.nodes[node]['pos'] = pos[node]
@@ -269,14 +247,11 @@ def process_all_analyses(text, _analyzer):
269
  def main():
270
  st.title("🗣️ Political Text Analysis Toolkit")
271
 
272
- # Initialize analyzer
273
  analyzer = SpeechAnalyzer()
274
 
275
- # File upload
276
  uploaded_file = st.file_uploader("Upload Political Speech", type=['txt', 'docx', 'pdf'])
277
 
278
  if uploaded_file is not None:
279
- # Read file (similar to previous implementation)
280
  if uploaded_file.name.endswith('.txt'):
281
  text = uploaded_file.getvalue().decode('utf-8')
282
  elif uploaded_file.name.endswith('.docx'):
@@ -288,7 +263,6 @@ def main():
288
  pdf_reader = PyPDF2.PdfReader(uploaded_file)
289
  text = ' '.join([page.extract_text() for page in pdf_reader.pages])
290
 
291
- # Create tabs for different analyses
292
  progress_bar = st.progress(0)
293
  status_text = st.empty()
294
  tab1, tab2, tab3, tab4, tab5 = st.tabs([
@@ -305,7 +279,6 @@ def main():
305
  st.subheader("Moral Foundations Analysis")
306
  moral_scores = analyzer.analyze_moral_foundations(text)
307
 
308
- # Plotly bar chart
309
  moral_df = pd.DataFrame.from_dict(moral_scores, orient='index', columns=['Score'])
310
  moral_df.index.name = 'Moral Foundation'
311
  moral_df = moral_df.reset_index()
@@ -319,7 +292,6 @@ def main():
319
  )
320
  st.plotly_chart(fig)
321
 
322
- # Detailed insights
323
  for foundation, score in moral_scores.items():
324
  st.write(f"**{MORAL_FOUNDATIONS[foundation]}**: {score:.2%}")
325
 
@@ -328,13 +300,10 @@ def main():
328
  progress_bar.progress(40)
329
  st.subheader("Speech Trajectory Analysis")
330
 
331
- # Get cached data
332
  segments, segment_labels, sentiment_scores, basic_emotions, moral_trajectories = process_all_analyses(text, analyzer)
333
 
334
- # Create unified figure
335
  unified_fig = go.Figure()
336
 
337
- # Add traces for each analysis type
338
  viz_options = st.multiselect(
339
  "Select analyses to display:",
340
  ["Sentiment Flow", "Moral Foundations Flow", "Basic Emotions Flow"],
@@ -371,7 +340,6 @@ def main():
371
  'Emotion': basic_emotions
372
  })
373
 
374
- # Create color mapping for emotions
375
  emotion_colors = {
376
  'joy': '#FFD700', # Gold
377
  'sadness': '#4169E1', # Royal Blue
@@ -383,11 +351,11 @@ def main():
383
 
384
  unified_fig.add_trace(go.Bar(
385
  x=segment_labels,
386
- y=[1] * len(basic_emotions), # Full height bars
387
- name=f'Emotions Found: {", ".join(sorted(set(basic_emotions)))}', # Shows all unique emotions
388
  marker=dict(
389
  color=[emotion_colors.get(e.lower(), '#808080') for e in basic_emotions],
390
- line=dict(width=1, color='#000000') # Adds border for better visibility
391
  ),
392
  opacity=0.8,
393
  hovertemplate="Segment %{x}<br>Emotion: %{text}<extra></extra>",
@@ -403,7 +371,6 @@ def main():
403
  st.subheader("Linguistic Analysis")
404
  readability = analyzer.calculate_readability(text)
405
 
406
- # Readability metrics with context
407
  col1, col2 = st.columns(2)
408
  with col1:
409
  score = readability['Flesch Reading Ease']
@@ -424,11 +391,9 @@ def main():
424
  delta_color="normal"
425
  )
426
 
427
- # Enhanced key phrases display
428
  st.subheader("Key Topics and Themes")
429
  key_phrases = analyzer.extract_key_phrases(text)
430
 
431
- # Create columns for better phrase organization
432
  cols = st.columns(3)
433
  for idx, phrase in enumerate(key_phrases):
434
  col_idx = idx % 3
@@ -452,20 +417,16 @@ def main():
452
 
453
  network_fig = go.Figure()
454
 
455
- # Add edges with enhanced visual encoding
456
  for edge in semantic_graph.edges():
457
  x0, y0 = semantic_graph.nodes[edge[0]]['pos']
458
  x1, y1 = semantic_graph.nodes[edge[1]]['pos']
459
  weight = semantic_graph.edges[edge]['weight']
460
  max_weight = max(d['weight'] for _, _, d in semantic_graph.edges(data=True))
461
 
462
- # Normalize weight for visual encoding
463
  normalized_weight = weight / max_weight
464
 
465
- # Enhanced width scaling (more pronounced differences)
466
  width = 2 + (normalized_weight * 8)
467
 
468
- # Color gradient from light to dark based on weight
469
  color = f'rgba(31, 119, 180, {0.3 + normalized_weight * 0.7})'
470
 
471
  network_fig.add_trace(go.Scatter(
@@ -480,7 +441,6 @@ def main():
480
  hovertext=f'Relationship strength: {weight:.2f}'
481
  ))
482
 
483
- # Enhanced nodes with better visibility
484
  for node in semantic_graph.nodes():
485
  x, y = semantic_graph.nodes[node]['pos']
486
  size = semantic_graph.nodes[node]['size']
@@ -521,10 +481,8 @@ def main():
521
  st.subheader("Named Entity Recognition")
522
  named_entities = analyzer.detect_named_entities(text)
523
 
524
- # Process entities
525
  entities_df = pd.DataFrame(named_entities)
526
 
527
- # Map entity types to friendly names
528
  type_mapping = {
529
  'B-PER': 'Person',
530
  'I-PER': 'Person',
@@ -536,20 +494,17 @@ def main():
536
  'I-MISC': 'Other'
537
  }
538
 
539
- # Clean and transform the data
540
  display_df = pd.DataFrame({
541
  'Term': entities_df['word'],
542
  'Category': entities_df['entity'].map(type_mapping),
543
  'Confidence': entities_df['score'].apply(lambda x: f"{x*100:.1f}%")
544
  })
545
 
546
- # Group similar entities
547
  grouped_df = display_df.groupby('Category').agg({
548
  'Term': lambda x: ', '.join(set(x)),
549
  'Confidence': 'count'
550
  }).reset_index()
551
 
552
- # Display results in an organized way
553
  for category in grouped_df['Category'].unique():
554
  category_data = grouped_df[grouped_df['Category'] == category]
555
  st.write(f"### {category}")
 
15
 
16
  st.set_page_config(page_title="Advanced Political Speech Analysis", page_icon="🗣️", layout="wide")
17
 
 
18
  from transformers import (
19
  AutoTokenizer,
20
  AutoModelForSequenceClassification,
 
28
  from nltk.tokenize import word_tokenize
29
  from textstat import flesch_reading_ease, flesch_kincaid_grade
30
 
 
31
  nltk.download('punkt', quiet=True)
32
  nltk.download('averaged_perceptron_tagger', quiet=True)
33
  nltk.download('stopwords', quiet=True)
34
  nltk.download('punkt_tab', quiet=True)
35
 
 
36
  try:
37
  nlp = spacy.load('en_core_web_lg')
38
  except:
 
40
  "pip install spacy\n"
41
  "python -m spacy download en_core_web_lg")
42
 
 
43
  MORAL_FOUNDATIONS = {
44
  'care': 'Care/Harm',
45
  'fairness': 'Fairness/Cheating',
 
58
 
59
  class SpeechAnalyzer:
60
  def __init__(self):
 
61
  self.moral_model_path = "MMADS/MoralFoundationsClassifier"
62
  self.moral_tokenizer = RobertaTokenizer.from_pretrained(self.moral_model_path)
63
  self.moral_model = RobertaForSequenceClassification.from_pretrained(self.moral_model_path)
64
 
 
65
  self.label_names = ['care', 'fairness', 'loyalty', 'authority', 'sanctity']
66
 
 
67
  self.sentiment_pipeline = pipeline("sentiment-analysis")
68
  self.ner_tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
69
  self.ner_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
70
  self.ner_pipeline = pipeline("ner", model=self.ner_model, tokenizer=self.ner_tokenizer)
71
 
 
72
  self.emotion_classifier = pipeline("text-classification",
73
  model="j-hartmann/emotion-english-distilroberta-base")
74
 
 
83
  for word in words:
84
  if current_length + len(word.split()) > max_length:
85
  segments.append(' '.join(current_segment))
 
86
  current_segment = current_segment[-overlap:] + [word]
87
  current_length = len(' '.join(current_segment).split())
88
  else:
 
116
  if foundation in foundation_scores:
117
  foundation_scores[foundation].append(probabilities[0][idx].item())
118
 
 
119
  aggregated_scores = {
120
  foundation: np.mean(scores) for foundation, scores in foundation_scores.items()
121
  }
 
129
  basic_emotions = []
130
 
131
  for segment in segments:
 
132
  sentiment_result = self.sentiment_pipeline(segment, truncation=True, max_length=512)
133
  score = sentiment_result[0]['score']
134
  if sentiment_result[0]['label'] == 'POSITIVE':
 
137
  score = 0.5 - (score * 0.5)
138
  sentiment_scores.append(score)
139
 
 
140
  emotion_result = self.emotion_classifier(segment, truncation=True, max_length=512)
141
  emotion = emotion_result[0]['label']
142
  basic_emotions.append(emotion)
 
155
  tfidf_matrix = vectorizer.fit_transform([text])
156
  feature_names = vectorizer.get_feature_names_out()
157
 
 
158
  sorted_idx = tfidf_matrix.toarray()[0].argsort()[::-1]
159
  top_phrases = [feature_names[i] for i in sorted_idx[:top_n]]
160
 
 
178
 
179
  def create_semantic_network(self, text, top_n=20, window_size=10, chunk_size=10000):
180
  """Create semantic network graph with weighted edges"""
 
181
  chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
182
 
 
183
  all_nouns = []
184
  noun_freq = nltk.FreqDist()
185
 
 
186
  for chunk in chunks:
187
  doc = nlp(chunk)
188
  chunk_nouns = [token.text.lower() for token in doc if token.pos_ == 'NOUN']
189
  all_nouns.extend(chunk_nouns)
190
  noun_freq.update(chunk_nouns)
191
 
 
192
  top_nouns = [noun for noun, freq in noun_freq.most_common(top_n)]
193
 
 
194
  G = nx.Graph()
195
  cooc_matrix = np.zeros((len(top_nouns), len(top_nouns)))
196
  noun_to_idx = {noun: idx for idx, noun in enumerate(top_nouns)}
197
 
 
198
  for chunk in chunks:
199
  doc = nlp(chunk)
200
  words = [token.text.lower() for token in doc]
 
209
  cooc_matrix[idx1][idx2] += 1
210
  cooc_matrix[idx2][idx1] += 1
211
 
 
212
  for noun in top_nouns:
213
  G.add_node(noun, size=noun_freq[noun])
214
 
 
215
  max_weight = np.max(cooc_matrix)
216
  if max_weight > 0: # Prevent division by zero
217
  for i in range(len(top_nouns)):
 
222
  weight=weight,
223
  width=3 * (weight/max_weight))
224
 
 
225
  pos = nx.spring_layout(G, k=1, iterations=50)
226
  for node in G.nodes():
227
  G.nodes[node]['pos'] = pos[node]
 
247
  def main():
248
  st.title("🗣️ Political Text Analysis Toolkit")
249
 
 
250
  analyzer = SpeechAnalyzer()
251
 
 
252
  uploaded_file = st.file_uploader("Upload Political Speech", type=['txt', 'docx', 'pdf'])
253
 
254
  if uploaded_file is not None:
 
255
  if uploaded_file.name.endswith('.txt'):
256
  text = uploaded_file.getvalue().decode('utf-8')
257
  elif uploaded_file.name.endswith('.docx'):
 
263
  pdf_reader = PyPDF2.PdfReader(uploaded_file)
264
  text = ' '.join([page.extract_text() for page in pdf_reader.pages])
265
 
 
266
  progress_bar = st.progress(0)
267
  status_text = st.empty()
268
  tab1, tab2, tab3, tab4, tab5 = st.tabs([
 
279
  st.subheader("Moral Foundations Analysis")
280
  moral_scores = analyzer.analyze_moral_foundations(text)
281
 
 
282
  moral_df = pd.DataFrame.from_dict(moral_scores, orient='index', columns=['Score'])
283
  moral_df.index.name = 'Moral Foundation'
284
  moral_df = moral_df.reset_index()
 
292
  )
293
  st.plotly_chart(fig)
294
 
 
295
  for foundation, score in moral_scores.items():
296
  st.write(f"**{MORAL_FOUNDATIONS[foundation]}**: {score:.2%}")
297
 
 
300
  progress_bar.progress(40)
301
  st.subheader("Speech Trajectory Analysis")
302
 
 
303
  segments, segment_labels, sentiment_scores, basic_emotions, moral_trajectories = process_all_analyses(text, analyzer)
304
 
 
305
  unified_fig = go.Figure()
306
 
 
307
  viz_options = st.multiselect(
308
  "Select analyses to display:",
309
  ["Sentiment Flow", "Moral Foundations Flow", "Basic Emotions Flow"],
 
340
  'Emotion': basic_emotions
341
  })
342
 
 
343
  emotion_colors = {
344
  'joy': '#FFD700', # Gold
345
  'sadness': '#4169E1', # Royal Blue
 
351
 
352
  unified_fig.add_trace(go.Bar(
353
  x=segment_labels,
354
+ y=[1] * len(basic_emotions),
355
+ name=f'Emotions Found: {", ".join(sorted(set(basic_emotions)))}',
356
  marker=dict(
357
  color=[emotion_colors.get(e.lower(), '#808080') for e in basic_emotions],
358
+ line=dict(width=1, color='#000000')
359
  ),
360
  opacity=0.8,
361
  hovertemplate="Segment %{x}<br>Emotion: %{text}<extra></extra>",
 
371
  st.subheader("Linguistic Analysis")
372
  readability = analyzer.calculate_readability(text)
373
 
 
374
  col1, col2 = st.columns(2)
375
  with col1:
376
  score = readability['Flesch Reading Ease']
 
391
  delta_color="normal"
392
  )
393
 
 
394
  st.subheader("Key Topics and Themes")
395
  key_phrases = analyzer.extract_key_phrases(text)
396
 
 
397
  cols = st.columns(3)
398
  for idx, phrase in enumerate(key_phrases):
399
  col_idx = idx % 3
 
417
 
418
  network_fig = go.Figure()
419
 
 
420
  for edge in semantic_graph.edges():
421
  x0, y0 = semantic_graph.nodes[edge[0]]['pos']
422
  x1, y1 = semantic_graph.nodes[edge[1]]['pos']
423
  weight = semantic_graph.edges[edge]['weight']
424
  max_weight = max(d['weight'] for _, _, d in semantic_graph.edges(data=True))
425
 
 
426
  normalized_weight = weight / max_weight
427
 
 
428
  width = 2 + (normalized_weight * 8)
429
 
 
430
  color = f'rgba(31, 119, 180, {0.3 + normalized_weight * 0.7})'
431
 
432
  network_fig.add_trace(go.Scatter(
 
441
  hovertext=f'Relationship strength: {weight:.2f}'
442
  ))
443
 
 
444
  for node in semantic_graph.nodes():
445
  x, y = semantic_graph.nodes[node]['pos']
446
  size = semantic_graph.nodes[node]['size']
 
481
  st.subheader("Named Entity Recognition")
482
  named_entities = analyzer.detect_named_entities(text)
483
 
 
484
  entities_df = pd.DataFrame(named_entities)
485
 
 
486
  type_mapping = {
487
  'B-PER': 'Person',
488
  'I-PER': 'Person',
 
494
  'I-MISC': 'Other'
495
  }
496
 
 
497
  display_df = pd.DataFrame({
498
  'Term': entities_df['word'],
499
  'Category': entities_df['entity'].map(type_mapping),
500
  'Confidence': entities_df['score'].apply(lambda x: f"{x*100:.1f}%")
501
  })
502
 
 
503
  grouped_df = display_df.groupby('Category').agg({
504
  'Term': lambda x: ', '.join(set(x)),
505
  'Confidence': 'count'
506
  }).reset_index()
507
 
 
508
  for category in grouped_df['Category'].unique():
509
  category_data = grouped_df[grouped_df['Category'] == category]
510
  st.write(f"### {category}")