FLS

Sleeping

App Files Files Community

kambris commited on Dec 13, 2024

Commit

e4d0f85

verified ·

1 Parent(s): 98644f4

Update app.py

Browse files

Files changed (1) hide show

app.py +3 -48

app.py CHANGED Viewed

@@ -15,7 +15,6 @@ import spacy
 st.set_page_config(page_title="Advanced Political Speech Analysis", page_icon="🗣️", layout="wide")
-# Advanced NLP Libraries
 from transformers import (
     AutoTokenizer,
     AutoModelForSequenceClassification,
@@ -29,13 +28,11 @@ from nltk.corpus import stopwords
 from nltk.tokenize import word_tokenize
 from textstat import flesch_reading_ease, flesch_kincaid_grade
-# Download necessary NLTK resources
 nltk.download('punkt', quiet=True)
 nltk.download('averaged_perceptron_tagger', quiet=True)
 nltk.download('stopwords', quiet=True)
 nltk.download('punkt_tab', quiet=True)
-# Load spaCy model (requires separate installation)
 try:
     nlp = spacy.load('en_core_web_lg')
 except:
@@ -43,7 +40,6 @@ except:
              "pip install spacy\n"
              "python -m spacy download en_core_web_lg")
-# Constants and Configurations
 MORAL_FOUNDATIONS = {
     'care': 'Care/Harm',
     'fairness': 'Fairness/Cheating',
@@ -62,21 +58,17 @@ RHETORICAL_DEVICES = {
 class SpeechAnalyzer:
     def __init__(self):
-        # Load MoralFoundations model
         self.moral_model_path = "MMADS/MoralFoundationsClassifier"
         self.moral_tokenizer = RobertaTokenizer.from_pretrained(self.moral_model_path)
         self.moral_model = RobertaForSequenceClassification.from_pretrained(self.moral_model_path)
-        # Define label names directly
         self.label_names = ['care', 'fairness', 'loyalty', 'authority', 'sanctity']
-        # Other pipelines remain the same
         self.sentiment_pipeline = pipeline("sentiment-analysis")
         self.ner_tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
         self.ner_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
         self.ner_pipeline = pipeline("ner", model=self.ner_model, tokenizer=self.ner_tokenizer)
-        # Add emotion classifier
         self.emotion_classifier = pipeline("text-classification",
                                          model="j-hartmann/emotion-english-distilroberta-base")
@@ -91,7 +83,6 @@ class SpeechAnalyzer:
         for word in words:
             if current_length + len(word.split()) > max_length:
                 segments.append(' '.join(current_segment))
-                # Use the overlap parameter from the method arguments
                 current_segment = current_segment[-overlap:] + [word]
                 current_length = len(' '.join(current_segment).split())
             else:
@@ -125,7 +116,6 @@ class SpeechAnalyzer:
                 if foundation in foundation_scores:
                     foundation_scores[foundation].append(probabilities[0][idx].item())
-        # Average the scores across segments
         aggregated_scores = {
             foundation: np.mean(scores) for foundation, scores in foundation_scores.items()
         }
@@ -139,7 +129,6 @@ class SpeechAnalyzer:
         basic_emotions = []
         for segment in segments:
-            # Get sentiment scores with truncation
             sentiment_result = self.sentiment_pipeline(segment, truncation=True, max_length=512)
             score = sentiment_result[0]['score']
             if sentiment_result[0]['label'] == 'POSITIVE':
@@ -148,7 +137,6 @@ class SpeechAnalyzer:
                 score = 0.5 - (score * 0.5)
             sentiment_scores.append(score)
-            # Get emotion classification with truncation
             emotion_result = self.emotion_classifier(segment, truncation=True, max_length=512)
             emotion = emotion_result[0]['label']
             basic_emotions.append(emotion)
@@ -167,7 +155,6 @@ class SpeechAnalyzer:
         tfidf_matrix = vectorizer.fit_transform([text])
         feature_names = vectorizer.get_feature_names_out()
-        # Get top phrases by TF-IDF score
         sorted_idx = tfidf_matrix.toarray()[0].argsort()[::-1]
         top_phrases = [feature_names[i] for i in sorted_idx[:top_n]]
@@ -191,29 +178,23 @@ class SpeechAnalyzer:
     def create_semantic_network(self, text, top_n=20, window_size=10, chunk_size=10000):
         """Create semantic network graph with weighted edges"""
-        # Process text in chunks
         chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
-        # Initialize collections for aggregating results
         all_nouns = []
         noun_freq = nltk.FreqDist()
-        # Process each chunk
         for chunk in chunks:
             doc = nlp(chunk)
             chunk_nouns = [token.text.lower() for token in doc if token.pos_ == 'NOUN']
             all_nouns.extend(chunk_nouns)
             noun_freq.update(chunk_nouns)
-        # Get top nouns across all chunks
         top_nouns = [noun for noun, freq in noun_freq.most_common(top_n)]
-        # Create graph and co-occurrence matrix
         G = nx.Graph()
         cooc_matrix = np.zeros((len(top_nouns), len(top_nouns)))
         noun_to_idx = {noun: idx for idx, noun in enumerate(top_nouns)}
-        # Process co-occurrences in chunks
         for chunk in chunks:
             doc = nlp(chunk)
             words = [token.text.lower() for token in doc]
@@ -228,11 +209,9 @@ class SpeechAnalyzer:
                                 cooc_matrix[idx1][idx2] += 1
                                 cooc_matrix[idx2][idx1] += 1
-        # Build network
         for noun in top_nouns:
             G.add_node(noun, size=noun_freq[noun])
-        # Add weighted edges
         max_weight = np.max(cooc_matrix)
         if max_weight > 0:  # Prevent division by zero
             for i in range(len(top_nouns)):
@@ -243,7 +222,6 @@ class SpeechAnalyzer:
                                   weight=weight,
                                   width=3 * (weight/max_weight))
-        # Calculate and store layout
         pos = nx.spring_layout(G, k=1, iterations=50)
         for node in G.nodes():
             G.nodes[node]['pos'] = pos[node]
@@ -269,14 +247,11 @@ def process_all_analyses(text, _analyzer):
 def main():
     st.title("🗣️ Political Text Analysis Toolkit")
-    # Initialize analyzer
     analyzer = SpeechAnalyzer()
-    # File upload
     uploaded_file = st.file_uploader("Upload Political Speech", type=['txt', 'docx', 'pdf'])
     if uploaded_file is not None:
-        # Read file (similar to previous implementation)
         if uploaded_file.name.endswith('.txt'):
             text = uploaded_file.getvalue().decode('utf-8')
         elif uploaded_file.name.endswith('.docx'):
@@ -288,7 +263,6 @@ def main():
             pdf_reader = PyPDF2.PdfReader(uploaded_file)
             text = ' '.join([page.extract_text() for page in pdf_reader.pages])
-        # Create tabs for different analyses
         progress_bar = st.progress(0)
         status_text = st.empty()
         tab1, tab2, tab3, tab4, tab5 = st.tabs([
@@ -305,7 +279,6 @@ def main():
             st.subheader("Moral Foundations Analysis")
             moral_scores = analyzer.analyze_moral_foundations(text)
-            # Plotly bar chart
             moral_df = pd.DataFrame.from_dict(moral_scores, orient='index', columns=['Score'])
             moral_df.index.name = 'Moral Foundation'
             moral_df = moral_df.reset_index()
@@ -319,7 +292,6 @@ def main():
             )
             st.plotly_chart(fig)
-            # Detailed insights
             for foundation, score in moral_scores.items():
                 st.write(f"**{MORAL_FOUNDATIONS[foundation]}**: {score:.2%}")
@@ -328,13 +300,10 @@ def main():
             progress_bar.progress(40)
             st.subheader("Speech Trajectory Analysis")
-            # Get cached data
             segments, segment_labels, sentiment_scores, basic_emotions, moral_trajectories = process_all_analyses(text, analyzer)
-            # Create unified figure
             unified_fig = go.Figure()
-            # Add traces for each analysis type
             viz_options = st.multiselect(
                 "Select analyses to display:",
                 ["Sentiment Flow", "Moral Foundations Flow", "Basic Emotions Flow"],
@@ -371,7 +340,6 @@ def main():
                     'Emotion': basic_emotions
                 })
-                # Create color mapping for emotions
                 emotion_colors = {
                     'joy': '#FFD700',      # Gold
                     'sadness': '#4169E1',  # Royal Blue
@@ -383,11 +351,11 @@ def main():
                 unified_fig.add_trace(go.Bar(
                     x=segment_labels,
-                    y=[1] * len(basic_emotions),  # Full height bars
-                    name=f'Emotions Found: {", ".join(sorted(set(basic_emotions)))}',  # Shows all unique emotions
                     marker=dict(
                         color=[emotion_colors.get(e.lower(), '#808080') for e in basic_emotions],
-                        line=dict(width=1, color='#000000')  # Adds border for better visibility
                     ),
                     opacity=0.8,
                     hovertemplate="Segment %{x}<br>Emotion: %{text}<extra></extra>",
@@ -403,7 +371,6 @@ def main():
             st.subheader("Linguistic Analysis")
             readability = analyzer.calculate_readability(text)
-            # Readability metrics with context
             col1, col2 = st.columns(2)
             with col1:
                 score = readability['Flesch Reading Ease']
@@ -424,11 +391,9 @@ def main():
                     delta_color="normal"
                 )
-            # Enhanced key phrases display
             st.subheader("Key Topics and Themes")
             key_phrases = analyzer.extract_key_phrases(text)
-            # Create columns for better phrase organization
             cols = st.columns(3)
             for idx, phrase in enumerate(key_phrases):
                 col_idx = idx % 3
@@ -452,20 +417,16 @@ def main():
             network_fig = go.Figure()
-            # Add edges with enhanced visual encoding
             for edge in semantic_graph.edges():
                 x0, y0 = semantic_graph.nodes[edge[0]]['pos']
                 x1, y1 = semantic_graph.nodes[edge[1]]['pos']
                 weight = semantic_graph.edges[edge]['weight']
                 max_weight = max(d['weight'] for _, _, d in semantic_graph.edges(data=True))
-                # Normalize weight for visual encoding
                 normalized_weight = weight / max_weight
-                # Enhanced width scaling (more pronounced differences)
                 width = 2 + (normalized_weight * 8)
-                # Color gradient from light to dark based on weight
                 color = f'rgba(31, 119, 180, {0.3 + normalized_weight * 0.7})'
                 network_fig.add_trace(go.Scatter(
@@ -480,7 +441,6 @@ def main():
                     hovertext=f'Relationship strength: {weight:.2f}'
                 ))
-            # Enhanced nodes with better visibility
             for node in semantic_graph.nodes():
                 x, y = semantic_graph.nodes[node]['pos']
                 size = semantic_graph.nodes[node]['size']
@@ -521,10 +481,8 @@ def main():
             st.subheader("Named Entity Recognition")
             named_entities = analyzer.detect_named_entities(text)
-            # Process entities
             entities_df = pd.DataFrame(named_entities)
-            # Map entity types to friendly names
             type_mapping = {
                 'B-PER': 'Person',
                 'I-PER': 'Person',
@@ -536,20 +494,17 @@ def main():
                 'I-MISC': 'Other'
             }
-            # Clean and transform the data
             display_df = pd.DataFrame({
                 'Term': entities_df['word'],
                 'Category': entities_df['entity'].map(type_mapping),
                 'Confidence': entities_df['score'].apply(lambda x: f"{x*100:.1f}%")
             })
-            # Group similar entities
             grouped_df = display_df.groupby('Category').agg({
                 'Term': lambda x: ', '.join(set(x)),
                 'Confidence': 'count'
             }).reset_index()
-            # Display results in an organized way
             for category in grouped_df['Category'].unique():
                 category_data = grouped_df[grouped_df['Category'] == category]
                 st.write(f"### {category}")

 st.set_page_config(page_title="Advanced Political Speech Analysis", page_icon="🗣️", layout="wide")
 from transformers import (
     AutoTokenizer,
     AutoModelForSequenceClassification,
 from nltk.tokenize import word_tokenize
 from textstat import flesch_reading_ease, flesch_kincaid_grade
 nltk.download('punkt', quiet=True)
 nltk.download('averaged_perceptron_tagger', quiet=True)
 nltk.download('stopwords', quiet=True)
 nltk.download('punkt_tab', quiet=True)
 try:
     nlp = spacy.load('en_core_web_lg')
 except:
              "pip install spacy\n"
              "python -m spacy download en_core_web_lg")
 MORAL_FOUNDATIONS = {
     'care': 'Care/Harm',
     'fairness': 'Fairness/Cheating',
 class SpeechAnalyzer:
     def __init__(self):
         self.moral_model_path = "MMADS/MoralFoundationsClassifier"
         self.moral_tokenizer = RobertaTokenizer.from_pretrained(self.moral_model_path)
         self.moral_model = RobertaForSequenceClassification.from_pretrained(self.moral_model_path)
         self.label_names = ['care', 'fairness', 'loyalty', 'authority', 'sanctity']
         self.sentiment_pipeline = pipeline("sentiment-analysis")
         self.ner_tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
         self.ner_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
         self.ner_pipeline = pipeline("ner", model=self.ner_model, tokenizer=self.ner_tokenizer)
         self.emotion_classifier = pipeline("text-classification",
                                          model="j-hartmann/emotion-english-distilroberta-base")
         for word in words:
             if current_length + len(word.split()) > max_length:
                 segments.append(' '.join(current_segment))
                 current_segment = current_segment[-overlap:] + [word]
                 current_length = len(' '.join(current_segment).split())
             else:
                 if foundation in foundation_scores:
                     foundation_scores[foundation].append(probabilities[0][idx].item())
         aggregated_scores = {
             foundation: np.mean(scores) for foundation, scores in foundation_scores.items()
         }
         basic_emotions = []
         for segment in segments:
             sentiment_result = self.sentiment_pipeline(segment, truncation=True, max_length=512)
             score = sentiment_result[0]['score']
             if sentiment_result[0]['label'] == 'POSITIVE':
                 score = 0.5 - (score * 0.5)
             sentiment_scores.append(score)
             emotion_result = self.emotion_classifier(segment, truncation=True, max_length=512)
             emotion = emotion_result[0]['label']
             basic_emotions.append(emotion)
         tfidf_matrix = vectorizer.fit_transform([text])
         feature_names = vectorizer.get_feature_names_out()
         sorted_idx = tfidf_matrix.toarray()[0].argsort()[::-1]
         top_phrases = [feature_names[i] for i in sorted_idx[:top_n]]
     def create_semantic_network(self, text, top_n=20, window_size=10, chunk_size=10000):
         """Create semantic network graph with weighted edges"""
         chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
         all_nouns = []
         noun_freq = nltk.FreqDist()
         for chunk in chunks:
             doc = nlp(chunk)
             chunk_nouns = [token.text.lower() for token in doc if token.pos_ == 'NOUN']
             all_nouns.extend(chunk_nouns)
             noun_freq.update(chunk_nouns)
         top_nouns = [noun for noun, freq in noun_freq.most_common(top_n)]
         G = nx.Graph()
         cooc_matrix = np.zeros((len(top_nouns), len(top_nouns)))
         noun_to_idx = {noun: idx for idx, noun in enumerate(top_nouns)}
         for chunk in chunks:
             doc = nlp(chunk)
             words = [token.text.lower() for token in doc]
                                 cooc_matrix[idx1][idx2] += 1
                                 cooc_matrix[idx2][idx1] += 1
         for noun in top_nouns:
             G.add_node(noun, size=noun_freq[noun])
         max_weight = np.max(cooc_matrix)
         if max_weight > 0:  # Prevent division by zero
             for i in range(len(top_nouns)):
                                   weight=weight,
                                   width=3 * (weight/max_weight))
         pos = nx.spring_layout(G, k=1, iterations=50)
         for node in G.nodes():
             G.nodes[node]['pos'] = pos[node]
 def main():
     st.title("🗣️ Political Text Analysis Toolkit")
     analyzer = SpeechAnalyzer()
     uploaded_file = st.file_uploader("Upload Political Speech", type=['txt', 'docx', 'pdf'])
     if uploaded_file is not None:
         if uploaded_file.name.endswith('.txt'):
             text = uploaded_file.getvalue().decode('utf-8')
         elif uploaded_file.name.endswith('.docx'):
             pdf_reader = PyPDF2.PdfReader(uploaded_file)
             text = ' '.join([page.extract_text() for page in pdf_reader.pages])
         progress_bar = st.progress(0)
         status_text = st.empty()
         tab1, tab2, tab3, tab4, tab5 = st.tabs([
             st.subheader("Moral Foundations Analysis")
             moral_scores = analyzer.analyze_moral_foundations(text)
             moral_df = pd.DataFrame.from_dict(moral_scores, orient='index', columns=['Score'])
             moral_df.index.name = 'Moral Foundation'
             moral_df = moral_df.reset_index()
             )
             st.plotly_chart(fig)
             for foundation, score in moral_scores.items():
                 st.write(f"**{MORAL_FOUNDATIONS[foundation]}**: {score:.2%}")
             progress_bar.progress(40)
             st.subheader("Speech Trajectory Analysis")
             segments, segment_labels, sentiment_scores, basic_emotions, moral_trajectories = process_all_analyses(text, analyzer)
             unified_fig = go.Figure()
             viz_options = st.multiselect(
                 "Select analyses to display:",
                 ["Sentiment Flow", "Moral Foundations Flow", "Basic Emotions Flow"],
                     'Emotion': basic_emotions
                 })
                 emotion_colors = {
                     'joy': '#FFD700',      # Gold
                     'sadness': '#4169E1',  # Royal Blue
                 unified_fig.add_trace(go.Bar(
                     x=segment_labels,
+                    y=[1] * len(basic_emotions),
+                    name=f'Emotions Found: {", ".join(sorted(set(basic_emotions)))}',
                     marker=dict(
                         color=[emotion_colors.get(e.lower(), '#808080') for e in basic_emotions],
+                        line=dict(width=1, color='#000000')
                     ),
                     opacity=0.8,
                     hovertemplate="Segment %{x}<br>Emotion: %{text}<extra></extra>",
             st.subheader("Linguistic Analysis")
             readability = analyzer.calculate_readability(text)
             col1, col2 = st.columns(2)
             with col1:
                 score = readability['Flesch Reading Ease']
                     delta_color="normal"
                 )
             st.subheader("Key Topics and Themes")
             key_phrases = analyzer.extract_key_phrases(text)
             cols = st.columns(3)
             for idx, phrase in enumerate(key_phrases):
                 col_idx = idx % 3
             network_fig = go.Figure()
             for edge in semantic_graph.edges():
                 x0, y0 = semantic_graph.nodes[edge[0]]['pos']
                 x1, y1 = semantic_graph.nodes[edge[1]]['pos']
                 weight = semantic_graph.edges[edge]['weight']
                 max_weight = max(d['weight'] for _, _, d in semantic_graph.edges(data=True))
                 normalized_weight = weight / max_weight
                 width = 2 + (normalized_weight * 8)
                 color = f'rgba(31, 119, 180, {0.3 + normalized_weight * 0.7})'
                 network_fig.add_trace(go.Scatter(
                     hovertext=f'Relationship strength: {weight:.2f}'
                 ))
             for node in semantic_graph.nodes():
                 x, y = semantic_graph.nodes[node]['pos']
                 size = semantic_graph.nodes[node]['size']
             st.subheader("Named Entity Recognition")
             named_entities = analyzer.detect_named_entities(text)
             entities_df = pd.DataFrame(named_entities)
             type_mapping = {
                 'B-PER': 'Person',
                 'I-PER': 'Person',
                 'I-MISC': 'Other'
             }
             display_df = pd.DataFrame({
                 'Term': entities_df['word'],
                 'Category': entities_df['entity'].map(type_mapping),
                 'Confidence': entities_df['score'].apply(lambda x: f"{x*100:.1f}%")
             })
             grouped_df = display_df.groupby('Category').agg({
                 'Term': lambda x: ', '.join(set(x)),
                 'Confidence': 'count'
             }).reset_index()
             for category in grouped_df['Category'].unique():
                 category_data = grouped_df[grouped_df['Category'] == category]
                 st.write(f"### {category}")