FLS

Sleeping

App Files Files Community

kambris commited on Dec 12, 2024

Commit

2b884bf

verified ·

1 Parent(s): 7204906

Update app.py

Browse files

Files changed (1) hide show

app.py +273 -562

app.py CHANGED Viewed

@@ -1,607 +1,318 @@
 import streamlit as st
 import pandas as pd
-from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, pipeline
-from sklearn.feature_extraction.text import CountVectorizer
-from bertopic import BERTopic
-import torch
 import numpy as np
-from collections import Counter
-import os
-from wordcloud import WordCloud
 import matplotlib.pyplot as plt
-import pkg_resources
-import folium
-import country_converter as coco
-import time
-import gc
-def clear_memory():
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-    gc.collect()
-current_dir = os.path.dirname(os.path.abspath(__file__))
-font_path = os.path.join(current_dir, "ArabicR2013-J25x.ttf")
-ARABIC_STOP_WORDS = {
-    'في', 'من', 'إلى', 'على', 'علي', 'عن', 'مع', 'خلال', 'حتي', 'حتى', 'إذا',
-    'ثم', 'أو', 'و', 'ل', 'ب', 'ك', 'لل', 'ال', 'هذا',
-    'هذه', 'ذلك', 'تلك', 'هؤلاء', 'هم', 'هن', 'هو', 'هي','هنا', 'نحن',
-    'انت', 'انتم', 'كان', 'كانت', 'يكون', 'تكون', 'اي', 'كل',
-    'بعض', 'غير', 'حول', 'عند', 'قد', 'لقد', 'لم', 'لن', 'لو',
-    'ما', 'ماذا', 'متى', 'كيف', 'اين', 'لماذا', 'الذي', 'التي',
-    'الذين', 'اللاتي', 'اللواتي', 'الان', 'بين', 'فوق', 'تحت',
-    'امام', 'خلف', 'حين', 'قبل', 'بعد', 'أن', 'له', 'كما', 'لها',
-    'منذ', 'نفس', 'حيث', 'هناك', 'جدا', 'ذات', 'ضمن', 'انه', 'لدى',
-    'عليه', 'مثل', 'أما', 'لدي', 'فيه', 'كلم', 'لكن', 'ايضا', 'لازم',
-     'يجب', 'صار', 'صارت', 'ضد', 'يا', 'لا', 'اما',
-    'بها', 'ان', 'به', 'الي', 'لما', 'انا', 'اليك', 'لي', 'لك','اذا','بلا','او','لديك','لديه','اني','كنت','ليس','ايها', 'قلت',
-    'وثم', 'وأو', 'ول', 'وب', 'وك', 'ولل', 'وال',
-    'وهذا', 'وهذه', 'وذلك', 'وتلك', 'وهؤلاء', 'وهم', 'وهن', 'وهو', 'وهي', 'ونحن',
-    'وانت', 'وانتم', 'وكان', 'وكانت', 'ويكون', 'وتكون', 'واي', 'وكل',
-    'وبعض', 'وغير', 'وحول', 'وعند', 'وقد', 'ولقد', 'ولم', 'ولن', 'ولو',
-    'وما', 'وماذا', 'ومتى', 'وكيف', 'واين', 'ولماذا', 'والذي', 'والتي',
-    'والذين', 'واللاتي', 'واللواتي', 'والان', 'وبين', 'وفوق','وهنا', 'وتحت',
-    'وامام', 'وخلف', 'وحين', 'وقبل', 'وبعد', 'وأن', 'وله', 'وكما', 'ولها',
-    'ومنذ', 'ونفس', 'وحيث', 'وهناك', 'وجدا', 'وذات', 'وضمن', 'وانه', 'ولدى',
-    'وعليه', 'ومثل', 'وأما', 'وفيه', 'وكلم', 'ولكن', 'وايضا', 'ولازم',
-     'ويجب', 'وصار', 'وصارت', 'وضد', 'ويا', 'ولا', 'واما',
-    'وبها', 'وان', 'وبه', 'والي', 'ولما', 'وانا', 'واليك', 'ولي', 'ولك', 'وقلت',
-    'وفي', 'ومن', 'وعلى', 'وعلي', 'وعن', 'ومع', 'وحتى', 'وإذا',
-    'وهذا', 'وهذه', 'وذلك', 'وتلك', 'وهو', 'وهي', 'ونحن',
-    'وكان', 'وكانت', 'وكل', 'وبعض', 'وحول', 'وعند', 'وقد',
-    'ولقد', 'ولم', 'ولن', 'وما', 'وكيف', 'واين', 'والذي',
-    'وبين', 'وقبل', 'وبعد', 'وله', 'ولها', 'وهناك', 'وانه',
-    'منه','الا','فيها','فلا','وكم','يكن','عليك','منها','فما','لهم','يكن','واني','هل','فهل','بي','نحو','كي','سوف','كنا','لنا','معا','كلما','وإذا','منه','عنه','إذ','كم','بل','فيها','هكذا','لهم','ولدى', 'وعليه', 'ومثل',
-    'واحد', 'اثنان', 'ثلاثة', 'أربعة', 'خمسة', 'ستة', 'سبعة',
-    'ثمانية', 'تسعة', 'عشرة',
-    'الأول', 'الثاني', 'الثالث', 'الرابع', 'الخامس', 'السادس',
-    'السابع', 'الثامن', 'التاسع', 'العاشر'
 }
-COUNTRY_MAPPING = {
-    'مصر': 'Egypt',
-    'السعودية': 'Saudi Arabia',
-    'الإمارات': 'UAE',
-    'الكويت': 'Kuwait',
-    'العراق': 'Iraq',
-    'سوريا': 'Syria',
-    'لبنان': 'Lebanon',
-    'الأردن': 'Jordan',
-    'فلسطين': 'Palestine',
-    'اليمن': 'Yemen',
-    'عمان': 'Oman',
-    'قطر': 'Qatar',
-    'البحرين': 'Bahrain',
-    'السودان': 'Sudan',
-    'ليبيا': 'Libya',
-    'تونس': 'Tunisia',
-    'الجزائر': 'Algeria',
-    'المغرب': 'Morocco',
-    'موريتانيا': 'Mauritania'
-}
-st.set_page_config(
-    page_title="Contemporary Arabic Poetry Analysis",
-    page_icon="📚",
-    layout="wide"
-)
-@st.cache_resource
-def load_models():
-    """Load and cache the models to prevent reloading"""
-    tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
-    bert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2")
-    emotion_model = AutoModelForSequenceClassification.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
-    emotion_tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
-    emotion_classifier = pipeline(
-        "sentiment-analysis",
-        model=emotion_model,
-        tokenizer=emotion_tokenizer,
-        return_all_scores=True
-    )
-    return tokenizer, bert_model, emotion_classifier
-def split_text(text, max_length=512):
-    """Split text into chunks of maximum token length while preserving word boundaries."""
-    words = text.split()
-    chunks = []
-    current_chunk = []
-    current_length = 0
-    for word in words:
-        word_length = len(word.split())
-        if current_length + word_length > max_length:
-            if current_chunk:
-                chunks.append(' '.join(current_chunk))
-            current_chunk = [word]
-            current_length = word_length
-        else:
-            current_chunk.append(word)
-            current_length += word_length
-    if current_chunk:
-        chunks.append(' '.join(current_chunk))
-    return chunks
-def get_country_coordinates():
-    """Returns dictionary of Arab country coordinates"""
-    return {
-        'Egypt': [26.8206, 30.8025],
-        'Saudi Arabia': [23.8859, 45.0792],
-        'UAE': [23.4241, 53.8478],
-        'Kuwait': [29.3117, 47.4818],
-        'Iraq': [33.2232, 43.6793],
-        'Syria': [34.8021, 38.9968],
-        'Lebanon': [33.8547, 35.8623],
-        'Jordan': [30.5852, 36.2384],
-        'Palestine': [31.9522, 35.2332],
-        'Yemen': [15.5527, 48.5164],
-        'Oman': [21.4735, 55.9754],
-        'Qatar': [25.3548, 51.1839],
-        'Bahrain': [26.0667, 50.5577],
-        'Sudan': [12.8628, 30.2176],
-        'Libya': [26.3351, 17.2283],
-        'Tunisia': [33.8869, 9.5375],
-        'Algeria': [28.0339, 1.6596],
-        'Morocco': [31.7917, -7.0926],
-        'Mauritania': [21.0079, -10.9408]
-    }
-def create_topic_map(summaries):
-    # Debug print to check incoming data
-    print("DEBUG - First summary emotions:", summaries[0]['top_emotions'])
-    coordinates = get_country_coordinates()
-    m = folium.Map(location=[27.0, 42.0], zoom_start=5)
-    sentiment_colors = {
-        'LABEL_1': 'green',  # Positive
-        'LABEL_0': 'red',    # Negative
-        'LABEL_2': 'blue'    # Neutral
-    }
-    for summary in summaries:
-        country_en = COUNTRY_MAPPING.get(summary['country'])
-        if country_en and country_en in coordinates:
-            REVERSE_EMOTION_LABELS = {
-                'positive': 'LABEL_1',
-                'negative': 'LABEL_0',
-                'neutral': 'LABEL_2'
-            }
-            dominant_emotion = summary['top_emotions'][0]['emotion'] if summary['top_emotions'] else "neutral"
-            dominant_label = REVERSE_EMOTION_LABELS.get(dominant_emotion, 'LABEL_2')
-            circle_color = sentiment_colors.get(dominant_label, 'gray')
-            # Debug print
-            print(f"DEBUG - Country: {country_en}, Emotion: {dominant_emotion}, Label: {dominant_label}, Color: {circle_color}")
-            popup_content = f"""
-                <b>{country_en}</b><br>
-                <b>Sentiment Distribution:</b><br>
-                {'<br>'.join(f"• {e['emotion']}: {e['count']}" for e in summary['top_emotions'][:3])}<br>
-                <b>Top Topic:</b><br>
-                {summary['top_topics'][0]['topic'] if summary['top_topics'] else 'No topics'}<br>
-                Total Poems: {summary['total_poems']}
-            """
-            folium.CircleMarker(
-                location=coordinates[country_en],
-                radius=10,
-                popup=folium.Popup(popup_content, max_width=300),
-                color=circle_color,
-                fill=True
-            ).add_to(m)
-    legend_html = """
-    <div style="position: fixed; bottom: 50px; left: 50px; z-index: 1000; background-color: white; padding: 10px; border: 2px solid grey; border-radius: 5px">
-    <p><b>Sentiment:</b></p>
-    <p><span style="color: green;">●</span> Positive</p>
-    <p><span style="color: red;">●</span> Negative</p>
-    <p><span style="color: blue;">●</span> Neutral</p>
-    </div>
-    """
-    m.get_root().html.add_child(folium.Element(legend_html))
-    return m
-def create_arabic_wordcloud(text, title):
-    wordcloud = WordCloud(
-        width=1200,
-        height=600,
-        background_color='white',
-        font_path=font_path,
-        max_words=200,
-        stopwords=ARABIC_STOP_WORDS
-    ).generate(text)
-    fig, ax = plt.subplots(figsize=(15, 8))
-    ax.imshow(wordcloud, interpolation='bilinear')
-    ax.axis('off')
-    ax.set_title(title, fontsize=16, pad=20)
-    return fig
-def clean_arabic_text(text):
-    """Clean Arabic text by removing stop words and normalizing."""
-    words = text.split()
-    cleaned_words = [word for word in words if word not in ARABIC_STOP_WORDS and len(word) > 1]
-    return ' '.join(cleaned_words)
-def classify_emotion(text, classifier):
-    """Classify emotion for complete text with proper token handling."""
-    try:
-        words = text.split()
-        chunks = []
-        current_chunk = []
-        current_length = 0
-        for word in words:
-            word_tokens = len(classifier.tokenizer.encode(word))
-            if current_length + word_tokens > 512:
-                if current_chunk:
-                    chunks.append(' '.join(current_chunk))
-                current_chunk = [word]
-                current_length = word_tokens
-            else:
-                current_chunk.append(word)
-                current_length += word_tokens
-        if current_chunk:
-            chunks.append(' '.join(current_chunk))
-        if not chunks:
-            chunks = [text]
-        all_scores = []
-        for chunk in chunks:
-            try:
-                inputs = classifier.tokenizer(
-                    chunk,
-                    truncation=True,
-                    max_length=512,
-                    return_tensors="pt"
-                )
-                result = classifier(chunk, truncation=True, max_length=512)
-                scores = result[0]
-                all_scores.append(scores)
-            except Exception as chunk_error:
-                st.warning(f"Skipping chunk due to error: {str(chunk_error)}")
-                continue
-        if all_scores:
-            label_scores = {}
-            count = len(all_scores)
-            for scores in all_scores:
-                for score in scores:
-                    label = score['label']
-                    if label not in label_scores:
-                        label_scores[label] = 0
-                    label_scores[label] += score['score']
-            avg_scores = {label: score/count for label, score in label_scores.items()}
-            final_emotion = max(avg_scores.items(), key=lambda x: x[1])[0]
-            return final_emotion
-        return "LABEL_2"
-    except Exception as e:
-        st.warning(f"Error in emotion classification: {str(e)}")
-        return "LABEL_2"
-def get_embedding_for_text(text, tokenizer, model):
-    """Get embedding for complete text."""
-    chunks = split_text(text)
-    chunk_embeddings = []
-    for chunk in chunks:
-        try:
-            inputs = tokenizer(
-                chunk,
-                return_tensors="pt",
-                padding=True,
-                truncation=True,
-                max_length=512
-            )
-            inputs = {k: v.to(model.device) for k, v in inputs.items()}
-            with torch.no_grad():
-                outputs = model(**inputs)
-            embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
-            chunk_embeddings.append(embedding[0])
-        except Exception as e:
-            st.warning(f"Error processing chunk: {str(e)}")
-            continue
-    if chunk_embeddings:
-        weights = np.array([len(chunk.split()) for chunk in chunks])
-        weights = weights / weights.sum()
-        weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights)
-        return weighted_embedding
-    return np.zeros(model.config.hidden_size)
-def format_topics(topic_model, topic_counts):
-    """Format topics for display."""
-    formatted_topics = []
-    for topic_num, count in topic_counts:
-        if topic_num == -1:
-            topic_label = "Miscellaneous"
-        else:
-            words = topic_model.get_topic(topic_num)
-            topic_label = " | ".join([word for word, _ in words[:5]])
-        formatted_topics.append({
-            'topic': topic_label,
-            'count': count
-        })
-    return formatted_topics
-def format_emotions(emotion_counts):
-    """Format emotions for display."""
-    EMOTION_LABELS = {
-        'LABEL_0': 'Negative',
-        'LABEL_1': 'Positive',
-        'LABEL_2': 'Neutral'
-    }
-    formatted_emotions = []
-    for label, count in emotion_counts:
-        emotion = EMOTION_LABELS.get(label, label)
-        formatted_emotions.append({
-            'emotion': emotion,
-            'count': count
-        })
-    return formatted_emotions
-def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3):
-    """Process the data and generate summaries with flexible topic configuration."""
-    summaries = []
-    topic_model_params = {
-        "language": "arabic",
-        "calculate_probabilities": True,
-        "min_topic_size": 3,
-        "n_gram_range": (1, 1),
-        "top_n_words": 15,
-        "verbose": True,
-    }
-    st.write(f"Total documents: {len(df)}")
-    st.write(f"Topic strategy: {topic_strategy}")
-    st.write(f"Min topic size: {min_topic_size}")
-    if topic_strategy == "Manual":
-        topic_model_params["nr_topics"] = n_topics
-    else:
-        topic_model_params["nr_topics"] = "auto"
-    topic_model = BERTopic(
-        embedding_model=bert_model,
-        **topic_model_params)
-    vectorizer = CountVectorizer(stop_words=list(ARABIC_STOP_WORDS),
-                                min_df=1,
-                                max_df=1.0)
-    topic_model.vectorizer_model = vectorizer
-    for country, group in df.groupby('country'):
-        progress_text = f"Processing poems for {country}..."
-        progress_bar = st.progress(0, text=progress_text)
-        texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
-        all_emotions = []
-        embeddings = []
-        clear_memory()
-        for i, text in enumerate(texts):
-            try:
-                embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
-                if embedding is not None and not np.isnan(embedding).any():
-                    embeddings.append(embedding)
-                else:
-                    st.warning(f"Invalid embedding generated for text {i+1} in {country}")
-                    continue
-            except Exception as e:
-                st.warning(f"Error generating embedding for text {i+1} in {country}: {str(e)}")
-                continue
-            if i % 10 == 0:
-                clear_memory()
-            progress = (i + 1) / len(texts) * 0.4
-            progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...")
-        if len(embeddings) != len(texts):
-            texts = texts[:len(embeddings)]
-        embeddings = np.array(embeddings)
-        clear_memory()
-        for i, text in enumerate(texts):
-            emotion = classify_emotion(text, emotion_classifier)
-            all_emotions.append(emotion)
-            if i % 10 == 0:
-                clear_memory()
-            progress = 0.4 + ((i + 1) / len(texts) * 0.3)
-            progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")
-        try:
-            if len(texts) < min_topic_size:
-                st.warning(f"Not enough documents for {country} to generate meaningful topics (minimum {min_topic_size} required)")
-                continue
-            topics, probs = topic_model.fit_transform(texts, embeddings)
-            topic_counts = Counter(topics)
-            top_topics = format_topics(topic_model, topic_counts.most_common(top_n))
-            top_emotions = format_emotions(Counter(all_emotions).most_common(top_n))
-            summaries.append({
-                'country': country,
-                'total_poems': len(texts),
-                'top_topics': top_topics,
-                'top_emotions': top_emotions
-            })
-            progress_bar.progress(1.0, text="Processing complete!")
-        except Exception as e:
-            st.warning(f"Could not generate topics for {country}: {str(e)}")
-            continue
-    return summaries, topic_model
-try:
-    bert_tokenizer, bert_model, emotion_classifier = load_models()
-    st.success("Models loaded successfully!")
-except Exception as e:
-    st.error(f"Error loading models: {str(e)}")
-    st.stop()
-# Main app interface
-st.title("📚 Contemporary Arabic Poetry Analysis")
-st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.")
-uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"])
-if uploaded_file is not None:
-    try:
-        if uploaded_file.name.endswith('.csv'):
-            df = pd.read_csv(uploaded_file)
-        else:
-            df = pd.read_excel(uploaded_file)
-        required_columns = ['country', 'poem']
-        if not all(col in df.columns for col in required_columns):
-            st.error("File must contain 'country' and 'poem' columns.")
-            st.stop()
-        df['country'] = df['country'].str.strip()
-        df = df.dropna(subset=['country', 'poem'])
-        sampled_df = df.groupby('country').apply(lambda x: x.head(20)).reset_index(drop=True)
-        st.subheader("Topic Modeling Settings")
-        col1, col2 = st.columns(2)
-        with col1:
-            topic_strategy = st.radio(
-                "Topic Number Strategy",
-                ["Auto", "Manual"],
-                help="Choose whether to let the model determine the optimal number of topics or set it manually"
-            )
-            if topic_strategy == "Manual":
-                n_documents = len(df)
-                max_topics = 500
-                min_topics = 5
-                default_topics = 20
-                n_topics = st.slider(
-                    "Number of Topics",
-                    min_value=min_topics,
-                    max_value=max_topics,
-                    value=default_topics,
-                    help=f"Select the desired number of topics (max {max_topics} based on dataset size)"
-                )
-                st.info(f"""
-                    💡 For your dataset of {n_documents:,} documents:
-                    - Available topic range: {min_topics}-{max_topics}
-                    - Recommended range: {max_topics//10}-{max_topics//3} for optimal coherence
-                    """)
-        with col2:
-            top_n = st.number_input(
-                "Number of top topics/emotions to display:",
-                min_value=1,
-                max_value=100,
-                value=10
-            )
-        if st.button("Process Data"):
-            with st.spinner("Processing your data..."):
-                summaries, topic_model = process_and_summarize(
-                    sampled_df,
-                    bert_tokenizer,
-                    bert_model,
-                    emotion_classifier,
-                    top_n=top_n,
-                    topic_strategy=topic_strategy,
-                    n_topics=n_topics if topic_strategy == "Manual" else None,
-                    min_topic_size=3
-                )
-                if summaries:
-                    st.success("Analysis complete!")
-                    tab1, tab2, tab3 = st.tabs(["Country Summaries", "Global Topics", "Topic Map"])
-                    with tab1:
-                        for summary in summaries:
-                            with st.expander(f"📍 {summary['country']} ({summary['total_poems']} poems)"):
-                                col1, col2 = st.columns(2)
-                                with col1:
-                                    st.subheader("Top Topics")
-                                    for topic in summary['top_topics']:
-                                        st.write(f"• {topic['topic']}: {topic['count']} poems")
-                                with col2:
-                                    st.subheader("Emotions")
-                                    for emotion in summary['top_emotions']:
-                                        st.write(f"• {emotion['emotion']}: {emotion['count']} poems")
-                                st.subheader("Word Cloud Visualization")
-                                country_poems = df[df['country'] == summary['country']]['poem']
-                                combined_text = ' '.join(country_poems)
-                                wordcloud_fig = create_arabic_wordcloud(combined_text, f"Most Common Words in {summary['country']} Poems")
-                                st.pyplot(wordcloud_fig)
-                    with tab2:
-                        st.subheader("Global Topic Distribution")
-                        topic_info = topic_model.get_topic_info()
-                        for _, row in topic_info.iterrows():
-                            if row['Topic'] == -1:
-                                topic_name = "Miscellaneous"
-                            else:
-                                words = topic_model.get_topic(row['Topic'])
-                                topic_name = " | ".join([word for word, _ in words[:5]])
-                            st.write(f"• Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
-                    with tab3:
-                        st.subheader("Topic and Sentiment Distribution Map")
-                        topic_map = create_topic_map(summaries)
-                        st.components.v1.html(topic_map._repr_html_(), height=600)
-    except Exception as e:
-        st.error(f"Error processing file: {str(e)}")
-else:
-    st.info("👆 Upload a file to get started!")
-    st.write("### Expected File Format:")
-    example_df = pd.DataFrame({
-        'country': ['Egypt', 'Palestine'],
-        'poem': ['قصيدة مصرية', 'قصيدة فلسطينية']
-    })
-    st.dataframe(example_df)

 import streamlit as st
 import pandas as pd
 import numpy as np
+import torch
+import networkx as nx
+import plotly.express as px
+import plotly.graph_objs as go
 import matplotlib.pyplot as plt
+import seaborn as sns
+from scipy.signal import savgol_filter
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+from wordcloud import WordCloud
+import spacy
+# Advanced NLP Libraries
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    pipeline,
+    AutoModelForTokenClassification
+)
+import nltk
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from textstat import flesch_reading_ease, flesch_kincaid_grade
+# Download necessary NLTK resources
+nltk.download('punkt', quiet=True)
+nltk.download('stopwords', quiet=True)
+# Load spaCy model (requires separate installation)
+try:
+    nlp = spacy.load('en_core_web_lg')
+except:
+    st.error("Please install spaCy and en_core_web_lg model: \n"
+             "pip install spacy\n"
+             "python -m spacy download en_core_web_lg")
+# Constants and Configurations
+MORAL_FOUNDATIONS = {
+    'care': 'Care/Harm',
+    'fairness': 'Fairness/Cheating',
+    'loyalty': 'Loyalty/Betrayal',
+    'authority': 'Authority/Subversion',
+    'sanctity': 'Sanctity/Degradation'
+}
+RHETORICAL_DEVICES = {
+    'analogy': ['like', 'as', 'similar to'],
+    'repetition': ['repetitive', 'recurring'],
+    'metaphor': ['as if', 'like', 'represents'],
+    'hyperbole': ['always', 'never', 'absolute'],
+    'rhetorical_question': ['?']
 }
+class SpeechAnalyzer:
+    def __init__(self):
+        # Load models
+        self.moralbert_tokenizer = AutoTokenizer.from_pretrained("minyoungchang/moralbert")
+        self.moralbert_model = AutoModelForSequenceClassification.from_pretrained("minyoungchang/moralbert")
+        self.sentiment_pipeline = pipeline("sentiment-analysis")
+        # Named Entity Recognition
+        self.ner_tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
+        self.ner_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
+        self.ner_pipeline = pipeline("ner", model=self.ner_model, tokenizer=self.ner_tokenizer)
+    def split_text(self, text, max_length=512, overlap=50):
+        """Split long text into overlapping segments"""
+        words = text.split()
+        segments = []
+        current_segment = []
+        current_length = 0
+        for word in words:
+            if current_length + len(word.split()) > max_length:
+                segments.append(' '.join(current_segment))
+                current_segment = current_segment[-overlap:] + [word]
+                current_length = len(' '.join(current_segment).split())
+            else:
+                current_segment.append(word)
+                current_length = len(' '.join(current_segment).split())
+        if current_segment:
+            segments.append(' '.join(current_segment))
+        return segments
+    def analyze_moral_foundations(self, text):
+        """Analyze moral foundations in text"""
+        segments = self.split_text(text)
+        foundation_scores = {
+            'care': [], 'fairness': [], 'loyalty': [],
+            'authority': [], 'sanctity': []
+        }
+        for segment in segments:
+            inputs = self.moralbert_tokenizer(segment, return_tensors="pt", truncation=True, max_length=512)
+            with torch.no_grad():
+                outputs = self.moralbert_model(**inputs)
+            probabilities = torch.softmax(outputs.logits, dim=1)
+            for foundation in foundation_scores.keys():
+                foundation_scores[foundation].append(probabilities[0][1].item())
+        aggregated_scores = {
+            foundation: np.mean(scores) for foundation, scores in foundation_scores.items()
+        }
+        return aggregated_scores
+    def analyze_emotional_trajectory(self, text, window_size=5):
+        """Perform emotional trajectory analysis"""
+        segments = self.split_text(text, max_length=256)
+        sentiment_scores = []
+        for segment in segments:
+            result = self.sentiment_pipeline(segment)[0]
+            score = 1 if result['label'] == 'POSITIVE' else -1
+            sentiment_scores.append(score)
+        smoothed_scores = (savgol_filter(sentiment_scores, window_length=window_size, polyorder=2)
+                           if len(sentiment_scores) > window_size else sentiment_scores)
+        return smoothed_scores
+    def detect_named_entities(self, text):
+        """Detect named entities in the text"""
+        entities = self.ner_pipeline(text)
+        return entities
+    def extract_key_phrases(self, text, top_n=10):
+        """Extract key phrases using TF-IDF"""
+        vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
+        tfidf_matrix = vectorizer.fit_transform([text])
+        feature_names = vectorizer.get_feature_names_out()
+        # Get top phrases by TF-IDF score
+        sorted_idx = tfidf_matrix.toarray()[0].argsort()[::-1]
+        top_phrases = [feature_names[i] for i in sorted_idx[:top_n]]
+        return top_phrases
+    def calculate_readability(self, text):
+        """Calculate readability metrics"""
+        return {
+            'Flesch Reading Ease': flesch_reading_ease(text),
+            'Flesch-Kincaid Grade Level': flesch_kincaid_grade(text)
+        }
+    def detect_rhetorical_devices(self, text):
+        """Detect rhetorical devices"""
+        devices_found = {}
+        for device, markers in RHETORICAL_DEVICES.items():
+            count = sum(text.lower().count(marker) for marker in markers)
+            if count > 0:
+                devices_found[device] = count
+        return devices_found
+    def create_semantic_network(self, text, top_n=20):
+        """Create semantic network graph"""
+        # Use spaCy for advanced parsing
+        doc = nlp(text)
+        # Create graph
+        G = nx.Graph()
+        # Extract top nouns and their relationships
+        nouns = [token.text for token in doc if token.pos_ == 'NOUN']
+        noun_freq = nltk.FreqDist(nouns)
+        top_nouns = [noun for noun, _ in noun_freq.most_common(top_n)]
+        # Add nodes and edges
+        for noun in top_nouns:
+            G.add_node(noun)
+        # Connect related nouns
+        for i in range(len(top_nouns)):
+            for j in range(i+1, len(top_nouns)):
+                if top_nouns[i] in text and top_nouns[j] in text:
+                    G.add_edge(top_nouns[i], top_nouns[j])
+        return G
+def main():
+    st.set_page_config(page_title="Advanced Political Speech Analysis", page_icon="🗣️", layout="wide")
+    st.title("🗣️ Advanced Political Speech Analysis Toolkit")
+    # Initialize analyzer
+    analyzer = SpeechAnalyzer()
+    # File upload
+    uploaded_file = st.file_uploader("Upload Political Speech", type=['txt', 'docx', 'pdf'])
+    if uploaded_file is not None:
+        # Read file (similar to previous implementation)
+        if uploaded_file.name.endswith('.txt'):
+            text = uploaded_file.getvalue().decode('utf-8')
+        elif uploaded_file.name.endswith('.docx'):
+            import docx
+            doc = docx.Document(uploaded_file)
+            text = '\n'.join([paragraph.text for paragraph in doc.paragraphs])
+        elif uploaded_file.name.endswith('.pdf'):
+            import PyPDF2
+            pdf_reader = PyPDF2.PdfReader(uploaded_file)
+            text = ' '.join([page.extract_text() for page in pdf_reader.pages])
+        # Create tabs for different analyses
+        tab1, tab2, tab3, tab4, tab5 = st.tabs([
+            "Moral Foundations",
+            "Emotional Analysis",
+            "Linguistic Insights",
+            "Semantic Network",
+            "Advanced NLP"
+        ])
+        with tab1:
+            st.subheader("Moral Foundations Analysis")
+            moral_scores = analyzer.analyze_moral_foundations(text)
+            # Plotly bar chart
+            moral_df = pd.DataFrame.from_dict(moral_scores, orient='index', columns=['Score'])
+            moral_df.index.name = 'Moral Foundation'
+            moral_df = moral_df.reset_index()
+            fig = px.bar(
+                moral_df,
+                x='Moral Foundation',
+                y='Score',
+                title='Moral Foundations Breakdown',
+                color='Moral Foundation'
+            )
+            st.plotly_chart(fig)
+            # Detailed insights
+            for foundation, score in moral_scores.items():
+                st.write(f"**{MORAL_FOUNDATIONS[foundation]}**: {score:.2%}")
+        with tab2:
+            st.subheader("Emotional Trajectory")
+            emotional_trajectory = analyzer.analyze_emotional_trajectory(text)
+            # Plotly line chart
+            trajectory_fig = go.Figure(data=go.Scatter(
+                y=emotional_trajectory,
+                mode='lines+markers',
+                name='Emotional Intensity'
+            ))
+            trajectory_fig.update_layout(
+                title='Speech Emotional Trajectory',
+                xaxis_title='Speech Segments',
+                yaxis_title='Emotional Intensity'
+            )
+            st.plotly_chart(trajectory_fig)
+        with tab3:
+            st.subheader("Linguistic Complexity")
+            readability = analyzer.calculate_readability(text)
+            col1, col2 = st.columns(2)
+            with col1:
+                st.metric("Flesch Reading Ease", f"{readability['Flesch Reading Ease']:.2f}")
+            with col2:
+                st.metric("Flesch-Kincaid Grade Level", f"{readability['Flesch-Kincaid Grade Level']:.2f}")
+            # Key Phrases
+            st.subheader("Key Phrases")
+            key_phrases = analyzer.extract_key_phrases(text)
+            st.write(", ".join(key_phrases))
+        with tab4:
+            st.subheader("Semantic Network")
+            semantic_graph = analyzer.create_semantic_network(text)
+            # Convert NetworkX graph to Plotly
+            edge_x = []
+            edge_y = []
+            for edge in semantic_graph.edges():
+                x0, y0 = semantic_graph.nodes[edge[0]].get('pos', (0,0))
+                x1, y1 = semantic_graph.nodes[edge[1]].get('pos', (0,0))
+                edge_x.append(x0)
+                edge_x.append(x1)
+                edge_y.append(y0)
+                edge_y.append(y1)
+            # Plotly network visualization
+            network_fig = go.Figure()
+            network_fig.add_trace(go.Scatter(
+                x=edge_x,
+                y=edge_y,
+                mode='lines',
+                line=dict(width=0.5, color='#888'),
+                hoverinfo='none'
+            ))
+            st.plotly_chart(network_fig)
+        with tab5:
+            st.subheader("Advanced NLP Analysis")
+            # Named Entities
+            st.write("### Named Entities")
+            named_entities = analyzer.detect_named_entities(text)
+            entities_df = pd.DataFrame(named_entities)
+            st.dataframe(entities_df)
+            # Rhetorical Devices
+            st.write("### Rhetorical Devices")
+            rhetorical_devices = analyzer.detect_rhetorical_devices(text)
+            for device, count in rhetorical_devices.items():
+                st.write(f"**{device.capitalize()}**: {count} instances")
+if __name__ == "__main__":
+    main()