Spaces:

dhanikitkat
/

sentiment_emotion

Sleeping

File size: 16,577 Bytes

import streamlit as st
import pandas as pd
from transformers import pipeline
import base64
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import plotly.express as px
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import numpy as np
from PIL import ImageFont
import os


nltk.download('punkt')
nltk.download('stopwords')

# Load pipelines
sentiment_pipe = pipeline("text-classification", model="dhanikitkat/indo_smsa-1.5G_sentiment_analysis") 
emotion_pipe = pipeline("text-classification", model="azizp128/prediksi-emosi-indobert")

def load_slank_formal(file):
    if file.name.endswith('.txt'):
        df = pd.read_csv(file, sep=';', header=None, names=['Slank', 'Formal'])
    else:
        st.error("Format file tidak didukung. Harap unggah file TXT.")
        return None
    df.columns = ['Slank', 'Formal']
    return df

def replace_slank_to_formal(sentence, slank_formal_df):
    words = re.findall(r'[\w\',./:-]+|[.,]+|[^\x00-\x7F]+', sentence)
    for i, word in enumerate(words):
        replacement = slank_formal_df.loc[slank_formal_df['Slank'] == word.lower(), 'Formal'].values
        if replacement.size > 0:
            words[i] = str(replacement[0])
    return ' '.join(words)

def preprocess_text(text, slank_formal_df):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'([^\w\s\U0001F000-\U0001F9FF])\1+', r'\1', text)
    text = re.sub(r'([\U0001F600-\U0001F64F\U0001F900-\U0001F9FF\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F])', r' \1 ', text)
    text = re.sub(r'([.,])', r' \1 ', text)
    text = re.sub(r'[&%]', lambda x: f' {x.group()} ', text)
    text = re.sub(r'(\w)\1{1,}', r'\1\1', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'\b(\w+)\b\s*-\s*\b\1\b', r'\1-\1', text)
    text = re.sub(r'(?<=\d)\s*\.\s*(?=\d)', '.', text)
    text = re.sub(r'(?<=\d)\s*,\s*(?=\d)', ',', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = replace_slank_to_formal(text, slank_formal_df)
    tokens = word_tokenize(text)
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

def generate_wordcloud(text, font_path, colormap, title):
    # Create a circular mask for Full HD resolution
    x, y = np.ogrid[:1400, :1400]  # Adjusted for 1400x1400 resolution
    mask = (x - 700) ** 2 + (y - 700) ** 2 > 630 ** 2  # Adjusted mask size for 1400x1400 resolution
    mask = 255 * mask.astype(int)

    # Remove Indonesian stopwords
    indo_stopwords = set(stopwords.words('indonesian'))
    words = text.split()
    words = [word for word in words if word.lower() not in indo_stopwords]
    text = ' '.join(words)

    wordcloud = WordCloud(
        width=1400,
        height=1400,
        background_color='white',
        font_path=font_path,
        prefer_horizontal=1.0,
        colormap=colormap,
        max_words=100,
        mask=mask
    ).generate(text)
    
    # Configure plot settings for high-quality output
    plt.figure(figsize=(14, 14))  # Adjusted figure size for 1400x1400 resolution
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title, fontsize=20, pad=20)  # Title directly in matplotlib plot

    # Save word cloud to file with high DPI for better quality
    plt.savefig(f"{title}.png", dpi=300, bbox_inches='tight', pad_inches=0.1)

    # Display word cloud in Streamlit
    st.image(f"{title}.png", use_column_width=True)

    # Add download link for word cloud
    st.markdown(get_image_download_link(f"{title}.png"), unsafe_allow_html=True)

def analyze_sentiment(text):
    result = sentiment_pipe(text)[0]
    return result['label'].lower(), result['score']

def analyze_emotion(text):
    result = emotion_pipe(text)[0]
    return result['label'].lower(), result['score']

def get_download_link(df, filename):
    csv = df.to_csv(index=False)
    b64 = base64.b64encode(csv.encode()).decode()
    href = f'<a href="data:file/csv;base64,{b64}" download="{filename}.csv">Download CSV</a>'
    return href

def get_word_freq_download_link(word_freq_df):
    csv = word_freq_df.to_csv(index=True)
    b64 = base64.b64encode(csv.encode()).decode()
    href = f'<a href="data:file/csv;base64,{b64}" download="word_frequency.csv">Download Word Frequency CSV</a>'
    return href

def get_example_download_link(file_path, link_text):
    with open(file_path, "rb") as file:
        b64 = base64.b64encode(file.read()).decode()
    return f'<a href="data:file/txt;base64,{b64}" download="{os.path.basename(file_path)}">{link_text}</a>'

def get_image_download_link(image_path):
    with open(image_path, "rb") as image_file:
        b64 = base64.b64encode(image_file.read()).decode()
    href = f'<a href="data:file/png;base64,{b64}" download="{image_path}">Download {image_path}</a>'
    return href

def combined_analysis(text, slank_formal_df):
    texts = text.split('\n')
    results = []
    for text in texts:
        if text.strip():
            cleaned_text = preprocess_text(text, slank_formal_df)
            sentiment_result = sentiment_pipe(cleaned_text)[0]
            emotion_result = emotion_pipe(cleaned_text)[0]
            results.append((text, cleaned_text, sentiment_result['label'].lower(), sentiment_result['score'], emotion_result['label'].lower(), emotion_result['score']))
    df = pd.DataFrame(results, columns=['Content', 'Cleaned Content', 'Sentiment', 'Score Sentiment', 'Emotion', 'Score Emotion'])
    
    # Define custom CSS to adjust the height
    st.markdown(
        """
        <style>
        .chart-container {
            display: flex;
            justify-content: center;
        }
        .user-select-none.svg-container {
            height: 360px !important;
        }
        .average-score {
            text-align: center;
        }
        </style>
        """, 
        unsafe_allow_html=True
    )

        # Sentiment pie chart
    sentiment_counts = df['Sentiment'].value_counts()
    sentiment_colors = {
        'positive': px.colors.qualitative.Set3[0],
        'negative': px.colors.qualitative.Set3[3],
        'neutral': px.colors.qualitative.Set3[1]
    }

    fig_sentiment = px.pie(
        sentiment_counts,
        values=sentiment_counts.values,
        names=sentiment_counts.index,
        title='Sentiment Distribution',
        width=400,
        height=400,
        color=sentiment_counts.index,
        color_discrete_map=sentiment_colors
    )

    # Calculate sentiment average
    sentiment_average = df['Score Sentiment'].mean()

    # Add average sentiment score as an annotation
    fig_sentiment.add_annotation(
        text=f"Average Sentiment Score: {sentiment_average:.4f}",
        xref="paper", yref="paper",
        x=0.5, y=-0.2,
        showarrow=False,
        font=dict(size=18)
    )

    st.markdown('<div class="chart-container">', unsafe_allow_html=True)
    st.plotly_chart(fig_sentiment, use_container_width=True)
    st.markdown('</div>', unsafe_allow_html=True)

    # Emotion pie chart
     # Sentiment pie chart
    emotion_counts = df['Emotion'].value_counts()
    emotion_colors = {
        'marah': px.colors.qualitative.Safe[9],
        'sedih': px.colors.qualitative.Safe[1],
        'senang': px.colors.qualitative.Safe[0],
        'cinta': px.colors.qualitative.Safe[2],
        'jijik': px.colors.qualitative.Safe[6],
        'takut': px.colors.qualitative.Safe[7],
    }
    fig_emotion = px.pie(
        emotion_counts, 
        values=emotion_counts.values, 
        names=emotion_counts.index, 
        title='Emotion Distribution', 
        width=400, 
        height=400, 
        color=emotion_counts.index, 
        color_discrete_map=emotion_colors
    )

    # Calculate emotion average
    emotion_average = df['Score Emotion'].mean()

    # Add average emotion score as an annotation
    fig_emotion.add_annotation(
        text=f"Average Emotion Score: {emotion_average:.4f}",
        xref="paper", yref="paper",
        x=0.5, y=-0.2,
        showarrow=False,
        font=dict(size=18)
    )

    st.markdown('<div class="chart-container">', unsafe_allow_html=True)
    st.plotly_chart(fig_emotion, use_container_width=True)
    st.markdown('</div>', unsafe_allow_html=True)

    # Generate word clouds
    font_path = os.path.join('assets', 'Poppins-Regular.ttf')
    
    # Ensure `df` is your DataFrame and 'Cleaned Content', 'Sentiment', and 'Emotion' columns exist
    overall_text = ' '.join(df['Cleaned Content'].dropna())
    generate_wordcloud(overall_text, font_path, 'hsv_r', 'Overall Word Cloud')

    positive_happy_text = ' '.join(df[(df['Sentiment'] == 'positive') & (df['Emotion'] == 'senang')]['Cleaned Content'].dropna())
    generate_wordcloud(positive_happy_text, font_path, 'gist_rainbow_r', 'Positive Sentiment & Happy Emotion Word Cloud')

    negative_angry_sad_text = ' '.join(df[(df['Sentiment'] == 'negative') & (df['Emotion'].isin(['marah', 'sedih']))]['Cleaned Content'].dropna())
    generate_wordcloud(negative_angry_sad_text, font_path, 'inferno', 'Negative Sentiment & Angry or Sad Emotion Word Cloud')

    # Word frequency
    word_freq = pd.Series(' '.join(df['Cleaned Content'].dropna()).split()).value_counts()
    st.write("Word Frequency:")
    st.write(word_freq)

    # Download link for word frequency
    word_freq_df = word_freq.reset_index()
    word_freq_df.columns = ['Word', 'Frequency']
    st.markdown(get_word_freq_download_link(word_freq_df), unsafe_allow_html=True)

    return df

def process_file(file, slank_formal_df):
    if file.name.endswith('.xlsx'):
        df = pd.read_excel(file)
    elif file.name.endswith('.csv'):
        df = pd.read_csv(file)
    else:
        st.error("Format file tidak didukung. Harap unggah file CSV atau XLSX.")
        return None

    results = []
    for index, row in df.iterrows():
        if pd.notna(row['content']) and isinstance(row['content'], str):
            cleaned_text = preprocess_text(row['content'], slank_formal_df)
            sentiment, score_sentiment = analyze_sentiment(cleaned_text)
            emotion, score_emotion = analyze_emotion(cleaned_text)
            results.append((row['content'], cleaned_text, sentiment, score_sentiment, emotion, score_emotion))
        else:
            results.append((row['content'], None, None, None, None, None))
    
    df['Cleaned Content'] = [r[1] for r in results]
    df['Sentiment'] = [r[2] for r in results]
    df['Score Sentiment'] = [r[3] for r in results]
    df['Emotion'] = [r[4] for r in results]
    df['Score Emotion'] = [r[5] for r in results]

    # Define custom CSS to adjust the height
    st.markdown(
        """
        <style>
        .chart-container {
            display: flex;
            justify-content: center;
        }
        .user-select-none.svg-container {
            height: 360px !important;
        }
        .average-score {
            text-align: center;
        }
        </style>
        """, 
        unsafe_allow_html=True
    )

        # Sentiment pie chart
    sentiment_counts = df['Sentiment'].value_counts()
    sentiment_colors = {
        'positive': px.colors.qualitative.Set3[0],
        'negative': px.colors.qualitative.Set3[3],
        'neutral': px.colors.qualitative.Set3[1]
    }

    fig_sentiment = px.pie(
        sentiment_counts,
        values=sentiment_counts.values,
        names=sentiment_counts.index,
        title='Sentiment Distribution',
        width=400,
        height=400,
        color=sentiment_counts.index,
        color_discrete_map=sentiment_colors
    )

    # Calculate sentiment average
    sentiment_average = df['Score Sentiment'].mean()

    # Add average sentiment score as an annotation
    fig_sentiment.add_annotation(
        text=f"Average Sentiment Score: {sentiment_average:.4f}",
        xref="paper", yref="paper",
        x=0.5, y=-0.2,
        showarrow=False,
        font=dict(size=18)
    )

    st.markdown('<div class="chart-container">', unsafe_allow_html=True)
    st.plotly_chart(fig_sentiment, use_container_width=True)
    st.markdown('</div>', unsafe_allow_html=True)

    # Emotion pie chart
     # Sentiment pie chart
    emotion_counts = df['Emotion'].value_counts()
    emotion_colors = {
        'marah': px.colors.qualitative.Safe[9],
        'sedih': px.colors.qualitative.Safe[1],
        'senang': px.colors.qualitative.Safe[0],
        'cinta': px.colors.qualitative.Safe[2],
        'jijik': px.colors.qualitative.Safe[6],
        'takut': px.colors.qualitative.Safe[7],
    }
    fig_emotion = px.pie(
        emotion_counts, 
        values=emotion_counts.values, 
        names=emotion_counts.index, 
        title='Emotion Distribution', 
        width=400, 
        height=400, 
        color=emotion_counts.index, 
        color_discrete_map=emotion_colors
    )

    # Calculate emotion average
    emotion_average = df['Score Emotion'].mean()

    # Add average emotion score as an annotation
    fig_emotion.add_annotation(
        text=f"Average Emotion Score: {emotion_average:.4f}",
        xref="paper", yref="paper",
        x=0.5, y=-0.2,
        showarrow=False,
        font=dict(size=18)
    )

    st.markdown('<div class="chart-container">', unsafe_allow_html=True)
    st.plotly_chart(fig_emotion, use_container_width=True)
    st.markdown('</div>', unsafe_allow_html=True)

    # Generate word clouds
    font_path = os.path.join('assets', 'Poppins-Regular.ttf')
    
    # Ensure `df` is your DataFrame and 'Cleaned Content', 'Sentiment', and 'Emotion' columns exist
    overall_text = ' '.join(df['Cleaned Content'].dropna())
    generate_wordcloud(overall_text, font_path, 'hsv_r', 'Overall Word Cloud')

    positive_happy_text = ' '.join(df[(df['Sentiment'] == 'positive') & (df['Emotion'] == 'senang')]['Cleaned Content'].dropna())
    generate_wordcloud(positive_happy_text, font_path, 'gist_rainbow_r', 'Positive Sentiment & Happy Emotion Word Cloud')

    negative_angry_sad_text = ' '.join(df[(df['Sentiment'] == 'negative') & (df['Emotion'].isin(['marah', 'sedih']))]['Cleaned Content'].dropna())
    generate_wordcloud(negative_angry_sad_text, font_path, 'inferno', 'Negative Sentiment & Angry or Sad Emotion Word Cloud')

    # Word frequency
    word_freq = pd.Series(' '.join(df['Cleaned Content'].dropna()).split()).value_counts()
    st.write("Word Frequency:")
    st.write(word_freq)

    # Download link for word frequency
    word_freq_df = word_freq.reset_index()
    word_freq_df.columns = ['Word', 'Frequency']
    st.markdown(get_word_freq_download_link(word_freq_df), unsafe_allow_html=True)

    return df

def main():
    st.title("Aplikasi Analisis Sentimen dan Prediksi Emosi")

    # Add download link for example slank template
    slank_template_path = "assets/contoh template data slank.txt"
    st.markdown(get_example_download_link(slank_template_path, "Download Contoh Template Data Slank (TXT)"), unsafe_allow_html=True)

    slank_file = st.file_uploader("Upload file slank dengan baris pertama Slank;Formal (TXT)", type=["txt"])
    if slank_file is not None:
        df_slank_formal = load_slank_formal(slank_file)
        if df_slank_formal is None:
            st.stop()
    else:
        st.warning("Harap upload file slank terlebih dahulu.")
        st.stop()

    menu = st.sidebar.selectbox("Pilih Metode", ["Analisis Langsung", "Import dari File"])

    if menu == "Analisis Langsung":
        user_input = st.text_area("Masukkan teks yang ingin dianalisis (pisahkan dengan enter):")
        if st.button("Analisis"):
            df = combined_analysis(user_input, df_slank_formal)
            st.write("Hasil Analisis:")
            st.write(df)
            st.markdown(get_download_link(df, "analisis_sentimen_emosi"), unsafe_allow_html=True)
            
    elif menu == "Import dari File":
        # Add download link for example content template
        content_template_path = "assets/contoh template data content.xlsx"
        st.markdown(get_example_download_link(content_template_path, "Download Contoh Template Data Content (XLSX)"), unsafe_allow_html=True)
        
        uploaded_file = st.file_uploader("Upload file CSV atau XLSX", type=["csv", "xlsx"])
        if uploaded_file is not None:
            df = process_file(uploaded_file, df_slank_formal)
            st.write("Hasil Analisis:")
            st.write(df)
            st.markdown(get_download_link(df, "analisis_sentimen_emosi"), unsafe_allow_html=True)

if __name__ == '__main__':
    main()