Spaces:

dhanikitkat
/

sentiment_emotion

Sleeping

File size: 10,019 Bytes

import streamlit as st
import pandas as pd
from transformers import pipeline
import base64
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import plotly.express as px
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import numpy as np
from PIL import ImageFont
import os


nltk.download('punkt')


# Load pipelines
sentiment_pipe = pipeline("text-classification", model="ayameRushia/bert-base-indonesian-1.5G-sentiment-analysis-smsa")
emotion_pipe = pipeline("text-classification", model="azizp128/prediksi-emosi-indobert")

def load_slank_formal(file):
    if file.name.endswith('.txt'):
        df = pd.read_csv(file, sep=';', header=None, names=['Slank', 'Formal'])
    else:
        st.error("Format file tidak didukung. Harap unggah file TXT.")
        return None
    df.columns = ['Slank', 'Formal']
    return df

def replace_slank_to_formal(sentence, slank_formal_df):
    words = re.findall(r'[\w\',./:-]+|[.,]+|[^\x00-\x7F]+', sentence)
    for i, word in enumerate(words):
        replacement = slank_formal_df.loc[slank_formal_df['Slank'] == word.lower(), 'Formal'].values
        if replacement.size > 0:
            words[i] = str(replacement[0])
    return ' '.join(words)

def preprocess_text(text, slank_formal_df):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = replace_slank_to_formal(text, slank_formal_df)
    tokens = word_tokenize(text)
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

def generate_wordcloud(text, font_path, title, colormap):
    wordcloud = WordCloud(
        width=600,
        height=600,
        background_color='white',
        font_path=font_path,
        prefer_horizontal=1.0,
        colormap=colormap,
        max_words=100
    ).generate(text)
    
    plt.figure(figsize=(10, 10))
    plt.title(title, fontsize=20)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    st.pyplot(plt)
    
    # Save word cloud to file
    wordcloud.to_file(f"{title}.png")

    # Add download link for word cloud
    st.markdown(get_image_download_link(f"{title}.png"), unsafe_allow_html=True)

def get_image_download_link(image_path):
    with open(image_path, "rb") as image_file:
        b64 = base64.b64encode(image_file.read()).decode()
    href = f'<a href="data:file/png;base64,{b64}" download="{image_path}">Download {image_path}</a>'
    return href


def combined_analysis(text, slank_formal_df):
    texts = text.split('\n')
    results = []
    for text in texts:
        if text.strip():
            cleaned_text = preprocess_text(text, slank_formal_df)
            sentiment_result = sentiment_pipe(cleaned_text)[0]
            emotion_result = emotion_pipe(cleaned_text)[0]
            results.append((text, cleaned_text, sentiment_result['label'].lower(), sentiment_result['score'], emotion_result['label'].lower(), emotion_result['score']))
    df = pd.DataFrame(results, columns=['Content', 'Cleaned Content', 'Sentiment', 'Score Sentiment', 'Emotion', 'Score Emotion'])
    
    # Sentiment pie chart
    sentiment_counts = df['Sentiment'].value_counts()
    fig_sentiment = px.pie(sentiment_counts, values=sentiment_counts.values, names=sentiment_counts.index, title='Sentiment Distribution')
    st.plotly_chart(fig_sentiment, use_container_width=True)

    # Emotion pie chart
    emotion_counts = df['Emotion'].value_counts()
    fig_emotion = px.pie(emotion_counts, values=emotion_counts.values, names=emotion_counts.index, title='Emotion Distribution')
    st.plotly_chart(fig_emotion, use_container_width=True)

    # Generate word clouds
    font_path = os.path.join('assets', 'Poppins-Regular.ttf')
    
    # Overall word cloud
    overall_text = ' '.join(df['Cleaned Content'].dropna())
    generate_wordcloud(overall_text, font_path, 'Overall Word Cloud', 'viridis')
    
    # Positive sentiment and happy emotion word cloud
    positive_happy_text = ' '.join(df[(df['Sentiment'] == 'positive') & (df['Emotion'] == 'senang')]['Cleaned Content'].dropna())
    generate_wordcloud(positive_happy_text, font_path, 'Positive Sentiment & Happy Emotion Word Cloud', 'Greens')

    # Negative sentiment and angry or sad emotion word cloud
    negative_angry_sad_text = ' '.join(df[(df['Sentiment'] == 'negative') & (df['Emotion'].isin(['marah', 'sedih']))]['Cleaned Content'].dropna())
    generate_wordcloud(negative_angry_sad_text, font_path, 'Negative Sentiment & Angry or Sad Emotion Word Cloud', 'Reds')

    # Word frequency
    word_freq = pd.Series(' '.join(df['Cleaned Content'].dropna()).split()).value_counts()
    st.write("Word Frequency:")
    st.write(word_freq)

    # Download link for word frequency
    word_freq_df = word_freq.reset_index()
    word_freq_df.columns = ['Word', 'Frequency']
    st.markdown(get_word_freq_download_link(word_freq_df), unsafe_allow_html=True)

    return df



def process_file(file, slank_formal_df):
    if file.name.endswith('.xlsx'):
        df = pd.read_excel(file)
    elif file.name.endswith('.csv'):
        df = pd.read_csv(file)
    else:
        st.error("Format file tidak didukung. Harap unggah file CSV atau XLSX.")
        return None

    results = []
    for index, row in df.iterrows():
        if pd.notna(row['content']) and isinstance(row['content'], str):
            cleaned_text = preprocess_text(row['content'], slank_formal_df)
            sentiment, score_sentiment = analyze_sentiment(cleaned_text)
            emotion, score_emotion = analyze_emotion(cleaned_text)
            results.append((row['content'], cleaned_text, sentiment, score_sentiment, emotion, score_emotion))
        else:
            results.append((row['content'], None, None, None, None, None))
    
    df['Cleaned Content'] = [r[1] for r in results]
    df['Sentiment'] = [r[2] for r in results]
    df['Score Sentiment'] = [r[3] for r in results]
    df['Emotion'] = [r[4] for r in results]
    df['Score Emotion'] = [r[5] for r in results]

    # Sentiment pie chart
    sentiment_counts = df['Sentiment'].value_counts()
    fig_sentiment = px.pie(sentiment_counts, values=sentiment_counts.values, names=sentiment_counts.index, title='Sentiment Distribution')
    st.plotly_chart(fig_sentiment, use_container_width=True)

    # Emotion pie chart
    emotion_counts = df['Emotion'].value_counts()
    fig_emotion = px.pie(emotion_counts, values=emotion_counts.values, names=emotion_counts.index, title='Emotion Distribution')
    st.plotly_chart(fig_emotion, use_container_width=True)

    # Generate word clouds
    font_path = os.path.join('assets', 'Poppins-Regular.ttf')
    
    # Overall word cloud
    overall_text = ' '.join(df['Cleaned Content'].dropna())
    generate_wordcloud(overall_text, font_path, 'Overall Word Cloud', 'viridis')
    
    # Positive sentiment and happy emotion word cloud
    positive_happy_text = ' '.join(df[(df['Sentiment'] == 'positive') & (df['Emotion'] == 'senang')]['Cleaned Content'].dropna())
    generate_wordcloud(positive_happy_text, font_path, 'Positive Sentiment & Happy Emotion Word Cloud', 'Greens')

    # Negative sentiment and angry or sad emotion word cloud
    negative_angry_sad_text = ' '.join(df[(df['Sentiment'] == 'negative') & (df['Emotion'].isin(['marah', 'sedih']))]['Cleaned Content'].dropna())
    generate_wordcloud(negative_angry_sad_text, font_path, 'Negative Sentiment & Angry or stSad Emotion Word Cloud', 'Reds')

    # Word frequency
    word_freq = pd.Series(' '.join(df['Cleaned Content'].dropna()).split()).value_counts()
    st.write("Word Frequency:")
    st.write(word_freq)

    # Download link for word frequency
    word_freq_df = word_freq.reset_index()
    word_freq_df.columns = ['Word', 'Frequency']
    st.markdown(get_word_freq_download_link(word_freq_df), unsafe_allow_html=True)

    return df


def analyze_sentiment(text):
    result = sentiment_pipe(text)[0]
    return result['label'].lower(), result['score']

def analyze_emotion(text):
    result = emotion_pipe(text)[0]
    return result['label'].lower(), result['score']

def get_download_link(df, filename):
    csv = df.to_csv(index=False)
    b64 = base64.b64encode(csv.encode()).decode()
    href = f'<a href="data:file/csv;base64,{b64}" download="{filename}.csv">Download CSV</a>'
    return href

def get_word_freq_download_link(word_freq_df):
    csv = word_freq_df.to_csv(index=True)
    b64 = base64.b64encode(csv.encode()).decode()
    href = f'<a href="data:file/csv;base64,{b64}" download="word_frequency.csv">Download Word Frequency CSV</a>'
    return href

def main():
    st.title("Aplikasi Analisis Sentimen dan Prediksi Emosi")

    slank_file = st.file_uploader("Upload file slank (CSV atau TXT)", type=["csv", "txt"])
    if slank_file is not None:
        df_slank_formal = load_slank_formal(slank_file)
        if df_slank_formal is None:
            st.stop()
    else:
        st.warning("Harap upload file slank terlebih dahulu.")
        st.stop()

    menu = st.sidebar.selectbox("Pilih Metode", ["Analisis Langsung", "Import dari File"])

    if menu == "Analisis Langsung":
        user_input = st.text_area("Masukkan teks yang ingin dianalisis (pisahkan dengan enter):")
        if st.button("Analisis"):
            df = combined_analysis(user_input, df_slank_formal)
            st.write("Hasil Analisis:")
            st.write(df)
            st.markdown(get_download_link(df, "analisis_sentimen_emosi"), unsafe_allow_html=True)
            
    elif menu == "Import dari File":
        uploaded_file = st.file_uploader("Upload file CSV atau XLSX", type=["csv", "xlsx"])
        if uploaded_file is not None:
            df = process_file(uploaded_file, df_slank_formal)
            st.write("Hasil Analisis:")
            st.write(df)
            st.markdown(get_download_link(df, "analisis_sentimen_emosi"), unsafe_allow_html=True)

if __name__ == '__main__':
    main()