dhanikitkat's picture
update os
9790e56
raw
history blame
10 kB
import streamlit as st
import pandas as pd
from transformers import pipeline
import base64
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import plotly.express as px
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import numpy as np
from PIL import ImageFont
import os
nltk.download('punkt')
# Load pipelines
sentiment_pipe = pipeline("text-classification", model="ayameRushia/bert-base-indonesian-1.5G-sentiment-analysis-smsa")
emotion_pipe = pipeline("text-classification", model="azizp128/prediksi-emosi-indobert")
def load_slank_formal(file):
if file.name.endswith('.txt'):
df = pd.read_csv(file, sep=';', header=None, names=['Slank', 'Formal'])
else:
st.error("Format file tidak didukung. Harap unggah file TXT.")
return None
df.columns = ['Slank', 'Formal']
return df
def replace_slank_to_formal(sentence, slank_formal_df):
words = re.findall(r'[\w\',./:-]+|[.,]+|[^\x00-\x7F]+', sentence)
for i, word in enumerate(words):
replacement = slank_formal_df.loc[slank_formal_df['Slank'] == word.lower(), 'Formal'].values
if replacement.size > 0:
words[i] = str(replacement[0])
return ' '.join(words)
def preprocess_text(text, slank_formal_df):
text = text.lower()
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
text = re.sub(r'\@\w+|\#', '', text)
text = re.sub(r'[^\w\s]', '', text)
text = replace_slank_to_formal(text, slank_formal_df)
tokens = word_tokenize(text)
preprocessed_text = ' '.join(tokens)
return preprocessed_text
def generate_wordcloud(text, font_path, title, colormap):
wordcloud = WordCloud(
width=600,
height=600,
background_color='white',
font_path=font_path,
prefer_horizontal=1.0,
colormap=colormap,
max_words=100
).generate(text)
plt.figure(figsize=(10, 10))
plt.title(title, fontsize=20)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
st.pyplot(plt)
# Save word cloud to file
wordcloud.to_file(f"{title}.png")
# Add download link for word cloud
st.markdown(get_image_download_link(f"{title}.png"), unsafe_allow_html=True)
def get_image_download_link(image_path):
with open(image_path, "rb") as image_file:
b64 = base64.b64encode(image_file.read()).decode()
href = f'<a href="data:file/png;base64,{b64}" download="{image_path}">Download {image_path}</a>'
return href
def combined_analysis(text, slank_formal_df):
texts = text.split('\n')
results = []
for text in texts:
if text.strip():
cleaned_text = preprocess_text(text, slank_formal_df)
sentiment_result = sentiment_pipe(cleaned_text)[0]
emotion_result = emotion_pipe(cleaned_text)[0]
results.append((text, cleaned_text, sentiment_result['label'].lower(), sentiment_result['score'], emotion_result['label'].lower(), emotion_result['score']))
df = pd.DataFrame(results, columns=['Content', 'Cleaned Content', 'Sentiment', 'Score Sentiment', 'Emotion', 'Score Emotion'])
# Sentiment pie chart
sentiment_counts = df['Sentiment'].value_counts()
fig_sentiment = px.pie(sentiment_counts, values=sentiment_counts.values, names=sentiment_counts.index, title='Sentiment Distribution')
st.plotly_chart(fig_sentiment, use_container_width=True)
# Emotion pie chart
emotion_counts = df['Emotion'].value_counts()
fig_emotion = px.pie(emotion_counts, values=emotion_counts.values, names=emotion_counts.index, title='Emotion Distribution')
st.plotly_chart(fig_emotion, use_container_width=True)
# Generate word clouds
font_path = os.path.join('assets', 'Poppins-Regular.ttf')
# Overall word cloud
overall_text = ' '.join(df['Cleaned Content'].dropna())
generate_wordcloud(overall_text, font_path, 'Overall Word Cloud', 'viridis')
# Positive sentiment and happy emotion word cloud
positive_happy_text = ' '.join(df[(df['Sentiment'] == 'positive') & (df['Emotion'] == 'senang')]['Cleaned Content'].dropna())
generate_wordcloud(positive_happy_text, font_path, 'Positive Sentiment & Happy Emotion Word Cloud', 'Greens')
# Negative sentiment and angry or sad emotion word cloud
negative_angry_sad_text = ' '.join(df[(df['Sentiment'] == 'negative') & (df['Emotion'].isin(['marah', 'sedih']))]['Cleaned Content'].dropna())
generate_wordcloud(negative_angry_sad_text, font_path, 'Negative Sentiment & Angry or Sad Emotion Word Cloud', 'Reds')
# Word frequency
word_freq = pd.Series(' '.join(df['Cleaned Content'].dropna()).split()).value_counts()
st.write("Word Frequency:")
st.write(word_freq)
# Download link for word frequency
word_freq_df = word_freq.reset_index()
word_freq_df.columns = ['Word', 'Frequency']
st.markdown(get_word_freq_download_link(word_freq_df), unsafe_allow_html=True)
return df
def process_file(file, slank_formal_df):
if file.name.endswith('.xlsx'):
df = pd.read_excel(file)
elif file.name.endswith('.csv'):
df = pd.read_csv(file)
else:
st.error("Format file tidak didukung. Harap unggah file CSV atau XLSX.")
return None
results = []
for index, row in df.iterrows():
if pd.notna(row['content']) and isinstance(row['content'], str):
cleaned_text = preprocess_text(row['content'], slank_formal_df)
sentiment, score_sentiment = analyze_sentiment(cleaned_text)
emotion, score_emotion = analyze_emotion(cleaned_text)
results.append((row['content'], cleaned_text, sentiment, score_sentiment, emotion, score_emotion))
else:
results.append((row['content'], None, None, None, None, None))
df['Cleaned Content'] = [r[1] for r in results]
df['Sentiment'] = [r[2] for r in results]
df['Score Sentiment'] = [r[3] for r in results]
df['Emotion'] = [r[4] for r in results]
df['Score Emotion'] = [r[5] for r in results]
# Sentiment pie chart
sentiment_counts = df['Sentiment'].value_counts()
fig_sentiment = px.pie(sentiment_counts, values=sentiment_counts.values, names=sentiment_counts.index, title='Sentiment Distribution')
st.plotly_chart(fig_sentiment, use_container_width=True)
# Emotion pie chart
emotion_counts = df['Emotion'].value_counts()
fig_emotion = px.pie(emotion_counts, values=emotion_counts.values, names=emotion_counts.index, title='Emotion Distribution')
st.plotly_chart(fig_emotion, use_container_width=True)
# Generate word clouds
font_path = os.path.join('assets', 'Poppins-Regular.ttf')
# Overall word cloud
overall_text = ' '.join(df['Cleaned Content'].dropna())
generate_wordcloud(overall_text, font_path, 'Overall Word Cloud', 'viridis')
# Positive sentiment and happy emotion word cloud
positive_happy_text = ' '.join(df[(df['Sentiment'] == 'positive') & (df['Emotion'] == 'senang')]['Cleaned Content'].dropna())
generate_wordcloud(positive_happy_text, font_path, 'Positive Sentiment & Happy Emotion Word Cloud', 'Greens')
# Negative sentiment and angry or sad emotion word cloud
negative_angry_sad_text = ' '.join(df[(df['Sentiment'] == 'negative') & (df['Emotion'].isin(['marah', 'sedih']))]['Cleaned Content'].dropna())
generate_wordcloud(negative_angry_sad_text, font_path, 'Negative Sentiment & Angry or stSad Emotion Word Cloud', 'Reds')
# Word frequency
word_freq = pd.Series(' '.join(df['Cleaned Content'].dropna()).split()).value_counts()
st.write("Word Frequency:")
st.write(word_freq)
# Download link for word frequency
word_freq_df = word_freq.reset_index()
word_freq_df.columns = ['Word', 'Frequency']
st.markdown(get_word_freq_download_link(word_freq_df), unsafe_allow_html=True)
return df
def analyze_sentiment(text):
result = sentiment_pipe(text)[0]
return result['label'].lower(), result['score']
def analyze_emotion(text):
result = emotion_pipe(text)[0]
return result['label'].lower(), result['score']
def get_download_link(df, filename):
csv = df.to_csv(index=False)
b64 = base64.b64encode(csv.encode()).decode()
href = f'<a href="data:file/csv;base64,{b64}" download="{filename}.csv">Download CSV</a>'
return href
def get_word_freq_download_link(word_freq_df):
csv = word_freq_df.to_csv(index=True)
b64 = base64.b64encode(csv.encode()).decode()
href = f'<a href="data:file/csv;base64,{b64}" download="word_frequency.csv">Download Word Frequency CSV</a>'
return href
def main():
st.title("Aplikasi Analisis Sentimen dan Prediksi Emosi")
slank_file = st.file_uploader("Upload file slank (CSV atau TXT)", type=["csv", "txt"])
if slank_file is not None:
df_slank_formal = load_slank_formal(slank_file)
if df_slank_formal is None:
st.stop()
else:
st.warning("Harap upload file slank terlebih dahulu.")
st.stop()
menu = st.sidebar.selectbox("Pilih Metode", ["Analisis Langsung", "Import dari File"])
if menu == "Analisis Langsung":
user_input = st.text_area("Masukkan teks yang ingin dianalisis (pisahkan dengan enter):")
if st.button("Analisis"):
df = combined_analysis(user_input, df_slank_formal)
st.write("Hasil Analisis:")
st.write(df)
st.markdown(get_download_link(df, "analisis_sentimen_emosi"), unsafe_allow_html=True)
elif menu == "Import dari File":
uploaded_file = st.file_uploader("Upload file CSV atau XLSX", type=["csv", "xlsx"])
if uploaded_file is not None:
df = process_file(uploaded_file, df_slank_formal)
st.write("Hasil Analisis:")
st.write(df)
st.markdown(get_download_link(df, "analisis_sentimen_emosi"), unsafe_allow_html=True)
if __name__ == '__main__':
main()