import streamlit as st
import pandas as pd
from transformers import pipeline
import base64
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import plotly.express as px
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import numpy as np
from PIL import ImageFont
import os
nltk.download('punkt')
nltk.download('stopwords')
# Load pipelines
sentiment_pipe = pipeline("text-classification", model="dhanikitkat/indo_smsa-1.5G_sentiment_analysis")
emotion_pipe = pipeline("text-classification", model="azizp128/prediksi-emosi-indobert")
def load_slank_formal(file):
if file.name.endswith('.txt'):
df = pd.read_csv(file, sep=';', header=None, names=['Slank', 'Formal'])
else:
st.error("Format file tidak didukung. Harap unggah file TXT.")
return None
df.columns = ['Slank', 'Formal']
return df
def replace_slank_to_formal(sentence, slank_formal_df):
words = re.findall(r'[\w\',./:-]+|[.,]+|[^\x00-\x7F]+', sentence)
for i, word in enumerate(words):
replacement = slank_formal_df.loc[slank_formal_df['Slank'] == word.lower(), 'Formal'].values
if replacement.size > 0:
words[i] = str(replacement[0])
return ' '.join(words)
def preprocess_text(text, slank_formal_df):
text = text.lower()
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
text = re.sub(r'\@\w+|\#', '', text)
text = re.sub(r'([^\w\s\U0001F000-\U0001F9FF])\1+', r'\1', text)
text = re.sub(r'([\U0001F600-\U0001F64F\U0001F900-\U0001F9FF\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F])', r' \1 ', text)
text = re.sub(r'([.,])', r' \1 ', text)
text = re.sub(r'[&%]', lambda x: f' {x.group()} ', text)
text = re.sub(r'(\w)\1{1,}', r'\1\1', text)
text = re.sub(r'\s+', ' ', text).strip()
text = re.sub(r'\b(\w+)\b\s*-\s*\b\1\b', r'\1-\1', text)
text = re.sub(r'(?<=\d)\s*\.\s*(?=\d)', '.', text)
text = re.sub(r'(?<=\d)\s*,\s*(?=\d)', ',', text)
text = re.sub(r'\s+', ' ', text).strip()
text = replace_slank_to_formal(text, slank_formal_df)
tokens = word_tokenize(text)
preprocessed_text = ' '.join(tokens)
return preprocessed_text
def generate_wordcloud(text, font_path, colormap, title):
# Create a circular mask for Full HD resolution
x, y = np.ogrid[:1400, :1400] # Adjusted for 1400x1400 resolution
mask = (x - 700) ** 2 + (y - 700) ** 2 > 630 ** 2 # Adjusted mask size for 1400x1400 resolution
mask = 255 * mask.astype(int)
# Remove Indonesian stopwords
indo_stopwords = set(stopwords.words('indonesian'))
words = text.split()
words = [word for word in words if word.lower() not in indo_stopwords]
text = ' '.join(words)
wordcloud = WordCloud(
width=1400,
height=1400,
background_color='white',
font_path=font_path,
prefer_horizontal=1.0,
colormap=colormap,
max_words=100,
mask=mask
).generate(text)
# Configure plot settings for high-quality output
plt.figure(figsize=(14, 14)) # Adjusted figure size for 1400x1400 resolution
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title(title, fontsize=20, pad=20) # Title directly in matplotlib plot
# Save word cloud to file with high DPI for better quality
plt.savefig(f"{title}.png", dpi=300, bbox_inches='tight', pad_inches=0.1)
# Display word cloud in Streamlit
st.image(f"{title}.png", use_column_width=True)
# Add download link for word cloud
st.markdown(get_image_download_link(f"{title}.png"), unsafe_allow_html=True)
def analyze_sentiment(text):
result = sentiment_pipe(text)[0]
return result['label'].lower(), result['score']
def analyze_emotion(text):
result = emotion_pipe(text)[0]
return result['label'].lower(), result['score']
def get_download_link(df, filename):
csv = df.to_csv(index=False)
b64 = base64.b64encode(csv.encode()).decode()
href = f'Download CSV'
return href
def get_word_freq_download_link(word_freq_df):
csv = word_freq_df.to_csv(index=True)
b64 = base64.b64encode(csv.encode()).decode()
href = f'Download Word Frequency CSV'
return href
def get_example_download_link(file_path, link_text):
with open(file_path, "rb") as file:
b64 = base64.b64encode(file.read()).decode()
return f'{link_text}'
def get_image_download_link(image_path):
with open(image_path, "rb") as image_file:
b64 = base64.b64encode(image_file.read()).decode()
href = f'Download {image_path}'
return href
def combined_analysis(text, slank_formal_df):
texts = text.split('\n')
results = []
for text in texts:
if text.strip():
cleaned_text = preprocess_text(text, slank_formal_df)
sentiment_result = sentiment_pipe(cleaned_text)[0]
emotion_result = emotion_pipe(cleaned_text)[0]
results.append((text, cleaned_text, sentiment_result['label'].lower(), sentiment_result['score'], emotion_result['label'].lower(), emotion_result['score']))
df = pd.DataFrame(results, columns=['Content', 'Cleaned Content', 'Sentiment', 'Score Sentiment', 'Emotion', 'Score Emotion'])
# Define custom CSS to adjust the height
st.markdown(
"""
""",
unsafe_allow_html=True
)
# Sentiment pie chart
sentiment_counts = df['Sentiment'].value_counts()
sentiment_colors = {
'positive': px.colors.qualitative.Set3[0],
'negative': px.colors.qualitative.Set3[3],
'neutral': px.colors.qualitative.Set3[1]
}
fig_sentiment = px.pie(
sentiment_counts,
values=sentiment_counts.values,
names=sentiment_counts.index,
title='Sentiment Distribution',
width=400,
height=400,
color=sentiment_counts.index,
color_discrete_map=sentiment_colors
)
# Calculate sentiment average
sentiment_average = df['Score Sentiment'].mean()
# Add average sentiment score as an annotation
fig_sentiment.add_annotation(
text=f"Average Sentiment Score: {sentiment_average:.4f}",
xref="paper", yref="paper",
x=0.5, y=-0.2,
showarrow=False,
font=dict(size=18)
)
st.markdown('
', unsafe_allow_html=True)
st.plotly_chart(fig_sentiment, use_container_width=True)
st.markdown('
', unsafe_allow_html=True)
# Emotion pie chart
# Sentiment pie chart
emotion_counts = df['Emotion'].value_counts()
emotion_colors = {
'marah': px.colors.qualitative.Safe[9],
'sedih': px.colors.qualitative.Safe[1],
'senang': px.colors.qualitative.Safe[0],
'cinta': px.colors.qualitative.Safe[2],
'jijik': px.colors.qualitative.Safe[6],
'takut': px.colors.qualitative.Safe[7],
}
fig_emotion = px.pie(
emotion_counts,
values=emotion_counts.values,
names=emotion_counts.index,
title='Emotion Distribution',
width=400,
height=400,
color=emotion_counts.index,
color_discrete_map=emotion_colors
)
# Calculate emotion average
emotion_average = df['Score Emotion'].mean()
# Add average emotion score as an annotation
fig_emotion.add_annotation(
text=f"Average Emotion Score: {emotion_average:.4f}",
xref="paper", yref="paper",
x=0.5, y=-0.2,
showarrow=False,
font=dict(size=18)
)
st.markdown('', unsafe_allow_html=True)
st.plotly_chart(fig_emotion, use_container_width=True)
st.markdown('
', unsafe_allow_html=True)
# Generate word clouds
font_path = os.path.join('assets', 'Poppins-Regular.ttf')
# Ensure `df` is your DataFrame and 'Cleaned Content', 'Sentiment', and 'Emotion' columns exist
overall_text = ' '.join(df['Cleaned Content'].dropna())
generate_wordcloud(overall_text, font_path, 'hsv_r', 'Overall Word Cloud')
positive_happy_text = ' '.join(df[(df['Sentiment'] == 'positive') & (df['Emotion'] == 'senang')]['Cleaned Content'].dropna())
generate_wordcloud(positive_happy_text, font_path, 'gist_rainbow_r', 'Positive Sentiment & Happy Emotion Word Cloud')
negative_angry_sad_text = ' '.join(df[(df['Sentiment'] == 'negative') & (df['Emotion'].isin(['marah', 'sedih']))]['Cleaned Content'].dropna())
generate_wordcloud(negative_angry_sad_text, font_path, 'inferno', 'Negative Sentiment & Angry or Sad Emotion Word Cloud')
# Word frequency
word_freq = pd.Series(' '.join(df['Cleaned Content'].dropna()).split()).value_counts()
st.write("Word Frequency:")
st.write(word_freq)
# Download link for word frequency
word_freq_df = word_freq.reset_index()
word_freq_df.columns = ['Word', 'Frequency']
st.markdown(get_word_freq_download_link(word_freq_df), unsafe_allow_html=True)
return df
def process_file(file, slank_formal_df):
if file.name.endswith('.xlsx'):
df = pd.read_excel(file)
elif file.name.endswith('.csv'):
df = pd.read_csv(file)
else:
st.error("Format file tidak didukung. Harap unggah file CSV atau XLSX.")
return None
results = []
for index, row in df.iterrows():
if pd.notna(row['content']) and isinstance(row['content'], str):
cleaned_text = preprocess_text(row['content'], slank_formal_df)
sentiment, score_sentiment = analyze_sentiment(cleaned_text)
emotion, score_emotion = analyze_emotion(cleaned_text)
results.append((row['content'], cleaned_text, sentiment, score_sentiment, emotion, score_emotion))
else:
results.append((row['content'], None, None, None, None, None))
df['Cleaned Content'] = [r[1] for r in results]
df['Sentiment'] = [r[2] for r in results]
df['Score Sentiment'] = [r[3] for r in results]
df['Emotion'] = [r[4] for r in results]
df['Score Emotion'] = [r[5] for r in results]
# Define custom CSS to adjust the height
st.markdown(
"""
""",
unsafe_allow_html=True
)
# Sentiment pie chart
sentiment_counts = df['Sentiment'].value_counts()
sentiment_colors = {
'positive': px.colors.qualitative.Set3[0],
'negative': px.colors.qualitative.Set3[3],
'neutral': px.colors.qualitative.Set3[1]
}
fig_sentiment = px.pie(
sentiment_counts,
values=sentiment_counts.values,
names=sentiment_counts.index,
title='Sentiment Distribution',
width=400,
height=400,
color=sentiment_counts.index,
color_discrete_map=sentiment_colors
)
# Calculate sentiment average
sentiment_average = df['Score Sentiment'].mean()
# Add average sentiment score as an annotation
fig_sentiment.add_annotation(
text=f"Average Sentiment Score: {sentiment_average:.4f}",
xref="paper", yref="paper",
x=0.5, y=-0.2,
showarrow=False,
font=dict(size=18)
)
st.markdown('', unsafe_allow_html=True)
st.plotly_chart(fig_sentiment, use_container_width=True)
st.markdown('
', unsafe_allow_html=True)
# Emotion pie chart
# Sentiment pie chart
emotion_counts = df['Emotion'].value_counts()
emotion_colors = {
'marah': px.colors.qualitative.Safe[9],
'sedih': px.colors.qualitative.Safe[1],
'senang': px.colors.qualitative.Safe[0],
'cinta': px.colors.qualitative.Safe[2],
'jijik': px.colors.qualitative.Safe[6],
'takut': px.colors.qualitative.Safe[7],
}
fig_emotion = px.pie(
emotion_counts,
values=emotion_counts.values,
names=emotion_counts.index,
title='Emotion Distribution',
width=400,
height=400,
color=emotion_counts.index,
color_discrete_map=emotion_colors
)
# Calculate emotion average
emotion_average = df['Score Emotion'].mean()
# Add average emotion score as an annotation
fig_emotion.add_annotation(
text=f"Average Emotion Score: {emotion_average:.4f}",
xref="paper", yref="paper",
x=0.5, y=-0.2,
showarrow=False,
font=dict(size=18)
)
st.markdown('', unsafe_allow_html=True)
st.plotly_chart(fig_emotion, use_container_width=True)
st.markdown('
', unsafe_allow_html=True)
# Generate word clouds
font_path = os.path.join('assets', 'Poppins-Regular.ttf')
# Ensure `df` is your DataFrame and 'Cleaned Content', 'Sentiment', and 'Emotion' columns exist
overall_text = ' '.join(df['Cleaned Content'].dropna())
generate_wordcloud(overall_text, font_path, 'hsv_r', 'Overall Word Cloud')
positive_happy_text = ' '.join(df[(df['Sentiment'] == 'positive') & (df['Emotion'] == 'senang')]['Cleaned Content'].dropna())
generate_wordcloud(positive_happy_text, font_path, 'gist_rainbow_r', 'Positive Sentiment & Happy Emotion Word Cloud')
negative_angry_sad_text = ' '.join(df[(df['Sentiment'] == 'negative') & (df['Emotion'].isin(['marah', 'sedih']))]['Cleaned Content'].dropna())
generate_wordcloud(negative_angry_sad_text, font_path, 'inferno', 'Negative Sentiment & Angry or Sad Emotion Word Cloud')
# Word frequency
word_freq = pd.Series(' '.join(df['Cleaned Content'].dropna()).split()).value_counts()
st.write("Word Frequency:")
st.write(word_freq)
# Download link for word frequency
word_freq_df = word_freq.reset_index()
word_freq_df.columns = ['Word', 'Frequency']
st.markdown(get_word_freq_download_link(word_freq_df), unsafe_allow_html=True)
return df
def main():
st.title("Aplikasi Analisis Sentimen dan Prediksi Emosi")
# Add download link for example slank template
slank_template_path = "assets/contoh template data slank.txt"
st.markdown(get_example_download_link(slank_template_path, "Download Contoh Template Data Slank (TXT)"), unsafe_allow_html=True)
slank_file = st.file_uploader("Upload file slank dengan baris pertama Slank;Formal (TXT)", type=["txt"])
if slank_file is not None:
df_slank_formal = load_slank_formal(slank_file)
if df_slank_formal is None:
st.stop()
else:
st.warning("Harap upload file slank terlebih dahulu.")
st.stop()
menu = st.sidebar.selectbox("Pilih Metode", ["Analisis Langsung", "Import dari File"])
if menu == "Analisis Langsung":
user_input = st.text_area("Masukkan teks yang ingin dianalisis (pisahkan dengan enter):")
if st.button("Analisis"):
df = combined_analysis(user_input, df_slank_formal)
st.write("Hasil Analisis:")
st.write(df)
st.markdown(get_download_link(df, "analisis_sentimen_emosi"), unsafe_allow_html=True)
elif menu == "Import dari File":
# Add download link for example content template
content_template_path = "assets/contoh template data content.xlsx"
st.markdown(get_example_download_link(content_template_path, "Download Contoh Template Data Content (XLSX)"), unsafe_allow_html=True)
uploaded_file = st.file_uploader("Upload file CSV atau XLSX", type=["csv", "xlsx"])
if uploaded_file is not None:
df = process_file(uploaded_file, df_slank_formal)
st.write("Hasil Analisis:")
st.write(df)
st.markdown(get_download_link(df, "analisis_sentimen_emosi"), unsafe_allow_html=True)
if __name__ == '__main__':
main()