Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
from transformers import pipeline | |
import base64 | |
import re | |
import nltk | |
from nltk.corpus import stopwords | |
from nltk.tokenize import word_tokenize | |
import plotly.express as px | |
import matplotlib.pyplot as plt | |
from wordcloud import WordCloud | |
import numpy as np | |
from PIL import ImageFont | |
import os | |
nltk.download('punkt') | |
nltk.download('stopwords') | |
# Load pipelines | |
sentiment_pipe = pipeline("text-classification", model="dhanikitkat/indo_smsa-1.5G_sentiment_analysis") | |
emotion_pipe = pipeline("text-classification", model="azizp128/prediksi-emosi-indobert") | |
def load_slank_formal(file): | |
if file.name.endswith('.txt'): | |
df = pd.read_csv(file, sep=';', header=None, names=['Slank', 'Formal']) | |
else: | |
st.error("Format file tidak didukung. Harap unggah file TXT.") | |
return None | |
df.columns = ['Slank', 'Formal'] | |
return df | |
def replace_slank_to_formal(sentence, slank_formal_df): | |
words = re.findall(r'[\w\',./:-]+|[.,]+|[^\x00-\x7F]+', sentence) | |
for i, word in enumerate(words): | |
replacement = slank_formal_df.loc[slank_formal_df['Slank'] == word.lower(), 'Formal'].values | |
if replacement.size > 0: | |
words[i] = str(replacement[0]) | |
return ' '.join(words) | |
def preprocess_text(text, slank_formal_df): | |
text = text.lower() | |
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) | |
text = re.sub(r'\@\w+|\#', '', text) | |
text = re.sub(r'([^\w\s\U0001F000-\U0001F9FF])\1+', r'\1', text) | |
text = re.sub(r'([\U0001F600-\U0001F64F\U0001F900-\U0001F9FF\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F])', r' \1 ', text) | |
text = re.sub(r'([.,])', r' \1 ', text) | |
text = re.sub(r'[&%]', lambda x: f' {x.group()} ', text) | |
text = re.sub(r'(\w)\1{1,}', r'\1\1', text) | |
text = re.sub(r'\s+', ' ', text).strip() | |
text = re.sub(r'\b(\w+)\b\s*-\s*\b\1\b', r'\1-\1', text) | |
text = re.sub(r'(?<=\d)\s*\.\s*(?=\d)', '.', text) | |
text = re.sub(r'(?<=\d)\s*,\s*(?=\d)', ',', text) | |
text = re.sub(r'\s+', ' ', text).strip() | |
text = replace_slank_to_formal(text, slank_formal_df) | |
tokens = word_tokenize(text) | |
preprocessed_text = ' '.join(tokens) | |
return preprocessed_text | |
def generate_wordcloud(text, font_path, colormap, title): | |
# Create a circular mask for Full HD resolution | |
x, y = np.ogrid[:1400, :1400] # Adjusted for 1400x1400 resolution | |
mask = (x - 700) ** 2 + (y - 700) ** 2 > 630 ** 2 # Adjusted mask size for 1400x1400 resolution | |
mask = 255 * mask.astype(int) | |
# Remove Indonesian stopwords | |
indo_stopwords = set(stopwords.words('indonesian')) | |
words = text.split() | |
words = [word for word in words if word.lower() not in indo_stopwords] | |
text = ' '.join(words) | |
wordcloud = WordCloud( | |
width=1400, | |
height=1400, | |
background_color='white', | |
font_path=font_path, | |
prefer_horizontal=1.0, | |
colormap=colormap, | |
max_words=100, | |
mask=mask | |
).generate(text) | |
# Configure plot settings for high-quality output | |
plt.figure(figsize=(14, 14)) # Adjusted figure size for 1400x1400 resolution | |
plt.imshow(wordcloud, interpolation='bilinear') | |
plt.axis('off') | |
plt.title(title, fontsize=20, pad=20) # Title directly in matplotlib plot | |
# Save word cloud to file with high DPI for better quality | |
plt.savefig(f"{title}.png", dpi=300, bbox_inches='tight', pad_inches=0.1) | |
# Display word cloud in Streamlit | |
st.image(f"{title}.png", use_column_width=True) | |
# Add download link for word cloud | |
st.markdown(get_image_download_link(f"{title}.png"), unsafe_allow_html=True) | |
def analyze_sentiment(text): | |
result = sentiment_pipe(text)[0] | |
return result['label'].lower(), result['score'] | |
def analyze_emotion(text): | |
result = emotion_pipe(text)[0] | |
return result['label'].lower(), result['score'] | |
def get_download_link(df, filename): | |
csv = df.to_csv(index=False) | |
b64 = base64.b64encode(csv.encode()).decode() | |
href = f'<a href="data:file/csv;base64,{b64}" download="{filename}.csv">Download CSV</a>' | |
return href | |
def get_word_freq_download_link(word_freq_df): | |
csv = word_freq_df.to_csv(index=True) | |
b64 = base64.b64encode(csv.encode()).decode() | |
href = f'<a href="data:file/csv;base64,{b64}" download="word_frequency.csv">Download Word Frequency CSV</a>' | |
return href | |
def get_example_download_link(file_path, link_text): | |
with open(file_path, "rb") as file: | |
b64 = base64.b64encode(file.read()).decode() | |
return f'<a href="data:file/txt;base64,{b64}" download="{os.path.basename(file_path)}">{link_text}</a>' | |
def get_image_download_link(image_path): | |
with open(image_path, "rb") as image_file: | |
b64 = base64.b64encode(image_file.read()).decode() | |
href = f'<a href="data:file/png;base64,{b64}" download="{image_path}">Download {image_path}</a>' | |
return href | |
def combined_analysis(text, slank_formal_df): | |
texts = text.split('\n') | |
results = [] | |
for text in texts: | |
if text.strip(): | |
cleaned_text = preprocess_text(text, slank_formal_df) | |
sentiment_result = sentiment_pipe(cleaned_text)[0] | |
emotion_result = emotion_pipe(cleaned_text)[0] | |
results.append((text, cleaned_text, sentiment_result['label'].lower(), sentiment_result['score'], emotion_result['label'].lower(), emotion_result['score'])) | |
df = pd.DataFrame(results, columns=['Content', 'Cleaned Content', 'Sentiment', 'Score Sentiment', 'Emotion', 'Score Emotion']) | |
# Define custom CSS to adjust the height | |
st.markdown( | |
""" | |
<style> | |
.chart-container { | |
display: flex; | |
justify-content: center; | |
} | |
.user-select-none.svg-container { | |
height: 360px !important; | |
} | |
.average-score { | |
text-align: center; | |
} | |
</style> | |
""", | |
unsafe_allow_html=True | |
) | |
# Sentiment pie chart | |
sentiment_counts = df['Sentiment'].value_counts() | |
sentiment_colors = { | |
'positive': px.colors.qualitative.Set3[0], | |
'negative': px.colors.qualitative.Set3[3], | |
'neutral': px.colors.qualitative.Set3[1] | |
} | |
fig_sentiment = px.pie( | |
sentiment_counts, | |
values=sentiment_counts.values, | |
names=sentiment_counts.index, | |
title='Sentiment Distribution', | |
width=400, | |
height=400, | |
color=sentiment_counts.index, | |
color_discrete_map=sentiment_colors | |
) | |
# Calculate sentiment average | |
sentiment_average = df['Score Sentiment'].mean() | |
# Add average sentiment score as an annotation | |
fig_sentiment.add_annotation( | |
text=f"Average Sentiment Score: {sentiment_average:.4f}", | |
xref="paper", yref="paper", | |
x=0.5, y=-0.2, | |
showarrow=False, | |
font=dict(size=18) | |
) | |
st.markdown('<div class="chart-container">', unsafe_allow_html=True) | |
st.plotly_chart(fig_sentiment, use_container_width=True) | |
st.markdown('</div>', unsafe_allow_html=True) | |
# Emotion pie chart | |
# Sentiment pie chart | |
emotion_counts = df['Emotion'].value_counts() | |
emotion_colors = { | |
'marah': px.colors.qualitative.Safe[9], | |
'sedih': px.colors.qualitative.Safe[1], | |
'senang': px.colors.qualitative.Safe[0], | |
'cinta': px.colors.qualitative.Safe[2], | |
'jijik': px.colors.qualitative.Safe[6], | |
'takut': px.colors.qualitative.Safe[7], | |
} | |
fig_emotion = px.pie( | |
emotion_counts, | |
values=emotion_counts.values, | |
names=emotion_counts.index, | |
title='Emotion Distribution', | |
width=400, | |
height=400, | |
color=emotion_counts.index, | |
color_discrete_map=emotion_colors | |
) | |
# Calculate emotion average | |
emotion_average = df['Score Emotion'].mean() | |
# Add average emotion score as an annotation | |
fig_emotion.add_annotation( | |
text=f"Average Emotion Score: {emotion_average:.4f}", | |
xref="paper", yref="paper", | |
x=0.5, y=-0.2, | |
showarrow=False, | |
font=dict(size=18) | |
) | |
st.markdown('<div class="chart-container">', unsafe_allow_html=True) | |
st.plotly_chart(fig_emotion, use_container_width=True) | |
st.markdown('</div>', unsafe_allow_html=True) | |
# Generate word clouds | |
font_path = os.path.join('assets', 'Poppins-Regular.ttf') | |
# Ensure `df` is your DataFrame and 'Cleaned Content', 'Sentiment', and 'Emotion' columns exist | |
overall_text = ' '.join(df['Cleaned Content'].dropna()) | |
generate_wordcloud(overall_text, font_path, 'hsv_r', 'Overall Word Cloud') | |
positive_happy_text = ' '.join(df[(df['Sentiment'] == 'positive') & (df['Emotion'] == 'senang')]['Cleaned Content'].dropna()) | |
generate_wordcloud(positive_happy_text, font_path, 'gist_rainbow_r', 'Positive Sentiment & Happy Emotion Word Cloud') | |
negative_angry_sad_text = ' '.join(df[(df['Sentiment'] == 'negative') & (df['Emotion'].isin(['marah', 'sedih']))]['Cleaned Content'].dropna()) | |
generate_wordcloud(negative_angry_sad_text, font_path, 'inferno', 'Negative Sentiment & Angry or Sad Emotion Word Cloud') | |
# Word frequency | |
word_freq = pd.Series(' '.join(df['Cleaned Content'].dropna()).split()).value_counts() | |
st.write("Word Frequency:") | |
st.write(word_freq) | |
# Download link for word frequency | |
word_freq_df = word_freq.reset_index() | |
word_freq_df.columns = ['Word', 'Frequency'] | |
st.markdown(get_word_freq_download_link(word_freq_df), unsafe_allow_html=True) | |
return df | |
def process_file(file, slank_formal_df): | |
if file.name.endswith('.xlsx'): | |
df = pd.read_excel(file) | |
elif file.name.endswith('.csv'): | |
df = pd.read_csv(file) | |
else: | |
st.error("Format file tidak didukung. Harap unggah file CSV atau XLSX.") | |
return None | |
results = [] | |
for index, row in df.iterrows(): | |
if pd.notna(row['content']) and isinstance(row['content'], str): | |
cleaned_text = preprocess_text(row['content'], slank_formal_df) | |
sentiment, score_sentiment = analyze_sentiment(cleaned_text) | |
emotion, score_emotion = analyze_emotion(cleaned_text) | |
results.append((row['content'], cleaned_text, sentiment, score_sentiment, emotion, score_emotion)) | |
else: | |
results.append((row['content'], None, None, None, None, None)) | |
df['Cleaned Content'] = [r[1] for r in results] | |
df['Sentiment'] = [r[2] for r in results] | |
df['Score Sentiment'] = [r[3] for r in results] | |
df['Emotion'] = [r[4] for r in results] | |
df['Score Emotion'] = [r[5] for r in results] | |
# Define custom CSS to adjust the height | |
st.markdown( | |
""" | |
<style> | |
.chart-container { | |
display: flex; | |
justify-content: center; | |
} | |
.user-select-none.svg-container { | |
height: 360px !important; | |
} | |
.average-score { | |
text-align: center; | |
} | |
</style> | |
""", | |
unsafe_allow_html=True | |
) | |
# Sentiment pie chart | |
sentiment_counts = df['Sentiment'].value_counts() | |
sentiment_colors = { | |
'positive': px.colors.qualitative.Set3[0], | |
'negative': px.colors.qualitative.Set3[3], | |
'neutral': px.colors.qualitative.Set3[1] | |
} | |
fig_sentiment = px.pie( | |
sentiment_counts, | |
values=sentiment_counts.values, | |
names=sentiment_counts.index, | |
title='Sentiment Distribution', | |
width=400, | |
height=400, | |
color=sentiment_counts.index, | |
color_discrete_map=sentiment_colors | |
) | |
# Calculate sentiment average | |
sentiment_average = df['Score Sentiment'].mean() | |
# Add average sentiment score as an annotation | |
fig_sentiment.add_annotation( | |
text=f"Average Sentiment Score: {sentiment_average:.4f}", | |
xref="paper", yref="paper", | |
x=0.5, y=-0.2, | |
showarrow=False, | |
font=dict(size=18) | |
) | |
st.markdown('<div class="chart-container">', unsafe_allow_html=True) | |
st.plotly_chart(fig_sentiment, use_container_width=True) | |
st.markdown('</div>', unsafe_allow_html=True) | |
# Emotion pie chart | |
# Sentiment pie chart | |
emotion_counts = df['Emotion'].value_counts() | |
emotion_colors = { | |
'marah': px.colors.qualitative.Safe[9], | |
'sedih': px.colors.qualitative.Safe[1], | |
'senang': px.colors.qualitative.Safe[0], | |
'cinta': px.colors.qualitative.Safe[2], | |
'jijik': px.colors.qualitative.Safe[6], | |
'takut': px.colors.qualitative.Safe[7], | |
} | |
fig_emotion = px.pie( | |
emotion_counts, | |
values=emotion_counts.values, | |
names=emotion_counts.index, | |
title='Emotion Distribution', | |
width=400, | |
height=400, | |
color=emotion_counts.index, | |
color_discrete_map=emotion_colors | |
) | |
# Calculate emotion average | |
emotion_average = df['Score Emotion'].mean() | |
# Add average emotion score as an annotation | |
fig_emotion.add_annotation( | |
text=f"Average Emotion Score: {emotion_average:.4f}", | |
xref="paper", yref="paper", | |
x=0.5, y=-0.2, | |
showarrow=False, | |
font=dict(size=18) | |
) | |
st.markdown('<div class="chart-container">', unsafe_allow_html=True) | |
st.plotly_chart(fig_emotion, use_container_width=True) | |
st.markdown('</div>', unsafe_allow_html=True) | |
# Generate word clouds | |
font_path = os.path.join('assets', 'Poppins-Regular.ttf') | |
# Ensure `df` is your DataFrame and 'Cleaned Content', 'Sentiment', and 'Emotion' columns exist | |
overall_text = ' '.join(df['Cleaned Content'].dropna()) | |
generate_wordcloud(overall_text, font_path, 'hsv_r', 'Overall Word Cloud') | |
positive_happy_text = ' '.join(df[(df['Sentiment'] == 'positive') & (df['Emotion'] == 'senang')]['Cleaned Content'].dropna()) | |
generate_wordcloud(positive_happy_text, font_path, 'gist_rainbow_r', 'Positive Sentiment & Happy Emotion Word Cloud') | |
negative_angry_sad_text = ' '.join(df[(df['Sentiment'] == 'negative') & (df['Emotion'].isin(['marah', 'sedih']))]['Cleaned Content'].dropna()) | |
generate_wordcloud(negative_angry_sad_text, font_path, 'inferno', 'Negative Sentiment & Angry or Sad Emotion Word Cloud') | |
# Word frequency | |
word_freq = pd.Series(' '.join(df['Cleaned Content'].dropna()).split()).value_counts() | |
st.write("Word Frequency:") | |
st.write(word_freq) | |
# Download link for word frequency | |
word_freq_df = word_freq.reset_index() | |
word_freq_df.columns = ['Word', 'Frequency'] | |
st.markdown(get_word_freq_download_link(word_freq_df), unsafe_allow_html=True) | |
return df | |
def main(): | |
st.title("Aplikasi Analisis Sentimen dan Prediksi Emosi") | |
# Add download link for example slank template | |
slank_template_path = "assets/contoh template data slank.txt" | |
st.markdown(get_example_download_link(slank_template_path, "Download Contoh Template Data Slank (TXT)"), unsafe_allow_html=True) | |
slank_file = st.file_uploader("Upload file slank dengan baris pertama Slank;Formal (TXT)", type=["txt"]) | |
if slank_file is not None: | |
df_slank_formal = load_slank_formal(slank_file) | |
if df_slank_formal is None: | |
st.stop() | |
else: | |
st.warning("Harap upload file slank terlebih dahulu.") | |
st.stop() | |
menu = st.sidebar.selectbox("Pilih Metode", ["Analisis Langsung", "Import dari File"]) | |
if menu == "Analisis Langsung": | |
user_input = st.text_area("Masukkan teks yang ingin dianalisis (pisahkan dengan enter):") | |
if st.button("Analisis"): | |
df = combined_analysis(user_input, df_slank_formal) | |
st.write("Hasil Analisis:") | |
st.write(df) | |
st.markdown(get_download_link(df, "analisis_sentimen_emosi"), unsafe_allow_html=True) | |
elif menu == "Import dari File": | |
# Add download link for example content template | |
content_template_path = "assets/contoh template data content.xlsx" | |
st.markdown(get_example_download_link(content_template_path, "Download Contoh Template Data Content (XLSX)"), unsafe_allow_html=True) | |
uploaded_file = st.file_uploader("Upload file CSV atau XLSX", type=["csv", "xlsx"]) | |
if uploaded_file is not None: | |
df = process_file(uploaded_file, df_slank_formal) | |
st.write("Hasil Analisis:") | |
st.write(df) | |
st.markdown(get_download_link(df, "analisis_sentimen_emosi"), unsafe_allow_html=True) | |
if __name__ == '__main__': | |
main() | |