dhanikitkat's picture
update self model for sentiment analysis
f45217d
import streamlit as st
import pandas as pd
from transformers import pipeline
import base64
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import plotly.express as px
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import numpy as np
from PIL import ImageFont
import os
nltk.download('punkt')
nltk.download('stopwords')
# Load pipelines
sentiment_pipe = pipeline("text-classification", model="dhanikitkat/indo_smsa-1.5G_sentiment_analysis")
emotion_pipe = pipeline("text-classification", model="azizp128/prediksi-emosi-indobert")
def load_slank_formal(file):
if file.name.endswith('.txt'):
df = pd.read_csv(file, sep=';', header=None, names=['Slank', 'Formal'])
else:
st.error("Format file tidak didukung. Harap unggah file TXT.")
return None
df.columns = ['Slank', 'Formal']
return df
def replace_slank_to_formal(sentence, slank_formal_df):
words = re.findall(r'[\w\',./:-]+|[.,]+|[^\x00-\x7F]+', sentence)
for i, word in enumerate(words):
replacement = slank_formal_df.loc[slank_formal_df['Slank'] == word.lower(), 'Formal'].values
if replacement.size > 0:
words[i] = str(replacement[0])
return ' '.join(words)
def preprocess_text(text, slank_formal_df):
text = text.lower()
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
text = re.sub(r'\@\w+|\#', '', text)
text = re.sub(r'([^\w\s\U0001F000-\U0001F9FF])\1+', r'\1', text)
text = re.sub(r'([\U0001F600-\U0001F64F\U0001F900-\U0001F9FF\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F])', r' \1 ', text)
text = re.sub(r'([.,])', r' \1 ', text)
text = re.sub(r'[&%]', lambda x: f' {x.group()} ', text)
text = re.sub(r'(\w)\1{1,}', r'\1\1', text)
text = re.sub(r'\s+', ' ', text).strip()
text = re.sub(r'\b(\w+)\b\s*-\s*\b\1\b', r'\1-\1', text)
text = re.sub(r'(?<=\d)\s*\.\s*(?=\d)', '.', text)
text = re.sub(r'(?<=\d)\s*,\s*(?=\d)', ',', text)
text = re.sub(r'\s+', ' ', text).strip()
text = replace_slank_to_formal(text, slank_formal_df)
tokens = word_tokenize(text)
preprocessed_text = ' '.join(tokens)
return preprocessed_text
def generate_wordcloud(text, font_path, colormap, title):
# Create a circular mask for Full HD resolution
x, y = np.ogrid[:1400, :1400] # Adjusted for 1400x1400 resolution
mask = (x - 700) ** 2 + (y - 700) ** 2 > 630 ** 2 # Adjusted mask size for 1400x1400 resolution
mask = 255 * mask.astype(int)
# Remove Indonesian stopwords
indo_stopwords = set(stopwords.words('indonesian'))
words = text.split()
words = [word for word in words if word.lower() not in indo_stopwords]
text = ' '.join(words)
wordcloud = WordCloud(
width=1400,
height=1400,
background_color='white',
font_path=font_path,
prefer_horizontal=1.0,
colormap=colormap,
max_words=100,
mask=mask
).generate(text)
# Configure plot settings for high-quality output
plt.figure(figsize=(14, 14)) # Adjusted figure size for 1400x1400 resolution
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title(title, fontsize=20, pad=20) # Title directly in matplotlib plot
# Save word cloud to file with high DPI for better quality
plt.savefig(f"{title}.png", dpi=300, bbox_inches='tight', pad_inches=0.1)
# Display word cloud in Streamlit
st.image(f"{title}.png", use_column_width=True)
# Add download link for word cloud
st.markdown(get_image_download_link(f"{title}.png"), unsafe_allow_html=True)
def analyze_sentiment(text):
result = sentiment_pipe(text)[0]
return result['label'].lower(), result['score']
def analyze_emotion(text):
result = emotion_pipe(text)[0]
return result['label'].lower(), result['score']
def get_download_link(df, filename):
csv = df.to_csv(index=False)
b64 = base64.b64encode(csv.encode()).decode()
href = f'<a href="data:file/csv;base64,{b64}" download="{filename}.csv">Download CSV</a>'
return href
def get_word_freq_download_link(word_freq_df):
csv = word_freq_df.to_csv(index=True)
b64 = base64.b64encode(csv.encode()).decode()
href = f'<a href="data:file/csv;base64,{b64}" download="word_frequency.csv">Download Word Frequency CSV</a>'
return href
def get_example_download_link(file_path, link_text):
with open(file_path, "rb") as file:
b64 = base64.b64encode(file.read()).decode()
return f'<a href="data:file/txt;base64,{b64}" download="{os.path.basename(file_path)}">{link_text}</a>'
def get_image_download_link(image_path):
with open(image_path, "rb") as image_file:
b64 = base64.b64encode(image_file.read()).decode()
href = f'<a href="data:file/png;base64,{b64}" download="{image_path}">Download {image_path}</a>'
return href
def combined_analysis(text, slank_formal_df):
texts = text.split('\n')
results = []
for text in texts:
if text.strip():
cleaned_text = preprocess_text(text, slank_formal_df)
sentiment_result = sentiment_pipe(cleaned_text)[0]
emotion_result = emotion_pipe(cleaned_text)[0]
results.append((text, cleaned_text, sentiment_result['label'].lower(), sentiment_result['score'], emotion_result['label'].lower(), emotion_result['score']))
df = pd.DataFrame(results, columns=['Content', 'Cleaned Content', 'Sentiment', 'Score Sentiment', 'Emotion', 'Score Emotion'])
# Define custom CSS to adjust the height
st.markdown(
"""
<style>
.chart-container {
display: flex;
justify-content: center;
}
.user-select-none.svg-container {
height: 360px !important;
}
.average-score {
text-align: center;
}
</style>
""",
unsafe_allow_html=True
)
# Sentiment pie chart
sentiment_counts = df['Sentiment'].value_counts()
sentiment_colors = {
'positive': px.colors.qualitative.Set3[0],
'negative': px.colors.qualitative.Set3[3],
'neutral': px.colors.qualitative.Set3[1]
}
fig_sentiment = px.pie(
sentiment_counts,
values=sentiment_counts.values,
names=sentiment_counts.index,
title='Sentiment Distribution',
width=400,
height=400,
color=sentiment_counts.index,
color_discrete_map=sentiment_colors
)
# Calculate sentiment average
sentiment_average = df['Score Sentiment'].mean()
# Add average sentiment score as an annotation
fig_sentiment.add_annotation(
text=f"Average Sentiment Score: {sentiment_average:.4f}",
xref="paper", yref="paper",
x=0.5, y=-0.2,
showarrow=False,
font=dict(size=18)
)
st.markdown('<div class="chart-container">', unsafe_allow_html=True)
st.plotly_chart(fig_sentiment, use_container_width=True)
st.markdown('</div>', unsafe_allow_html=True)
# Emotion pie chart
# Sentiment pie chart
emotion_counts = df['Emotion'].value_counts()
emotion_colors = {
'marah': px.colors.qualitative.Safe[9],
'sedih': px.colors.qualitative.Safe[1],
'senang': px.colors.qualitative.Safe[0],
'cinta': px.colors.qualitative.Safe[2],
'jijik': px.colors.qualitative.Safe[6],
'takut': px.colors.qualitative.Safe[7],
}
fig_emotion = px.pie(
emotion_counts,
values=emotion_counts.values,
names=emotion_counts.index,
title='Emotion Distribution',
width=400,
height=400,
color=emotion_counts.index,
color_discrete_map=emotion_colors
)
# Calculate emotion average
emotion_average = df['Score Emotion'].mean()
# Add average emotion score as an annotation
fig_emotion.add_annotation(
text=f"Average Emotion Score: {emotion_average:.4f}",
xref="paper", yref="paper",
x=0.5, y=-0.2,
showarrow=False,
font=dict(size=18)
)
st.markdown('<div class="chart-container">', unsafe_allow_html=True)
st.plotly_chart(fig_emotion, use_container_width=True)
st.markdown('</div>', unsafe_allow_html=True)
# Generate word clouds
font_path = os.path.join('assets', 'Poppins-Regular.ttf')
# Ensure `df` is your DataFrame and 'Cleaned Content', 'Sentiment', and 'Emotion' columns exist
overall_text = ' '.join(df['Cleaned Content'].dropna())
generate_wordcloud(overall_text, font_path, 'hsv_r', 'Overall Word Cloud')
positive_happy_text = ' '.join(df[(df['Sentiment'] == 'positive') & (df['Emotion'] == 'senang')]['Cleaned Content'].dropna())
generate_wordcloud(positive_happy_text, font_path, 'gist_rainbow_r', 'Positive Sentiment & Happy Emotion Word Cloud')
negative_angry_sad_text = ' '.join(df[(df['Sentiment'] == 'negative') & (df['Emotion'].isin(['marah', 'sedih']))]['Cleaned Content'].dropna())
generate_wordcloud(negative_angry_sad_text, font_path, 'inferno', 'Negative Sentiment & Angry or Sad Emotion Word Cloud')
# Word frequency
word_freq = pd.Series(' '.join(df['Cleaned Content'].dropna()).split()).value_counts()
st.write("Word Frequency:")
st.write(word_freq)
# Download link for word frequency
word_freq_df = word_freq.reset_index()
word_freq_df.columns = ['Word', 'Frequency']
st.markdown(get_word_freq_download_link(word_freq_df), unsafe_allow_html=True)
return df
def process_file(file, slank_formal_df):
if file.name.endswith('.xlsx'):
df = pd.read_excel(file)
elif file.name.endswith('.csv'):
df = pd.read_csv(file)
else:
st.error("Format file tidak didukung. Harap unggah file CSV atau XLSX.")
return None
results = []
for index, row in df.iterrows():
if pd.notna(row['content']) and isinstance(row['content'], str):
cleaned_text = preprocess_text(row['content'], slank_formal_df)
sentiment, score_sentiment = analyze_sentiment(cleaned_text)
emotion, score_emotion = analyze_emotion(cleaned_text)
results.append((row['content'], cleaned_text, sentiment, score_sentiment, emotion, score_emotion))
else:
results.append((row['content'], None, None, None, None, None))
df['Cleaned Content'] = [r[1] for r in results]
df['Sentiment'] = [r[2] for r in results]
df['Score Sentiment'] = [r[3] for r in results]
df['Emotion'] = [r[4] for r in results]
df['Score Emotion'] = [r[5] for r in results]
# Define custom CSS to adjust the height
st.markdown(
"""
<style>
.chart-container {
display: flex;
justify-content: center;
}
.user-select-none.svg-container {
height: 360px !important;
}
.average-score {
text-align: center;
}
</style>
""",
unsafe_allow_html=True
)
# Sentiment pie chart
sentiment_counts = df['Sentiment'].value_counts()
sentiment_colors = {
'positive': px.colors.qualitative.Set3[0],
'negative': px.colors.qualitative.Set3[3],
'neutral': px.colors.qualitative.Set3[1]
}
fig_sentiment = px.pie(
sentiment_counts,
values=sentiment_counts.values,
names=sentiment_counts.index,
title='Sentiment Distribution',
width=400,
height=400,
color=sentiment_counts.index,
color_discrete_map=sentiment_colors
)
# Calculate sentiment average
sentiment_average = df['Score Sentiment'].mean()
# Add average sentiment score as an annotation
fig_sentiment.add_annotation(
text=f"Average Sentiment Score: {sentiment_average:.4f}",
xref="paper", yref="paper",
x=0.5, y=-0.2,
showarrow=False,
font=dict(size=18)
)
st.markdown('<div class="chart-container">', unsafe_allow_html=True)
st.plotly_chart(fig_sentiment, use_container_width=True)
st.markdown('</div>', unsafe_allow_html=True)
# Emotion pie chart
# Sentiment pie chart
emotion_counts = df['Emotion'].value_counts()
emotion_colors = {
'marah': px.colors.qualitative.Safe[9],
'sedih': px.colors.qualitative.Safe[1],
'senang': px.colors.qualitative.Safe[0],
'cinta': px.colors.qualitative.Safe[2],
'jijik': px.colors.qualitative.Safe[6],
'takut': px.colors.qualitative.Safe[7],
}
fig_emotion = px.pie(
emotion_counts,
values=emotion_counts.values,
names=emotion_counts.index,
title='Emotion Distribution',
width=400,
height=400,
color=emotion_counts.index,
color_discrete_map=emotion_colors
)
# Calculate emotion average
emotion_average = df['Score Emotion'].mean()
# Add average emotion score as an annotation
fig_emotion.add_annotation(
text=f"Average Emotion Score: {emotion_average:.4f}",
xref="paper", yref="paper",
x=0.5, y=-0.2,
showarrow=False,
font=dict(size=18)
)
st.markdown('<div class="chart-container">', unsafe_allow_html=True)
st.plotly_chart(fig_emotion, use_container_width=True)
st.markdown('</div>', unsafe_allow_html=True)
# Generate word clouds
font_path = os.path.join('assets', 'Poppins-Regular.ttf')
# Ensure `df` is your DataFrame and 'Cleaned Content', 'Sentiment', and 'Emotion' columns exist
overall_text = ' '.join(df['Cleaned Content'].dropna())
generate_wordcloud(overall_text, font_path, 'hsv_r', 'Overall Word Cloud')
positive_happy_text = ' '.join(df[(df['Sentiment'] == 'positive') & (df['Emotion'] == 'senang')]['Cleaned Content'].dropna())
generate_wordcloud(positive_happy_text, font_path, 'gist_rainbow_r', 'Positive Sentiment & Happy Emotion Word Cloud')
negative_angry_sad_text = ' '.join(df[(df['Sentiment'] == 'negative') & (df['Emotion'].isin(['marah', 'sedih']))]['Cleaned Content'].dropna())
generate_wordcloud(negative_angry_sad_text, font_path, 'inferno', 'Negative Sentiment & Angry or Sad Emotion Word Cloud')
# Word frequency
word_freq = pd.Series(' '.join(df['Cleaned Content'].dropna()).split()).value_counts()
st.write("Word Frequency:")
st.write(word_freq)
# Download link for word frequency
word_freq_df = word_freq.reset_index()
word_freq_df.columns = ['Word', 'Frequency']
st.markdown(get_word_freq_download_link(word_freq_df), unsafe_allow_html=True)
return df
def main():
st.title("Aplikasi Analisis Sentimen dan Prediksi Emosi")
# Add download link for example slank template
slank_template_path = "assets/contoh template data slank.txt"
st.markdown(get_example_download_link(slank_template_path, "Download Contoh Template Data Slank (TXT)"), unsafe_allow_html=True)
slank_file = st.file_uploader("Upload file slank dengan baris pertama Slank;Formal (TXT)", type=["txt"])
if slank_file is not None:
df_slank_formal = load_slank_formal(slank_file)
if df_slank_formal is None:
st.stop()
else:
st.warning("Harap upload file slank terlebih dahulu.")
st.stop()
menu = st.sidebar.selectbox("Pilih Metode", ["Analisis Langsung", "Import dari File"])
if menu == "Analisis Langsung":
user_input = st.text_area("Masukkan teks yang ingin dianalisis (pisahkan dengan enter):")
if st.button("Analisis"):
df = combined_analysis(user_input, df_slank_formal)
st.write("Hasil Analisis:")
st.write(df)
st.markdown(get_download_link(df, "analisis_sentimen_emosi"), unsafe_allow_html=True)
elif menu == "Import dari File":
# Add download link for example content template
content_template_path = "assets/contoh template data content.xlsx"
st.markdown(get_example_download_link(content_template_path, "Download Contoh Template Data Content (XLSX)"), unsafe_allow_html=True)
uploaded_file = st.file_uploader("Upload file CSV atau XLSX", type=["csv", "xlsx"])
if uploaded_file is not None:
df = process_file(uploaded_file, df_slank_formal)
st.write("Hasil Analisis:")
st.write(df)
st.markdown(get_download_link(df, "analisis_sentimen_emosi"), unsafe_allow_html=True)
if __name__ == '__main__':
main()