import streamlit as st import pandas as pd from transformers import pipeline import base64 import re import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize import plotly.express as px import matplotlib.pyplot as plt from wordcloud import WordCloud import numpy as np from PIL import ImageFont import os nltk.download('punkt') nltk.download('stopwords') # Load pipelines sentiment_pipe = pipeline("text-classification", model="dhanikitkat/indo_smsa-1.5G_sentiment_analysis") emotion_pipe = pipeline("text-classification", model="azizp128/prediksi-emosi-indobert") def load_slank_formal(file): if file.name.endswith('.txt'): df = pd.read_csv(file, sep=';', header=None, names=['Slank', 'Formal']) else: st.error("Format file tidak didukung. Harap unggah file TXT.") return None df.columns = ['Slank', 'Formal'] return df def replace_slank_to_formal(sentence, slank_formal_df): words = re.findall(r'[\w\',./:-]+|[.,]+|[^\x00-\x7F]+', sentence) for i, word in enumerate(words): replacement = slank_formal_df.loc[slank_formal_df['Slank'] == word.lower(), 'Formal'].values if replacement.size > 0: words[i] = str(replacement[0]) return ' '.join(words) def preprocess_text(text, slank_formal_df): text = text.lower() text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) text = re.sub(r'\@\w+|\#', '', text) text = re.sub(r'([^\w\s\U0001F000-\U0001F9FF])\1+', r'\1', text) text = re.sub(r'([\U0001F600-\U0001F64F\U0001F900-\U0001F9FF\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F])', r' \1 ', text) text = re.sub(r'([.,])', r' \1 ', text) text = re.sub(r'[&%]', lambda x: f' {x.group()} ', text) text = re.sub(r'(\w)\1{1,}', r'\1\1', text) text = re.sub(r'\s+', ' ', text).strip() text = re.sub(r'\b(\w+)\b\s*-\s*\b\1\b', r'\1-\1', text) text = re.sub(r'(?<=\d)\s*\.\s*(?=\d)', '.', text) text = re.sub(r'(?<=\d)\s*,\s*(?=\d)', ',', text) text = re.sub(r'\s+', ' ', text).strip() text = replace_slank_to_formal(text, slank_formal_df) tokens = word_tokenize(text) preprocessed_text = ' '.join(tokens) return preprocessed_text def generate_wordcloud(text, font_path, colormap, title): # Create a circular mask for Full HD resolution x, y = np.ogrid[:1400, :1400] # Adjusted for 1400x1400 resolution mask = (x - 700) ** 2 + (y - 700) ** 2 > 630 ** 2 # Adjusted mask size for 1400x1400 resolution mask = 255 * mask.astype(int) # Remove Indonesian stopwords indo_stopwords = set(stopwords.words('indonesian')) words = text.split() words = [word for word in words if word.lower() not in indo_stopwords] text = ' '.join(words) wordcloud = WordCloud( width=1400, height=1400, background_color='white', font_path=font_path, prefer_horizontal=1.0, colormap=colormap, max_words=100, mask=mask ).generate(text) # Configure plot settings for high-quality output plt.figure(figsize=(14, 14)) # Adjusted figure size for 1400x1400 resolution plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') plt.title(title, fontsize=20, pad=20) # Title directly in matplotlib plot # Save word cloud to file with high DPI for better quality plt.savefig(f"{title}.png", dpi=300, bbox_inches='tight', pad_inches=0.1) # Display word cloud in Streamlit st.image(f"{title}.png", use_column_width=True) # Add download link for word cloud st.markdown(get_image_download_link(f"{title}.png"), unsafe_allow_html=True) def analyze_sentiment(text): result = sentiment_pipe(text)[0] return result['label'].lower(), result['score'] def analyze_emotion(text): result = emotion_pipe(text)[0] return result['label'].lower(), result['score'] def get_download_link(df, filename): csv = df.to_csv(index=False) b64 = base64.b64encode(csv.encode()).decode() href = f'Download CSV' return href def get_word_freq_download_link(word_freq_df): csv = word_freq_df.to_csv(index=True) b64 = base64.b64encode(csv.encode()).decode() href = f'Download Word Frequency CSV' return href def get_example_download_link(file_path, link_text): with open(file_path, "rb") as file: b64 = base64.b64encode(file.read()).decode() return f'{link_text}' def get_image_download_link(image_path): with open(image_path, "rb") as image_file: b64 = base64.b64encode(image_file.read()).decode() href = f'Download {image_path}' return href def combined_analysis(text, slank_formal_df): texts = text.split('\n') results = [] for text in texts: if text.strip(): cleaned_text = preprocess_text(text, slank_formal_df) sentiment_result = sentiment_pipe(cleaned_text)[0] emotion_result = emotion_pipe(cleaned_text)[0] results.append((text, cleaned_text, sentiment_result['label'].lower(), sentiment_result['score'], emotion_result['label'].lower(), emotion_result['score'])) df = pd.DataFrame(results, columns=['Content', 'Cleaned Content', 'Sentiment', 'Score Sentiment', 'Emotion', 'Score Emotion']) # Define custom CSS to adjust the height st.markdown( """ """, unsafe_allow_html=True ) # Sentiment pie chart sentiment_counts = df['Sentiment'].value_counts() sentiment_colors = { 'positive': px.colors.qualitative.Set3[0], 'negative': px.colors.qualitative.Set3[3], 'neutral': px.colors.qualitative.Set3[1] } fig_sentiment = px.pie( sentiment_counts, values=sentiment_counts.values, names=sentiment_counts.index, title='Sentiment Distribution', width=400, height=400, color=sentiment_counts.index, color_discrete_map=sentiment_colors ) # Calculate sentiment average sentiment_average = df['Score Sentiment'].mean() # Add average sentiment score as an annotation fig_sentiment.add_annotation( text=f"Average Sentiment Score: {sentiment_average:.4f}", xref="paper", yref="paper", x=0.5, y=-0.2, showarrow=False, font=dict(size=18) ) st.markdown('
', unsafe_allow_html=True) st.plotly_chart(fig_sentiment, use_container_width=True) st.markdown('
', unsafe_allow_html=True) # Emotion pie chart # Sentiment pie chart emotion_counts = df['Emotion'].value_counts() emotion_colors = { 'marah': px.colors.qualitative.Safe[9], 'sedih': px.colors.qualitative.Safe[1], 'senang': px.colors.qualitative.Safe[0], 'cinta': px.colors.qualitative.Safe[2], 'jijik': px.colors.qualitative.Safe[6], 'takut': px.colors.qualitative.Safe[7], } fig_emotion = px.pie( emotion_counts, values=emotion_counts.values, names=emotion_counts.index, title='Emotion Distribution', width=400, height=400, color=emotion_counts.index, color_discrete_map=emotion_colors ) # Calculate emotion average emotion_average = df['Score Emotion'].mean() # Add average emotion score as an annotation fig_emotion.add_annotation( text=f"Average Emotion Score: {emotion_average:.4f}", xref="paper", yref="paper", x=0.5, y=-0.2, showarrow=False, font=dict(size=18) ) st.markdown('
', unsafe_allow_html=True) st.plotly_chart(fig_emotion, use_container_width=True) st.markdown('
', unsafe_allow_html=True) # Generate word clouds font_path = os.path.join('assets', 'Poppins-Regular.ttf') # Ensure `df` is your DataFrame and 'Cleaned Content', 'Sentiment', and 'Emotion' columns exist overall_text = ' '.join(df['Cleaned Content'].dropna()) generate_wordcloud(overall_text, font_path, 'hsv_r', 'Overall Word Cloud') positive_happy_text = ' '.join(df[(df['Sentiment'] == 'positive') & (df['Emotion'] == 'senang')]['Cleaned Content'].dropna()) generate_wordcloud(positive_happy_text, font_path, 'gist_rainbow_r', 'Positive Sentiment & Happy Emotion Word Cloud') negative_angry_sad_text = ' '.join(df[(df['Sentiment'] == 'negative') & (df['Emotion'].isin(['marah', 'sedih']))]['Cleaned Content'].dropna()) generate_wordcloud(negative_angry_sad_text, font_path, 'inferno', 'Negative Sentiment & Angry or Sad Emotion Word Cloud') # Word frequency word_freq = pd.Series(' '.join(df['Cleaned Content'].dropna()).split()).value_counts() st.write("Word Frequency:") st.write(word_freq) # Download link for word frequency word_freq_df = word_freq.reset_index() word_freq_df.columns = ['Word', 'Frequency'] st.markdown(get_word_freq_download_link(word_freq_df), unsafe_allow_html=True) return df def process_file(file, slank_formal_df): if file.name.endswith('.xlsx'): df = pd.read_excel(file) elif file.name.endswith('.csv'): df = pd.read_csv(file) else: st.error("Format file tidak didukung. Harap unggah file CSV atau XLSX.") return None results = [] for index, row in df.iterrows(): if pd.notna(row['content']) and isinstance(row['content'], str): cleaned_text = preprocess_text(row['content'], slank_formal_df) sentiment, score_sentiment = analyze_sentiment(cleaned_text) emotion, score_emotion = analyze_emotion(cleaned_text) results.append((row['content'], cleaned_text, sentiment, score_sentiment, emotion, score_emotion)) else: results.append((row['content'], None, None, None, None, None)) df['Cleaned Content'] = [r[1] for r in results] df['Sentiment'] = [r[2] for r in results] df['Score Sentiment'] = [r[3] for r in results] df['Emotion'] = [r[4] for r in results] df['Score Emotion'] = [r[5] for r in results] # Define custom CSS to adjust the height st.markdown( """ """, unsafe_allow_html=True ) # Sentiment pie chart sentiment_counts = df['Sentiment'].value_counts() sentiment_colors = { 'positive': px.colors.qualitative.Set3[0], 'negative': px.colors.qualitative.Set3[3], 'neutral': px.colors.qualitative.Set3[1] } fig_sentiment = px.pie( sentiment_counts, values=sentiment_counts.values, names=sentiment_counts.index, title='Sentiment Distribution', width=400, height=400, color=sentiment_counts.index, color_discrete_map=sentiment_colors ) # Calculate sentiment average sentiment_average = df['Score Sentiment'].mean() # Add average sentiment score as an annotation fig_sentiment.add_annotation( text=f"Average Sentiment Score: {sentiment_average:.4f}", xref="paper", yref="paper", x=0.5, y=-0.2, showarrow=False, font=dict(size=18) ) st.markdown('
', unsafe_allow_html=True) st.plotly_chart(fig_sentiment, use_container_width=True) st.markdown('
', unsafe_allow_html=True) # Emotion pie chart # Sentiment pie chart emotion_counts = df['Emotion'].value_counts() emotion_colors = { 'marah': px.colors.qualitative.Safe[9], 'sedih': px.colors.qualitative.Safe[1], 'senang': px.colors.qualitative.Safe[0], 'cinta': px.colors.qualitative.Safe[2], 'jijik': px.colors.qualitative.Safe[6], 'takut': px.colors.qualitative.Safe[7], } fig_emotion = px.pie( emotion_counts, values=emotion_counts.values, names=emotion_counts.index, title='Emotion Distribution', width=400, height=400, color=emotion_counts.index, color_discrete_map=emotion_colors ) # Calculate emotion average emotion_average = df['Score Emotion'].mean() # Add average emotion score as an annotation fig_emotion.add_annotation( text=f"Average Emotion Score: {emotion_average:.4f}", xref="paper", yref="paper", x=0.5, y=-0.2, showarrow=False, font=dict(size=18) ) st.markdown('
', unsafe_allow_html=True) st.plotly_chart(fig_emotion, use_container_width=True) st.markdown('
', unsafe_allow_html=True) # Generate word clouds font_path = os.path.join('assets', 'Poppins-Regular.ttf') # Ensure `df` is your DataFrame and 'Cleaned Content', 'Sentiment', and 'Emotion' columns exist overall_text = ' '.join(df['Cleaned Content'].dropna()) generate_wordcloud(overall_text, font_path, 'hsv_r', 'Overall Word Cloud') positive_happy_text = ' '.join(df[(df['Sentiment'] == 'positive') & (df['Emotion'] == 'senang')]['Cleaned Content'].dropna()) generate_wordcloud(positive_happy_text, font_path, 'gist_rainbow_r', 'Positive Sentiment & Happy Emotion Word Cloud') negative_angry_sad_text = ' '.join(df[(df['Sentiment'] == 'negative') & (df['Emotion'].isin(['marah', 'sedih']))]['Cleaned Content'].dropna()) generate_wordcloud(negative_angry_sad_text, font_path, 'inferno', 'Negative Sentiment & Angry or Sad Emotion Word Cloud') # Word frequency word_freq = pd.Series(' '.join(df['Cleaned Content'].dropna()).split()).value_counts() st.write("Word Frequency:") st.write(word_freq) # Download link for word frequency word_freq_df = word_freq.reset_index() word_freq_df.columns = ['Word', 'Frequency'] st.markdown(get_word_freq_download_link(word_freq_df), unsafe_allow_html=True) return df def main(): st.title("Aplikasi Analisis Sentimen dan Prediksi Emosi") # Add download link for example slank template slank_template_path = "assets/contoh template data slank.txt" st.markdown(get_example_download_link(slank_template_path, "Download Contoh Template Data Slank (TXT)"), unsafe_allow_html=True) slank_file = st.file_uploader("Upload file slank dengan baris pertama Slank;Formal (TXT)", type=["txt"]) if slank_file is not None: df_slank_formal = load_slank_formal(slank_file) if df_slank_formal is None: st.stop() else: st.warning("Harap upload file slank terlebih dahulu.") st.stop() menu = st.sidebar.selectbox("Pilih Metode", ["Analisis Langsung", "Import dari File"]) if menu == "Analisis Langsung": user_input = st.text_area("Masukkan teks yang ingin dianalisis (pisahkan dengan enter):") if st.button("Analisis"): df = combined_analysis(user_input, df_slank_formal) st.write("Hasil Analisis:") st.write(df) st.markdown(get_download_link(df, "analisis_sentimen_emosi"), unsafe_allow_html=True) elif menu == "Import dari File": # Add download link for example content template content_template_path = "assets/contoh template data content.xlsx" st.markdown(get_example_download_link(content_template_path, "Download Contoh Template Data Content (XLSX)"), unsafe_allow_html=True) uploaded_file = st.file_uploader("Upload file CSV atau XLSX", type=["csv", "xlsx"]) if uploaded_file is not None: df = process_file(uploaded_file, df_slank_formal) st.write("Hasil Analisis:") st.write(df) st.markdown(get_download_link(df, "analisis_sentimen_emosi"), unsafe_allow_html=True) if __name__ == '__main__': main()