Spaces:
Runtime error
Runtime error
import pandas as pd | |
import numpy as np | |
from PIL import Image | |
import plotly.express as px | |
from wordcloud import WordCloud | |
import matplotlib.pyplot as plt | |
import string | |
import re #regex library | |
#umap | |
import umap | |
import hdbscan | |
import plotly.graph_objects as go | |
from bertopic import BERTopic | |
from sklearn.feature_extraction.text import CountVectorizer | |
# import word_tokenize from NLTK | |
from transformers import AutoTokenizer | |
from script.plotting import visualize_barchart | |
def load_stopwords(): | |
stopwords = pd.read_csv("assets/stopwordbahasa.csv", header=None) | |
stopwords = stopwords[0].tolist() | |
stopwords = stopwords + list(string.punctuation) | |
return stopwords | |
def tokenisasi(df): | |
stopwords = load_stopwords() | |
tokenizer = AutoTokenizer.from_pretrained('indobert') | |
tokens = df.content.apply(lambda x: tokenizer.tokenize(x)) | |
tokens = tokens.apply(lambda x: [x for x in x if (not x.startswith('##') and x not in stopwords and len(x) > 4)]) | |
return tokens | |
def get_wordcloud(df,kelas_sentiment): | |
mask = np.array(Image.open('./assets/twitter.png')) | |
cmap_dict = {'positif': 'YlGn', 'negatif': 'OrRd', 'netral': 'GnBu'} | |
tokens = tokenisasi(df[df.sentiment == kelas_sentiment]) | |
tokens = tokens.apply(lambda x: ' '.join(x)) | |
text = ' '.join(tokens) | |
# check if text empty or not | |
try : | |
wordcloud = WordCloud(width = 800, height = 800, | |
background_color ='black', | |
min_font_size = 10, | |
colormap = cmap_dict[kelas_sentiment], | |
mask = mask).generate(text) | |
except: | |
wordcloud = WordCloud(width = 800, height = 800, | |
background_color ='black', | |
min_font_size = 10, | |
colormap = cmap_dict[kelas_sentiment], | |
mask = mask).generate("None") | |
return wordcloud | |
def plot_text(df,kelas,embedding_model): | |
df = df[df.sentiment == kelas] | |
data = embedding_model.encode(df.values.tolist()) | |
umap_model = umap.UMAP(n_neighbors=min(df.shape[0],5),random_state = 42) | |
umap_data = umap_model.fit_transform(data) | |
clusterer = hdbscan.HDBSCAN(min_cluster_size=round((df.shape[0])**(0.5)-1),min_samples=3) | |
clusterer.fit(umap_data) | |
labels = ['cluster ' + str(i) for i in clusterer.labels_] | |
# replace cluster -1 with outlier | |
labels = ["outlier" if i == "cluster -1" else i for i in labels ] | |
text = df["content"].str.wrap(50).apply(lambda x: x.replace('\n', '<br>')) | |
fig = px.scatter(x=umap_data[:,0], y=umap_data[:,1],color = clusterer.labels_) | |
# remove legend | |
fig = px.scatter(x=umap_data[:,0], y=umap_data[:,1],color = labels,text = text) | |
#set text color | |
fig.update_traces(textfont_color='rgba(0,0,0,0)',marker_size = 8) | |
# set background color | |
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)') | |
# set margin | |
fig.update_layout(margin=dict(l=40, r=5, t=0, b=40)) | |
# set axis color to grey | |
fig.update_xaxes(showgrid=False, zeroline=False, linecolor='rgb(200,200,200)') | |
fig.update_yaxes( zeroline=False, linecolor='rgb(200,200,200)') | |
# set font sans-serif | |
fig.update_layout(font_family="sans-serif") | |
# remove legend | |
fig.update_layout(showlegend=False) | |
# set legend title to cluster | |
return df["content"],data,fig | |
def topic_modelling(df,embed_df): | |
data = df.apply(lambda x: ' '.join([w for w in x.split() if len(w)>3])) | |
stopwords = load_stopwords() | |
# remove empty data | |
topic_model = BERTopic( | |
calculate_probabilities=True, | |
# cluster model | |
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=5,prediction_data=True), | |
vectorizer_model=CountVectorizer(stop_words=stopwords), | |
language="indonesian", | |
) | |
topics, probs = topic_model.fit_transform(data,embed_df) | |
topic_labels = topic_model.generate_topic_labels( | |
topic_prefix = False, | |
separator = ", ", | |
) | |
topic_model.set_topic_labels(topic_labels) | |
fig = visualize_barchart(topic_model) | |
# set title to Kata Kunci tiap Topic | |
# fig.update_layout(title_text="Topic yang sering muncul") | |
return fig,topic_model |