Spaces:
Runtime error
Runtime error
Duplicate from dafqi/indo_twitter_sentiment_app
Browse filesCo-authored-by: dafqii <[email protected]>
- .gitattributes +34 -0
- README.md +18 -0
- __pycache__/functions.cpython-310.pyc +0 -0
- __pycache__/plotting.cpython-310.pyc +0 -0
- __pycache__/text_proc.cpython-310.pyc +0 -0
- app.py +139 -0
- assets/data.csv +99 -0
- assets/df_model.pkl +3 -0
- assets/notebook.ipynb +0 -0
- assets/stopwordbahasa.csv +782 -0
- assets/twitter.png +0 -0
- assets/valid.csv +0 -0
- indobert/config.json +46 -0
- indobert/pytorch_model.bin +3 -0
- indobert/special_tokens_map.json +7 -0
- indobert/tokenizer_config.json +15 -0
- indobert/vocab.txt +0 -0
- pages/1__model_information.py +36 -0
- requirements.txt +12 -0
- scraping.ipynb +213 -0
- script/__init__.py +0 -0
- script/__pycache__/__init__.cpython-310.pyc +0 -0
- script/__pycache__/functions.cpython-310.pyc +0 -0
- script/__pycache__/plotting.cpython-310.pyc +0 -0
- script/__pycache__/text_proc.cpython-310.pyc +0 -0
- script/functions.py +132 -0
- script/plotting.py +116 -0
- script/text_proc.py +107 -0
- sentence_bert/1_Pooling/config.json +7 -0
- sentence_bert/README.md +136 -0
- sentence_bert/config.json +47 -0
- sentence_bert/config_sentence_transformers.json +7 -0
- sentence_bert/modules.json +14 -0
- sentence_bert/pytorch_model.bin +3 -0
- sentence_bert/sentence_bert_config.json +4 -0
- sentence_bert/special_tokens_map.json +7 -0
- sentence_bert/tokenizer.json +0 -0
- sentence_bert/tokenizer_config.json +16 -0
- sentence_bert/vocab.txt +0 -0
.gitattributes
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Indo Twitter Sentiment App
|
3 |
+
emoji: 👀
|
4 |
+
colorFrom: green
|
5 |
+
colorTo: yellow
|
6 |
+
sdk: streamlit
|
7 |
+
sdk_version: 1.15.2
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
duplicated_from: dafqi/indo_twitter_sentiment_app
|
11 |
+
---
|
12 |
+
|
13 |
+
# twitter sentiment app
|
14 |
+
|
15 |
+
Aplikasi sederhana untuk melakukan analisis sentimen terhadap tweet yang diinputkan dan mengekstrak topik dari setiap sentimen
|
16 |
+
|
17 |
+
link website : https://dafiqrahman-twitter-sentiment-app-app-shcgk3.streamlit.app/
|
18 |
+
|
__pycache__/functions.cpython-310.pyc
ADDED
Binary file (2.23 kB). View file
|
|
__pycache__/plotting.cpython-310.pyc
ADDED
Binary file (3.61 kB). View file
|
|
__pycache__/text_proc.cpython-310.pyc
ADDED
Binary file (4.41 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
import streamlit as st
|
4 |
+
import pandas as pd
|
5 |
+
import script.functions as fn
|
6 |
+
import plotly.express as px
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
+
# import text_proc in script folder
|
9 |
+
import script.text_proc as tp
|
10 |
+
from sentence_transformers import SentenceTransformer
|
11 |
+
|
12 |
+
|
13 |
+
st.set_page_config(
|
14 |
+
page_title="twitter sentiment analysis",
|
15 |
+
page_icon="👋",
|
16 |
+
)
|
17 |
+
|
18 |
+
st.sidebar.markdown("📚 Twitter Sentiment Analysis App")
|
19 |
+
|
20 |
+
# Load data
|
21 |
+
# add tiwtter logo inside title
|
22 |
+
st.markdown("<h1 style='text-align: center;'>📚 Twitter Sentiment Analysis App</h1>", unsafe_allow_html=True)
|
23 |
+
st.write("Aplikasi sederhana untuk melakukan analisis sentimen terhadap tweet yang diinputkan dan mengekstrak topik dari setiap sentimen.")
|
24 |
+
# streamlit selectbox simple and advanced
|
25 |
+
|
26 |
+
sb1,sb2 = st.columns([2,4])
|
27 |
+
with sb1:
|
28 |
+
option = st.selectbox('Pilih Mode Pencarian',('Simple','Advanced'))
|
29 |
+
with sb2:
|
30 |
+
option_model = st.selectbox('Pilih Model',("IndoBERT (Accurate,Slow)",'Naive Bayes','Logistic Regression (Less Accurate,Fast)','XGBoost','Catboost','SVM','Random Forest'))
|
31 |
+
|
32 |
+
if option == 'Simple':
|
33 |
+
# create col1 and col2
|
34 |
+
col1, col2 = st.columns([3,2])
|
35 |
+
with col1:
|
36 |
+
input = st.text_input("Masukkan User/Hastag", "@traveloka")
|
37 |
+
with col2:
|
38 |
+
length = st.number_input("Jumlah Tweet", 10, 500, 100)
|
39 |
+
else :
|
40 |
+
col1, col2 = st.columns([3,1])
|
41 |
+
with col1:
|
42 |
+
input = st.text_input("Masukkan Parameter Pencarian", "(to:@traveloka AND @traveloka) -filter:links filter:replies lang:id")
|
43 |
+
with col2:
|
44 |
+
length = st.number_input("Jumlah Tweet", 10, 500, 100)
|
45 |
+
st.caption("anda bisa menggunakan parameter pencarian yang lebih spesifik, parameter ini sama dengan paremeter pencarian di twitter")
|
46 |
+
|
47 |
+
submit = st.button("🔍Cari Tweet")
|
48 |
+
|
49 |
+
st.caption("semakin banyak tweet yang diambil maka semakin lama proses analisis sentimen")
|
50 |
+
|
51 |
+
if submit:
|
52 |
+
with st.spinner('Mengambil data dari twitter... (1/2)'):
|
53 |
+
df = fn.get_tweets(input, length, option)
|
54 |
+
with st.spinner('Melakukan Prediksi Sentimen... (2/2)'):
|
55 |
+
df = fn.get_sentiment(df,option_model)
|
56 |
+
df.to_csv('assets/data.csv',index=False)
|
57 |
+
# plot
|
58 |
+
st.write("<b>Preview Dataset</b>",unsafe_allow_html=True)
|
59 |
+
def color_sentiment(val):
|
60 |
+
color_dict = {"positif": "#00cc96", "negatif": "#ef553b","netral": "#636efa"}
|
61 |
+
return f'color: {color_dict[val]}'
|
62 |
+
st.dataframe(df.style.applymap(color_sentiment, subset=['sentiment']),use_container_width=True,height = 200)
|
63 |
+
# st.dataframe(df,use_container_width=True,height = 200)
|
64 |
+
st.write ("Jumlah Tweet: ",df.shape[0])
|
65 |
+
# download datasets
|
66 |
+
|
67 |
+
|
68 |
+
st.write("<h3>📊 Analisis Sentimen</h3>",unsafe_allow_html=True)
|
69 |
+
col_fig1, col_fig2 = st.columns([4,3])
|
70 |
+
with col_fig1:
|
71 |
+
with st.spinner('Sedang Membuat Grafik...'):
|
72 |
+
st.write("<b>Jumlah Tweet Tiap Sentiment</b>",unsafe_allow_html=True)
|
73 |
+
fig_1 = fn.get_bar_chart(df)
|
74 |
+
st.plotly_chart(fig_1,use_container_width=True,theme="streamlit")
|
75 |
+
with col_fig2:
|
76 |
+
st.write("<b>Wordcloud Tiap Sentiment</b>",unsafe_allow_html=True)
|
77 |
+
tab1,tab2,tab3 = st.tabs(["😞 negatif","😐 netral","😃 positif"])
|
78 |
+
with tab1:
|
79 |
+
wordcloud_pos = tp.get_wordcloud(df,"negatif")
|
80 |
+
fig = plt.figure(figsize=(10, 5))
|
81 |
+
plt.imshow(wordcloud_pos, interpolation="bilinear")
|
82 |
+
plt.axis("off")
|
83 |
+
st.pyplot(fig)
|
84 |
+
with tab2:
|
85 |
+
wordcloud_neg = tp.get_wordcloud(df,"netral")
|
86 |
+
fig = plt.figure(figsize=(10, 5))
|
87 |
+
plt.imshow(wordcloud_neg, interpolation="bilinear")
|
88 |
+
plt.axis("off")
|
89 |
+
st.pyplot(fig)
|
90 |
+
with tab3:
|
91 |
+
wordcloud_net = tp.get_wordcloud(df,"positif")
|
92 |
+
fig = plt.figure(figsize=(10, 5))
|
93 |
+
plt.imshow(wordcloud_net, interpolation="bilinear")
|
94 |
+
plt.axis("off")
|
95 |
+
st.pyplot(fig)
|
96 |
+
st.write("<h3>✨ Sentiment Clustering</h3>",unsafe_allow_html=True)
|
97 |
+
@st.experimental_singleton
|
98 |
+
def load_sentence_model():
|
99 |
+
embedding_model = SentenceTransformer('sentence_bert')
|
100 |
+
return embedding_model
|
101 |
+
embedding_model = load_sentence_model()
|
102 |
+
tab4,tab5,tab6 = st.tabs(["😞 negatif","😐 netral","😃 positif"])
|
103 |
+
with tab4:
|
104 |
+
if len(df[df["sentiment"]=="negatif"]) < 11:
|
105 |
+
st.write("Tweet Terlalu Sedikit, Tidak dapat melakukan clustering")
|
106 |
+
st.write(df[df["sentiment"]=="negatif"])
|
107 |
+
else:
|
108 |
+
with st.spinner('Sedang Membuat Grafik...(1/2)'):
|
109 |
+
text,data,fig = tp.plot_text(df,"negatif",embedding_model)
|
110 |
+
st.plotly_chart(fig,use_container_width=True,theme=None)
|
111 |
+
with st.spinner('Sedang Mengekstrak Topik... (2/2)'):
|
112 |
+
fig,topic_modelling = tp.topic_modelling(text,data)
|
113 |
+
st.plotly_chart(fig,use_container_width=True,theme="streamlit")
|
114 |
+
with tab5:
|
115 |
+
if len(df[df["sentiment"]=="netral"]) < 11:
|
116 |
+
st.write("Tweet Terlalu Sedikit, Tidak dapat melakukan clustering")
|
117 |
+
st.write(df[df["sentiment"]=="netral"])
|
118 |
+
else:
|
119 |
+
with st.spinner('Sedang Membuat Grafik... (1/2)'):
|
120 |
+
text,data,fig = tp.plot_text(df,"netral",embedding_model)
|
121 |
+
st.plotly_chart(fig,use_container_width=True,theme=None)
|
122 |
+
with st.spinner('Sedang Mengekstrak Topik... (2/2)'):
|
123 |
+
fig,topic_modelling = tp.topic_modelling(text,data)
|
124 |
+
st.plotly_chart(fig,use_container_width=True,theme="streamlit")
|
125 |
+
with tab6:
|
126 |
+
if len(df[df["sentiment"]=="positif"]) < 11:
|
127 |
+
st.write("Tweet Terlalu Sedikit, Tidak dapat melakukan clustering")
|
128 |
+
st.write(df[df["sentiment"]=="positif"])
|
129 |
+
else:
|
130 |
+
with st.spinner('Sedang Membuat Grafik...(1/2)'):
|
131 |
+
text,data,fig = tp.plot_text(df,"positif",embedding_model)
|
132 |
+
st.plotly_chart(fig,use_container_width=True,theme=None)
|
133 |
+
with st.spinner('Sedang Mengekstrak Topik... (2/2)'):
|
134 |
+
fig,topic_modelling = tp.topic_modelling(text,data)
|
135 |
+
st.plotly_chart(fig,use_container_width=True,theme="streamlit")
|
136 |
+
|
137 |
+
|
138 |
+
|
139 |
+
|
assets/data.csv
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
sentiment,content
|
2 |
+
netral,"Gabisa DM min ,gaada tempat buat dm kamu hehe"
|
3 |
+
netral,"Halo Haeni, mimin belum menerima DM kamu, yuk infokan mimin kembali melalui DM ya. Terima kasih. -OR"
|
4 |
+
positif,"halo ini saya mau atur ulang jam penerbangan, tulisannya tersedia untuk re schedule, tapi setelah di ajukan kenapa tidak tertolak dan tulisannya tidak tersedia untuk penerbangan ini ya ,padahal ada pilihan untuk jam yang ingin diubah"
|
5 |
+
netral,"Well, the night comes, but my refund money is still no where in sight. 92 days of waiting and still counting. Been told to wait for another 2x24hr time."
|
6 |
+
netral,Mimin meluncur ke DM kamu ya. Ditunggu -TK
|
7 |
+
netral,Hai Riri. DM kamu sudah mimin balas ya. Terima kasih. -DA
|
8 |
+
netral,min cek dm
|
9 |
+
netral,"hallo, min. Cek dm ya. Terima kasih"
|
10 |
+
netral,"Siap, Kak. Udah mimin balas ya. -SM"
|
11 |
+
negatif,Aamin. ehem ehem
|
12 |
+
netral,Tolong dibalas kka
|
13 |
+
netral,Hai Kak. Mohon maaf banget ya atas kendala kamu. Supaya nyaman dan aman ngobrolnya boleh informasikan ke mimin nomor pemesanan kamu via DM Kak Supaya mimin bisa cek lebih lanjut. Mimin tunggu ya Kak. Terima kasih. -DA
|
14 |
+
netral,kalau purchase pending artinya apa ya Cek akun udah ada receipt nya. Mohon dibantu ya min. Thanks.
|
15 |
+
netral,"Sudah bsa,sudah aku DM kak"
|
16 |
+
negatif,Ayam rasa rasa yang dulu pernah ada.
|
17 |
+
positif,"Aku coba DM tapi ga bsa , coba KK dluan ya"
|
18 |
+
netral,"Hai, Kak. Untuk lakukan pengecekan lebih lanjut, yuk infoin nomor pesanan kamu dulu via DM ya. Mimin tunggu -TK"
|
19 |
+
netral,"kak aku mau reschedule tiket balik, biaya yg sebelumnya akan direfund atau bagaimana"
|
20 |
+
netral,min tolong cek DM saya mau reschedule tiket balik
|
21 |
+
netral,admin tolong cek DM
|
22 |
+
netral,Cek DM min
|
23 |
+
netral,"Hai, Kak. Mohon maaf atas kendala yang kamu alami. Untuk lakukan pengecekan lebih lanjut, mohon infokan nomor pesanan Traveloka kamu via DM ya. Mimin tunggu -TK"
|
24 |
+
netral,"Hai Kak Hendri, pada dasarnya untuk asuransi pada produk yang kamu pesan bersifat optional ya Kak, dapat dipesan atau tidak. Apabila kamu memiliki ketidaksesuaian silakan lirkan screenshot dan detail kendalanya via DM, agar mimin dapat coba cek lebih lanjut, terima k"
|
25 |
+
netral,Admin saya mau bertanya² ke CS tentang pembatalan penerbangan maskapai melalui aplikasi traveloka. Saya sudah hubungi CS di apk tapi belum ada tanggapan. Harusnya besok saya sudah flight
|
26 |
+
positif,"kenapa setiap kita booking traveloka kita selalu harus bayar asuransi Chubb, saya sudah ada asuransi sendiri ngapain banyak2"
|
27 |
+
netral,-PA
|
28 |
+
netral,Liburan ke Jawa Barat seru dan murah murah kak :D
|
29 |
+
netral,"Hai, Kak. Mimin udah balas DM kamu ya. Yuk cek DM dari mimin Kak. -SM"
|
30 |
+
netral,"hi min, cek dm dong"
|
31 |
+
netral,"Hai Kak. Mimin sudah balas DM kamu ya, silakan dicek kembali. Terima kasih. -KA"
|
32 |
+
netral,"Hi min, cek DM ya terima kasih"
|
33 |
+
netral,mohon cek dm
|
34 |
+
netral,Gk bisa dm
|
35 |
+
netral,"Halo, Kak Budi. Pindah ke DMyuk, biar mimin bisa jelasin detail. -Mimin tunggu DM-nya. Terima kasih. -OE"
|
36 |
+
netral,"halo min, tolong cek DM yaa"
|
37 |
+
netral,"Kalau kamu terkendala hapus akun dengan cara sebelumnya, silakan email langsung ke tim internal mimin dengan alamat email privacy ya. Infokan aja kalo kamu mau hapus akun Traveloka dengan email xxx atau telepon xxx. Terima kasih. -AM 2/2"
|
38 |
+
netral,"Hai, Kak. kamu bisa nonaktifkan akun kamu dengan cara Self Deactivation. Kamu klik forgot password, nanti akan ada link untuk nonaktifkan akun yang bisa kamu klik ya dengan pilih Self Deactivation 1/2."
|
39 |
+
positif,"hi admin saya mau bayar via uangku kok gaada pilihannya ya Cuma asa debit card sama paylater, mohon bantuannya"
|
40 |
+
positif,Ini link apa coba
|
41 |
+
netral,"Fly Eat Sleep Shop Repeat No Refund on Cancelation please add it to your tagline , -I Dont think do do the same."
|
42 |
+
netral,hahahahah
|
43 |
+
netral,"Sudah kak mohon dicheck dmnya ya, terima kasih"
|
44 |
+
netral,"Hai, Kak. Mohon maaf udah buat kamu gak nyaman ya. Supaya bisa dibantu cek lebih lanjut, bisa infoin dulu email kamu via DM Mimin tunggu ya. -WR"
|
45 |
+
positif,"hi admin, saya udah topup uangku kok gak masuk² udah 30 menit lebih, thanks"
|
46 |
+
netral,"Oh sure, thank you, Ive been waiting for 92 days, surely its a very delightful experience to keep waiting another 5x60 minutes. Im sure the Flight Specialist has been doing a great job to fight for my rights, keep up the good work What would I do without you"
|
47 |
+
netral,"Hi, Kiky. We do apologize for the inconvenience. Regarding your issue, we inform you that currently it is still in the escalation process by the Flight Specialist team. Please wait for further information in an estimated 5x60 minutes via the Traveloka inbox feature o"
|
48 |
+
netral,Good morning world. Oh hi i see its the 92th day I havent been getting my refund back from you. Thank you for the helpful and re-assuring messages from you.
|
49 |
+
netral,"hi min, please cek dm ya"
|
50 |
+
netral,Kak Jajang bisa aja deh -RZ
|
51 |
+
netral,ada kode promo untuk tiket bus gak min
|
52 |
+
netral,tetep gak bisa min
|
53 |
+
netral,"Hai Kak Rhea. Terkait gambar yang kamu berikan, saat ini tidak ada kendala pada sistem pemesanan kami. Mohon pastikan jaringan yang kamu gunakan stabil serta sudah menggunakan aplikasi Traveloka versi terbaru. Mohon untuk relogin akun Traveloka kamu terlebih dahulu da"
|
54 |
+
netral,wkwkwkkw
|
55 |
+
netral,"Lewat agen kak, di ig narendra ada. Monggo dicek"
|
56 |
+
netral,minn tolong cek dm urgent banget ini min
|
57 |
+
netral,oke minn cek dm min
|
58 |
+
netral,beli tiketnya lewat aplikasi atau gmn kak
|
59 |
+
netral,"Hai, Kak. Sebelumnya bisa infokan nomor pemesanan Traveloka kamu via DM Supaya bisa kami bantu cek lebih lanjut. Terima kasih. -FR"
|
60 |
+
positif,minn klo udah mesen tiket esawat tapi nama penumpang ada yang salah gimana ya min bisa diperbaiki ga aku nulis namanya ke double gitu min
|
61 |
+
netral,"Ada Rosalia Indah, Harapan Jaya, Narendra"
|
62 |
+
netral,Hai Kak Setyo. Bergantung tanggal dan jam yang kamu pilih saat melakukan pemesanan ya. Kamu bisa cek ketersediaan jam keberangkatan melalui aplikasi Traveloka kamu. Yuk di cek -FR
|
63 |
+
netral,Dari jakarta jam berapa
|
64 |
+
netral,"Bisa kak, di sesuaiin aja sm kebutuhan"
|
65 |
+
positif,Nice sowbat mintownquh...
|
66 |
+
netral,"Hai, Kak. Untuk perihal tersebut tergantung ketersediaan dari pihak PO Bus ya Kak. Nantinya kamu bisa cek detailnya melalui aplikasi Traveloka pada saat pesan, ataupun bisa konfirmasi kepada pihak PO Bus Kak. Nantinya apabila kamu ada pertanyaan lain, jangan ragu DM mimi"
|
67 |
+
positif,ada gak sih double decker yg jkt-madiunn pliss ingfonyaa
|
68 |
+
netral,"Baik Kak, silakan cek DM-nya kembali ya -RU"
|
69 |
+
netral,"Hai, Kak. Mohon menginformasikan kembali kendala atau pertanyaan yang ingin Kakak saikan terkait Booking ID tersebut via DM agar mimin bisa bantu lebih lanjut informasinya. Terima kasih. -RU"
|
70 |
+
netral,Hi kak tolong cdm yaa urgent
|
71 |
+
netral,Terima kasih ya min
|
72 |
+
netral,"You know what, Im gonna keep this thread long. I will start my Day-91. Please anticipate for more tweets days ahead ok Im counting days on you."
|
73 |
+
netral,"So now tell me, what actions you have been doing for the past 90 days Can you show me your communication history with PAL"
|
74 |
+
netral,We apologize for the inconvenience caused. Our team is currently looking into your issue and we will get back to you shortly via inbox on Traveloka app with the latest update. Thank you for your understanding -RU
|
75 |
+
netral,Sudah DM
|
76 |
+
netral,"Hai, Nita. Makasih udah hubungin mimin ya. Sambil mimin infoin pertanyaan kamu, bisikin ke mimin juga yuk data email kamu yang terdaftar di Traveloka via DM , mimin mau ajak kamu untuk ikutan isi survey layanan nih. Mimin tunggu ya. Terima kasih : -RU"
|
77 |
+
positif,"Klo paspor lama expired, masih bisa beli tiket ga dg paspor tsb, Perpanjangan paspor msh nunggu antrian soalnya."
|
78 |
+
netral,"Hai Kak, makasih udah hubungin mimin ya. Sambil mimin infoin pertanyaan kamu, bisikin ke mimin juga yuk data email kamu yang terdaftar di Traveloka melalui DM, mimin mau ajak kamu untuk ikutan isi survey layanan nih -DZ 1/2"
|
79 |
+
netral,Next trip cobain double decker
|
80 |
+
netral,Min event 2.2 bakal ada promo bus gkkkk Promo kemaren udh gk kebagiann
|
81 |
+
netral,"Baik Kak Hazna, mimin sudah balas DM kamu ya, mohon untuk cek DM Kak -NT"
|
82 |
+
netral,Trip kapan min
|
83 |
+
netral,Is 4.6million too much for you to return Arent you a Unicorn arent you ranked 1st as The Best Place to Work in Indonesia So you take care your employees but you dont take your customers seriously
|
84 |
+
netral,You know what I am out of patience. You guys did not take me seriously for the past 90 days What is this International Flight Specialist have been doing with my inquiry the whole 3 MONTHS Were you aware that this is a SERIOUS issue You guys are keeping my money
|
85 |
+
netral,We are coordinating closely with related airlines regarding to your issue. Our International Flight Specialist team will update and provide you the confirmation through inbox on Traveloka app on Case ID 30xxxx77. We thank you for your patience -RU
|
86 |
+
netral,What is now your offer to solve this How are you gonna escalated this To who Who is the one going to ensure my money will return
|
87 |
+
netral,"Ive been contacting your cs team dozen times. No answers every time. You all keep wanting me to wait and throw your responsibilities to the airlines. Do remember I made my transaction to you, I transferred my money to you. You got your profit already from my transaction. But me"
|
88 |
+
netral,"Iyaa, aku berharapnya yaa bisa dibedakan gituu"
|
89 |
+
netral,Dear mohon bantuannya. Saya ingin melaporkan ketidakpuasan saya terhadap penanganan isu refund dari Mohon dapat diberikan info prosedur pelaporan. Terima kasih.
|
90 |
+
netral,"Hi, Kiky. Were deeply sorry for the inconvenience youve experienced. We understand how frustrating this experience must be. However we will need your help to kindly inform us your booking details via DM so we can address your concern to internal team as well as our"
|
91 |
+
netral,"Hehe, minimal atur jadwal dan ambil cuti kamu aja dulu Kak Ecko, sambil ngumpulin dananya -NT"
|
92 |
+
netral,"Baik Kak, mohon cek DM-nya kembali ya. Terima kasih -RU"
|
93 |
+
netral,"valid no debat, sering disandingin, biar kaya lagi ngobrol, twitter kiri, traveloka kanan"
|
94 |
+
netral,"Hai Kak Hazna, agar mimin dapat bantu cek lebih lanjut mengenai pemesanan kamu, mohon sebutkan nomor pemesanan kamu di Traveloka yang terdiri dari 9 Digit angka via DM ya Kak, mimin tunggu konfirmasinya. -NT"
|
95 |
+
positif,Udh saya inbox
|
96 |
+
positif,Servis makannya ga bedaa ya sama class yg lain:
|
97 |
+
netral,"Halo, Vazeryn. Kami mohon maaf atas kendala yang dialami saat menggunakan aplikasi kami. Untuk investigasi lebih lanjut, kamu bisa kirimkan screenshot error atau penjelasan kendala yang ditemui via DM ya Kak. Terima kasih -RU"
|
98 |
+
netral,"Makasih ya Kak Kadek udah berbagi cerita Pasti seru banget perjalanannya Oh Iya, mimin minta bantuan Kakak buat ngisi survey meningkatkan kualitas pelayanan Traveloka. Boleh infoin alamat email kamu via DM Mimin tunggu ya. Makasih -RU"
|
99 |
+
netral,cek dm min
|
assets/df_model.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:29ceb6f9e4327d2b9181cf5a651f8e88876920b94c205a52340f401f8c2ae536
|
3 |
+
size 63943096
|
assets/notebook.ipynb
ADDED
File without changes
|
assets/stopwordbahasa.csv
ADDED
@@ -0,0 +1,782 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
mimin
|
2 |
+
gimana
|
3 |
+
min
|
4 |
+
ga
|
5 |
+
iya
|
6 |
+
dg
|
7 |
+
dengan
|
8 |
+
ia
|
9 |
+
bahwa
|
10 |
+
oleh
|
11 |
+
sy
|
12 |
+
kl
|
13 |
+
gak
|
14 |
+
ah
|
15 |
+
apa
|
16 |
+
kok
|
17 |
+
mau
|
18 |
+
yg
|
19 |
+
pak
|
20 |
+
bapak
|
21 |
+
ibu
|
22 |
+
krn
|
23 |
+
nya
|
24 |
+
ya
|
25 |
+
ada
|
26 |
+
adalah
|
27 |
+
adanya
|
28 |
+
adapun
|
29 |
+
agak
|
30 |
+
agaknya
|
31 |
+
agar
|
32 |
+
akan
|
33 |
+
akankah
|
34 |
+
akhir
|
35 |
+
akhiri
|
36 |
+
akhirnya
|
37 |
+
aku
|
38 |
+
akulah
|
39 |
+
amat
|
40 |
+
amatlah
|
41 |
+
anda
|
42 |
+
andalah
|
43 |
+
antar
|
44 |
+
antara
|
45 |
+
antaranya
|
46 |
+
apa
|
47 |
+
apaan
|
48 |
+
apabila
|
49 |
+
apakah
|
50 |
+
apalagi
|
51 |
+
apatah
|
52 |
+
artinya
|
53 |
+
asal
|
54 |
+
asalkan
|
55 |
+
atas
|
56 |
+
atau
|
57 |
+
ataukah
|
58 |
+
ataupun
|
59 |
+
awal
|
60 |
+
awalnya
|
61 |
+
bagai
|
62 |
+
bagaikan
|
63 |
+
bagaimana
|
64 |
+
bagaimanakah
|
65 |
+
bagaimanapun
|
66 |
+
bagi
|
67 |
+
bagian
|
68 |
+
bahkan
|
69 |
+
bahwa
|
70 |
+
bahwasanya
|
71 |
+
baik
|
72 |
+
bakal
|
73 |
+
bakalan
|
74 |
+
balik
|
75 |
+
banyak
|
76 |
+
bapak
|
77 |
+
baru
|
78 |
+
bawah
|
79 |
+
beberapa
|
80 |
+
begini
|
81 |
+
beginian
|
82 |
+
beginikah
|
83 |
+
beginilah
|
84 |
+
begitu
|
85 |
+
begitukah
|
86 |
+
begitulah
|
87 |
+
begitupun
|
88 |
+
bekerja
|
89 |
+
belakang
|
90 |
+
belakangan
|
91 |
+
belum
|
92 |
+
belumlah
|
93 |
+
benar
|
94 |
+
benarkah
|
95 |
+
benarlah
|
96 |
+
berada
|
97 |
+
berakhir
|
98 |
+
berakhirlah
|
99 |
+
berakhirnya
|
100 |
+
berapa
|
101 |
+
berapakah
|
102 |
+
berapalah
|
103 |
+
berapapun
|
104 |
+
berarti
|
105 |
+
berawal
|
106 |
+
berbagai
|
107 |
+
berdatangan
|
108 |
+
beri
|
109 |
+
berikan
|
110 |
+
berikut
|
111 |
+
berikutnya
|
112 |
+
berjumlah
|
113 |
+
berkali-kali
|
114 |
+
berkata
|
115 |
+
berkehendak
|
116 |
+
berkeinginan
|
117 |
+
berkenaan
|
118 |
+
berlainan
|
119 |
+
berlalu
|
120 |
+
berlangsung
|
121 |
+
berlebihan
|
122 |
+
bermacam
|
123 |
+
bermacam-macam
|
124 |
+
bermaksud
|
125 |
+
bermula
|
126 |
+
bersama
|
127 |
+
bersama-sama
|
128 |
+
bersiap
|
129 |
+
bersiap-siap
|
130 |
+
bertanya
|
131 |
+
bertanya-tanya
|
132 |
+
berturut
|
133 |
+
berturut-turut
|
134 |
+
bertutur
|
135 |
+
berujar
|
136 |
+
berupa
|
137 |
+
besar
|
138 |
+
betul
|
139 |
+
betulkah
|
140 |
+
biasa
|
141 |
+
biasanya
|
142 |
+
bila
|
143 |
+
bilakah
|
144 |
+
bisa
|
145 |
+
bisakah
|
146 |
+
boleh
|
147 |
+
bolehkah
|
148 |
+
bolehlah
|
149 |
+
buat
|
150 |
+
bukan
|
151 |
+
bukankah
|
152 |
+
bukanlah
|
153 |
+
bukannya
|
154 |
+
bulan
|
155 |
+
bung
|
156 |
+
cara
|
157 |
+
caranya
|
158 |
+
cukup
|
159 |
+
cukupkah
|
160 |
+
cukuplah
|
161 |
+
cuma
|
162 |
+
dahulu
|
163 |
+
dalam
|
164 |
+
dan
|
165 |
+
dapat
|
166 |
+
dari
|
167 |
+
daripada
|
168 |
+
datang
|
169 |
+
dekat
|
170 |
+
demi
|
171 |
+
demikian
|
172 |
+
demikianlah
|
173 |
+
dengan
|
174 |
+
depan
|
175 |
+
di
|
176 |
+
dia
|
177 |
+
diakhiri
|
178 |
+
diakhirinya
|
179 |
+
dialah
|
180 |
+
diantara
|
181 |
+
diantaranya
|
182 |
+
diberi
|
183 |
+
diberikan
|
184 |
+
diberikannya
|
185 |
+
dibuat
|
186 |
+
dibuatnya
|
187 |
+
didapat
|
188 |
+
didatangkan
|
189 |
+
digunakan
|
190 |
+
diibaratkan
|
191 |
+
diibaratkannya
|
192 |
+
diingat
|
193 |
+
diingatkan
|
194 |
+
diinginkan
|
195 |
+
dijawab
|
196 |
+
dijelaskan
|
197 |
+
dijelaskannya
|
198 |
+
dikarenakan
|
199 |
+
dikatakan
|
200 |
+
dikatakannya
|
201 |
+
dikerjakan
|
202 |
+
diketahui
|
203 |
+
diketahuinya
|
204 |
+
dikira
|
205 |
+
dilakukan
|
206 |
+
dilalui
|
207 |
+
dilihat
|
208 |
+
dimaksud
|
209 |
+
dimaksudkan
|
210 |
+
dimaksudkannya
|
211 |
+
dimaksudnya
|
212 |
+
diminta
|
213 |
+
dimintai
|
214 |
+
dimisalkan
|
215 |
+
dimulai
|
216 |
+
dimulailah
|
217 |
+
dimulainya
|
218 |
+
dimungkinkan
|
219 |
+
dini
|
220 |
+
dipastikan
|
221 |
+
diperbuat
|
222 |
+
diperbuatnya
|
223 |
+
dipergunakan
|
224 |
+
diperkirakan
|
225 |
+
diperlihatkan
|
226 |
+
diperlukan
|
227 |
+
diperlukannya
|
228 |
+
dipersoalkan
|
229 |
+
dipertanyakan
|
230 |
+
dipunyai
|
231 |
+
diri
|
232 |
+
dirinya
|
233 |
+
disampaikan
|
234 |
+
disebut
|
235 |
+
disebutkan
|
236 |
+
disebutkannya
|
237 |
+
disini
|
238 |
+
disinilah
|
239 |
+
ditambahkan
|
240 |
+
ditandaskan
|
241 |
+
ditanya
|
242 |
+
ditanyai
|
243 |
+
ditanyakan
|
244 |
+
ditegaskan
|
245 |
+
ditujukan
|
246 |
+
ditunjuk
|
247 |
+
ditunjuki
|
248 |
+
ditunjukkan
|
249 |
+
ditunjukkannya
|
250 |
+
ditunjuknya
|
251 |
+
dituturkan
|
252 |
+
dituturkannya
|
253 |
+
diucapkan
|
254 |
+
diucapkannya
|
255 |
+
diungkapkan
|
256 |
+
dong
|
257 |
+
dua
|
258 |
+
dulu
|
259 |
+
empat
|
260 |
+
enggak
|
261 |
+
enggaknya
|
262 |
+
entah
|
263 |
+
entahlah
|
264 |
+
guna
|
265 |
+
gunakan
|
266 |
+
hal
|
267 |
+
hampir
|
268 |
+
hanya
|
269 |
+
hanyalah
|
270 |
+
hari
|
271 |
+
harus
|
272 |
+
haruslah
|
273 |
+
harusnya
|
274 |
+
hendak
|
275 |
+
hendaklah
|
276 |
+
hendaknya
|
277 |
+
hingga
|
278 |
+
ia
|
279 |
+
ialah
|
280 |
+
ibarat
|
281 |
+
ibaratkan
|
282 |
+
ibaratnya
|
283 |
+
ibu
|
284 |
+
ikut
|
285 |
+
ingat
|
286 |
+
ingat-ingat
|
287 |
+
ingin
|
288 |
+
inginkah
|
289 |
+
inginkan
|
290 |
+
ini
|
291 |
+
inikah
|
292 |
+
inilah
|
293 |
+
itu
|
294 |
+
itukah
|
295 |
+
itulah
|
296 |
+
jadi
|
297 |
+
jadilah
|
298 |
+
jadinya
|
299 |
+
jangan
|
300 |
+
jangankan
|
301 |
+
janganlah
|
302 |
+
jauh
|
303 |
+
jawab
|
304 |
+
jawaban
|
305 |
+
jawabnya
|
306 |
+
jelas
|
307 |
+
jelaskan
|
308 |
+
jelaslah
|
309 |
+
jelasnya
|
310 |
+
jika
|
311 |
+
jikalau
|
312 |
+
juga
|
313 |
+
jumlah
|
314 |
+
jumlahnya
|
315 |
+
justru
|
316 |
+
kala
|
317 |
+
kalau
|
318 |
+
kalaulah
|
319 |
+
kalaupun
|
320 |
+
kalian
|
321 |
+
kami
|
322 |
+
kamilah
|
323 |
+
kamu
|
324 |
+
kamulah
|
325 |
+
kan
|
326 |
+
kapan
|
327 |
+
kapankah
|
328 |
+
kapanpun
|
329 |
+
karena
|
330 |
+
karenanya
|
331 |
+
kasus
|
332 |
+
kata
|
333 |
+
katakan
|
334 |
+
katakanlah
|
335 |
+
katanya
|
336 |
+
ke
|
337 |
+
keadaan
|
338 |
+
kebetulan
|
339 |
+
kecil
|
340 |
+
kedua
|
341 |
+
keduanya
|
342 |
+
keinginan
|
343 |
+
kelamaan
|
344 |
+
kelihatan
|
345 |
+
kelihatannya
|
346 |
+
kelima
|
347 |
+
keluar
|
348 |
+
kembali
|
349 |
+
kemudian
|
350 |
+
kemungkinan
|
351 |
+
kemungkinannya
|
352 |
+
kenapa
|
353 |
+
kepada
|
354 |
+
kepadanya
|
355 |
+
kesampaian
|
356 |
+
keseluruhan
|
357 |
+
keseluruhannya
|
358 |
+
keterlaluan
|
359 |
+
ketika
|
360 |
+
khususnya
|
361 |
+
kini
|
362 |
+
kinilah
|
363 |
+
kira
|
364 |
+
kira-kira
|
365 |
+
kiranya
|
366 |
+
kita
|
367 |
+
kitalah
|
368 |
+
kok
|
369 |
+
kurang
|
370 |
+
lagi
|
371 |
+
lagian
|
372 |
+
lah
|
373 |
+
lain
|
374 |
+
lainnya
|
375 |
+
lalu
|
376 |
+
lama
|
377 |
+
lamanya
|
378 |
+
lanjut
|
379 |
+
lanjutnya
|
380 |
+
lebih
|
381 |
+
lewat
|
382 |
+
lima
|
383 |
+
luar
|
384 |
+
macam
|
385 |
+
maka
|
386 |
+
makanya
|
387 |
+
makin
|
388 |
+
malah
|
389 |
+
malahan
|
390 |
+
mampu
|
391 |
+
mampukah
|
392 |
+
mana
|
393 |
+
manakala
|
394 |
+
manalagi
|
395 |
+
masa
|
396 |
+
masalah
|
397 |
+
masalahnya
|
398 |
+
masih
|
399 |
+
masihkah
|
400 |
+
masing
|
401 |
+
masing-masing
|
402 |
+
mau
|
403 |
+
maupun
|
404 |
+
melainkan
|
405 |
+
melakukan
|
406 |
+
melalui
|
407 |
+
melihat
|
408 |
+
melihatnya
|
409 |
+
memang
|
410 |
+
memastikan
|
411 |
+
memberi
|
412 |
+
memberikan
|
413 |
+
membuat
|
414 |
+
memerlukan
|
415 |
+
memihak
|
416 |
+
meminta
|
417 |
+
memintakan
|
418 |
+
memisalkan
|
419 |
+
memperbuat
|
420 |
+
mempergunakan
|
421 |
+
memperkirakan
|
422 |
+
memperlihatkan
|
423 |
+
mempersiapkan
|
424 |
+
mempersoalkan
|
425 |
+
mempertanyakan
|
426 |
+
mempunyai
|
427 |
+
memulai
|
428 |
+
memungkinkan
|
429 |
+
menaiki
|
430 |
+
menambahkan
|
431 |
+
menandaskan
|
432 |
+
menanti
|
433 |
+
menanti-nanti
|
434 |
+
menantikan
|
435 |
+
menanya
|
436 |
+
menanyai
|
437 |
+
menanyakan
|
438 |
+
mendapat
|
439 |
+
mendapatkan
|
440 |
+
mendatang
|
441 |
+
mendatangi
|
442 |
+
mendatangkan
|
443 |
+
menegaskan
|
444 |
+
mengakhiri
|
445 |
+
mengapa
|
446 |
+
mengatakan
|
447 |
+
mengatakannya
|
448 |
+
mengenai
|
449 |
+
mengerjakan
|
450 |
+
mengetahui
|
451 |
+
menggunakan
|
452 |
+
menghendaki
|
453 |
+
mengibaratkan
|
454 |
+
mengibaratkannya
|
455 |
+
mengingat
|
456 |
+
mengingatkan
|
457 |
+
menginginkan
|
458 |
+
mengira
|
459 |
+
mengucapkan
|
460 |
+
mengucapkannya
|
461 |
+
mengungkapkan
|
462 |
+
menjadi
|
463 |
+
menjawab
|
464 |
+
menjelaskan
|
465 |
+
menuju
|
466 |
+
menunjuk
|
467 |
+
menunjuki
|
468 |
+
menunjukkan
|
469 |
+
menunjuknya
|
470 |
+
menurut
|
471 |
+
menuturkan
|
472 |
+
menyampaikan
|
473 |
+
menyangkut
|
474 |
+
menyatakan
|
475 |
+
menyebutkan
|
476 |
+
menyeluruh
|
477 |
+
menyiapkan
|
478 |
+
merasa
|
479 |
+
mereka
|
480 |
+
merekalah
|
481 |
+
merupakan
|
482 |
+
meski
|
483 |
+
meskipun
|
484 |
+
meyakini
|
485 |
+
meyakinkan
|
486 |
+
minta
|
487 |
+
mirip
|
488 |
+
misal
|
489 |
+
misalkan
|
490 |
+
misalnya
|
491 |
+
mula
|
492 |
+
mulai
|
493 |
+
mulailah
|
494 |
+
mulanya
|
495 |
+
mungkin
|
496 |
+
mungkinkah
|
497 |
+
nah
|
498 |
+
naik
|
499 |
+
namun
|
500 |
+
nanti
|
501 |
+
nantinya
|
502 |
+
nyaris
|
503 |
+
nyatanya
|
504 |
+
oleh
|
505 |
+
olehnya
|
506 |
+
pada
|
507 |
+
padahal
|
508 |
+
padanya
|
509 |
+
pak
|
510 |
+
paling
|
511 |
+
panjang
|
512 |
+
pantas
|
513 |
+
para
|
514 |
+
pasti
|
515 |
+
pastilah
|
516 |
+
penting
|
517 |
+
pentingnya
|
518 |
+
per
|
519 |
+
percuma
|
520 |
+
perlu
|
521 |
+
perlukah
|
522 |
+
perlunya
|
523 |
+
pernah
|
524 |
+
persoalan
|
525 |
+
pertama
|
526 |
+
pertama-tama
|
527 |
+
pertanyaan
|
528 |
+
pertanyakan
|
529 |
+
pihak
|
530 |
+
pihaknya
|
531 |
+
pukul
|
532 |
+
pula
|
533 |
+
pun
|
534 |
+
punya
|
535 |
+
rasa
|
536 |
+
rasanya
|
537 |
+
rata
|
538 |
+
rupanya
|
539 |
+
saat
|
540 |
+
saatnya
|
541 |
+
saja
|
542 |
+
sajalah
|
543 |
+
saling
|
544 |
+
sama
|
545 |
+
sama-sama
|
546 |
+
sambil
|
547 |
+
sampai
|
548 |
+
sampai-sampai
|
549 |
+
sampaikan
|
550 |
+
sana
|
551 |
+
sangat
|
552 |
+
sangatlah
|
553 |
+
satu
|
554 |
+
saya
|
555 |
+
sayalah
|
556 |
+
se
|
557 |
+
sebab
|
558 |
+
sebabnya
|
559 |
+
sebagai
|
560 |
+
sebagaimana
|
561 |
+
sebagainya
|
562 |
+
sebagian
|
563 |
+
sebaik
|
564 |
+
sebaik-baiknya
|
565 |
+
sebaiknya
|
566 |
+
sebaliknya
|
567 |
+
sebanyak
|
568 |
+
sebegini
|
569 |
+
sebegitu
|
570 |
+
sebelum
|
571 |
+
sebelumnya
|
572 |
+
sebenarnya
|
573 |
+
seberapa
|
574 |
+
sebesar
|
575 |
+
sebetulnya
|
576 |
+
sebisanya
|
577 |
+
sebuah
|
578 |
+
sebut
|
579 |
+
sebutlah
|
580 |
+
sebutnya
|
581 |
+
secara
|
582 |
+
secukupnya
|
583 |
+
sedang
|
584 |
+
sedangkan
|
585 |
+
sedemikian
|
586 |
+
sedikit
|
587 |
+
sedikitnya
|
588 |
+
seenaknya
|
589 |
+
segala
|
590 |
+
segalanya
|
591 |
+
segera
|
592 |
+
seharusnya
|
593 |
+
sehingga
|
594 |
+
seingat
|
595 |
+
sejak
|
596 |
+
sejauh
|
597 |
+
sejenak
|
598 |
+
sejumlah
|
599 |
+
sekadar
|
600 |
+
sekadarnya
|
601 |
+
sekali
|
602 |
+
sekali-kali
|
603 |
+
sekalian
|
604 |
+
sekaligus
|
605 |
+
sekalipun
|
606 |
+
sekarang
|
607 |
+
sekarang
|
608 |
+
sekecil
|
609 |
+
seketika
|
610 |
+
sekiranya
|
611 |
+
sekitar
|
612 |
+
sekitarnya
|
613 |
+
sekurang-kurangnya
|
614 |
+
sekurangnya
|
615 |
+
sela
|
616 |
+
selain
|
617 |
+
selaku
|
618 |
+
selalu
|
619 |
+
selama
|
620 |
+
selama-lamanya
|
621 |
+
selamanya
|
622 |
+
selanjutnya
|
623 |
+
seluruh
|
624 |
+
seluruhnya
|
625 |
+
semacam
|
626 |
+
semakin
|
627 |
+
semampu
|
628 |
+
semampunya
|
629 |
+
semasa
|
630 |
+
semasih
|
631 |
+
semata
|
632 |
+
semata-mata
|
633 |
+
semaunya
|
634 |
+
sementara
|
635 |
+
semisal
|
636 |
+
semisalnya
|
637 |
+
sempat
|
638 |
+
semua
|
639 |
+
semuanya
|
640 |
+
semula
|
641 |
+
sendiri
|
642 |
+
sendirian
|
643 |
+
sendirinya
|
644 |
+
seolah
|
645 |
+
seolah-olah
|
646 |
+
seorang
|
647 |
+
sepanjang
|
648 |
+
sepantasnya
|
649 |
+
sepantasnyalah
|
650 |
+
seperlunya
|
651 |
+
seperti
|
652 |
+
sepertinya
|
653 |
+
sepihak
|
654 |
+
sering
|
655 |
+
seringnya
|
656 |
+
serta
|
657 |
+
serupa
|
658 |
+
sesaat
|
659 |
+
sesama
|
660 |
+
sesampai
|
661 |
+
sesegera
|
662 |
+
sesekali
|
663 |
+
seseorang
|
664 |
+
sesuatu
|
665 |
+
sesuatunya
|
666 |
+
sesudah
|
667 |
+
sesudahnya
|
668 |
+
setelah
|
669 |
+
setempat
|
670 |
+
setengah
|
671 |
+
seterusnya
|
672 |
+
setiap
|
673 |
+
setiba
|
674 |
+
setibanya
|
675 |
+
setidak-tidaknya
|
676 |
+
setidaknya
|
677 |
+
setinggi
|
678 |
+
seusai
|
679 |
+
sewaktu
|
680 |
+
siap
|
681 |
+
siapa
|
682 |
+
siapakah
|
683 |
+
siapapun
|
684 |
+
sini
|
685 |
+
sinilah
|
686 |
+
soal
|
687 |
+
soalnya
|
688 |
+
suatu
|
689 |
+
sudah
|
690 |
+
sudahkah
|
691 |
+
sudahlah
|
692 |
+
supaya
|
693 |
+
tadi
|
694 |
+
tadinya
|
695 |
+
tahu
|
696 |
+
tahun
|
697 |
+
tak
|
698 |
+
tambah
|
699 |
+
tambahnya
|
700 |
+
tampak
|
701 |
+
tampaknya
|
702 |
+
tandas
|
703 |
+
tandasnya
|
704 |
+
tanpa
|
705 |
+
tanya
|
706 |
+
tanyakan
|
707 |
+
tanyanya
|
708 |
+
tapi
|
709 |
+
tegas
|
710 |
+
tegasnya
|
711 |
+
telah
|
712 |
+
tempat
|
713 |
+
tengah
|
714 |
+
tentang
|
715 |
+
tentu
|
716 |
+
tentulah
|
717 |
+
tentunya
|
718 |
+
tepat
|
719 |
+
terakhir
|
720 |
+
terasa
|
721 |
+
terbanyak
|
722 |
+
terdahulu
|
723 |
+
terdapat
|
724 |
+
terdiri
|
725 |
+
terhadap
|
726 |
+
terhadapnya
|
727 |
+
teringat
|
728 |
+
teringat-ingat
|
729 |
+
terjadi
|
730 |
+
terjadilah
|
731 |
+
terjadinya
|
732 |
+
terkira
|
733 |
+
terlalu
|
734 |
+
terlebih
|
735 |
+
terlihat
|
736 |
+
termasuk
|
737 |
+
ternyata
|
738 |
+
tersampaikan
|
739 |
+
tersebut
|
740 |
+
tersebutlah
|
741 |
+
tertentu
|
742 |
+
tertuju
|
743 |
+
terus
|
744 |
+
terutama
|
745 |
+
tetap
|
746 |
+
tetapi
|
747 |
+
tiap
|
748 |
+
tiba
|
749 |
+
tiba-tiba
|
750 |
+
tidak
|
751 |
+
tidakkah
|
752 |
+
tidaklah
|
753 |
+
tiga
|
754 |
+
tinggi
|
755 |
+
toh
|
756 |
+
tunjuk
|
757 |
+
turut
|
758 |
+
tutur
|
759 |
+
tuturnya
|
760 |
+
ucap
|
761 |
+
ucapnya
|
762 |
+
ujar
|
763 |
+
ujarnya
|
764 |
+
umum
|
765 |
+
umumnya
|
766 |
+
ungkap
|
767 |
+
ungkapnya
|
768 |
+
untuk
|
769 |
+
usah
|
770 |
+
usai
|
771 |
+
waduh
|
772 |
+
wah
|
773 |
+
wahai
|
774 |
+
waktu
|
775 |
+
waktunya
|
776 |
+
walau
|
777 |
+
walaupun
|
778 |
+
wong
|
779 |
+
yaitu
|
780 |
+
yakin
|
781 |
+
yakni
|
782 |
+
yang
|
assets/twitter.png
ADDED
![]() |
assets/valid.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
indobert/config.json
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "indobenchmark/indobert-lite-base-p1",
|
3 |
+
"_num_labels": 3,
|
4 |
+
"architectures": [
|
5 |
+
"AlbertForSequenceClassification"
|
6 |
+
],
|
7 |
+
"attention_probs_dropout_prob": 0,
|
8 |
+
"bos_token_id": 2,
|
9 |
+
"classifier_dropout_prob": 0.1,
|
10 |
+
"down_scale_factor": 1,
|
11 |
+
"embedding_size": 128,
|
12 |
+
"eos_token_id": 3,
|
13 |
+
"gap_size": 0,
|
14 |
+
"hidden_act": "gelu",
|
15 |
+
"hidden_dropout_prob": 0,
|
16 |
+
"hidden_size": 768,
|
17 |
+
"id2label": {
|
18 |
+
"0": 0,
|
19 |
+
"1": 1,
|
20 |
+
"2": 2
|
21 |
+
},
|
22 |
+
"initializer_range": 0.02,
|
23 |
+
"inner_group_num": 1,
|
24 |
+
"intermediate_size": 3072,
|
25 |
+
"label2id": {
|
26 |
+
"positif": 0,
|
27 |
+
"netral": 1,
|
28 |
+
"negatif": 2
|
29 |
+
},
|
30 |
+
"layer_norm_eps": 1e-12,
|
31 |
+
"max_position_embeddings": 512,
|
32 |
+
"model_type": "albert",
|
33 |
+
"net_structure_type": 0,
|
34 |
+
"num_attention_heads": 12,
|
35 |
+
"num_hidden_groups": 1,
|
36 |
+
"num_hidden_layers": 12,
|
37 |
+
"num_memory_blocks": 0,
|
38 |
+
"output_past": true,
|
39 |
+
"pad_token_id": 0,
|
40 |
+
"position_embedding_type": "absolute",
|
41 |
+
"problem_type": "single_label_classification",
|
42 |
+
"torch_dtype": "float32",
|
43 |
+
"transformers_version": "4.20.1",
|
44 |
+
"type_vocab_size": 2,
|
45 |
+
"vocab_size": 30000
|
46 |
+
}
|
indobert/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7acc22f7091c678b59d2953576814b63e8ed5339e7d43e4eedbd895e15cac77a
|
3 |
+
size 46756497
|
indobert/special_tokens_map.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"mask_token": "[MASK]",
|
4 |
+
"pad_token": "[PAD]",
|
5 |
+
"sep_token": "[SEP]",
|
6 |
+
"unk_token": "[UNK]"
|
7 |
+
}
|
indobert/tokenizer_config.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"do_basic_tokenize": true,
|
4 |
+
"do_lower_case": true,
|
5 |
+
"mask_token": "[MASK]",
|
6 |
+
"name_or_path": "indobenchmark/indobert-lite-base-p1",
|
7 |
+
"never_split": null,
|
8 |
+
"pad_token": "[PAD]",
|
9 |
+
"sep_token": "[SEP]",
|
10 |
+
"special_tokens_map_file": "/root/.cache/huggingface/transformers/b6b586d8d94e9bf6e42220e38946991ba8ef18d03b186ced76a4f19ae21ed603.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d",
|
11 |
+
"strip_accents": null,
|
12 |
+
"tokenize_chinese_chars": true,
|
13 |
+
"tokenizer_class": "BertTokenizer",
|
14 |
+
"unk_token": "[UNK]"
|
15 |
+
}
|
indobert/vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
pages/1__model_information.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import time
|
3 |
+
import numpy as np
|
4 |
+
import joblib
|
5 |
+
import plotly.express as px
|
6 |
+
import script.functions as fn
|
7 |
+
import pandas as pd
|
8 |
+
|
9 |
+
st.set_page_config(page_title="Model Information", page_icon="📈")
|
10 |
+
|
11 |
+
st.sidebar.markdown("📈 Model Information")
|
12 |
+
|
13 |
+
st.markdown("<h1 style='text-align: center;'>📈 Model Information</h1>", unsafe_allow_html=True)
|
14 |
+
st.write("halaman ini berisi mengenai informasi model yang tersedia pada aplikasi. anda bisa melihat bagaimana performa model dalam memprediksi sentiment baik dari waktu maupun hasil prediksi.")
|
15 |
+
|
16 |
+
st.markdown("<h3>⌛ Model Perfomance</h3>", unsafe_allow_html=True)
|
17 |
+
st.caption("Perfomance model dihitung berdasarkan akurasi dan waktu yang dibutuhkan model untuk memprediksi 100 data")
|
18 |
+
df_model = joblib.load("./assets/df_model.pkl")
|
19 |
+
fig = fn.plot_model_summary(df_model)
|
20 |
+
st.plotly_chart(fig,use_container_width=True,theme="streamlit")
|
21 |
+
|
22 |
+
|
23 |
+
st.markdown("<h3>🚀 Model Evaluation</h3>", unsafe_allow_html=True)
|
24 |
+
st.caption("Hasil evaluasi model berdasarkan data IndoNLU subset smsa pada validation split")
|
25 |
+
|
26 |
+
df = pd.read_csv("./assets/valid.csv")
|
27 |
+
option = st.selectbox('Pilih Model',["IndoBERT",'Naive Bayes','Logistic Regression','XGBoost','Catboost','SVM','Random Forest'],key = "model1")
|
28 |
+
clfr_fig = fn.plot_clfr(df_model,option,df)
|
29 |
+
conf_m_fig = fn.plot_confusion_matrix(df_model,option,df)
|
30 |
+
clfr,conf_m = st.columns([1,1])
|
31 |
+
with clfr:
|
32 |
+
st.plotly_chart(clfr_fig,use_container_width=True,theme="streamlit")
|
33 |
+
with conf_m:
|
34 |
+
st.plotly_chart(conf_m_fig,use_container_width=True,theme="streamlit")
|
35 |
+
st.caption("CLassification Report : Classification report merupakan metode evaluasi yang menyedakan data mengenai akurasi klasifikasi, recall, precision, dan F1 score.")
|
36 |
+
st.caption("Confusion Matrix : mengukur jumlah prediksi benar dan salah yang dibuat oleh model yang berguna untuk menunjukkan kinerja dari model untuk setiap kelas")
|
requirements.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
scikit-learn == 1.2.1
|
2 |
+
pandas
|
3 |
+
numpy
|
4 |
+
matplotlib
|
5 |
+
plotly
|
6 |
+
wordcloud
|
7 |
+
transformers
|
8 |
+
sentence-transformers
|
9 |
+
bertopic
|
10 |
+
snscrape == 0.5.0.20230113
|
11 |
+
xgboost == 1.7.3
|
12 |
+
catboost == 1.1.1
|
scraping.ipynb
ADDED
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"import snscrape.modules.twitter as sntwitter"
|
10 |
+
]
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"cell_type": "code",
|
14 |
+
"execution_count": 9,
|
15 |
+
"metadata": {},
|
16 |
+
"outputs": [
|
17 |
+
{
|
18 |
+
"ename": "AttributeError",
|
19 |
+
"evalue": "module 'snscrape' has no attribute '__version__'",
|
20 |
+
"output_type": "error",
|
21 |
+
"traceback": [
|
22 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
23 |
+
"\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)",
|
24 |
+
"Cell \u001b[1;32mIn[9], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39msnscrape\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m snscrape\u001b[39m.\u001b[39;49m__version__\n",
|
25 |
+
"\u001b[1;31mAttributeError\u001b[0m: module 'snscrape' has no attribute '__version__'"
|
26 |
+
]
|
27 |
+
}
|
28 |
+
],
|
29 |
+
"source": [
|
30 |
+
"import snscrape\n",
|
31 |
+
"snscrape.__version__"
|
32 |
+
]
|
33 |
+
},
|
34 |
+
{
|
35 |
+
"cell_type": "code",
|
36 |
+
"execution_count": 9,
|
37 |
+
"metadata": {},
|
38 |
+
"outputs": [
|
39 |
+
{
|
40 |
+
"name": "stderr",
|
41 |
+
"output_type": "stream",
|
42 |
+
"text": [
|
43 |
+
"C:\\Users\\syair dafiq\\AppData\\Local\\Temp\\ipykernel_11116\\2950995507.py:8: FutureWarning: content is deprecated, use rawContent instead\n",
|
44 |
+
" tweets.append([tweet.content])\n"
|
45 |
+
]
|
46 |
+
}
|
47 |
+
],
|
48 |
+
"source": [
|
49 |
+
"query = \"@traveloka -filter:links filter:replies lang:id\"\n",
|
50 |
+
"tweets = []\n",
|
51 |
+
"# Using TwitterSearchScraper to scrape\n",
|
52 |
+
"# Using TwitterSearchScraper to scrape\n",
|
53 |
+
"for i,tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()):\n",
|
54 |
+
" if i>=100:\n",
|
55 |
+
" break\n",
|
56 |
+
" tweets.append([tweet.content])"
|
57 |
+
]
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"cell_type": "code",
|
61 |
+
"execution_count": 10,
|
62 |
+
"metadata": {},
|
63 |
+
"outputs": [
|
64 |
+
{
|
65 |
+
"data": {
|
66 |
+
"text/plain": [
|
67 |
+
"[['@traveloka Terima kasih 🙏'],\n",
|
68 |
+
" ['@SSusaneey Aamiin! Semoga segera terwujud di tahun 2023 ya, Kak Susan 💙\\n\\n\\xa0-AM'],\n",
|
69 |
+
" ['@traveloka Pagi Admin, mau tanya kenapa skrg ditraveloka penerbangan ke Iran ngga ada semua ya ? \\n\\nTerimakasih'],\n",
|
70 |
+
" ['@adm1nwaras Halo, Kak Fata. Mengenai hal tersebut, kamu bisa langsung konfirmasi ke pihak\\nKAI dengan menghubungi call center PT. KAI di nomor 021 – 121. Terima kasih. -WR'],\n",
|
71 |
+
" ['@traveloka halo min.. kenapa ya sekarang ga bisa add tiket ke KAIAccess kalau pesan tiket nya dari traveloka? mau print boarding pass jadi harus ke counter'],\n",
|
72 |
+
" ['@humminglzbrd Halo Kak. Pada dasarnya kamu bisa centang salah satu nama penumpang saat ajukan\\nrefund di aplikasi. Namun untuk pengecekan lebih lanjut pesanan kamu, infoin\\nnomor pesanan Traveloka kamu via DM dulu yuk. Mimin tunggu ya. Terima kasih. -MZ'],\n",
|
73 |
+
" ['@traveloka min, saya beli 4 tiket pesawat dalam 1 kode booking. kalau mau cancel 1 orang saja apakah bisa?'],\n",
|
74 |
+
" ['Dear @traveloka Segera tanggapi komplain saya no. Laporan #30462475'],\n",
|
75 |
+
" ['@DDPRXYZ Hai, Kak. Agar kami bisa cek lebih lanjut, mohon infokan No. Pesanan Traveloka\\nkamu beserta bukti pembayaran kamu via DM ya. Terima kasih. -SY'],\n",
|
76 |
+
" ['@hefihendri Hai, Hefi. Mohon maaf atas kendalanya. Agar kami bisa cek lebih lanjut, mohon\\ninfokan No. Pesanan Traveloka kamu via DM ya. Terima kasih. -SY'],\n",
|
77 |
+
" ['@traveloka Akan lebih banyak air mata saat ngerti telah ditipu oleh @traveloka'],\n",
|
78 |
+
" ['@notbadkuy Sudah mimin respon DM kamu lagi Kak. Terima kasih -DZ'],\n",
|
79 |
+
" ['@traveloka Cek dm lgi ka'],\n",
|
80 |
+
" ['@notbadkuy Sudah mimin respon DM kamu Kak. Terima kasih -DZ'],\n",
|
81 |
+
" ['@traveloka @naljenal Cek dm min'],\n",
|
82 |
+
" ['@naljenal Mimin tunggu loh ya pemesanan kamu kak.\\xa0😊 -DA'],\n",
|
83 |
+
" ['@traveloka @senjatanuklir Harga akomodasi keluar dari jawa luar biasa yaa :))) wishlist dulu ya'],\n",
|
84 |
+
" ['@shazzzha Hai Kak. Mohon maaf banget ya atas kendala kamu. Saat ini untuk pembelian tiket\\npesawat sedang mengalami perbaikan system kak. Mohon kesediaannya untuk dapat\\ndilakukan pengecekan secara berkala ya kak. Terima kasih. -DA'],\n",
|
85 |
+
" ['@traveloka error terus mau pesen tiket pesawat dari jam 00.01'],\n",
|
86 |
+
" ['@Haamdee Hai Kak Hamdi, apabila kamu memiliki kendala pada pemesanan kamu, silakan\\nsampaikan ke mimin di DM ya Kak, terima kaish. -NT'],\n",
|
87 |
+
" ['@traveloka Dengan senang hati, kak. Aku menuju dm, ya!'],\n",
|
88 |
+
" ['@bifrozt Sama-sama kak. Oh ya, mimin butuh survey kamu nih untuk meningkatkan kualitas\\npelayanan Traveloka. Boleh infoin alamat email kamu yang aktif via DM Kak? Mimin\\ntunggu konfirmasinya ya. Terima kasih. -DA'],\n",
|
89 |
+
" ['@traveloka Terima kasih kak ^^'],\n",
|
90 |
+
" ['@traveloka cek dm min thankyou'],\n",
|
91 |
+
" ['@rossafosho Hai Kak Rosa. Untuk DM kamu sudah mimin balas ya. Mohon kesediaannya untuk\\nmelakukan pengecekan. Terima kasih. -DA'],\n",
|
92 |
+
" ['@traveloka @senjatanuklir Ke Likupang aja kak, Bunaken udah ga se bagus duluuu'],\n",
|
93 |
+
" ['@springberyl Hai Kak. DM kamu sudah mimin balas ya. Silakan dicek. -CA'],\n",
|
94 |
+
" ['@traveloka Ditunggu balasannya kak'],\n",
|
95 |
+
" ['@fadnov Mimin meluncur ke DM kamu ya Kak Fadli. Ditunggu 😊-TK'],\n",
|
96 |
+
" ['@traveloka halo kak. Mohon cek DM'],\n",
|
97 |
+
" ['@sbyfess 1. Beli pas low season\\n2. Giat mencari promo\\n3. Ikutan giveaway potongan harga di acara Live dll\\n4. Pake fitur buat ngsh tau kalo tiket lagi murah\\n\\nNah, semua itu tidak lain dan tidak bukan bisa kamu dapetin di aplikasi Traveloka🤭'],\n",
|
98 |
+
" ['@thohirjassin Hai, Thohir. Mohon maaf sebelumnya ya. Supaya bisa dilakukan pengecekan lebih\\nlanjut, silakan diinfoin ya nomor pesanan Traveloka kamu melalui DM. Ditunggu ya\\nKak. Terima kasih. -IN'],\n",
|
99 |
+
" ['@syakilamufida Ohh iya mimin butuh survey kamu nih untuk meningkatkan kualitas pelayanan\\nTraveloka. Boleh infoin alamat email akun Traveloka kamu via DM, Kak? Mimin\\ntunggu konfirmasinya ya. Makasih. -LA'],\n",
|
100 |
+
" ['@syakilamufida Hai, Kak. Terima kasih juga sudah percaya dengan Traveloka ya. Have a nice day😉\\n-LA'],\n",
|
101 |
+
" ['@lovelydeb_ Halo, Kak Pia. Mimin udah balas DM kamu yaaa. Yuk cek DM-nya 😊 -SM'],\n",
|
102 |
+
" ['@traveloka Mimin cantik/ganteng, buka dm dong plis\\U0001f979'],\n",
|
103 |
+
" ['@jhiyonatan Halo Yonatan, agar dapat mimin cek lebih lanjut, infokan email dan nomor ponsel\\nkamu melalui DM ya. Terima kasih. -OR'],\n",
|
104 |
+
" ['@trxshh @traveloka haloo kak, saya jg dpt email yg sama nih, DM nya udah dijawab atau blm sama mereka? so, itu bener valid dan aman kah??'],\n",
|
105 |
+
" ['@byutivi Sudah ya Kak, silakan dicek ya balasan mimin. Terima kasih. -IN'],\n",
|
106 |
+
" ['@traveloka ka jawab dong naik pesawat perlu booster atau gaak'],\n",
|
107 |
+
" ['@byutivi Oh iya nih Kak, mimin butuh survey kamu nih untuk meningkatkan kualitas\\npelayanan Traveloka. Boleh infoin alamat email akun Traveloka kamu melalui DM?\\nMimin tunggu konfirmasinya ya. Makasih. -IN'],\n",
|
108 |
+
" ['@traveloka min untuk naik pesawat sekarang apakah masih dibutuhkan syarat minimal vaksin booster?'],\n",
|
109 |
+
" ['@trxshh Hai, Kak. Untuk pengecekan lebih lanjut, silakan DM mimin email atau nomor\\ntelepon akun PayLater kamu. Mimin tunggu konfirmasinya. Terima kasih. -SM'],\n",
|
110 |
+
" ['@astrianov19 Aamiin! Semoga segera terwujud di tahun 2023 ya, Kak Astria 💙 -AM'],\n",
|
111 |
+
" ['@kuyaxs Semoga banyak hal baik yang datang ke kamu di tahun ini ya Kak 💙 -WR'],\n",
|
112 |
+
" ['@aljeihateu @traveloka haloo, aku udah di refund sehari setelah lapor. coba km hubungin customer service traveloka pake apps nya yaa'],\n",
|
113 |
+
" ['@traveloka halo admin minta tolong balas DM lagi yaah krn super urgent, thanks before 🥺🥺🥺🙏🏻🙏🏻🙏🏻'],\n",
|
114 |
+
" ['@aljeihateu Hai, Kak. Jika kamu memiliki kendala salah transfer ke rekening Traveloka, mohon\\nhubungi mimin melalui DM serta lampirkan bukti bayarnya di DM ya, agar bisa\\ndibantu cek lebih lanjut. Terima kasih. -WR'],\n",
|
115 |
+
" ['@rzkamlputrii @traveloka Maaf kak, uangnya sudah di refund,,?\\n soalnya saya jga punya masalah yg sama, salah tranfer ke rekening bni traveloka sudah lapor tapi masih panik'],\n",
|
116 |
+
" ['@thendless_D @traveloka @hiro_ngelag BIAR DAPET HOTEL'],\n",
|
117 |
+
" ['@wahyuadya Hai Kak Wahyu. Betul Kak. Kalo ada kendala atau pertanyaan, kamu bisa langsung\\nhubungi mimin melalui DM ya. Supaya bisa mimin bantu lebih lanjut. Terima kasih.\\n😊 -FR'],\n",
|
118 |
+
" ['@mgfiratulistqmh @traveloka Selamat malam kak Fira, terkait dengan informasi perubahan jadwal maskapai kakak, bisa diinfomasikan secara langsung melalui pesan/inbox via traveloka Apps atau kakak bisa via email juga, agar bisa dilakukan lebih lanjut ya kak. Terima kasih :)'],\n",
|
119 |
+
" ['@nekopurrin_ @traveloka min kok disini ga ada hotel bintang 100'],\n",
|
120 |
+
" ['@NathArtistOwO @traveloka @hiro_ngelag BELL ANJER, KENAPA SAMPEK NGETAG TRAVELOKA 😭😭 mana ditanggepin'],\n",
|
121 |
+
" ['@pencarioppa Hai Kak. Apabila ada pertanyaan bisa DM ke mimin ya. -CA'],\n",
|
122 |
+
" ['@makannteyussss @traveloka Halo k mau tanya dong kakaknya udah tau belombaliknya uang atau ap?'],\n",
|
123 |
+
" ['@susukhalal Ada yang bisa mimin bantu Kak? Kalo ada pertanyaan atau kendala kamu bisa\\ninfokan melalui DM ya Kak. Supaya bisa mimin bantu lebih lanjut. Terima kasih.\\n-FR'],\n",
|
124 |
+
" ['@traveloka mana mimin AK nya? kenapa mimin FR yang bales 🧐'],\n",
|
125 |
+
" ['@traveloka @hiro_ngelag WAH AKU DI NOTICE, MINTA HOTEL MIN'],\n",
|
126 |
+
" ['@NathArtistOwO Hai Kak Bella, apabila kamu memiliki pertanyaan tentang pemesanan kamu, silakan\\nsampaikan detailnya melalui DM ya Kak, mimin tunggu. -NT'],\n",
|
127 |
+
" ['@hiro_ngelag @traveloka Maap🙏biar dapet hotel bintang 100'],\n",
|
128 |
+
" ['@NathArtistOwO @traveloka kebiasaan ngetag org lain ajr'],\n",
|
129 |
+
" ['@hiro_ngelag @traveloka ada yang booking min'],\n",
|
130 |
+
" ['@eurush_ Hai Kak, untuk dapat kami bantu cek pemesanan kamu yang terkendala, mohon\\ninformasikan detail bukti screenshot beserta rute dan tanggal perjalanan yang\\nkamu inginkan melalui DM Traveloka ya Kak. Ditunggu😊 -AK'],\n",
|
131 |
+
" ['@kuchingtaimu @CommuterLine Padahal ayamnya mau berangkat kerja cari nafkah\\U0001f979\\U0001f979'],\n",
|
132 |
+
" ['@traveloka @traveloka nama lengkap Novie Rumuy\\nAlamat email [email protected]'],\n",
|
133 |
+
" ['@NovieRumuy Hai, Novie. Supaya bisa dilakukan pengecekan lebih lanjut, sialakn diinfoin ya\\nnama lengkap dan alamat email yang terdaftar pada akun PayLater kamu melalui DM.\\nDitunggu ya Kak. Terima kasih. -IN'],\n",
|
134 |
+
" ['@traveloka kenapa sy memanfaatkan saldo paylater saya, lalu tiba2 saldo saya di turunkan, padahal sy membayar tepat waktu... apa manfaatnya paylater kalo tdk dimanfaatkan.... hapuskan aja produknya kalo bgtu...'],\n",
|
135 |
+
" ['@traveloka Sudah saya DM yaa, mohon dibantu cek.. terimakasih min'],\n",
|
136 |
+
" ['@freshappiness Hai Kak. Maaf sebelumnya, untuk dapat mimin bantu cek lebih lanjut infokan dulu yuk nomor pesanan Traveloka kamu via DM. Terima kasih. -RM'],\n",
|
137 |
+
" ['@siiofyorluv AK itu singkatan nama mimin Kak. 😁 -FR'],\n",
|
138 |
+
" ['@traveloka ak itu akuntansi atau apa 😵'],\n",
|
139 |
+
" ['@diinoraurus DM kamu sudah mimin balas ya Kak. Silakan dicek🙏 -MZ'],\n",
|
140 |
+
" ['@traveloka mohon bantuan, sudah saya DM 🙏'],\n",
|
141 |
+
" ['@viqtinx @traveloka @BPerempuan Udah lewat kaka hehe'],\n",
|
142 |
+
" ['@traveloka tahun ini jg masih bisa apply visa waiver ga min?'],\n",
|
143 |
+
" ['@LJumardi Hai Kak Lucia, kendala kamu sudah mimin balas via DM ya. Silakan dicek. -NA'],\n",
|
144 |
+
" ['@traveloka @uway__ Minimal skill 2 dulu min....maen ulti aja\\U0001fae0\\U0001fae0\\U0001fae0'],\n",
|
145 |
+
" ['@traveloka Promo yang tidak ada gunanya mending promo. Ini dihapus, dibuat BDOBESTIE tapi hanya berlaku untuk Nam air dan sriwijaya sedangkan pesawat dari bandung tidak ada yang naik pesawat itu\\n\\nKalau tidak niat mending jangan berikan promo kak\\n\\nHanya berkoar koar saja apa yg bisa diambil'],\n",
|
146 |
+
" ['@traveloka selamat pagi ka, mohon dibantu untuk di respon DM saya. Terimakasih 🙏🏼'],\n",
|
147 |
+
" ['@Iexicone Mimin sudah balas via DM, Kak. Silakan dicek ya🙏 -MZ'],\n",
|
148 |
+
" ['@_Utariyulianti Mimin meluncur ke DM ya, Kak Utari :) -VK'],\n",
|
149 |
+
" ['@traveloka halo min, tolong cek dm'],\n",
|
150 |
+
" ['@Iexicone Hai Megan. Sebelumnya mohon maaf atas ketidaknyamanannya. Supaya bisa mimin bantu cek lebih lanjut, mohon bantuannya infokan dahulu nomor pesanan Traveloka kamu ke DM ya. Terima kasih. -KK'],\n",
|
151 |
+
" ['@hajelnuts Makasih Kak 💙 -AH'],\n",
|
152 |
+
" ['@traveloka Mimin jugaa!!🤩\\U0001faf6🏻'],\n",
|
153 |
+
" ['@dandelionsin1 Aamiin, semoga terwujud ya Kak. -AH'],\n",
|
154 |
+
" ['@traveloka @woypriss Mimin nya udh siap tuh jadi temen traveling kamu man😁'],\n",
|
155 |
+
" ['@llhamkomarudin Hai Kak, apabila kamu memiliki kendala atau pertanyaan silakan sampaikan pada mimin ya, terima kasih. -NT'],\n",
|
156 |
+
" ['@traveloka Tolong...'],\n",
|
157 |
+
" ['@warga_plus26 Sebenernya mimin juga lho Kak Gal😁 -KA'],\n",
|
158 |
+
" ['@traveloka Kangen promo penerbangan ke Bali lagi min ♌😁'],\n",
|
159 |
+
" ['@lightbluesvv Saluttt sama kakaknya😁🙌🙌🙌 -KA'],\n",
|
160 |
+
" ['@traveloka Karena promo-promo kaliaannn😂'],\n",
|
161 |
+
" ['@traveloka boleh² aja min asal ada pap karena no pic = hoax'],\n",
|
162 |
+
" ['@yazwang Setelah mimin cek harga penerbangan dewasa dan anak sama Kak. Namun, harga yang tertera mengikuti yang diberikan maskapai ke Traveloka Kak. Terima kasih -DZ'],\n",
|
163 |
+
" ['@traveloka Saya no debat ya bapak ibu, saya capek abis terombang ambing dilautan Maluku 27 jam, saya sudah dapat hotel lain, saya mau istirahat, no rekening sudah saya DM makasih ya'],\n",
|
164 |
+
" ['@traveloka Untuk singapore airlines bagaimana kak? Apakah harga child sama dengan adult? Karena di web maskapai nya harga child lebih murah dari adult'],\n",
|
165 |
+
" ['@GebbyErcha Halo Kak. Sebelumnya mohon maaf atas kendala yang kamu alami. Silakan informasikan nomor pesanan Traveloka dengan 9 digit angka melalui DM. Terima kasih. -SC'],\n",
|
166 |
+
" ['@yazwang Hai Kak Wang, setiap maskapai punya kebijakan masing-masing terkait harga penumpang anak dan dewasa. Ada yang harganya sama dan ada yang berbeda ya Kak. Terima kasih -DZ']]"
|
167 |
+
]
|
168 |
+
},
|
169 |
+
"execution_count": 10,
|
170 |
+
"metadata": {},
|
171 |
+
"output_type": "execute_result"
|
172 |
+
}
|
173 |
+
],
|
174 |
+
"source": [
|
175 |
+
"tweets"
|
176 |
+
]
|
177 |
+
},
|
178 |
+
{
|
179 |
+
"cell_type": "code",
|
180 |
+
"execution_count": null,
|
181 |
+
"metadata": {},
|
182 |
+
"outputs": [],
|
183 |
+
"source": []
|
184 |
+
}
|
185 |
+
],
|
186 |
+
"metadata": {
|
187 |
+
"kernelspec": {
|
188 |
+
"display_name": "sentiment_app",
|
189 |
+
"language": "python",
|
190 |
+
"name": "python3"
|
191 |
+
},
|
192 |
+
"language_info": {
|
193 |
+
"codemirror_mode": {
|
194 |
+
"name": "ipython",
|
195 |
+
"version": 3
|
196 |
+
},
|
197 |
+
"file_extension": ".py",
|
198 |
+
"mimetype": "text/x-python",
|
199 |
+
"name": "python",
|
200 |
+
"nbconvert_exporter": "python",
|
201 |
+
"pygments_lexer": "ipython3",
|
202 |
+
"version": "3.10.8"
|
203 |
+
},
|
204 |
+
"orig_nbformat": 4,
|
205 |
+
"vscode": {
|
206 |
+
"interpreter": {
|
207 |
+
"hash": "060cb0c314b72233d054837cbf7114a14e911c92ed126864d61595f4324447df"
|
208 |
+
}
|
209 |
+
}
|
210 |
+
},
|
211 |
+
"nbformat": 4,
|
212 |
+
"nbformat_minor": 2
|
213 |
+
}
|
script/__init__.py
ADDED
File without changes
|
script/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (161 Bytes). View file
|
|
script/__pycache__/functions.cpython-310.pyc
ADDED
Binary file (4.54 kB). View file
|
|
script/__pycache__/plotting.cpython-310.pyc
ADDED
Binary file (3.66 kB). View file
|
|
script/__pycache__/text_proc.cpython-310.pyc
ADDED
Binary file (4.72 kB). View file
|
|
script/functions.py
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import re
|
4 |
+
import snscrape.modules.twitter as sntwitter
|
5 |
+
from transformers import pipeline
|
6 |
+
import plotly.express as px
|
7 |
+
import joblib
|
8 |
+
from sklearn.metrics import classification_report,confusion_matrix
|
9 |
+
|
10 |
+
|
11 |
+
import nltk
|
12 |
+
nltk.download("punkt")
|
13 |
+
nltk.download('stopwords')
|
14 |
+
from nltk.tokenize import word_tokenize
|
15 |
+
|
16 |
+
|
17 |
+
def get_tweets(username, length=10, option = None):
|
18 |
+
# Creating list to append tweet data to
|
19 |
+
query = username + " -filter:links filter:replies lang:id"
|
20 |
+
if option == "Advanced":
|
21 |
+
query = username
|
22 |
+
tweets = []
|
23 |
+
# Using TwitterSearchScraper to scrape
|
24 |
+
# Using TwitterSearchScraper to scrape
|
25 |
+
for i,tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()):
|
26 |
+
if i>=length:
|
27 |
+
break
|
28 |
+
tweets.append([tweet.content])
|
29 |
+
|
30 |
+
# Creating a dataframe from the tweets list above
|
31 |
+
tweets_df = pd.DataFrame(tweets, columns=["content"])
|
32 |
+
tweets_df['content'] = tweets_df['content'].str.replace('@[^\s]+','')
|
33 |
+
tweets_df['content'] = tweets_df['content'].str.replace('#[^\s]+','')
|
34 |
+
tweets_df['content'] = tweets_df['content'].str.replace('http\S+','')
|
35 |
+
tweets_df['content'] = tweets_df['content'].str.replace('pic.twitter.com\S+','')
|
36 |
+
tweets_df['content'] = tweets_df['content'].str.replace('RT','')
|
37 |
+
tweets_df['content'] = tweets_df['content'].str.replace('amp','')
|
38 |
+
# remove emoticon
|
39 |
+
tweets_df['content'] = tweets_df['content'].str.replace('[^\w\s#@/:%.,_-]', '', flags=re.UNICODE)
|
40 |
+
|
41 |
+
# remove whitespace leading & trailing
|
42 |
+
tweets_df['content'] = tweets_df['content'].str.strip()
|
43 |
+
|
44 |
+
# remove multiple whitespace into single whitespace
|
45 |
+
tweets_df['content'] = tweets_df['content'].str.replace('\s+', ' ')
|
46 |
+
|
47 |
+
# remove row with empty content
|
48 |
+
tweets_df = tweets_df[tweets_df['content'] != '']
|
49 |
+
return tweets_df
|
50 |
+
|
51 |
+
|
52 |
+
def get_sentiment(df,option_model):
|
53 |
+
id2label = {0: "negatif", 1: "netral", 2: "positif"}
|
54 |
+
if option_model == "IndoBERT (Accurate,Slow)":
|
55 |
+
classifier = pipeline("sentiment-analysis",model = "indobert")
|
56 |
+
df['sentiment'] = df['content'].apply(lambda x: id2label[classifier(x)[0]['label']])
|
57 |
+
elif (option_model == "Logistic Regression (Less Accurate,Fast)"):
|
58 |
+
df_model = joblib.load('assets/df_model.pkl')
|
59 |
+
classifier = df_model[df_model.model_name == "Logistic Regression"].model.values[0]
|
60 |
+
df['sentiment'] = df['content'].apply(lambda x: id2label[classifier.predict([x])[0]])
|
61 |
+
else :
|
62 |
+
df_model = joblib.load('assets/df_model.pkl')
|
63 |
+
classifier = df_model[df_model.model_name == option_model].model.values[0]
|
64 |
+
df['sentiment'] = df['content'].apply(lambda x: id2label[classifier.predict([x])[0]])
|
65 |
+
# change order sentiment to first column
|
66 |
+
cols = df.columns.tolist()
|
67 |
+
cols = cols[-1:] + cols[:-1]
|
68 |
+
df = df[cols]
|
69 |
+
|
70 |
+
return df
|
71 |
+
|
72 |
+
def get_bar_chart(df):
|
73 |
+
df= df.groupby(['sentiment']).count().reset_index()
|
74 |
+
# plot barchart sentiment
|
75 |
+
# plot barchart sentiment
|
76 |
+
fig = px.bar(df, x="sentiment", y="content", color="sentiment",text = "content", color_discrete_map={"positif": "#00cc96", "negatif": "#ef553b","netral": "#636efa"})
|
77 |
+
# hide legend
|
78 |
+
fig.update_layout(showlegend=False)
|
79 |
+
# set margin top
|
80 |
+
fig.update_layout(margin=dict(t=0, b=150, l=0, r=0))
|
81 |
+
# set title in center
|
82 |
+
# set annotation in bar
|
83 |
+
fig.update_traces(textposition='outside')
|
84 |
+
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
|
85 |
+
|
86 |
+
# set y axis title
|
87 |
+
fig.update_yaxes(title_text='Jumlah Komentar')
|
88 |
+
|
89 |
+
return fig
|
90 |
+
|
91 |
+
def plot_model_summary(df_model):
|
92 |
+
df_scatter = df_model[df_model.set_data == "test"][["score","time","model_name"]]
|
93 |
+
# plot scatter
|
94 |
+
fig = px.scatter(df_scatter, x="time", y="score", color="model_name", hover_data=['model_name'])
|
95 |
+
# set xlabel to time (s)
|
96 |
+
fig.update_xaxes(title_text="time (s)")
|
97 |
+
# set ylabel to accuracy
|
98 |
+
fig.update_yaxes(title_text="accuracy")
|
99 |
+
|
100 |
+
# set point size
|
101 |
+
fig.update_traces(marker=dict(size=10))
|
102 |
+
fig.update_layout(autosize = False,margin=dict(t=0, l=0, r=0),height = 400)
|
103 |
+
return fig
|
104 |
+
|
105 |
+
def plot_clfr(df_model,option_model,df):
|
106 |
+
df_clfr = pd.DataFrame(classification_report(df["label"],df[f"{option_model}_pred"],output_dict=True))
|
107 |
+
# heatmap using plotly
|
108 |
+
df_clfr.columns = ["positif","netral","negatif","accuracy","macro_avg","weighted_avg"]
|
109 |
+
fig = px.imshow(df_clfr.T.iloc[:,:-1], x=df_clfr.T.iloc[:,:-1].columns, y=df_clfr.T.iloc[:,:-1].index)
|
110 |
+
# remove colorbar
|
111 |
+
fig.update_layout(coloraxis_showscale=False)
|
112 |
+
fig.update_layout(coloraxis_colorscale='gnbu')
|
113 |
+
# get annot
|
114 |
+
annot = df_clfr.T.iloc[:,:-1].values
|
115 |
+
# add annot and set font size
|
116 |
+
fig.update_traces(text=annot, texttemplate='%{text:.2f}',textfont_size=12)
|
117 |
+
# set title to classification report
|
118 |
+
fig.update_layout(title_text="📄 Classification Report")
|
119 |
+
return fig
|
120 |
+
|
121 |
+
def plot_confusion_matrix(df_model,option_model,df):
|
122 |
+
# plot confusion matrix
|
123 |
+
cm = confusion_matrix(df['label'],df[f"{option_model}_pred"])
|
124 |
+
fig = px.imshow(cm, x=['negatif','netral','positif'], y=['negatif','netral','positif'])
|
125 |
+
# remove colorbar
|
126 |
+
fig.update_layout(coloraxis_showscale=False)
|
127 |
+
fig.update_layout(coloraxis_colorscale='gnbu',title_text = "📊 Confusion Matrix")
|
128 |
+
# get annot
|
129 |
+
annot = cm
|
130 |
+
# add annot
|
131 |
+
fig.update_traces(text=annot, texttemplate='%{text:.0f}',textfont_size=15)
|
132 |
+
return fig
|
script/plotting.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import itertools
|
2 |
+
import numpy as np
|
3 |
+
from typing import List
|
4 |
+
|
5 |
+
import plotly.graph_objects as go
|
6 |
+
from plotly.subplots import make_subplots
|
7 |
+
|
8 |
+
|
9 |
+
def visualize_barchart(topic_model,
|
10 |
+
topics: List[int] = None,
|
11 |
+
top_n_topics: int = 8,
|
12 |
+
n_words: int = 5,
|
13 |
+
custom_labels: bool = False,
|
14 |
+
title: str = "Kata Kunci tiap Topic",
|
15 |
+
width: int = 250,
|
16 |
+
height: int = 250) -> go.Figure:
|
17 |
+
""" Visualize a barchart of selected topics
|
18 |
+
Arguments:
|
19 |
+
topic_model: A fitted BERTopic instance.
|
20 |
+
topics: A selection of topics to visualize.
|
21 |
+
top_n_topics: Only select the top n most frequent topics.
|
22 |
+
n_words: Number of words to show in a topic
|
23 |
+
custom_labels: Whether to use custom topic labels that were defined using
|
24 |
+
`topic_model.set_topic_labels`.
|
25 |
+
title: Title of the plot.
|
26 |
+
width: The width of each figure.
|
27 |
+
height: The height of each figure.
|
28 |
+
Returns:
|
29 |
+
fig: A plotly figure
|
30 |
+
Examples:
|
31 |
+
To visualize the barchart of selected topics
|
32 |
+
simply run:
|
33 |
+
```python
|
34 |
+
topic_model.visualize_barchart()
|
35 |
+
```
|
36 |
+
Or if you want to save the resulting figure:
|
37 |
+
```python
|
38 |
+
fig = topic_model.visualize_barchart()
|
39 |
+
fig.write_html("path/to/file.html")
|
40 |
+
```
|
41 |
+
<iframe src="../../getting_started/visualization/bar_chart.html"
|
42 |
+
style="width:1100px; height: 660px; border: 0px;""></iframe>
|
43 |
+
"""
|
44 |
+
colors = itertools.cycle(['#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A', '#19D3F3', '#FF6692', '#B6E880', '#FF97FF', '#FECB52'])
|
45 |
+
|
46 |
+
# Select topics based on top_n and topics args
|
47 |
+
freq_df = topic_model.get_topic_freq()
|
48 |
+
if len(freq_df) > 1:
|
49 |
+
freq_df = freq_df.loc[freq_df.Topic != -1, :]
|
50 |
+
if topics is not None:
|
51 |
+
topics = list(topics)
|
52 |
+
elif top_n_topics is not None:
|
53 |
+
topics = sorted(freq_df.Topic.to_list()[:top_n_topics])
|
54 |
+
else:
|
55 |
+
topics = sorted(freq_df.Topic.to_list()[0:6])
|
56 |
+
|
57 |
+
# Initialize figure
|
58 |
+
if topic_model.custom_labels_ is not None and custom_labels:
|
59 |
+
subplot_titles = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in topics]
|
60 |
+
else:
|
61 |
+
subplot_titles = [f"Topic {topic}" for topic in topics]
|
62 |
+
columns = 3
|
63 |
+
rows = int(np.ceil(len(topics) / columns))
|
64 |
+
fig = make_subplots(rows=rows,
|
65 |
+
cols=columns,
|
66 |
+
shared_xaxes=False,
|
67 |
+
horizontal_spacing=.1,
|
68 |
+
vertical_spacing=.4 / rows if rows > 1 else 0,
|
69 |
+
subplot_titles=subplot_titles)
|
70 |
+
|
71 |
+
# Add barchart for each topic
|
72 |
+
row = 1
|
73 |
+
column = 1
|
74 |
+
for topic in topics:
|
75 |
+
words = [word + " " for word, _ in topic_model.get_topic(topic)][:n_words][::-1]
|
76 |
+
scores = [score for _, score in topic_model.get_topic(topic)][:n_words][::-1]
|
77 |
+
|
78 |
+
fig.add_trace(
|
79 |
+
go.Bar(x=scores,
|
80 |
+
y=words,
|
81 |
+
orientation='h',
|
82 |
+
marker_color=next(colors)),
|
83 |
+
row=row, col=column)
|
84 |
+
|
85 |
+
if column == columns:
|
86 |
+
column = 1
|
87 |
+
row += 1
|
88 |
+
else:
|
89 |
+
column += 1
|
90 |
+
|
91 |
+
# Stylize graph
|
92 |
+
fig.update_layout(
|
93 |
+
|
94 |
+
showlegend=False,
|
95 |
+
title={
|
96 |
+
'text': f"<b>{title}",
|
97 |
+
'xanchor': 'center',
|
98 |
+
'yanchor': 'top',
|
99 |
+
'font': dict(
|
100 |
+
size=22,
|
101 |
+
color="Black")
|
102 |
+
},
|
103 |
+
width=width*3,
|
104 |
+
height=height*rows if rows > 1 else height * 1.3,
|
105 |
+
hoverlabel=dict(
|
106 |
+
bgcolor="white",
|
107 |
+
font_size=13,
|
108 |
+
font_family="Rockwell"
|
109 |
+
),
|
110 |
+
margin=dict(l=40, r=40)
|
111 |
+
)
|
112 |
+
|
113 |
+
fig.update_xaxes(showgrid=True)
|
114 |
+
fig.update_yaxes(showgrid=True)
|
115 |
+
|
116 |
+
return fig
|
script/text_proc.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
from PIL import Image
|
4 |
+
import plotly.express as px
|
5 |
+
from wordcloud import WordCloud
|
6 |
+
import matplotlib.pyplot as plt
|
7 |
+
import string
|
8 |
+
import re #regex library
|
9 |
+
#umap
|
10 |
+
import umap
|
11 |
+
import hdbscan
|
12 |
+
import plotly.graph_objects as go
|
13 |
+
from bertopic import BERTopic
|
14 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
15 |
+
|
16 |
+
# import word_tokenize from NLTK
|
17 |
+
from transformers import AutoTokenizer
|
18 |
+
from script.plotting import visualize_barchart
|
19 |
+
|
20 |
+
def load_stopwords():
|
21 |
+
stopwords = pd.read_csv("assets/stopwordbahasa.csv", header=None)
|
22 |
+
stopwords = stopwords[0].tolist()
|
23 |
+
stopwords = stopwords + list(string.punctuation)
|
24 |
+
return stopwords
|
25 |
+
|
26 |
+
def tokenisasi(df):
|
27 |
+
stopwords = load_stopwords()
|
28 |
+
tokenizer = AutoTokenizer.from_pretrained('indobert')
|
29 |
+
tokens = df.content.apply(lambda x: tokenizer.tokenize(x))
|
30 |
+
tokens = tokens.apply(lambda x: [x for x in x if (not x.startswith('##') and x not in stopwords and len(x) > 4)])
|
31 |
+
return tokens
|
32 |
+
|
33 |
+
def get_wordcloud(df,kelas_sentiment):
|
34 |
+
mask = np.array(Image.open('./assets/twitter.png'))
|
35 |
+
cmap_dict = {'positif': 'YlGn', 'negatif': 'OrRd', 'netral': 'GnBu'}
|
36 |
+
tokens = tokenisasi(df[df.sentiment == kelas_sentiment])
|
37 |
+
tokens = tokens.apply(lambda x: ' '.join(x))
|
38 |
+
text = ' '.join(tokens)
|
39 |
+
# check if text empty or not
|
40 |
+
try :
|
41 |
+
wordcloud = WordCloud(width = 800, height = 800,
|
42 |
+
background_color ='black',
|
43 |
+
min_font_size = 10,
|
44 |
+
colormap = cmap_dict[kelas_sentiment],
|
45 |
+
mask = mask).generate(text)
|
46 |
+
except:
|
47 |
+
wordcloud = WordCloud(width = 800, height = 800,
|
48 |
+
background_color ='black',
|
49 |
+
min_font_size = 10,
|
50 |
+
colormap = cmap_dict[kelas_sentiment],
|
51 |
+
mask = mask).generate("None")
|
52 |
+
return wordcloud
|
53 |
+
|
54 |
+
def plot_text(df,kelas,embedding_model):
|
55 |
+
df = df[df.sentiment == kelas]
|
56 |
+
data = embedding_model.encode(df.values.tolist())
|
57 |
+
umap_model = umap.UMAP(n_neighbors=min(df.shape[0],5),random_state = 42)
|
58 |
+
umap_data = umap_model.fit_transform(data)
|
59 |
+
clusterer = hdbscan.HDBSCAN(min_cluster_size=round((df.shape[0])**(0.5)-1),min_samples=3)
|
60 |
+
clusterer.fit(umap_data)
|
61 |
+
|
62 |
+
labels = ['cluster ' + str(i) for i in clusterer.labels_]
|
63 |
+
# replace cluster -1 with outlier
|
64 |
+
labels = ["outlier" if i == "cluster -1" else i for i in labels ]
|
65 |
+
text = df["content"].str.wrap(50).apply(lambda x: x.replace('\n', '<br>'))
|
66 |
+
|
67 |
+
fig = px.scatter(x=umap_data[:,0], y=umap_data[:,1],color = clusterer.labels_)
|
68 |
+
# remove legend
|
69 |
+
fig = px.scatter(x=umap_data[:,0], y=umap_data[:,1],color = labels,text = text)
|
70 |
+
#set text color
|
71 |
+
fig.update_traces(textfont_color='rgba(0,0,0,0)',marker_size = 8)
|
72 |
+
# set background color
|
73 |
+
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)')
|
74 |
+
# set margin
|
75 |
+
fig.update_layout(margin=dict(l=40, r=5, t=0, b=40))
|
76 |
+
# set axis color to grey
|
77 |
+
fig.update_xaxes(showgrid=False, zeroline=False, linecolor='rgb(200,200,200)')
|
78 |
+
fig.update_yaxes( zeroline=False, linecolor='rgb(200,200,200)')
|
79 |
+
# set font sans-serif
|
80 |
+
fig.update_layout(font_family="sans-serif")
|
81 |
+
# remove legend
|
82 |
+
fig.update_layout(showlegend=False)
|
83 |
+
|
84 |
+
# set legend title to cluster
|
85 |
+
return df["content"],data,fig
|
86 |
+
|
87 |
+
def topic_modelling(df,embed_df):
|
88 |
+
data = df.apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
|
89 |
+
stopwords = load_stopwords()
|
90 |
+
# remove empty data
|
91 |
+
topic_model = BERTopic(
|
92 |
+
calculate_probabilities=True,
|
93 |
+
# cluster model
|
94 |
+
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=5,prediction_data=True),
|
95 |
+
vectorizer_model=CountVectorizer(stop_words=stopwords),
|
96 |
+
language="indonesian",
|
97 |
+
)
|
98 |
+
topics, probs = topic_model.fit_transform(data,embed_df)
|
99 |
+
topic_labels = topic_model.generate_topic_labels(
|
100 |
+
topic_prefix = False,
|
101 |
+
separator = ", ",
|
102 |
+
)
|
103 |
+
topic_model.set_topic_labels(topic_labels)
|
104 |
+
fig = visualize_barchart(topic_model)
|
105 |
+
# set title to Kata Kunci tiap Topic
|
106 |
+
# fig.update_layout(title_text="Topic yang sering muncul")
|
107 |
+
return fig,topic_model
|
sentence_bert/1_Pooling/config.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"word_embedding_dimension": 768,
|
3 |
+
"pooling_mode_cls_token": false,
|
4 |
+
"pooling_mode_mean_tokens": true,
|
5 |
+
"pooling_mode_max_tokens": false,
|
6 |
+
"pooling_mode_mean_sqrt_len_tokens": false
|
7 |
+
}
|
sentence_bert/README.md
ADDED
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
pipeline_tag: sentence-similarity
|
3 |
+
tags:
|
4 |
+
- sentence-transformers
|
5 |
+
- feature-extraction
|
6 |
+
- sentence-similarity
|
7 |
+
- transformers
|
8 |
+
|
9 |
+
---
|
10 |
+
|
11 |
+
# indo-sentence-bert-base
|
12 |
+
|
13 |
+
This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.
|
14 |
+
|
15 |
+
<!--- Describe your model here -->
|
16 |
+
|
17 |
+
## Usage (Sentence-Transformers)
|
18 |
+
|
19 |
+
Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
|
20 |
+
|
21 |
+
```
|
22 |
+
pip install -U sentence-transformers
|
23 |
+
```
|
24 |
+
|
25 |
+
Then you can use the model like this:
|
26 |
+
|
27 |
+
```python
|
28 |
+
from sentence_transformers import SentenceTransformer
|
29 |
+
sentences = ["Ibukota Perancis adalah Paris",
|
30 |
+
"Menara Eifel terletak di Paris, Perancis",
|
31 |
+
"Pizza adalah makanan khas Italia",
|
32 |
+
"Saya kuliah di Carneige Mellon University"]
|
33 |
+
|
34 |
+
model = SentenceTransformer('firqaaa/indo-sentence-bert-base')
|
35 |
+
embeddings = model.encode(sentences)
|
36 |
+
print(embeddings)
|
37 |
+
```
|
38 |
+
|
39 |
+
|
40 |
+
|
41 |
+
## Usage (HuggingFace Transformers)
|
42 |
+
Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
|
43 |
+
|
44 |
+
```python
|
45 |
+
from transformers import AutoTokenizer, AutoModel
|
46 |
+
import torch
|
47 |
+
|
48 |
+
|
49 |
+
#Mean Pooling - Take attention mask into account for correct averaging
|
50 |
+
def mean_pooling(model_output, attention_mask):
|
51 |
+
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
|
52 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
53 |
+
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
54 |
+
|
55 |
+
|
56 |
+
# Sentences we want sentence embeddings for
|
57 |
+
sentences = ["Ibukota Perancis adalah Paris",
|
58 |
+
"Menara Eifel terletak di Paris, Perancis",
|
59 |
+
"Pizza adalah makanan khas Italia",
|
60 |
+
"Saya kuliah di Carneige Mellon University"]
|
61 |
+
|
62 |
+
|
63 |
+
# Load model from HuggingFace Hub
|
64 |
+
tokenizer = AutoTokenizer.from_pretrained('firqaaa/indo-sentence-bert-base')
|
65 |
+
model = AutoModel.from_pretrained('firqaaa/indo-sentence-bert-base')
|
66 |
+
|
67 |
+
# Tokenize sentences
|
68 |
+
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
69 |
+
|
70 |
+
# Compute token embeddings
|
71 |
+
with torch.no_grad():
|
72 |
+
model_output = model(**encoded_input)
|
73 |
+
|
74 |
+
# Perform pooling. In this case, mean pooling.
|
75 |
+
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
|
76 |
+
|
77 |
+
print("Sentence embeddings:")
|
78 |
+
print(sentence_embeddings)
|
79 |
+
```
|
80 |
+
|
81 |
+
|
82 |
+
|
83 |
+
## Evaluation Results
|
84 |
+
|
85 |
+
<!--- Describe how your model was evaluated -->
|
86 |
+
|
87 |
+
For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name={MODEL_NAME})
|
88 |
+
|
89 |
+
|
90 |
+
## Training
|
91 |
+
The model was trained with the parameters:
|
92 |
+
|
93 |
+
**DataLoader**:
|
94 |
+
|
95 |
+
`sentence_transformers.datasets.NoDuplicatesDataLoader.NoDuplicatesDataLoader` of length 19644 with parameters:
|
96 |
+
```
|
97 |
+
{'batch_size': 16}
|
98 |
+
```
|
99 |
+
|
100 |
+
**Loss**:
|
101 |
+
|
102 |
+
`sentence_transformers.losses.MultipleNegativesRankingLoss.MultipleNegativesRankingLoss` with parameters:
|
103 |
+
```
|
104 |
+
{'scale': 20.0, 'similarity_fct': 'cos_sim'}
|
105 |
+
```
|
106 |
+
|
107 |
+
Parameters of the fit()-Method:
|
108 |
+
```
|
109 |
+
{
|
110 |
+
"epochs": 5,
|
111 |
+
"evaluation_steps": 0,
|
112 |
+
"evaluator": "NoneType",
|
113 |
+
"max_grad_norm": 1,
|
114 |
+
"optimizer_class": "<class 'torch.optim.adamw.AdamW'>",
|
115 |
+
"optimizer_params": {
|
116 |
+
"lr": 2e-05
|
117 |
+
},
|
118 |
+
"scheduler": "WarmupLinear",
|
119 |
+
"steps_per_epoch": null,
|
120 |
+
"warmup_steps": 9930,
|
121 |
+
"weight_decay": 0.01
|
122 |
+
}
|
123 |
+
```
|
124 |
+
|
125 |
+
|
126 |
+
## Full Model Architecture
|
127 |
+
```
|
128 |
+
SentenceTransformer(
|
129 |
+
(0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel
|
130 |
+
(1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
|
131 |
+
)
|
132 |
+
```
|
133 |
+
|
134 |
+
## Citing & Authors
|
135 |
+
|
136 |
+
<!--- Describe where people can find more information -->
|
sentence_bert/config.json
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "C:\\Users\\syair dafiq/.cache\\torch\\sentence_transformers\\firqaaa_indo-sentence-bert-base\\",
|
3 |
+
"_num_labels": 5,
|
4 |
+
"architectures": [
|
5 |
+
"BertModel"
|
6 |
+
],
|
7 |
+
"attention_probs_dropout_prob": 0.1,
|
8 |
+
"classifier_dropout": null,
|
9 |
+
"directionality": "bidi",
|
10 |
+
"hidden_act": "gelu",
|
11 |
+
"hidden_dropout_prob": 0.1,
|
12 |
+
"hidden_size": 768,
|
13 |
+
"id2label": {
|
14 |
+
"0": "LABEL_0",
|
15 |
+
"1": "LABEL_1",
|
16 |
+
"2": "LABEL_2",
|
17 |
+
"3": "LABEL_3",
|
18 |
+
"4": "LABEL_4"
|
19 |
+
},
|
20 |
+
"initializer_range": 0.02,
|
21 |
+
"intermediate_size": 3072,
|
22 |
+
"label2id": {
|
23 |
+
"LABEL_0": 0,
|
24 |
+
"LABEL_1": 1,
|
25 |
+
"LABEL_2": 2,
|
26 |
+
"LABEL_3": 3,
|
27 |
+
"LABEL_4": 4
|
28 |
+
},
|
29 |
+
"layer_norm_eps": 1e-12,
|
30 |
+
"max_position_embeddings": 512,
|
31 |
+
"model_type": "bert",
|
32 |
+
"num_attention_heads": 12,
|
33 |
+
"num_hidden_layers": 12,
|
34 |
+
"output_past": true,
|
35 |
+
"pad_token_id": 0,
|
36 |
+
"pooler_fc_size": 768,
|
37 |
+
"pooler_num_attention_heads": 12,
|
38 |
+
"pooler_num_fc_layers": 3,
|
39 |
+
"pooler_size_per_head": 128,
|
40 |
+
"pooler_type": "first_token_transform",
|
41 |
+
"position_embedding_type": "absolute",
|
42 |
+
"torch_dtype": "float32",
|
43 |
+
"transformers_version": "4.25.1",
|
44 |
+
"type_vocab_size": 2,
|
45 |
+
"use_cache": true,
|
46 |
+
"vocab_size": 50000
|
47 |
+
}
|
sentence_bert/config_sentence_transformers.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"__version__": {
|
3 |
+
"sentence_transformers": "2.2.2",
|
4 |
+
"transformers": "4.20.1",
|
5 |
+
"pytorch": "1.11.0"
|
6 |
+
}
|
7 |
+
}
|
sentence_bert/modules.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"idx": 0,
|
4 |
+
"name": "0",
|
5 |
+
"path": "",
|
6 |
+
"type": "sentence_transformers.models.Transformer"
|
7 |
+
},
|
8 |
+
{
|
9 |
+
"idx": 1,
|
10 |
+
"name": "1",
|
11 |
+
"path": "1_Pooling",
|
12 |
+
"type": "sentence_transformers.models.Pooling"
|
13 |
+
}
|
14 |
+
]
|
sentence_bert/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:531e8399cbb7941b3f71bb0dc25d6abf70805ef05342f603fb4d4f158ce54b6d
|
3 |
+
size 497833773
|
sentence_bert/sentence_bert_config.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"max_seq_length": 512,
|
3 |
+
"do_lower_case": false
|
4 |
+
}
|
sentence_bert/special_tokens_map.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"mask_token": "[MASK]",
|
4 |
+
"pad_token": "[PAD]",
|
5 |
+
"sep_token": "[SEP]",
|
6 |
+
"unk_token": "[UNK]"
|
7 |
+
}
|
sentence_bert/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
sentence_bert/tokenizer_config.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"do_basic_tokenize": true,
|
4 |
+
"do_lower_case": true,
|
5 |
+
"mask_token": "[MASK]",
|
6 |
+
"model_max_length": 1000000000000000019884624838656,
|
7 |
+
"name_or_path": "C:\\Users\\syair dafiq/.cache\\torch\\sentence_transformers\\firqaaa_indo-sentence-bert-base\\",
|
8 |
+
"never_split": null,
|
9 |
+
"pad_token": "[PAD]",
|
10 |
+
"sep_token": "[SEP]",
|
11 |
+
"special_tokens_map_file": "/root/.cache/huggingface/transformers/b515a756d9ddf12a7a391ea596c488ac805f0576790934e590ce250a3e4ff056.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d",
|
12 |
+
"strip_accents": null,
|
13 |
+
"tokenize_chinese_chars": true,
|
14 |
+
"tokenizer_class": "BertTokenizer",
|
15 |
+
"unk_token": "[UNK]"
|
16 |
+
}
|
sentence_bert/vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|