Commit
Β·
c7f5132
0
Parent(s):
init
Browse files- Home.py +204 -0
- Product Reviews Ecommerce Multilabel Dataset.json +0 -0
- model_services/__pycache__/model.cpython-312.pyc +0 -0
- model_services/__pycache__/pipeline.cpython-310.pyc +0 -0
- model_services/__pycache__/pipeline.cpython-312.pyc +0 -0
- model_services/model.py +55 -0
- model_services/pipeline.py +95 -0
- pages/Dataset.py +62 -0
- requirements.txt +0 -0
- tes.py +40 -0
Home.py
ADDED
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# streamlit
|
2 |
+
import streamlit as st
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
st.set_page_config(page_title="Home", page_icon="π ", layout="centered")
|
6 |
+
|
7 |
+
st.markdown("# ποΈ Aspect-Based Multilabel Classification of Ecommerce Reviews")
|
8 |
+
st.write("Ever wondered what people think about the products, customer service, and shipping of your favorite online store? Try this out!")
|
9 |
+
|
10 |
+
# help me create sidebar
|
11 |
+
st.sidebar.markdown("## π About"
|
12 |
+
"\nThis is a simple web app to classify the aspect of reviews from an e-commerce dataset."
|
13 |
+
"\n\nThe dataset used is a multilabel dataset, which means a review can have multiple labels."
|
14 |
+
"\n\nThe labels are:"
|
15 |
+
"\n- π¦ **Product**"
|
16 |
+
"\n- π©βπΌ **Customer Service**"
|
17 |
+
"\n- π **Shipping/Delivery**")
|
18 |
+
|
19 |
+
# add create by Fahrendra Khoirul Ihtada and Rizha Alfianita using streamlit and Hugging Face's IndoBERT model
|
20 |
+
st.sidebar.markdown("## π¨βπ» Created by"
|
21 |
+
"\n[Fahrendra Khoirul Ihtada](https://www.linkedin.com/in/fahrendra-khoirul-ihtada/) "
|
22 |
+
"and [Rizha Alfianita](https://www.linkedin.com/in/rizha-alfianita/)"
|
23 |
+
"\n Using Streamlit and Hugging Face's [IndoBERT](https://huggingface.co/indobenchmark/indobert-base-p1) model.")
|
24 |
+
|
25 |
+
# add my hugging face profile
|
26 |
+
st.sidebar.markdown("## π€ Hugging Face"
|
27 |
+
"\n- [Fahrendra Khoirul Ihtada](https://huggingface.co/fahrendrakhoirul)")
|
28 |
+
|
29 |
+
|
30 |
+
|
31 |
+
# import here because why not??
|
32 |
+
import model_services.pipeline as pipeline
|
33 |
+
|
34 |
+
container_1 = st.container(border=True)
|
35 |
+
|
36 |
+
|
37 |
+
# create rows and 2 dropdown menus side by side
|
38 |
+
row1_1, row1_2 = container_1.columns((2, 1))
|
39 |
+
with row1_1:
|
40 |
+
df = pd.read_json("Product Reviews Ecommerce Multilabel Dataset.json", lines=True)
|
41 |
+
selected_review = st.selectbox(
|
42 |
+
"You can pick a review from dataset",
|
43 |
+
df["review"].values,
|
44 |
+
)
|
45 |
+
with row1_2:
|
46 |
+
selected_model = st.selectbox(
|
47 |
+
"Choose the model",
|
48 |
+
("IndoBERT", "IndoBERT-CNN (Best)", "IndoBERT-LSTM"),
|
49 |
+
)
|
50 |
+
|
51 |
+
# text input
|
52 |
+
input_review = container_1.text_area("Or you can input multiple review with separated line", selected_review, height=200)
|
53 |
+
|
54 |
+
# create button submit
|
55 |
+
button_submit = container_1.button("Classify")
|
56 |
+
|
57 |
+
|
58 |
+
def show_label_desc():
|
59 |
+
st.divider()
|
60 |
+
st.write("Let's see what is the meaning of eachΒ labels:")
|
61 |
+
st.write("- π¦**Product** : related Customer satisfaction with the quality, performance, and conformity of the product to the description given")
|
62 |
+
st.write("- π©βπΌ**Customer Service** : Interaction between customers and sellers, friendliness and speed of response from sellers, and handling complaints.")
|
63 |
+
st.write("- π**Shipping/Delivery** : related to shipping speed, condition of goods when received, and timelinessΒ ofΒ shipping")
|
64 |
+
|
65 |
+
def submit():
|
66 |
+
# Create UI for Result
|
67 |
+
st.success("Done! π")
|
68 |
+
outputs = do_calculation(input_review)
|
69 |
+
# input_review = ""
|
70 |
+
show_result(outputs)
|
71 |
+
show_label_desc()
|
72 |
+
|
73 |
+
def do_calculation(texts):
|
74 |
+
# split text by newline
|
75 |
+
reviews = texts.split("\n")
|
76 |
+
# remove empty string
|
77 |
+
reviews = list(filter(None, reviews))
|
78 |
+
# do the prediction
|
79 |
+
outputs = pipeline.get_result(reviews, selected_model)
|
80 |
+
return outputs
|
81 |
+
|
82 |
+
st.markdown("""
|
83 |
+
<style>
|
84 |
+
.label-container {
|
85 |
+
display: flex;
|
86 |
+
flex-wrap: wrap;
|
87 |
+
gap: 5px;
|
88 |
+
}
|
89 |
+
.rounded-label-product {
|
90 |
+
background-color: #FFD700;
|
91 |
+
color: black;
|
92 |
+
border-radius: 20px;
|
93 |
+
padding: 5px 10px;
|
94 |
+
font-size: 14px;
|
95 |
+
margin-bottom: 20px;
|
96 |
+
}
|
97 |
+
|
98 |
+
.rounded-label-customer-service {
|
99 |
+
background-color: #FFA07A;
|
100 |
+
color: black;
|
101 |
+
border-radius: 20px;
|
102 |
+
padding: 5px 10px;
|
103 |
+
font-size: 14px;
|
104 |
+
margin-bottom: 20px;
|
105 |
+
}
|
106 |
+
|
107 |
+
.rounded-label-shipping-delivery {
|
108 |
+
background-color: #20B2AA;
|
109 |
+
color: black;
|
110 |
+
border-radius: 20px;
|
111 |
+
padding: 5px 10px;
|
112 |
+
font-size: 14px;
|
113 |
+
margin-bottom: 20px;
|
114 |
+
}
|
115 |
+
|
116 |
+
.rounded-label-undefined {
|
117 |
+
background-color: #DCDCDC;
|
118 |
+
color: black;
|
119 |
+
border-radius: 20px;
|
120 |
+
padding: 5px 10px;
|
121 |
+
font-size: 14px;
|
122 |
+
margin-bottom: 20px;
|
123 |
+
}
|
124 |
+
|
125 |
+
</style>
|
126 |
+
""", unsafe_allow_html=True)
|
127 |
+
|
128 |
+
def chips_label(output):
|
129 |
+
asd = []
|
130 |
+
for label in output["predicted_labels"]:
|
131 |
+
if label == "Product":
|
132 |
+
score = f"{output["predicted_score"][0] * 100:.2f}%"
|
133 |
+
score = f"<strong>{score}</strong>"
|
134 |
+
asd.append(f"<div class='rounded-label-product'>π¦Product {score}</div>")
|
135 |
+
elif label == "Customer Service":
|
136 |
+
score = f"{output["predicted_score"][1] * 100:.2f}%"
|
137 |
+
score = f"<strong>{score}</strong>"
|
138 |
+
asd.append(f"<div class='rounded-label-customer-service'>π©βπΌCustomer Service {score}</div>")
|
139 |
+
elif label == "Shipping/Delivery":
|
140 |
+
score = f"{output["predicted_score"][2] * 100:.2f}%"
|
141 |
+
score = f"<strong>{score}</strong>"
|
142 |
+
asd.append(f"<div class='rounded-label-shipping-delivery'>πShipping/Delivery {score}</div>")
|
143 |
+
# for label, score in zip(output["predicted_labels"], output["predicted_score"]):
|
144 |
+
# score = f"{score * 100:.2f}%"
|
145 |
+
# score = f"<strong>{score}</strong>"
|
146 |
+
# if label == "Product":
|
147 |
+
# asd.append(f"<div class='rounded-label-product'>π¦Product {score}</div>")
|
148 |
+
# elif label == "Customer Service":
|
149 |
+
# asd.append(f"<div class='rounded-label-customer-service'>π©βπΌCustomer Service {score}</div>")
|
150 |
+
# elif label == "Shipping/Delivery":
|
151 |
+
# asd.append(f"<div class='rounded-label-shipping-delivery'>πShipping/Delivery {score}</div>")
|
152 |
+
if asd == []:
|
153 |
+
asd.append("<div class='rounded-label-undefined'>Undefined</div>")
|
154 |
+
labels_html = "".join(asd)
|
155 |
+
st.markdown(f"<div class='label-container'>{labels_html}</div>", unsafe_allow_html=True)
|
156 |
+
|
157 |
+
def show_result(outputs):
|
158 |
+
st.title("Result")
|
159 |
+
# create 2 column
|
160 |
+
col1, col2 = st.columns(2)
|
161 |
+
with col1:
|
162 |
+
st.write("π Total reviews : ", len(outputs))
|
163 |
+
with col2:
|
164 |
+
st.write("π₯οΈ Model used : ", selected_model)
|
165 |
+
for i, output in enumerate(outputs):
|
166 |
+
st.markdown(
|
167 |
+
f"<p style='color:grey; margin: 0; padding: 0;'>Review {i+1}:</p>",
|
168 |
+
unsafe_allow_html=True)
|
169 |
+
st.markdown(f"<p style='font-size:20px; margin-bottom: 5px;'><strong>{output['review']}</strong></p>", unsafe_allow_html=True)
|
170 |
+
chips_label(output)
|
171 |
+
st.balloons()
|
172 |
+
# change predicted_labels to dict with key is the label
|
173 |
+
new_outputs = []
|
174 |
+
for output in outputs:
|
175 |
+
temp = output
|
176 |
+
temp['predicted_score'] = [
|
177 |
+
f"Product {output['predicted_score'][0] * 100:.2f}%",
|
178 |
+
f"Customer Service {output['predicted_score'][1] * 100:.2f}%",
|
179 |
+
f"Shipping/Delivery {output['predicted_score'][2] * 100:.2f}%"
|
180 |
+
]
|
181 |
+
new_outputs.append(temp)
|
182 |
+
|
183 |
+
df = pd.DataFrame(new_outputs)
|
184 |
+
st.write(df)
|
185 |
+
|
186 |
+
# # Display the scores in a more beautiful way
|
187 |
+
# for output in new_outputs:
|
188 |
+
# st.markdown("### Predicted Scores")
|
189 |
+
# for label, score in output['predicted_score'].items():
|
190 |
+
# st.markdown(f"<div style='padding: 5px; border-radius: 5px; background-color: #f0f0f0; margin-bottom: 5px;'>"
|
191 |
+
# f"<strong>{label}:</strong> <span style='color: #007BFF;'>{score}</span></div>",
|
192 |
+
# unsafe_allow_html=True)
|
193 |
+
button_download = st.button("Download as csv")
|
194 |
+
if button_download:
|
195 |
+
df.to_csv("result.csv", index=False)
|
196 |
+
st.success("Result has been downloaded as csv")
|
197 |
+
|
198 |
+
|
199 |
+
if button_submit and pipeline.ready_status:
|
200 |
+
submit()
|
201 |
+
elif button_submit and not pipeline.ready_status:
|
202 |
+
st.error("Models are not ready yet, please wait a moment")
|
203 |
+
|
204 |
+
|
Product Reviews Ecommerce Multilabel Dataset.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
model_services/__pycache__/model.cpython-312.pyc
ADDED
Binary file (4.05 kB). View file
|
|
model_services/__pycache__/pipeline.cpython-310.pyc
ADDED
Binary file (1.79 kB). View file
|
|
model_services/__pycache__/pipeline.cpython-312.pyc
ADDED
Binary file (3.87 kB). View file
|
|
model_services/model.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch.nn as nn
|
2 |
+
import torch
|
3 |
+
from huggingface_hub import PyTorchModelHubMixin
|
4 |
+
|
5 |
+
class IndoBertEcommerceReview(nn.Module, PyTorchModelHubMixin):
|
6 |
+
def __init__(self, bert):
|
7 |
+
super().__init__()
|
8 |
+
self.bert = bert
|
9 |
+
self.sigmoid = nn.Sigmoid()
|
10 |
+
|
11 |
+
def forward(self, input_ids, attention_mask):
|
12 |
+
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
|
13 |
+
logits = outputs.logits
|
14 |
+
probabilities = self.sigmoid(logits)
|
15 |
+
return probabilities
|
16 |
+
|
17 |
+
class IndoBertCNNEcommerceReview(nn.Module, PyTorchModelHubMixin):
|
18 |
+
def __init__(self, bert):
|
19 |
+
super().__init__()
|
20 |
+
self.bert = bert
|
21 |
+
self.conv1 = nn.Conv1d(in_channels=bert.config.hidden_size, out_channels=512, kernel_size=3, padding=1)
|
22 |
+
self.pool = nn.AdaptiveAvgPool1d(1)
|
23 |
+
self.linear = nn.Linear(512, 3)
|
24 |
+
self.sigmoid = nn.Sigmoid()
|
25 |
+
|
26 |
+
def forward(self, input_ids, attention_mask):
|
27 |
+
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
|
28 |
+
last_hidden_state = outputs.last_hidden_state
|
29 |
+
|
30 |
+
# Permute to [batch_size, hidden_size, seq_len]
|
31 |
+
last_hidden_state = last_hidden_state.permute(0, 2, 1)
|
32 |
+
|
33 |
+
conv1_output = self.conv1(last_hidden_state)
|
34 |
+
pooled_output = self.pool(conv1_output).squeeze(-1)
|
35 |
+
logits = self.linear(pooled_output)
|
36 |
+
probabilities = self.sigmoid(logits)
|
37 |
+
return probabilities
|
38 |
+
|
39 |
+
class IndoBertLSTMEcommerceReview(nn.Module, PyTorchModelHubMixin):
|
40 |
+
def __init__(self, bert):
|
41 |
+
super().__init__()
|
42 |
+
self.bert = bert
|
43 |
+
self.lstm = nn.LSTM(bert.config.hidden_size, 128)
|
44 |
+
self.linear = nn.Linear(128, 3)
|
45 |
+
self.sigmoid = nn.Sigmoid()
|
46 |
+
|
47 |
+
def forward(self, input_ids, attention_mask):
|
48 |
+
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
|
49 |
+
last_hidden_state = outputs.last_hidden_state
|
50 |
+
lstm_out, _ = self.lstm(last_hidden_state)
|
51 |
+
pooled = lstm_out[:, -1, :]
|
52 |
+
logits = self.linear(pooled)
|
53 |
+
probabilities = self.sigmoid(logits)
|
54 |
+
return probabilities
|
55 |
+
|
model_services/pipeline.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import numpy as np
|
3 |
+
from transformers import BertModel, AutoTokenizer, BertForSequenceClassification
|
4 |
+
from model_services.model import IndoBertEcommerceReview, IndoBertCNNEcommerceReview, IndoBertLSTMEcommerceReview
|
5 |
+
import streamlit as st
|
6 |
+
|
7 |
+
|
8 |
+
ready_status = False
|
9 |
+
bert = None
|
10 |
+
tokenizer = None
|
11 |
+
indobert_model = None
|
12 |
+
indobertcnn_model = None
|
13 |
+
indobertlstm_model = None
|
14 |
+
|
15 |
+
|
16 |
+
with st.status("Loading models...", expanded=True, state='running') as status:
|
17 |
+
# Load the base model and tokenizer
|
18 |
+
bertSequence = BertForSequenceClassification.from_pretrained("indobenchmark/indobert-base-p1",
|
19 |
+
num_labels=3,
|
20 |
+
problem_type="multi_label_classification")
|
21 |
+
bert = BertModel.from_pretrained("indobenchmark/indobert-base-p1")
|
22 |
+
tokenizer = AutoTokenizer.from_pretrained("fahrendrakhoirul/indobert-finetuned-ecommerce-reviews")
|
23 |
+
|
24 |
+
# Load custom models
|
25 |
+
indobert_model = IndoBertEcommerceReview.from_pretrained("fahrendrakhoirul/indobert-finetuned-ecommerce-reviews", bert=bertSequence)
|
26 |
+
st.write("IndoBERT model loaded")
|
27 |
+
indobertcnn_model = IndoBertCNNEcommerceReview.from_pretrained("fahrendrakhoirul/indobert-cnn-finetuned-ecommerce-reviews", bert=bert)
|
28 |
+
st.write("IndoBERT-CNN model loaded")
|
29 |
+
indobertlstm_model = IndoBertLSTMEcommerceReview.from_pretrained("fahrendrakhoirul/indobert-lstm-finetuned-ecommerce-reviews", bert=bert)
|
30 |
+
st.write("IndoBERT-LSTM model loaded")
|
31 |
+
|
32 |
+
# Update status to indicate models are ready
|
33 |
+
if indobert_model and indobertcnn_model and indobertlstm_model != None:
|
34 |
+
ready_status = True
|
35 |
+
if ready_status:
|
36 |
+
status.update(label="Models loaded successfully", expanded=False)
|
37 |
+
status.success("Models loaded successfully", icon="β
")
|
38 |
+
else:
|
39 |
+
status.error("Failed to load models")
|
40 |
+
|
41 |
+
|
42 |
+
# def init():
|
43 |
+
# global ready_status, bert, tokenizer, indobert_model, indobertcnn_model, indobertlstm_model
|
44 |
+
# try:
|
45 |
+
# # Load the base model and tokenizer
|
46 |
+
# bert = BertModel.from_pretrained("indobenchmark/indobert-base-p1")
|
47 |
+
# tokenizer = AutoTokenizer.from_pretrained("fahrendrakhoirul/indobert-finetuned-ecommerce-reviews")
|
48 |
+
|
49 |
+
# # Load custom models
|
50 |
+
# indobert_model = IndoBertEcommerceReview.from_pretrained("fahrendrakhoirul/indobert-finetuned-ecommerce-reviews", bert=bert)
|
51 |
+
# print("IndoBERT model loaded")
|
52 |
+
# indobertcnn_model = IndoBertCNNEcommerceReview.from_pretrained("fahrendrakhoirul/indobert-cnn-finetuned-ecommerce-reviews", bert=bert)
|
53 |
+
# print("IndoBERT-CNN model loaded")
|
54 |
+
# indobertlstm_model = IndoBertLSTMEcommerceReview.from_pretrained("fahrendrakhoirul/indobert-lstm-finetuned-ecommerce-reviews", bert=bert)
|
55 |
+
# print("IndoBERT-LSTM model loaded")
|
56 |
+
# ready_status = True
|
57 |
+
# return True
|
58 |
+
# except Exception as e:
|
59 |
+
# print(f"Failed to initialize models: {e}")
|
60 |
+
# ready_status = False
|
61 |
+
# return False
|
62 |
+
|
63 |
+
def predict(text: str, model_name: str):
|
64 |
+
token_result = tokenizer(text, return_tensors="pt")
|
65 |
+
model = None
|
66 |
+
if model_name == "IndoBERT":
|
67 |
+
model = indobert_model
|
68 |
+
if model_name == "IndoBERT-CNN (Best)":
|
69 |
+
model = indobertcnn_model
|
70 |
+
if model_name == "IndoBERT-LSTM":
|
71 |
+
model = indobertlstm_model
|
72 |
+
input_ids = token_result['input_ids']
|
73 |
+
attention_mask = token_result['attention_mask']
|
74 |
+
with torch.no_grad():
|
75 |
+
logits = model(input_ids=input_ids, attention_mask=attention_mask)
|
76 |
+
preds = torch.sigmoid(logits).detach().cpu().numpy()[0]
|
77 |
+
return preds
|
78 |
+
|
79 |
+
def get_label(preds):
|
80 |
+
labels = ["Product", "Customer Service", "Shipping/Delivery"]
|
81 |
+
result = [label for i, label in enumerate(labels) if preds[i] > 0.6]
|
82 |
+
return result
|
83 |
+
|
84 |
+
def get_result(reviews: list[str], model_name: str):
|
85 |
+
outputs = []
|
86 |
+
for review in reviews:
|
87 |
+
preds = predict(review, model_name)
|
88 |
+
labels = get_label(preds)
|
89 |
+
output = {
|
90 |
+
"review": review,
|
91 |
+
"predicted_score": preds,
|
92 |
+
"predicted_labels": labels
|
93 |
+
}
|
94 |
+
outputs.append(output)
|
95 |
+
return outputs
|
pages/Dataset.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# streamlit
|
2 |
+
import streamlit as st
|
3 |
+
import pandas as pd
|
4 |
+
import numpy as np
|
5 |
+
# import matplotlib.pyplot as plt
|
6 |
+
# from wordcloud import WordCloud
|
7 |
+
|
8 |
+
st.set_page_config(page_title="Dataset", page_icon="π ")
|
9 |
+
|
10 |
+
# help me create sidebar
|
11 |
+
st.sidebar.markdown("## π About"
|
12 |
+
"\nThis is a simple web app to classify the aspect of reviews from an e-commerce dataset."
|
13 |
+
"\n\nThe dataset used is a multilabel dataset, which means a review can have multiple labels."
|
14 |
+
"\n\nThe labels are:"
|
15 |
+
"\n- π¦ **Product**"
|
16 |
+
"\n- π©βπΌ **Customer Service**"
|
17 |
+
"\n- π **Shipping/Delivery**")
|
18 |
+
|
19 |
+
# add create by Fahrendra Khoirul Ihtada and Rizha Alfianita using streamlit and Hugging Face's IndoBERT model
|
20 |
+
st.sidebar.markdown("## π¨βπ» Created by"
|
21 |
+
"\n[Fahrendra Khoirul Ihtada](https://www.linkedin.com/in/fahrendra-khoirul-ihtada/) "
|
22 |
+
"and [Rizha Alfianita](https://www.linkedin.com/in/rizha-alfianita/)"
|
23 |
+
"\n Using Streamlit and Hugging Face's [IndoBERT](https://huggingface.co/indobenchmark/indobert-base-p1) model.")
|
24 |
+
|
25 |
+
# add my hugging face profile
|
26 |
+
st.sidebar.markdown("## π€ Hugging Face"
|
27 |
+
"\n- [Fahrendra Khoirul Ihtada](https://huggingface.co/fahrendrakhoirul)")
|
28 |
+
|
29 |
+
# Title and Caption
|
30 |
+
st.title("πDataset Overview")
|
31 |
+
|
32 |
+
# Descriptive text
|
33 |
+
st.write("""
|
34 |
+
This dataset is full of customer reviews that give us a great idea of what it's like to buy things online. The reviews talk about everything from how good the product is to how fast it got here and how helpful the seller was""")
|
35 |
+
|
36 |
+
# Dataset link
|
37 |
+
st.markdown("[Access the Dataset](https://huggingface.co/datasets/fahrendrakhoirul/ecommerce-reviews-multilabel-dataset)")
|
38 |
+
|
39 |
+
|
40 |
+
df = pd.read_json("Product Reviews Ecommerce Multilabel Dataset.json", lines=True)
|
41 |
+
st.write(df)
|
42 |
+
|
43 |
+
# # Combine all reviews into a single string
|
44 |
+
# all_reviews = " ".join(df['review']) Β
|
45 |
+
|
46 |
+
|
47 |
+
# # Generate the word cloud
|
48 |
+
# wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_reviews)
|
49 |
+
|
50 |
+
# # Display the word cloud
|
51 |
+
# plt.figure(figsize=(10, 6))
|
52 |
+
# plt.imshow(wordcloud, interpolation='bilinear')
|
53 |
+
# plt.axis("off")
|
54 |
+
# plt.show()
|
55 |
+
|
56 |
+
# st.pyplot(plt.gcf())
|
57 |
+
|
58 |
+
# # Create a bar chart for sentiment analysis (example)
|
59 |
+
# # Assuming you have a column 'sentiment' in your dataframe
|
60 |
+
# # st.write("**Distribusi Sentimen**")
|
61 |
+
# # sentiment_counts = df['sentiment'].value_counts()
|
62 |
+
# # st.bar_chart(sentiment_counts)
|
requirements.txt
ADDED
File without changes
|
tes.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch.nn as nn
|
2 |
+
import torch
|
3 |
+
import numpy as np
|
4 |
+
from huggingface_hub import PyTorchModelHubMixin
|
5 |
+
from transformers import BertModel, AutoTokenizer
|
6 |
+
|
7 |
+
class IndoBertLSTMEcommerceReview(nn.Module, PyTorchModelHubMixin):
|
8 |
+
def __init__(self, bert):
|
9 |
+
super().__init__()
|
10 |
+
self.bert = bert
|
11 |
+
self.lstm = nn.LSTM(bert.config.hidden_size, 128)
|
12 |
+
self.linear = nn.Linear(128, 3)
|
13 |
+
self.sigmoid = nn.Sigmoid()
|
14 |
+
|
15 |
+
def forward(self, input_ids, attention_mask):
|
16 |
+
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
|
17 |
+
# print(outputs.keys())
|
18 |
+
last_hidden_state = outputs.last_hidden_state
|
19 |
+
lstm_out, _ = self.lstm(last_hidden_state)
|
20 |
+
pooled = lstm_out[:, -1, :]
|
21 |
+
logits = self.linear(pooled)
|
22 |
+
probabilities = self.sigmoid(logits)
|
23 |
+
return probabilities
|
24 |
+
|
25 |
+
bert = BertModel.from_pretrained("indobenchmark/indobert-base-p1")
|
26 |
+
tokenizer = AutoTokenizer.from_pretrained("fahrendrakhoirul/indobert-finetuned-ecommerce-reviews")
|
27 |
+
|
28 |
+
indobertlstm_model = IndoBertLSTMEcommerceReview.from_pretrained("fahrendrakhoirul/indobert-lstm-finetuned-ecommerce-reviews", bert=bert).to('cpu')
|
29 |
+
|
30 |
+
# run modell
|
31 |
+
res_token = tokenizer("hahahah", return_tensors="pt").to('cpu')
|
32 |
+
input_ids = res_token['input_ids'] # Unpack dictionary
|
33 |
+
attention_mask = res_token['attention_mask']# Unpack dictionary
|
34 |
+
|
35 |
+
print(res_token)
|
36 |
+
with torch.no_grad():
|
37 |
+
logits = indobertlstm_model(input_ids=input_ids, attention_mask=attention_mask)
|
38 |
+
preds = torch.sigmoid(logits).detach().cpu().numpy()[0]
|
39 |
+
|
40 |
+
print(preds)
|