Spaces:

fahrendrakhoirul
/

EcomReviewClassification

Sleeping

App Files Files Community

fahrendrakhoirul commited on Jul 31, 2024

Commit

c7f5132

0 Parent(s):

init

Browse files

Files changed (10) hide show

Home.py +204 -0
Product Reviews Ecommerce Multilabel Dataset.json +0 -0
model_services/__pycache__/model.cpython-312.pyc +0 -0
model_services/__pycache__/pipeline.cpython-310.pyc +0 -0
model_services/__pycache__/pipeline.cpython-312.pyc +0 -0
model_services/model.py +55 -0
model_services/pipeline.py +95 -0
pages/Dataset.py +62 -0
requirements.txt +0 -0
tes.py +40 -0

Home.py ADDED Viewed

	@@ -0,0 +1,204 @@

+# streamlit
+import streamlit as st
+import pandas as pd
+st.set_page_config(page_title="Home", page_icon="🏠", layout="centered")
+st.markdown("# 🛍️ Aspect-Based Multilabel Classification of Ecommerce Reviews")
+st.write("Ever wondered what people think about the products, customer service, and shipping of your favorite online store? Try this out!")
+# help me create sidebar
+st.sidebar.markdown("## 📚 About"
+                    "\nThis is a simple web app to classify the aspect of reviews from an e-commerce dataset."
+                    "\n\nThe dataset used is a multilabel dataset, which means a review can have multiple labels."
+                    "\n\nThe labels are:"
+                    "\n- 📦 **Product**"
+                    "\n- 👩‍💼 **Customer Service**"
+                    "\n- 🚚 **Shipping/Delivery**")
+# add create by Fahrendra Khoirul Ihtada and Rizha Alfianita using streamlit and Hugging Face's IndoBERT model
+st.sidebar.markdown("## 👨‍💻 Created by"
+                    "\n[Fahrendra Khoirul Ihtada](https://www.linkedin.com/in/fahrendra-khoirul-ihtada/) "
+                    "and [Rizha Alfianita](https://www.linkedin.com/in/rizha-alfianita/)"
+                    "\n Using Streamlit and Hugging Face's [IndoBERT](https://huggingface.co/indobenchmark/indobert-base-p1) model.")
+# add my hugging face profile
+st.sidebar.markdown("## 🤖 Hugging Face"
+                    "\n- [Fahrendra Khoirul Ihtada](https://huggingface.co/fahrendrakhoirul)")
+# import here because why not??
+import model_services.pipeline as pipeline
+container_1 = st.container(border=True)
+# create rows and 2 dropdown menus side by side
+row1_1, row1_2 = container_1.columns((2, 1))
+with row1_1:
+    df = pd.read_json("Product Reviews Ecommerce Multilabel Dataset.json", lines=True)
+    selected_review = st.selectbox(
+        "You can pick a review from dataset",
+        df["review"].values,
+    )
+with row1_2:
+    selected_model = st.selectbox(
+        "Choose the model",
+        ("IndoBERT", "IndoBERT-CNN (Best)", "IndoBERT-LSTM"),
+    )
+# text input
+input_review = container_1.text_area("Or you can input multiple review with separated line", selected_review, height=200)
+# create button submit
+button_submit = container_1.button("Classify")
+def show_label_desc():
+    st.divider()
+    st.write("Let's see what is the meaning of each labels:")
+    st.write("- 📦**Product**         : related Customer satisfaction with the quality, performance, and conformity of the product to the description given")
+    st.write("- 👩‍💼**Customer Service**  : Interaction between customers and sellers, friendliness and speed of response from sellers, and handling complaints.")
+    st.write("- 🚚**Shipping/Delivery** : related to shipping speed, condition of goods when received, and timeliness of shipping")
+def submit():
+    # Create UI for Result
+    st.success("Done! 👌")
+    outputs = do_calculation(input_review)
+    # input_review = ""
+    show_result(outputs)
+    show_label_desc()
+def do_calculation(texts):
+    # split text by newline
+    reviews = texts.split("\n")
+    # remove empty string
+    reviews = list(filter(None, reviews))
+    # do the prediction
+    outputs = pipeline.get_result(reviews, selected_model)
+    return outputs
+st.markdown("""
+    <style>
+    .label-container {
+        display: flex;
+        flex-wrap: wrap;
+        gap: 5px;
+    }
+    .rounded-label-product {
+        background-color: #FFD700;
+        color: black;
+        border-radius: 20px;
+        padding: 5px 10px;
+        font-size: 14px;
+        margin-bottom: 20px;
+    }
+    .rounded-label-customer-service {
+        background-color: #FFA07A;
+        color: black;
+        border-radius: 20px;
+        padding: 5px 10px;
+        font-size: 14px;
+        margin-bottom: 20px;
+    }
+    .rounded-label-shipping-delivery {
+        background-color: #20B2AA;
+        color: black;
+        border-radius: 20px;
+        padding: 5px 10px;
+        font-size: 14px;
+        margin-bottom: 20px;
+    }
+    .rounded-label-undefined {
+        background-color: #DCDCDC;
+        color: black;
+        border-radius: 20px;
+        padding: 5px 10px;
+        font-size: 14px;
+        margin-bottom: 20px;
+    }
+    </style>
+    """, unsafe_allow_html=True)
+def chips_label(output):
+    asd = []
+    for label in output["predicted_labels"]:
+        if label == "Product":
+            score = f"{output["predicted_score"][0] * 100:.2f}%"
+            score = f"<strong>{score}</strong>"
+            asd.append(f"<div class='rounded-label-product'>📦Product {score}</div>")
+        elif label == "Customer Service":
+            score = f"{output["predicted_score"][1] * 100:.2f}%"
+            score = f"<strong>{score}</strong>"
+            asd.append(f"<div class='rounded-label-customer-service'>👩‍💼Customer Service {score}</div>")
+        elif label == "Shipping/Delivery":
+            score = f"{output["predicted_score"][2] * 100:.2f}%"
+            score = f"<strong>{score}</strong>"
+            asd.append(f"<div class='rounded-label-shipping-delivery'>🚚Shipping/Delivery {score}</div>")
+    # for label, score in zip(output["predicted_labels"], output["predicted_score"]):
+    #     score = f"{score * 100:.2f}%"
+    #     score = f"<strong>{score}</strong>"
+    #     if label == "Product":
+    #         asd.append(f"<div class='rounded-label-product'>📦Product {score}</div>")
+    #     elif label == "Customer Service":
+    #         asd.append(f"<div class='rounded-label-customer-service'>👩‍💼Customer Service {score}</div>")
+    #     elif label == "Shipping/Delivery":
+    #         asd.append(f"<div class='rounded-label-shipping-delivery'>🚚Shipping/Delivery {score}</div>")
+    if asd == []:
+            asd.append("<div class='rounded-label-undefined'>Undefined</div>")
+    labels_html = "".join(asd)
+    st.markdown(f"<div class='label-container'>{labels_html}</div>", unsafe_allow_html=True)
+def show_result(outputs):
+    st.title("Result")
+    # create 2 column
+    col1, col2 = st.columns(2)
+    with col1:
+        st.write("📑 Total reviews   : ", len(outputs))
+    with col2:
+        st.write("🖥️ Model used      : ", selected_model)
+    for i, output in enumerate(outputs):
+        st.markdown(
+        f"<p style='color:grey; margin: 0; padding: 0;'>Review {i+1}:</p>",
+        unsafe_allow_html=True)
+        st.markdown(f"<p style='font-size:20px; margin-bottom: 5px;'><strong>{output['review']}</strong></p>", unsafe_allow_html=True)
+        chips_label(output)
+    st.balloons()
+    # change predicted_labels to dict with key is the label
+    new_outputs = []
+    for output in outputs:
+        temp = output
+        temp['predicted_score'] = [
+            f"Product {output['predicted_score'][0] * 100:.2f}%",
+            f"Customer Service {output['predicted_score'][1] * 100:.2f}%",
+            f"Shipping/Delivery {output['predicted_score'][2] * 100:.2f}%"
+        ]
+        new_outputs.append(temp)
+    df = pd.DataFrame(new_outputs)
+    st.write(df)
+    # # Display the scores in a more beautiful way
+    # for output in new_outputs:
+    #     st.markdown("### Predicted Scores")
+    #     for label, score in output['predicted_score'].items():
+    #         st.markdown(f"<div style='padding: 5px; border-radius: 5px; background-color: #f0f0f0; margin-bottom: 5px;'>"
+    #                     f"<strong>{label}:</strong> <span style='color: #007BFF;'>{score}</span></div>",
+    #                     unsafe_allow_html=True)
+    button_download = st.button("Download as csv")
+    if button_download:
+        df.to_csv("result.csv", index=False)
+        st.success("Result has been downloaded as csv")
+if button_submit and pipeline.ready_status:
+    submit()
+elif button_submit and not pipeline.ready_status:
+    st.error("Models are not ready yet, please wait a moment")

Product Reviews Ecommerce Multilabel Dataset.json ADDED Viewed

The diff for this file is too large to render. See raw diff

model_services/__pycache__/model.cpython-312.pyc ADDED Viewed

Binary file (4.05 kB). View file

model_services/__pycache__/pipeline.cpython-310.pyc ADDED Viewed

Binary file (1.79 kB). View file

model_services/__pycache__/pipeline.cpython-312.pyc ADDED Viewed

Binary file (3.87 kB). View file

model_services/model.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import torch.nn as nn
+import torch
+from huggingface_hub import PyTorchModelHubMixin
+class IndoBertEcommerceReview(nn.Module, PyTorchModelHubMixin):
+    def __init__(self, bert):
+        super().__init__()
+        self.bert  = bert
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, input_ids, attention_mask):
+        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
+        logits = outputs.logits
+        probabilities = self.sigmoid(logits)
+        return probabilities
+class IndoBertCNNEcommerceReview(nn.Module, PyTorchModelHubMixin):
+    def __init__(self, bert):
+        super().__init__()
+        self.bert = bert
+        self.conv1 = nn.Conv1d(in_channels=bert.config.hidden_size, out_channels=512, kernel_size=3, padding=1)
+        self.pool = nn.AdaptiveAvgPool1d(1)
+        self.linear = nn.Linear(512, 3)
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, input_ids, attention_mask):
+        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
+        last_hidden_state = outputs.last_hidden_state
+        # Permute to [batch_size, hidden_size, seq_len]
+        last_hidden_state = last_hidden_state.permute(0, 2, 1)
+        conv1_output = self.conv1(last_hidden_state)
+        pooled_output = self.pool(conv1_output).squeeze(-1)
+        logits = self.linear(pooled_output)
+        probabilities = self.sigmoid(logits)
+        return probabilities
+class IndoBertLSTMEcommerceReview(nn.Module, PyTorchModelHubMixin):
+    def __init__(self, bert):
+      super().__init__()
+      self.bert = bert
+      self.lstm = nn.LSTM(bert.config.hidden_size, 128)
+      self.linear = nn.Linear(128, 3)
+      self.sigmoid = nn.Sigmoid()
+    def forward(self, input_ids, attention_mask):
+      outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
+      last_hidden_state = outputs.last_hidden_state
+      lstm_out, _ = self.lstm(last_hidden_state)
+      pooled = lstm_out[:, -1, :]
+      logits = self.linear(pooled)
+      probabilities = self.sigmoid(logits)
+      return probabilities

model_services/pipeline.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import torch
+import numpy as np
+from transformers import BertModel, AutoTokenizer, BertForSequenceClassification
+from model_services.model import IndoBertEcommerceReview, IndoBertCNNEcommerceReview, IndoBertLSTMEcommerceReview
+import streamlit as st
+ready_status = False
+bert = None
+tokenizer = None
+indobert_model = None
+indobertcnn_model = None
+indobertlstm_model = None
+with st.status("Loading models...", expanded=True, state='running') as status:
+    # Load the base model and tokenizer
+    bertSequence = BertForSequenceClassification.from_pretrained("indobenchmark/indobert-base-p1",
+                                                            num_labels=3,
+                                                           problem_type="multi_label_classification")
+    bert = BertModel.from_pretrained("indobenchmark/indobert-base-p1")
+    tokenizer = AutoTokenizer.from_pretrained("fahrendrakhoirul/indobert-finetuned-ecommerce-reviews")
+    # Load custom models
+    indobert_model = IndoBertEcommerceReview.from_pretrained("fahrendrakhoirul/indobert-finetuned-ecommerce-reviews", bert=bertSequence)
+    st.write("IndoBERT model loaded")
+    indobertcnn_model = IndoBertCNNEcommerceReview.from_pretrained("fahrendrakhoirul/indobert-cnn-finetuned-ecommerce-reviews", bert=bert)
+    st.write("IndoBERT-CNN model loaded")
+    indobertlstm_model = IndoBertLSTMEcommerceReview.from_pretrained("fahrendrakhoirul/indobert-lstm-finetuned-ecommerce-reviews", bert=bert)
+    st.write("IndoBERT-LSTM model loaded")
+    # Update status to indicate models are ready
+    if indobert_model and indobertcnn_model and indobertlstm_model != None:
+        ready_status = True
+    if ready_status:
+        status.update(label="Models loaded successfully", expanded=False)
+        status.success("Models loaded successfully", icon="✅")
+    else:
+        status.error("Failed to load models")
+# def init():
+#     global ready_status, bert, tokenizer, indobert_model, indobertcnn_model, indobertlstm_model
+#     try:
+#         # Load the base model and tokenizer
+#         bert = BertModel.from_pretrained("indobenchmark/indobert-base-p1")
+#         tokenizer = AutoTokenizer.from_pretrained("fahrendrakhoirul/indobert-finetuned-ecommerce-reviews")
+#         # Load custom models
+#         indobert_model = IndoBertEcommerceReview.from_pretrained("fahrendrakhoirul/indobert-finetuned-ecommerce-reviews", bert=bert)
+#         print("IndoBERT model loaded")
+#         indobertcnn_model = IndoBertCNNEcommerceReview.from_pretrained("fahrendrakhoirul/indobert-cnn-finetuned-ecommerce-reviews", bert=bert)
+#         print("IndoBERT-CNN model loaded")
+#         indobertlstm_model = IndoBertLSTMEcommerceReview.from_pretrained("fahrendrakhoirul/indobert-lstm-finetuned-ecommerce-reviews", bert=bert)
+#         print("IndoBERT-LSTM model loaded")
+#         ready_status = True
+#         return True
+#     except Exception as e:
+#         print(f"Failed to initialize models: {e}")
+#         ready_status = False
+#         return False
+def predict(text: str, model_name: str):
+    token_result = tokenizer(text, return_tensors="pt")
+    model = None
+    if model_name == "IndoBERT":
+        model = indobert_model
+    if model_name == "IndoBERT-CNN (Best)":
+        model = indobertcnn_model
+    if model_name == "IndoBERT-LSTM":
+        model = indobertlstm_model
+    input_ids = token_result['input_ids']
+    attention_mask = token_result['attention_mask']
+    with torch.no_grad():
+        logits = model(input_ids=input_ids, attention_mask=attention_mask)
+        preds = torch.sigmoid(logits).detach().cpu().numpy()[0]
+    return preds
+def get_label(preds):
+    labels = ["Product", "Customer Service", "Shipping/Delivery"]
+    result = [label for i, label in enumerate(labels) if preds[i] > 0.6]
+    return result
+def get_result(reviews: list[str], model_name: str):
+    outputs = []
+    for review in reviews:
+        preds = predict(review, model_name)
+        labels = get_label(preds)
+        output = {
+            "review": review,
+            "predicted_score": preds,
+            "predicted_labels": labels
+        }
+        outputs.append(output)
+    return outputs

pages/Dataset.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# streamlit
+import streamlit as st
+import pandas as pd
+import numpy as np
+# import matplotlib.pyplot as plt
+# from wordcloud import WordCloud
+st.set_page_config(page_title="Dataset", page_icon="🏠")
+# help me create sidebar
+st.sidebar.markdown("## 📚 About"
+                    "\nThis is a simple web app to classify the aspect of reviews from an e-commerce dataset."
+                    "\n\nThe dataset used is a multilabel dataset, which means a review can have multiple labels."
+                    "\n\nThe labels are:"
+                    "\n- 📦 **Product**"
+                    "\n- 👩‍💼 **Customer Service**"
+                    "\n- 🚚 **Shipping/Delivery**")
+# add create by Fahrendra Khoirul Ihtada and Rizha Alfianita using streamlit and Hugging Face's IndoBERT model
+st.sidebar.markdown("## 👨‍💻 Created by"
+                    "\n[Fahrendra Khoirul Ihtada](https://www.linkedin.com/in/fahrendra-khoirul-ihtada/) "
+                    "and [Rizha Alfianita](https://www.linkedin.com/in/rizha-alfianita/)"
+                    "\n Using Streamlit and Hugging Face's [IndoBERT](https://huggingface.co/indobenchmark/indobert-base-p1) model.")
+# add my hugging face profile
+st.sidebar.markdown("## 🤖 Hugging Face"
+                    "\n- [Fahrendra Khoirul Ihtada](https://huggingface.co/fahrendrakhoirul)")
+# Title and Caption
+st.title("📊Dataset Overview")
+# Descriptive text
+st.write("""
+This dataset is full of customer reviews that give us a great idea of what it's like to buy things online. The reviews talk about everything from how good the product is to how fast it got here and how helpful the seller was""")
+# Dataset link
+st.markdown("[Access the Dataset](https://huggingface.co/datasets/fahrendrakhoirul/ecommerce-reviews-multilabel-dataset)")
+df = pd.read_json("Product Reviews Ecommerce Multilabel Dataset.json", lines=True)
+st.write(df)
+# # Combine all reviews into a single string
+# all_reviews = " ".join(df['review'])
+# # Generate the word cloud
+# wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_reviews)
+# # Display the word cloud
+# plt.figure(figsize=(10, 6))
+# plt.imshow(wordcloud, interpolation='bilinear')
+# plt.axis("off")
+# plt.show()
+# st.pyplot(plt.gcf())
+# # Create a bar chart for sentiment analysis (example)
+# # Assuming you have a column 'sentiment' in your dataframe
+# # st.write("**Distribusi Sentimen**")
+# # sentiment_counts = df['sentiment'].value_counts()
+# # st.bar_chart(sentiment_counts)

requirements.txt ADDED Viewed

File without changes

tes.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import torch.nn as nn
+import torch
+import numpy as np
+from huggingface_hub import PyTorchModelHubMixin
+from transformers import BertModel, AutoTokenizer
+class IndoBertLSTMEcommerceReview(nn.Module, PyTorchModelHubMixin):
+    def __init__(self, bert):
+      super().__init__()
+      self.bert = bert
+      self.lstm = nn.LSTM(bert.config.hidden_size, 128)
+      self.linear = nn.Linear(128, 3)
+      self.sigmoid = nn.Sigmoid()
+    def forward(self, input_ids, attention_mask):
+      outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
+      # print(outputs.keys())
+      last_hidden_state = outputs.last_hidden_state
+      lstm_out, _ = self.lstm(last_hidden_state)
+      pooled = lstm_out[:, -1, :]
+      logits = self.linear(pooled)
+      probabilities = self.sigmoid(logits)
+      return probabilities
+bert = BertModel.from_pretrained("indobenchmark/indobert-base-p1")
+tokenizer = AutoTokenizer.from_pretrained("fahrendrakhoirul/indobert-finetuned-ecommerce-reviews")
+indobertlstm_model = IndoBertLSTMEcommerceReview.from_pretrained("fahrendrakhoirul/indobert-lstm-finetuned-ecommerce-reviews", bert=bert).to('cpu')
+# run modell
+res_token = tokenizer("hahahah",  return_tensors="pt").to('cpu')
+input_ids = res_token['input_ids']  # Unpack dictionary
+attention_mask = res_token['attention_mask']# Unpack dictionary
+print(res_token)
+with torch.no_grad():
+    logits = indobertlstm_model(input_ids=input_ids, attention_mask=attention_mask)
+    preds = torch.sigmoid(logits).detach().cpu().numpy()[0]
+print(preds)