Spaces:
Sleeping
Sleeping
Uploaded initial files
Browse files- MultinominalModel.py +64 -0
- README.md +12 -12
- amazon_reviews.csv +0 -0
- app.py +60 -0
- requirements.txt +5 -0
MultinominalModel.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from nltk.tokenize import word_tokenize
|
2 |
+
from nltk.stem import WordNetLemmatizer
|
3 |
+
from nltk.corpus import stopwords
|
4 |
+
from sklearn.metrics import accuracy_score
|
5 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
6 |
+
from sklearn.model_selection import train_test_split
|
7 |
+
from imblearn.over_sampling import SMOTE
|
8 |
+
from sklearn.naive_bayes import MultinomialNB
|
9 |
+
import nltk
|
10 |
+
import pandas as pd
|
11 |
+
|
12 |
+
lemmatizer = WordNetLemmatizer()
|
13 |
+
|
14 |
+
|
15 |
+
nltk.download('all-corpora')
|
16 |
+
stop_words = set(stopwords.words('english'))
|
17 |
+
|
18 |
+
df = pd.read_csv("amazon_reviews.csv")
|
19 |
+
|
20 |
+
# Preprocess text data
|
21 |
+
def preprocess(review):
|
22 |
+
review = review.lower()
|
23 |
+
tokens = word_tokenize(review)
|
24 |
+
lemmas = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
|
25 |
+
return " ".join(lemmas)
|
26 |
+
|
27 |
+
|
28 |
+
# Format csv data into array of [review, rating]
|
29 |
+
review_ratings = []
|
30 |
+
for i in range(len(df)):
|
31 |
+
review_text = str(df.loc[i]["reviewText"])
|
32 |
+
rating = int(df.loc[i]["overall"])
|
33 |
+
review_ratings.append([review_text, rating])
|
34 |
+
|
35 |
+
# Create corpus of preprocessed text
|
36 |
+
corpus = []
|
37 |
+
for i in range(len(review_ratings)):
|
38 |
+
review = review_ratings[i][0]
|
39 |
+
rating = review_ratings[i][1]
|
40 |
+
preprocessed_text = preprocess(review)
|
41 |
+
corpus.append(preprocessed_text)
|
42 |
+
|
43 |
+
|
44 |
+
# Convert to vector representation
|
45 |
+
vectorizer = TfidfVectorizer(max_features=10000)
|
46 |
+
X = vectorizer.fit_transform(corpus).toarray()
|
47 |
+
y = [r[1] for r in review_ratings]
|
48 |
+
|
49 |
+
# Generate synthetic samples as 5 star rating reviews are overbalanced
|
50 |
+
smote = SMOTE(random_state=42)
|
51 |
+
X_resampled, y_resampled = smote.fit_resample(X, y)
|
52 |
+
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
|
53 |
+
|
54 |
+
# Create model and fit
|
55 |
+
model = MultinomialNB()
|
56 |
+
model.fit(X_train, y_train)
|
57 |
+
|
58 |
+
y_predict = model.predict(X_test)
|
59 |
+
print("Accuracy", accuracy_score(y_test, y_predict))
|
60 |
+
|
61 |
+
def predict_rating(review):
|
62 |
+
preprocessed_text = preprocess(review)
|
63 |
+
vectorized = vectorizer.transform([preprocessed_text]).toarray()
|
64 |
+
return model.predict(vectorized)
|
README.md
CHANGED
@@ -1,12 +1,12 @@
|
|
1 |
-
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 4.41.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
+
---
|
2 |
+
title: AmazonRatingPrediction
|
3 |
+
emoji: 🌍
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: pink
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 4.41.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
---
|
11 |
+
|
12 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
amazon_reviews.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
app.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from MultinominalModel import predict_rating
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
import requests
|
4 |
+
import gradio as gr
|
5 |
+
|
6 |
+
max_review_count = 5
|
7 |
+
|
8 |
+
example_urls = [
|
9 |
+
"https://www.amazon.co.uk/Trintion-Scratching-Scratcher-Activity-Dangling/dp/B08FT54NRM",
|
10 |
+
"https://www.amazon.co.uk/Indoor-Hanging-playing-sleeping-suitable/dp/B0BTVW7G66",
|
11 |
+
"https://www.amazon.co.uk/PlayStation-5-Digital-Console-Slim/dp/B0CM9VKQ5N",
|
12 |
+
"https://www.amazon.co.uk/Celebrations-Chocolate-Chocolates-Centerpiece-Maltesers/dp/B07L8D6XM8",
|
13 |
+
"https://www.amazon.co.uk/HyRich-SIM-Free-Unlocked-Smartphone-Bluetooth-Note-80-Black/dp/B0BG5KBMYK",
|
14 |
+
"https://www.amazon.co.uk/Hama-HS-P350-headset-Binaural-Plastic/dp/B07ZR24KQZ",
|
15 |
+
"https://www.amazon.co.uk/Skinapeel-Sonic-Facial-Cleanser-Replaceable/dp/B011V6FUG0",
|
16 |
+
"https://www.amazon.co.uk/dp/B0BX47X1K9/"
|
17 |
+
]
|
18 |
+
|
19 |
+
def scrape_amazon_reviews(url):
|
20 |
+
headers = { "accept-language": "en-GB,en;q=0.9",
|
21 |
+
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15"}
|
22 |
+
|
23 |
+
response = requests.get(url, headers=headers)
|
24 |
+
soup = BeautifulSoup(response.content)
|
25 |
+
|
26 |
+
# Retrieve image from product page
|
27 |
+
image = soup.select_one('#landingImage').attrs.get('src')
|
28 |
+
|
29 |
+
reviews = soup.select("div.review")
|
30 |
+
|
31 |
+
# Extract review description, rating, and predict a rating from the model
|
32 |
+
output_reviews = []
|
33 |
+
for i in range(min(len(reviews), max_review_count)):
|
34 |
+
review_text = reviews[i].select_one("span.review-text").text.replace("The media could not be loaded.", "").strip("Read more").strip("\n")
|
35 |
+
rating = reviews[i].select_one("i.review-rating").text.replace("out of 5 stars", "")
|
36 |
+
predicted_rating = predict_rating(review_text)
|
37 |
+
output_reviews.append(review_text + "\n\nPredicted Rating: " + str(predicted_rating)[1] + ".0\nActual Rating: " + rating)
|
38 |
+
|
39 |
+
# If there aren't enough reviews, leave the remaining review text boxes empty
|
40 |
+
while(len(output_reviews)) < max_review_count:
|
41 |
+
output_reviews.append("")
|
42 |
+
|
43 |
+
output_reviews.append(image)
|
44 |
+
return output_reviews
|
45 |
+
|
46 |
+
# Main gradio app
|
47 |
+
with gr.Blocks() as demo:
|
48 |
+
with gr.Row():
|
49 |
+
with gr.Column():
|
50 |
+
url = gr.Textbox(label="Amazon URL")
|
51 |
+
button = gr.Button(variant="primary")
|
52 |
+
gr.Examples(inputs=url, examples=example_urls)
|
53 |
+
with gr.Column():
|
54 |
+
reviews = [gr.Text(label="Review " + str(i + 1)) for i in range(max_review_count)]
|
55 |
+
image = gr.Image(label="Amazon Product Image", interactive=False)
|
56 |
+
|
57 |
+
|
58 |
+
button.click(fn=scrape_amazon_reviews, inputs=url, outputs=reviews + [image])
|
59 |
+
|
60 |
+
demo.launch(share=True)
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
nltk
|
2 |
+
scikit-learn
|
3 |
+
imblearn
|
4 |
+
pandas
|
5 |
+
BeautifulSoup4
|