sweetfelinity commited on
Commit
f321966
·
verified ·
1 Parent(s): ac21ac8

Uploaded initial files

Browse files
Files changed (5) hide show
  1. MultinominalModel.py +64 -0
  2. README.md +12 -12
  3. amazon_reviews.csv +0 -0
  4. app.py +60 -0
  5. requirements.txt +5 -0
MultinominalModel.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from nltk.tokenize import word_tokenize
2
+ from nltk.stem import WordNetLemmatizer
3
+ from nltk.corpus import stopwords
4
+ from sklearn.metrics import accuracy_score
5
+ from sklearn.feature_extraction.text import TfidfVectorizer
6
+ from sklearn.model_selection import train_test_split
7
+ from imblearn.over_sampling import SMOTE
8
+ from sklearn.naive_bayes import MultinomialNB
9
+ import nltk
10
+ import pandas as pd
11
+
12
+ lemmatizer = WordNetLemmatizer()
13
+
14
+
15
+ nltk.download('all-corpora')
16
+ stop_words = set(stopwords.words('english'))
17
+
18
+ df = pd.read_csv("amazon_reviews.csv")
19
+
20
+ # Preprocess text data
21
+ def preprocess(review):
22
+ review = review.lower()
23
+ tokens = word_tokenize(review)
24
+ lemmas = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
25
+ return " ".join(lemmas)
26
+
27
+
28
+ # Format csv data into array of [review, rating]
29
+ review_ratings = []
30
+ for i in range(len(df)):
31
+ review_text = str(df.loc[i]["reviewText"])
32
+ rating = int(df.loc[i]["overall"])
33
+ review_ratings.append([review_text, rating])
34
+
35
+ # Create corpus of preprocessed text
36
+ corpus = []
37
+ for i in range(len(review_ratings)):
38
+ review = review_ratings[i][0]
39
+ rating = review_ratings[i][1]
40
+ preprocessed_text = preprocess(review)
41
+ corpus.append(preprocessed_text)
42
+
43
+
44
+ # Convert to vector representation
45
+ vectorizer = TfidfVectorizer(max_features=10000)
46
+ X = vectorizer.fit_transform(corpus).toarray()
47
+ y = [r[1] for r in review_ratings]
48
+
49
+ # Generate synthetic samples as 5 star rating reviews are overbalanced
50
+ smote = SMOTE(random_state=42)
51
+ X_resampled, y_resampled = smote.fit_resample(X, y)
52
+ X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
53
+
54
+ # Create model and fit
55
+ model = MultinomialNB()
56
+ model.fit(X_train, y_train)
57
+
58
+ y_predict = model.predict(X_test)
59
+ print("Accuracy", accuracy_score(y_test, y_predict))
60
+
61
+ def predict_rating(review):
62
+ preprocessed_text = preprocess(review)
63
+ vectorized = vectorizer.transform([preprocessed_text]).toarray()
64
+ return model.predict(vectorized)
README.md CHANGED
@@ -1,12 +1,12 @@
1
- ---
2
- title: AmazonRatingPredictor
3
- emoji:
4
- colorFrom: blue
5
- colorTo: gray
6
- sdk: gradio
7
- sdk_version: 4.41.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: AmazonRatingPrediction
3
+ emoji: 🌍
4
+ colorFrom: yellow
5
+ colorTo: pink
6
+ sdk: gradio
7
+ sdk_version: 4.41.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
amazon_reviews.csv ADDED
The diff for this file is too large to render. See raw diff
 
app.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from MultinominalModel import predict_rating
2
+ from bs4 import BeautifulSoup
3
+ import requests
4
+ import gradio as gr
5
+
6
+ max_review_count = 5
7
+
8
+ example_urls = [
9
+ "https://www.amazon.co.uk/Trintion-Scratching-Scratcher-Activity-Dangling/dp/B08FT54NRM",
10
+ "https://www.amazon.co.uk/Indoor-Hanging-playing-sleeping-suitable/dp/B0BTVW7G66",
11
+ "https://www.amazon.co.uk/PlayStation-5-Digital-Console-Slim/dp/B0CM9VKQ5N",
12
+ "https://www.amazon.co.uk/Celebrations-Chocolate-Chocolates-Centerpiece-Maltesers/dp/B07L8D6XM8",
13
+ "https://www.amazon.co.uk/HyRich-SIM-Free-Unlocked-Smartphone-Bluetooth-Note-80-Black/dp/B0BG5KBMYK",
14
+ "https://www.amazon.co.uk/Hama-HS-P350-headset-Binaural-Plastic/dp/B07ZR24KQZ",
15
+ "https://www.amazon.co.uk/Skinapeel-Sonic-Facial-Cleanser-Replaceable/dp/B011V6FUG0",
16
+ "https://www.amazon.co.uk/dp/B0BX47X1K9/"
17
+ ]
18
+
19
+ def scrape_amazon_reviews(url):
20
+ headers = { "accept-language": "en-GB,en;q=0.9",
21
+ "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15"}
22
+
23
+ response = requests.get(url, headers=headers)
24
+ soup = BeautifulSoup(response.content)
25
+
26
+ # Retrieve image from product page
27
+ image = soup.select_one('#landingImage').attrs.get('src')
28
+
29
+ reviews = soup.select("div.review")
30
+
31
+ # Extract review description, rating, and predict a rating from the model
32
+ output_reviews = []
33
+ for i in range(min(len(reviews), max_review_count)):
34
+ review_text = reviews[i].select_one("span.review-text").text.replace("The media could not be loaded.", "").strip("Read more").strip("\n")
35
+ rating = reviews[i].select_one("i.review-rating").text.replace("out of 5 stars", "")
36
+ predicted_rating = predict_rating(review_text)
37
+ output_reviews.append(review_text + "\n\nPredicted Rating: " + str(predicted_rating)[1] + ".0\nActual Rating: " + rating)
38
+
39
+ # If there aren't enough reviews, leave the remaining review text boxes empty
40
+ while(len(output_reviews)) < max_review_count:
41
+ output_reviews.append("")
42
+
43
+ output_reviews.append(image)
44
+ return output_reviews
45
+
46
+ # Main gradio app
47
+ with gr.Blocks() as demo:
48
+ with gr.Row():
49
+ with gr.Column():
50
+ url = gr.Textbox(label="Amazon URL")
51
+ button = gr.Button(variant="primary")
52
+ gr.Examples(inputs=url, examples=example_urls)
53
+ with gr.Column():
54
+ reviews = [gr.Text(label="Review " + str(i + 1)) for i in range(max_review_count)]
55
+ image = gr.Image(label="Amazon Product Image", interactive=False)
56
+
57
+
58
+ button.click(fn=scrape_amazon_reviews, inputs=url, outputs=reviews + [image])
59
+
60
+ demo.launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ nltk
2
+ scikit-learn
3
+ imblearn
4
+ pandas
5
+ BeautifulSoup4