Spaces:
Sleeping
Sleeping
from nltk.tokenize import word_tokenize | |
from nltk.stem import WordNetLemmatizer | |
from nltk.corpus import stopwords | |
from sklearn.metrics import accuracy_score | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.model_selection import train_test_split | |
from imblearn.over_sampling import SMOTE | |
from sklearn.naive_bayes import MultinomialNB | |
import nltk | |
import pandas as pd | |
lemmatizer = WordNetLemmatizer() | |
nltk.download('stopwords') | |
nltk.download('punkt_tab') | |
nltk.download('all-corpora') | |
stop_words = set(stopwords.words('english')) | |
df = pd.read_csv("amazon_reviews.csv") | |
# Preprocess text data | |
def preprocess(review): | |
review = review.lower() | |
tokens = word_tokenize(review) | |
lemmas = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words] | |
return " ".join(lemmas) | |
# Format csv data into array of [review, rating] | |
review_ratings = [] | |
for i in range(len(df)): | |
review_text = str(df.loc[i]["reviewText"]) | |
rating = int(df.loc[i]["overall"]) | |
review_ratings.append([review_text, rating]) | |
# Create corpus of preprocessed text | |
corpus = [] | |
for i in range(len(review_ratings)): | |
review = review_ratings[i][0] | |
rating = review_ratings[i][1] | |
preprocessed_text = preprocess(review) | |
corpus.append(preprocessed_text) | |
# Convert to vector representation | |
vectorizer = TfidfVectorizer(max_features=10000) | |
X = vectorizer.fit_transform(corpus).toarray() | |
y = [r[1] for r in review_ratings] | |
# Generate synthetic samples as 5 star rating reviews are overbalanced | |
smote = SMOTE(random_state=42) | |
X_resampled, y_resampled = smote.fit_resample(X, y) | |
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42) | |
# Create model and fit | |
model = MultinomialNB() | |
model.fit(X_train, y_train) | |
y_predict = model.predict(X_test) | |
print("Accuracy", accuracy_score(y_test, y_predict)) | |
def predict_rating(review): | |
preprocessed_text = preprocess(review) | |
vectorized = vectorizer.transform([preprocessed_text]).toarray() | |
return model.predict(vectorized) |