Spaces:
Sleeping
Sleeping
# Importing the required packages | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import nltk | |
# Set the style sheet for plots | |
plt.style.use('ggplot') | |
# Read the data | |
df = pd.read_csv("hf://datasets/patrickbdevaney/tripadvisor_hotel_reviews/data/tripadvisor_hotel_reviews.csv") | |
df = df.reset_index().rename(columns={'index': 'Id'}) | |
df.head() | |
df.head() | |
# Check the shape of the DataFrame | |
print(df.shape) | |
# Count the number of reviews for each rating and plot a bar chart | |
ax = df['Rating'].value_counts().sort_index() \ | |
.plot(kind='bar', | |
title='Count of Reviews by Stars', | |
figsize=(10, 5)) | |
ax.set_xlabel('Review Stars') | |
ax.set_ylabel('No. of Stars') | |
plt.show() | |
# Select a review for sentiment analysis | |
rev250 = df['Review'][200] | |
print(rev250) | |
# Preprocess the review text | |
tokens = nltk.word_tokenize(rev250) # Tokenization | |
tagged = nltk.pos_tag(tokens) # Part-of-speech tagging | |
entities = nltk.chunk.ne_chunk(tagged) # Entity recognition | |
entities.pprint() | |
# Perform sentiment analysis using VADER | |
from nltk.sentiment import SentimentIntensityAnalyzer | |
sia = SentimentIntensityAnalyzer() | |
# Analyze sentiment for a positive sentence | |
print(sia.polarity_scores('I am so happy!')) | |
#>> {'neg': 0.0, 'neu': 0.318, 'pos': 0.682, 'compound': 0.6468} | |
# Analyze sentiment for a negative sentence | |
print(sia.polarity_scores('I hate sweet aroma!')) | |
#>> {'neg': 0.499, 'neu': 0.125, 'pos': 0.375, 'compound': -0.2481} | |
# Analyze sentiment for the selected review | |
print(sia.polarity_scores(rev250)) | |
#>> {'neg': 0.1, 'neu': 0.612, 'pos': 0.288, 'compound': 0.9556} | |
# Perform sentiment analysis on the entire dataset | |
from tqdm import tqdm | |
res = {} # Store the sentiment scores | |
for i, row in tqdm(df.iterrows(), total=len(df)): | |
text = row['Review'] | |
myid = row['Id'] | |
res[myid] = sia.polarity_scores(text) | |
# Create a DataFrame from the sentiment scores and merge it with the original DataFrame | |
vaders = pd.DataFrame(res).T | |
vaders = vaders.reset_index().rename(columns={'index': 'Id'}) | |
vaders = vaders.merge(df, how='left') | |
vaders.head() | |
# Visualize the sentiment scores | |
fig, axs = plt.subplots(1, 3, figsize=(12, 3)) | |
sns.barplot(data=vaders, x='Rating', y='pos', ax=axs[0]) | |
sns.barplot(data=vaders, x='Rating', y='neu', ax=axs[1]) | |
sns.barplot(data=vaders, x='Rating', y='neg', ax=axs[2]) | |
# Set titles for the subplots | |
axs[0].set_title('Positive') | |
axs[1].set_title('Neutral') | |
axs[2].set_title('Negative') | |
# Add spacing between the subplots | |
plt.tight_layout() | |
plt.show() |