0xEmir's picture
Create app.py
c9652bd verified
# Importing the required packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
# Set the style sheet for plots
plt.style.use('ggplot')
# Read the data
df = pd.read_csv("hf://datasets/patrickbdevaney/tripadvisor_hotel_reviews/data/tripadvisor_hotel_reviews.csv")
df = df.reset_index().rename(columns={'index': 'Id'})
df.head()
df.head()
# Check the shape of the DataFrame
print(df.shape)
# Count the number of reviews for each rating and plot a bar chart
ax = df['Rating'].value_counts().sort_index() \
.plot(kind='bar',
title='Count of Reviews by Stars',
figsize=(10, 5))
ax.set_xlabel('Review Stars')
ax.set_ylabel('No. of Stars')
plt.show()
# Select a review for sentiment analysis
rev250 = df['Review'][200]
print(rev250)
# Preprocess the review text
tokens = nltk.word_tokenize(rev250) # Tokenization
tagged = nltk.pos_tag(tokens) # Part-of-speech tagging
entities = nltk.chunk.ne_chunk(tagged) # Entity recognition
entities.pprint()
# Perform sentiment analysis using VADER
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
# Analyze sentiment for a positive sentence
print(sia.polarity_scores('I am so happy!'))
#>> {'neg': 0.0, 'neu': 0.318, 'pos': 0.682, 'compound': 0.6468}
# Analyze sentiment for a negative sentence
print(sia.polarity_scores('I hate sweet aroma!'))
#>> {'neg': 0.499, 'neu': 0.125, 'pos': 0.375, 'compound': -0.2481}
# Analyze sentiment for the selected review
print(sia.polarity_scores(rev250))
#>> {'neg': 0.1, 'neu': 0.612, 'pos': 0.288, 'compound': 0.9556}
# Perform sentiment analysis on the entire dataset
from tqdm import tqdm
res = {} # Store the sentiment scores
for i, row in tqdm(df.iterrows(), total=len(df)):
text = row['Review']
myid = row['Id']
res[myid] = sia.polarity_scores(text)
# Create a DataFrame from the sentiment scores and merge it with the original DataFrame
vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns={'index': 'Id'})
vaders = vaders.merge(df, how='left')
vaders.head()
# Visualize the sentiment scores
fig, axs = plt.subplots(1, 3, figsize=(12, 3))
sns.barplot(data=vaders, x='Rating', y='pos', ax=axs[0])
sns.barplot(data=vaders, x='Rating', y='neu', ax=axs[1])
sns.barplot(data=vaders, x='Rating', y='neg', ax=axs[2])
# Set titles for the subplots
axs[0].set_title('Positive')
axs[1].set_title('Neutral')
axs[2].set_title('Negative')
# Add spacing between the subplots
plt.tight_layout()
plt.show()