# Importing the required packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
# Set the style sheet for plots
plt.style.use('ggplot')

# Read the data
df = pd.read_csv("hf://datasets/patrickbdevaney/tripadvisor_hotel_reviews/data/tripadvisor_hotel_reviews.csv")

df = df.reset_index().rename(columns={'index': 'Id'})

df.head()

df.head()

# Check the shape of the DataFrame
print(df.shape)

# Count the number of reviews for each rating and plot a bar chart
ax = df['Rating'].value_counts().sort_index() \
    .plot(kind='bar',
          title='Count of Reviews by Stars',
          figsize=(10, 5))
ax.set_xlabel('Review Stars')
ax.set_ylabel('No. of Stars')
plt.show()

# Select a review for sentiment analysis
rev250 = df['Review'][200]
print(rev250)

# Preprocess the review text
tokens = nltk.word_tokenize(rev250)  # Tokenization
tagged = nltk.pos_tag(tokens)  # Part-of-speech tagging
entities = nltk.chunk.ne_chunk(tagged)  # Entity recognition

entities.pprint()

# Perform sentiment analysis using VADER
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# Analyze sentiment for a positive sentence
print(sia.polarity_scores('I am so happy!'))
#>> {'neg': 0.0, 'neu': 0.318, 'pos': 0.682, 'compound': 0.6468}

# Analyze sentiment for a negative sentence
print(sia.polarity_scores('I hate sweet aroma!'))
#>> {'neg': 0.499, 'neu': 0.125, 'pos': 0.375, 'compound': -0.2481}

# Analyze sentiment for the selected review
print(sia.polarity_scores(rev250))
#>> {'neg': 0.1, 'neu': 0.612, 'pos': 0.288, 'compound': 0.9556}

# Perform sentiment analysis on the entire dataset
from tqdm import tqdm

res = {}  # Store the sentiment scores

for i, row in tqdm(df.iterrows(), total=len(df)):
    text = row['Review']
    myid = row['Id']
    res[myid] = sia.polarity_scores(text)

# Create a DataFrame from the sentiment scores and merge it with the original DataFrame
vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns={'index': 'Id'})
vaders = vaders.merge(df, how='left')

vaders.head()

# Visualize the sentiment scores
fig, axs = plt.subplots(1, 3, figsize=(12, 3))
sns.barplot(data=vaders, x='Rating', y='pos', ax=axs[0])
sns.barplot(data=vaders, x='Rating', y='neu', ax=axs[1])
sns.barplot(data=vaders, x='Rating', y='neg', ax=axs[2])

# Set titles for the subplots
axs[0].set_title('Positive')
axs[1].set_title('Neutral')
axs[2].set_title('Negative')

# Add spacing between the subplots
plt.tight_layout()
plt.show()