# Importing the required packages import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import nltk # Set the style sheet for plots plt.style.use('ggplot') # Read the data df = pd.read_csv("hf://datasets/patrickbdevaney/tripadvisor_hotel_reviews/data/tripadvisor_hotel_reviews.csv") df = df.reset_index().rename(columns={'index': 'Id'}) df.head() df.head() # Check the shape of the DataFrame print(df.shape) # Count the number of reviews for each rating and plot a bar chart ax = df['Rating'].value_counts().sort_index() \ .plot(kind='bar', title='Count of Reviews by Stars', figsize=(10, 5)) ax.set_xlabel('Review Stars') ax.set_ylabel('No. of Stars') plt.show() # Select a review for sentiment analysis rev250 = df['Review'][200] print(rev250) # Preprocess the review text tokens = nltk.word_tokenize(rev250) # Tokenization tagged = nltk.pos_tag(tokens) # Part-of-speech tagging entities = nltk.chunk.ne_chunk(tagged) # Entity recognition entities.pprint() # Perform sentiment analysis using VADER from nltk.sentiment import SentimentIntensityAnalyzer sia = SentimentIntensityAnalyzer() # Analyze sentiment for a positive sentence print(sia.polarity_scores('I am so happy!')) #>> {'neg': 0.0, 'neu': 0.318, 'pos': 0.682, 'compound': 0.6468} # Analyze sentiment for a negative sentence print(sia.polarity_scores('I hate sweet aroma!')) #>> {'neg': 0.499, 'neu': 0.125, 'pos': 0.375, 'compound': -0.2481} # Analyze sentiment for the selected review print(sia.polarity_scores(rev250)) #>> {'neg': 0.1, 'neu': 0.612, 'pos': 0.288, 'compound': 0.9556} # Perform sentiment analysis on the entire dataset from tqdm import tqdm res = {} # Store the sentiment scores for i, row in tqdm(df.iterrows(), total=len(df)): text = row['Review'] myid = row['Id'] res[myid] = sia.polarity_scores(text) # Create a DataFrame from the sentiment scores and merge it with the original DataFrame vaders = pd.DataFrame(res).T vaders = vaders.reset_index().rename(columns={'index': 'Id'}) vaders = vaders.merge(df, how='left') vaders.head() # Visualize the sentiment scores fig, axs = plt.subplots(1, 3, figsize=(12, 3)) sns.barplot(data=vaders, x='Rating', y='pos', ax=axs[0]) sns.barplot(data=vaders, x='Rating', y='neu', ax=axs[1]) sns.barplot(data=vaders, x='Rating', y='neg', ax=axs[2]) # Set titles for the subplots axs[0].set_title('Positive') axs[1].set_title('Neutral') axs[2].set_title('Negative') # Add spacing between the subplots plt.tight_layout() plt.show()