Spaces:
Build error
Build error
| # Importing the required packages | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import nltk | |
| # Set the style sheet for plots | |
| plt.style.use('ggplot') | |
| # Read the data | |
| df = pd.read_csv("hf://datasets/patrickbdevaney/tripadvisor_hotel_reviews/data/tripadvisor_hotel_reviews.csv") | |
| df = df.reset_index().rename(columns={'index': 'Id'}) | |
| df.head() | |
| df.head() | |
| # Check the shape of the DataFrame | |
| print(df.shape) | |
| # Count the number of reviews for each rating and plot a bar chart | |
| ax = df['Rating'].value_counts().sort_index() \ | |
| .plot(kind='bar', | |
| title='Count of Reviews by Stars', | |
| figsize=(10, 5)) | |
| ax.set_xlabel('Review Stars') | |
| ax.set_ylabel('No. of Stars') | |
| plt.show() | |
| # Select a review for sentiment analysis | |
| rev250 = df['Review'][200] | |
| print(rev250) | |
| # Preprocess the review text | |
| tokens = nltk.word_tokenize(rev250) # Tokenization | |
| tagged = nltk.pos_tag(tokens) # Part-of-speech tagging | |
| entities = nltk.chunk.ne_chunk(tagged) # Entity recognition | |
| entities.pprint() | |
| # Perform sentiment analysis using VADER | |
| from nltk.sentiment import SentimentIntensityAnalyzer | |
| sia = SentimentIntensityAnalyzer() | |
| # Analyze sentiment for a positive sentence | |
| print(sia.polarity_scores('I am so happy!')) | |
| #>> {'neg': 0.0, 'neu': 0.318, 'pos': 0.682, 'compound': 0.6468} | |
| # Analyze sentiment for a negative sentence | |
| print(sia.polarity_scores('I hate sweet aroma!')) | |
| #>> {'neg': 0.499, 'neu': 0.125, 'pos': 0.375, 'compound': -0.2481} | |
| # Analyze sentiment for the selected review | |
| print(sia.polarity_scores(rev250)) | |
| #>> {'neg': 0.1, 'neu': 0.612, 'pos': 0.288, 'compound': 0.9556} | |
| # Perform sentiment analysis on the entire dataset | |
| from tqdm import tqdm | |
| res = {} # Store the sentiment scores | |
| for i, row in tqdm(df.iterrows(), total=len(df)): | |
| text = row['Review'] | |
| myid = row['Id'] | |
| res[myid] = sia.polarity_scores(text) | |
| # Create a DataFrame from the sentiment scores and merge it with the original DataFrame | |
| vaders = pd.DataFrame(res).T | |
| vaders = vaders.reset_index().rename(columns={'index': 'Id'}) | |
| vaders = vaders.merge(df, how='left') | |
| vaders.head() | |
| # Visualize the sentiment scores | |
| fig, axs = plt.subplots(1, 3, figsize=(12, 3)) | |
| sns.barplot(data=vaders, x='Rating', y='pos', ax=axs[0]) | |
| sns.barplot(data=vaders, x='Rating', y='neu', ax=axs[1]) | |
| sns.barplot(data=vaders, x='Rating', y='neg', ax=axs[2]) | |
| # Set titles for the subplots | |
| axs[0].set_title('Positive') | |
| axs[1].set_title('Neutral') | |
| axs[2].set_title('Negative') | |
| # Add spacing between the subplots | |
| plt.tight_layout() | |
| plt.show() |