Spaces:
Build error
Build error
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Importing the required packages
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
import matplotlib.pyplot as plt
|
| 5 |
+
import seaborn as sns
|
| 6 |
+
import nltk
|
| 7 |
+
# Set the style sheet for plots
|
| 8 |
+
plt.style.use('ggplot')
|
| 9 |
+
|
| 10 |
+
# Read the data
|
| 11 |
+
df = pd.read_csv("hf://datasets/patrickbdevaney/tripadvisor_hotel_reviews/data/tripadvisor_hotel_reviews.csv")
|
| 12 |
+
|
| 13 |
+
df = df.reset_index().rename(columns={'index': 'Id'})
|
| 14 |
+
|
| 15 |
+
df.head()
|
| 16 |
+
|
| 17 |
+
df.head()
|
| 18 |
+
|
| 19 |
+
# Check the shape of the DataFrame
|
| 20 |
+
print(df.shape)
|
| 21 |
+
|
| 22 |
+
# Count the number of reviews for each rating and plot a bar chart
|
| 23 |
+
ax = df['Rating'].value_counts().sort_index() \
|
| 24 |
+
.plot(kind='bar',
|
| 25 |
+
title='Count of Reviews by Stars',
|
| 26 |
+
figsize=(10, 5))
|
| 27 |
+
ax.set_xlabel('Review Stars')
|
| 28 |
+
ax.set_ylabel('No. of Stars')
|
| 29 |
+
plt.show()
|
| 30 |
+
|
| 31 |
+
# Select a review for sentiment analysis
|
| 32 |
+
rev250 = df['Review'][200]
|
| 33 |
+
print(rev250)
|
| 34 |
+
|
| 35 |
+
# Preprocess the review text
|
| 36 |
+
tokens = nltk.word_tokenize(rev250) # Tokenization
|
| 37 |
+
tagged = nltk.pos_tag(tokens) # Part-of-speech tagging
|
| 38 |
+
entities = nltk.chunk.ne_chunk(tagged) # Entity recognition
|
| 39 |
+
|
| 40 |
+
entities.pprint()
|
| 41 |
+
|
| 42 |
+
# Perform sentiment analysis using VADER
|
| 43 |
+
from nltk.sentiment import SentimentIntensityAnalyzer
|
| 44 |
+
sia = SentimentIntensityAnalyzer()
|
| 45 |
+
|
| 46 |
+
# Analyze sentiment for a positive sentence
|
| 47 |
+
print(sia.polarity_scores('I am so happy!'))
|
| 48 |
+
#>> {'neg': 0.0, 'neu': 0.318, 'pos': 0.682, 'compound': 0.6468}
|
| 49 |
+
|
| 50 |
+
# Analyze sentiment for a negative sentence
|
| 51 |
+
print(sia.polarity_scores('I hate sweet aroma!'))
|
| 52 |
+
#>> {'neg': 0.499, 'neu': 0.125, 'pos': 0.375, 'compound': -0.2481}
|
| 53 |
+
|
| 54 |
+
# Analyze sentiment for the selected review
|
| 55 |
+
print(sia.polarity_scores(rev250))
|
| 56 |
+
#>> {'neg': 0.1, 'neu': 0.612, 'pos': 0.288, 'compound': 0.9556}
|
| 57 |
+
|
| 58 |
+
# Perform sentiment analysis on the entire dataset
|
| 59 |
+
from tqdm import tqdm
|
| 60 |
+
|
| 61 |
+
res = {} # Store the sentiment scores
|
| 62 |
+
|
| 63 |
+
for i, row in tqdm(df.iterrows(), total=len(df)):
|
| 64 |
+
text = row['Review']
|
| 65 |
+
myid = row['Id']
|
| 66 |
+
res[myid] = sia.polarity_scores(text)
|
| 67 |
+
|
| 68 |
+
# Create a DataFrame from the sentiment scores and merge it with the original DataFrame
|
| 69 |
+
vaders = pd.DataFrame(res).T
|
| 70 |
+
vaders = vaders.reset_index().rename(columns={'index': 'Id'})
|
| 71 |
+
vaders = vaders.merge(df, how='left')
|
| 72 |
+
|
| 73 |
+
vaders.head()
|
| 74 |
+
|
| 75 |
+
# Visualize the sentiment scores
|
| 76 |
+
fig, axs = plt.subplots(1, 3, figsize=(12, 3))
|
| 77 |
+
sns.barplot(data=vaders, x='Rating', y='pos', ax=axs[0])
|
| 78 |
+
sns.barplot(data=vaders, x='Rating', y='neu', ax=axs[1])
|
| 79 |
+
sns.barplot(data=vaders, x='Rating', y='neg', ax=axs[2])
|
| 80 |
+
|
| 81 |
+
# Set titles for the subplots
|
| 82 |
+
axs[0].set_title('Positive')
|
| 83 |
+
axs[1].set_title('Neutral')
|
| 84 |
+
axs[2].set_title('Negative')
|
| 85 |
+
|
| 86 |
+
# Add spacing between the subplots
|
| 87 |
+
plt.tight_layout()
|
| 88 |
+
plt.show()
|