import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
/kaggle/input/reviews/Restaurant_Reviews.tsv
reviews = pd.read_table("/kaggle/input/reviews/Restaurant_Reviews.tsv")
reviews.head(10)
| Review | Liked | |
|---|---|---|
| 0 | Wow... Loved this place. | 1 |
| 1 | Crust is not good. | 0 |
| 2 | Not tasty and the texture was just nasty. | 0 |
| 3 | Stopped by during the late May bank holiday of... | 1 |
| 4 | The selection on the menu was great and so wer... | 1 |
| 5 | Now I am getting angry and I want my damn pho. | 0 |
| 6 | Honeslty it didn't taste THAT fresh.) | 0 |
| 7 | The potatoes were like rubber and you could te... | 0 |
| 8 | The fries were great too. | 1 |
| 9 | A great touch. | 1 |
liked_counts = reviews['Liked'].value_counts()
print(liked_counts)
1 500 0 500 Name: Liked, dtype: int64
# Necessary libraries
from sklearn.model_selection import train_test_split
from sklearn import metrics
import re
import string
from tensorflow import keras
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import SimpleRNN, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
def strip_emoji(text):
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002500-\U00002BEF" # chinese characters
u"\U00002702-\U000027B0"
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
u"\U0001f926-\U0001f937"
u"\U00010000-\U0010ffff"
u"\u2640-\u2642"
u"\u2600-\u2B55"
u"\u200d"
u"\u23cf"
u"\u23e9"
u"\u231a"
u"\ufe0f" # dingbats
u"\u3030"
"]+", flags=re.UNICODE)
return emoji_pattern.sub(r'', text)
#Remove punctuations, links, mentions and \r\n new line characters
def strip_all_entities(text):
text = text.replace('\r', '').replace('\n', ' ').replace('\n', ' ').lower() #remove \n and \r and lowercase
text = re.sub(r"(?:\@|https?\://)\S+", "", text) #remove links and mentions
text = re.sub(r'[^\x00-\x7f]',r'', text) #remove non utf8/ascii characters such as '\x9a\x91\x97\x9a\x97'
banned_list= string.punctuation + 'Ã'+'±'+'ã'+'¼'+'â'+'»'+'§'
table = str.maketrans('', '', banned_list)
text = text.translate(table)
return text
#clean hashtags at the end of the sentence, and keep those in the middle of the sentence by removing just the # symbol
def clean_hashtags(text):
new_text = " ".join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', text)) #remove last hashtags
new_text2 = " ".join(word.strip() for word in re.split('#|_', new_text)) #remove hashtags symbol from words in the middle of the sentence
return new_text2
#Filter special characters such as & and $ present in some words
def filter_chars(a):
sent = []
for word in a.split(' '):
if ('$' in word) | ('&' in word):
sent.append('')
else:
sent.append(word)
return ' '.join(sent)
def remove_mult_spaces(text): # remove multiple spaces
return re.sub("\s\s+" , " ", text)
reviews['cleaned_reviews'] = (reviews['Review']
.apply(strip_emoji)
.apply(strip_all_entities)
.apply(clean_hashtags)
.apply(filter_chars)
.apply(remove_mult_spaces))
reviews.head()
| Review | Liked | cleaned_reviews | |
|---|---|---|---|
| 0 | Wow... Loved this place. | 1 | wow loved this place |
| 1 | Crust is not good. | 0 | crust is not good |
| 2 | Not tasty and the texture was just nasty. | 0 | not tasty and the texture was just nasty |
| 3 | Stopped by during the late May bank holiday of... | 1 | stopped by during the late may bank holiday of... |
| 4 | The selection on the menu was great and so wer... | 1 | the selection on the menu was great and so wer... |
from wordcloud import WordCloud
# Separate positive and negative reviews
positive_reviews = reviews[reviews['Liked'] == 1]['cleaned_reviews'].values
negative_reviews = reviews[reviews['Liked'] == 0]['cleaned_reviews'].values
# Combine the positive and negative reviews as single strings
positive_text = " ".join(positive_reviews)
negative_text = " ".join(negative_reviews)
# Create word clouds for positive and negative reviews
wordcloud_positive = WordCloud(width=800, height=400, background_color='white').generate(positive_text)
wordcloud_negative = WordCloud(width=800, height=400, background_color='white').generate(negative_text)
# Plot the word clouds side by side
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.imshow(wordcloud_positive, interpolation='bilinear')
plt.title('Word Cloud for positive reviews')
plt.axis('off')
plt.subplot(1, 2, 2)
plt.imshow(wordcloud_negative, interpolation='bilinear')
plt.title('Word Cloud for negative reviews')
plt.axis('off')
plt.tight_layout()
plt.show()
Before using any ML algorithm, let us see how sentiment analysis tools such as vaderSentiment can correctly identify negative and positive reviews left by patrons.
pip install vaderSentiment
Requirement already satisfied: vaderSentiment in /opt/conda/lib/python3.10/site-packages (3.3.2) Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from vaderSentiment) (2.31.0) Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->vaderSentiment) (3.1.0) Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->vaderSentiment) (3.4) Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->vaderSentiment) (1.26.15) Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->vaderSentiment) (2023.5.7) Note: you may need to restart the kernel to use updated packages.
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
# Select the first three positive and negative reviews
positive_reviews = reviews[reviews['Liked'] == 1]['cleaned_reviews'][:3].tolist()
negative_reviews = reviews[reviews['Liked'] == 0]['cleaned_reviews'][:3].tolist()
# Combine the positive and negative reviews
all_reviews = positive_reviews + negative_reviews
# Initialize the sentiment analyzer
analyzer = SentimentIntensityAnalyzer()
# Analyze sentiment for each review
results = []
for review in all_reviews:
sentiment = analyzer.polarity_scores(review)
sentiment_score = sentiment['compound']
sentiment_label = "this is a positive review" if sentiment_score >= 0 else "this is a negative review"
results.append({
"Review": review,
"Sentiment Score": sentiment_score,
"Sentiment Label": sentiment_label
})
# Create a DataFrame to display the results
output_table = pd.DataFrame(results)
# Print the output table
print(output_table)
Review Sentiment Score \
0 wow loved this place 0.8271
1 stopped by during the late may bank holiday of... 0.6908
2 the selection on the menu was great and so wer... 0.6249
3 crust is not good -0.3412
4 not tasty and the texture was just nasty -0.5574
5 now i am getting angry and i want my damn pho -0.6908
Sentiment Label
0 this is a positive review
1 this is a positive review
2 this is a positive review
3 this is a negative review
4 this is a negative review
5 this is a negative review
Using VaderSentiment, we managed to correctly classify 6 out of 6 reviews. Impressive performance. Makes me wonder how much performance we can get by simply calculating sentiment scores to classify all other reviews...
# Initialize the sentiment analyzer
analyzer = SentimentIntensityAnalyzer()
# Calculate sentiment scores for each review
reviews['Sentiment Score'] = reviews['cleaned_reviews'].apply(lambda review: analyzer.polarity_scores(review)['compound'])
# Count positive and negative reviews
positive_reviews_count = reviews[reviews['Sentiment Score'] >= 0].shape[0]
negative_reviews_count = reviews[reviews['Sentiment Score'] < 0].shape[0]
reviews.head()
| Review | Liked | cleaned_reviews | Sentiment Score | |
|---|---|---|---|---|
| 0 | Wow... Loved this place. | 1 | wow loved this place | 0.8271 |
| 1 | Crust is not good. | 0 | crust is not good | -0.3412 |
| 2 | Not tasty and the texture was just nasty. | 0 | not tasty and the texture was just nasty | -0.5574 |
| 3 | Stopped by during the late May bank holiday of... | 1 | stopped by during the late may bank holiday of... | 0.6908 |
| 4 | The selection on the menu was great and so wer... | 1 | the selection on the menu was great and so wer... | 0.6249 |
# Print the counts
print("Number of Positive Reviews:", positive_reviews_count)
print("Number of Negative Reviews:", negative_reviews_count)
Number of Positive Reviews: 735 Number of Negative Reviews: 265
Well, this is not good performance. I guess we have to go the machine learning route to get some better performance. VADER, while a valuable tool for sentiment analysis, operates on a lexicon-based approach that assigns a polarity score to each word in a given text and then computes an overall sentiment score based on these scores. This method has its strengths, particularly in capturing the sentiment of individual words, idioms, and expressions. However, it also has limitations.
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Split the data into training and testing sets
X = reviews['cleaned_reviews']
y = reviews['Liked']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Feature extraction (TF-IDF or CountVectorizer can be used)
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
# Train the Random Forest classifier
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train_vec, y_train)
RandomForestClassifier(random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(random_state=42)
# Make predictions
y_pred = classifier.predict(X_test_vec)
# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
Accuracy: 0.76
Classification Report:
precision recall f1-score support
0 0.72 0.82 0.77 96
1 0.81 0.70 0.75 104
accuracy 0.76 200
macro avg 0.76 0.76 0.76 200
weighted avg 0.77 0.76 0.76 200
Confusion Matrix:
[[79 17]
[31 73]]
from sklearn.model_selection import GridSearchCV
# Define the parameter grid to search
param_grid = {
'n_estimators': [50, 100, 200], # Number of trees in the forest
'max_depth': [None, 10, 20, 30], # Maximum depth of the tree
'min_samples_split': [2, 5, 10], # Minimum number of samples required to split an internal node
'min_samples_leaf': [1, 2, 4] # Minimum number of samples required to be at a leaf node
}
# Create the Random Forest classifier
classifier = RandomForestClassifier(random_state=142)
# Create the GridSearchCV instance
grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='accuracy')
# Fit the GridSearchCV instance to the data
grid_search.fit(X_train_vec, y_train)
# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)
Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
Best Score: 0.8099999999999999
# Best parameters from grid search
best_params = {
'min_samples_leaf': 1,
'max_depth': None,
'min_samples_split': 5,
'n_estimators': 50
}
# Create the Random Forest classifier with the best parameters
best_classifier = RandomForestClassifier(**best_params, random_state=142)
# Train the classifier
best_classifier.fit(X_train_vec, y_train)
# Make predictions
y_pred = best_classifier.predict(X_test_vec)
# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
Accuracy: 0.77
Classification Report:
precision recall f1-score support
0 0.74 0.81 0.77 96
1 0.81 0.73 0.77 104
accuracy 0.77 200
macro avg 0.77 0.77 0.77 200
weighted avg 0.77 0.77 0.77 200
Confusion Matrix:
[[78 18]
[28 76]]
We got a 1% performance improvement from hyperparameter tuning
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Create the Naive Bayes classifier
naive_bayes_classifier = MultinomialNB()
# Train the classifier
naive_bayes_classifier.fit(X_train_vec, y_train)
# Make predictions
y_pred_nb = naive_bayes_classifier.predict(X_test_vec)
# Evaluate the classifier
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print(f"Naive Bayes Accuracy: {accuracy_nb:.2f}")
print("Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_nb))
print("Naive Bayes Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_nb))
Naive Bayes Accuracy: 0.78
Naive Bayes Classification Report:
precision recall f1-score support
0 0.74 0.83 0.78 96
1 0.83 0.73 0.78 104
accuracy 0.78 200
macro avg 0.78 0.78 0.78 200
weighted avg 0.79 0.78 0.78 200
Naive Bayes Confusion Matrix:
[[80 16]
[28 76]]
Hyperparameter tuning of the radial based kernel SVM
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Define the parameter grid to search
param_grid = {
'C': [0.1, 1, 10],
'gamma': [0.1, 1, 'scale', 'auto'],
'kernel': ['rbf']
}
# Create the SVM classifier
svm_classifier = SVC()
# Create the GridSearchCV instance
grid_search_svm = GridSearchCV(svm_classifier, param_grid, cv=5, scoring='accuracy')
# Fit the GridSearchCV instance to the data
grid_search_svm.fit(X_train_vec, y_train)
# Print the best parameters and best score
print("Best Parameters for SVM:", grid_search_svm.best_params_)
print("Best Score for SVM:", grid_search_svm.best_score_)
Best Parameters for SVM: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Best Score for SVM: 0.7975000000000001
Model fitting using optimal hyperparameters:
# Create the SVM classifier with the best parameters
best_svm_classifier = SVC(**grid_search_svm.best_params_)
# Train the classifier
best_svm_classifier.fit(X_train_vec, y_train)
# Make predictions
y_pred_svm = best_svm_classifier.predict(X_test_vec)
# Evaluate the classifier
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {accuracy_svm:.2f}")
print("SVM Classification Report:")
print(classification_report(y_test, y_pred_svm))
print("SVM Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm))
SVM Accuracy: 0.78
SVM Classification Report:
precision recall f1-score support
0 0.73 0.83 0.78 96
1 0.82 0.72 0.77 104
accuracy 0.78 200
macro avg 0.78 0.78 0.77 200
weighted avg 0.78 0.78 0.77 200
SVM Confusion Matrix:
[[80 16]
[29 75]]
The SVM classifier achieved an accuracy of 78%; The Naive Bayes classifier had a similar performance of 78%; The Random Forest classifier came last at 77%
Difference between performance of ML algorithms and VADER sentiment scores:
While machine learning algorithms consider the entire context and relationships between words in a review, VADER operates on a word-by-word basis. Therefore, the differences in results highlight the complexity of accurately assessing sentiment using automated methods and the necessity of combining such methods with more advanced techniques, such as machine learning algorithms, to achieve accurate and reliable sentiment classification.