# This R environment comes with many helpful analytics packages installed
# It is defined by the kaggle/rstats Docker image: https://github.com/kaggle/docker-rstats
# For example, here's a helpful package to load
library(tidyverse) # metapackage of all tidyverse packages
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
list.files(path = "../input")
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ── ✔ dplyr 1.1.2 ✔ readr 2.1.4 ✔ forcats 1.0.0 ✔ stringr 1.5.0 ✔ ggplot2 3.4.2 ✔ tibble 3.2.1 ✔ lubridate 1.9.2 ✔ tidyr 1.3.0 ✔ purrr 1.0.1 ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ── ✖ dplyr::filter() masks stats::filter() ✖ dplyr::lag() masks stats::lag() ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
library(rpart)
library(e1071)
library(randomForest)
library(pROC)
Loading required package: lattice
Attaching package: ‘caret’
The following object is masked from ‘package:purrr’:
lift
The following object is masked from ‘package:httr’:
progress
randomForest 4.6-14
Type rfNews() to see new features/changes/bug fixes.
Attaching package: ‘randomForest’
The following object is masked from ‘package:gridExtra’:
combine
The following object is masked from ‘package:dplyr’:
combine
The following object is masked from ‘package:ggplot2’:
margin
Type 'citation("pROC")' for a citation.
Attaching package: ‘pROC’
The following objects are masked from ‘package:stats’:
cov, smooth, var
red_wine <- read.csv("/kaggle/input/wine-quality-red-and-white/winequality-red.csv")
white_wine <- read.csv("/kaggle/input/wine-quality-red-and-white/winequality-white.csv")
str(red_wine)
'data.frame': 1599 obs. of 12 variables: $ fixed.acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ... $ volatile.acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ... $ citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ... $ residual.sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ... $ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ... $ free.sulfur.dioxide : num 11 25 15 17 11 13 15 15 9 17 ... $ total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 102 ... $ density : num 0.998 0.997 0.997 0.998 0.998 ... $ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ... $ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ... $ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ... $ quality : int 5 5 5 6 5 5 5 7 7 5 ...
# Assign labels to the observations
red_wine$type <- 0
white_wine$type <- 1
# Combine the red and white wine datasets
combined_data <- rbind(red_wine, white_wine)
# Convert the "type" variable to factor
combined_data$type <- as.factor(combined_data$type)
head(combined_data)
| fixed.acidity | volatile.acidity | citric.acid | residual.sugar | chlorides | free.sulfur.dioxide | total.sulfur.dioxide | density | pH | sulphates | alcohol | quality | type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <int> | <fct> | |
| 1 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11 | 34 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 | 0 |
| 2 | 7.8 | 0.88 | 0.00 | 2.6 | 0.098 | 25 | 67 | 0.9968 | 3.20 | 0.68 | 9.8 | 5 | 0 |
| 3 | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15 | 54 | 0.9970 | 3.26 | 0.65 | 9.8 | 5 | 0 |
| 4 | 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17 | 60 | 0.9980 | 3.16 | 0.58 | 9.8 | 6 | 0 |
| 5 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11 | 34 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 | 0 |
| 6 | 7.4 | 0.66 | 0.00 | 1.8 | 0.075 | 13 | 40 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 | 0 |
# Plot the scatterplot matrix
pairs(combined_data)
library(ggplot2)
library(gridExtra)
# Create a 2x2 matrix for the first 4 features
plot1 <- ggplot(combined_data, aes(x = factor(type), y = `fixed.acidity`, fill = factor(type))) +
geom_boxplot() +
labs(x = "Wine Type", y = "Fixed Acidity")
plot2 <- ggplot(combined_data, aes(x = factor(type), y = `volatile.acidity`, fill = factor(type))) +
geom_boxplot() +
labs(x = "Wine Type", y = "Volatile Acidity")
plot3 <- ggplot(combined_data, aes(x = factor(type), y = `citric.acid`, fill = factor(type))) +
geom_boxplot() +
labs(x = "Wine Type", y = "Citric Acid")
plot4 <- ggplot(combined_data, aes(x = factor(type), y = `residual.sugar`, fill = factor(type))) +
geom_boxplot() +
labs(x = "Wine Type", y = "Residual Sugar")
# Arrange the plots in a 2x2 matrix
grid.arrange(plot1, plot2, plot3, plot4, nrow = 2)
# Plotting factors 5 - 8
plot5 <- ggplot(combined_data, aes(x = factor(type), y = `chlorides`, fill = factor(type))) +
geom_boxplot() +
labs(x = "Wine Type", y = "Chlorides")
plot6 <- ggplot(combined_data, aes(x = factor(type), y = `free.sulfur.dioxide`, fill = factor(type))) +
geom_boxplot() +
labs(x = "Wine Type", y = "Free Sulfur Dioxide")
plot7 <- ggplot(combined_data, aes(x = factor(type), y = `total.sulfur.dioxide`, fill = factor(type))) +
geom_boxplot() +
labs(x = "Wine Type", y = "Total Sulfur Dioxide")
plot8 <- ggplot(combined_data, aes(x = factor(type), y = `density`, fill = factor(type))) +
geom_boxplot() +
labs(x = "Wine Type", y = "Density")
# Arrange the plots in a 2x3 matrix
grid.arrange(plot5, plot6, plot7, plot8, nrow = 2)
# Create a 2x2 matrix for the remaining 4 features
plot9 <- ggplot(combined_data, aes(x = factor(type), y = `pH`, fill = factor(type))) +
geom_boxplot() +
labs(x = "Wine Type", y = "pH")
plot10 <- ggplot(combined_data, aes(x = factor(type), y = `sulphates`, fill = factor(type))) +
geom_boxplot() +
labs(x = "Wine Type", y = "Sulphates")
plot11 <- ggplot(combined_data, aes(x = factor(type), y = `alcohol`, fill = factor(type))) +
geom_boxplot() +
labs(x = "Wine Type", y = "Alcohol")
plot12 <- ggplot(combined_data, aes(x = factor(type), y = `quality`, fill = factor(type))) +
geom_boxplot() +
labs(x = "Wine Type", y = "Quality")
# Arrange the plots in a 2x3 matrix
grid.arrange(plot9, plot10, plot11, plot12, nrow = 2)
# Split the data into training and testing sets
set.seed(123) # For reproducibility
train_index <- sample(1:nrow(combined_data), 0.7 * nrow(combined_data)) # 70% for training
train_data <- combined_data[train_index, ]
test_data <- combined_data[-train_index, ]
# Perform grid search for optimal cp value
cp_values <- seq(0.001, 0.1, by = 0.001) # Define the range of cp values to search
cp_errors <- sapply(cp_values, function(cp) {
model <- rpart(type ~ ., data = train_data, method = "class", control = rpart.control(cp = cp))
predictions <- predict(model, newdata = train_data, type = "class")
mean(predictions != train_data$type)
})
# Visualize the errors based on different cp values
plot(cp_values, cp_errors, type = "b", xlab = "cp", ylab = "Error", main = "Grid Search: Complexity Parameter (cp)")
# Find the optimal cp value with the minimum error
optimal_cp <- cp_values[which.min(cp_errors)]
cat("Optimal cp value:", optimal_cp, "\n")
Optimal cp value: 0.001
# Fit the final tree using the optimal cp value
tree_model <- rpart(type ~ ., data = train_data, method = "class", control = rpart.control(cp = optimal_cp))
# Make predictions using the final tree
predictions <- predict(tree_model, newdata = test_data, type = "class")
# Evaluate performance using confusion matrix, accuracy, precision, and F1 score
confusion_mat <- confusionMatrix(predictions, test_data$type)
accuracy <- confusion_mat$overall["Accuracy"]
precision <- confusion_mat$byClass["Pos Pred Value"]
f1_score <- confusion_mat$byClass["F1"]
# Print the results
print(confusion_mat)
print(paste("Accuracy:", accuracy))
print(paste("Precision:", precision))
print(paste("F1 Score:", f1_score))
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 460 5
1 24 1461
Accuracy : 0.9851
95% CI : (0.9787, 0.99)
No Information Rate : 0.7518
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.9596
Mcnemar's Test P-Value : 0.0008302
Sensitivity : 0.9504
Specificity : 0.9966
Pos Pred Value : 0.9892
Neg Pred Value : 0.9838
Prevalence : 0.2482
Detection Rate : 0.2359
Detection Prevalence : 0.2385
Balanced Accuracy : 0.9735
'Positive' Class : 0
[1] "Accuracy: 0.985128205128205"
[1] "Precision: 0.989247311827957"
[1] "F1 Score: 0.969441517386723"
# Define the range of values for mtry and ntree
mtry_values <- seq(2, ncol(train_data) - 1)
ntree_values <- seq(50, 300, by = 50)
# Perform grid search for optimal mtry and ntree values
best_accuracy <- 0
best_mtry <- 0
best_ntree <- 0
for (mtry in mtry_values) {
for (ntree in ntree_values) {
rf <- randomForest(type ~ ., data = train_data, mtry = mtry, ntree = ntree)
predictions <- predict(rf, newdata = test_data, type = "class")
accuracy <- sum(predictions == test_data$type) / nrow(test_data)
if (accuracy > best_accuracy) {
best_accuracy <- accuracy
best_mtry <- mtry
best_ntree <- ntree
}
}
}
# Print the optimal hyperparameters
cat("Optimal mtry:", best_mtry, "\n")
cat("Optimal ntree:", best_ntree, "\n")
cat("Best Accuracy:", best_accuracy, "\n")
Optimal mtry: 2 Optimal ntree: 150 Best Accuracy: 0.994359
# Fit the random forest model with the optimal hyperparameters
rf <- randomForest(type ~ ., data = train_data, mtry = best_mtry, ntree = best_ntree)
# Make predictions using the random forest model
predictions <- predict(rf, newdata = test_data, type = "class")
# Evaluate performance using confusion matrix, accuracy, precision, and F1 score
confusion_mat <- confusionMatrix(predictions, test_data$type)
accuracy <- confusion_mat$overall["Accuracy"]
precision <- confusion_mat$byClass["Pos Pred Value"]
f1_score <- confusion_mat$byClass["F1"]
# Print the results
print(confusion_mat)
print(paste("Accuracy:", accuracy))
print(paste("Precision:", precision))
print(paste("F1 Score:", f1_score))
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 476 3
1 8 1463
Accuracy : 0.9944
95% CI : (0.9899, 0.9972)
No Information Rate : 0.7518
P-Value [Acc > NIR] : <2e-16
Kappa : 0.9848
Mcnemar's Test P-Value : 0.2278
Sensitivity : 0.9835
Specificity : 0.9980
Pos Pred Value : 0.9937
Neg Pred Value : 0.9946
Prevalence : 0.2482
Detection Rate : 0.2441
Detection Prevalence : 0.2456
Balanced Accuracy : 0.9907
'Positive' Class : 0
[1] "Accuracy: 0.994358974358974"
[1] "Precision: 0.993736951983298"
[1] "F1 Score: 0.988577362409138"
# Define the range of values for cost and gamma
cost_values <- c(0.1, 1, 10)
gamma_values <- c(0.1, 1, 10)
# Perform grid search for optimal cost and gamma values
best_accuracy <- 0
best_cost <- 0
best_gamma <- 0
for (cost in cost_values) {
for (gamma in gamma_values) {
svm_model <- svm(type ~ ., data = train_data, kernel = "radial", cost = cost, gamma = gamma)
predictions <- predict(svm_model, newdata = test_data)
accuracy <- sum(predictions == test_data$type) / nrow(test_data)
if (accuracy > best_accuracy) {
best_accuracy <- accuracy
best_cost <- cost
best_gamma <- gamma
}
}
}
# Print the optimal hyperparameters
cat("Optimal cost:", best_cost, "\n")
cat("Optimal gamma:", best_gamma, "\n")
cat("Best Accuracy:", best_accuracy, "\n")
Optimal cost: 1 Optimal gamma: 0.1 Best Accuracy: 0.9953846
# Fit the SVM model with the optimal hyperparameters
svm_model <- svm(type ~ ., data = train_data, kernel = "radial", cost = best_cost, gamma = best_gamma)
# Make predictions using the SVM model
predictions <- predict(svm_model, newdata = test_data)
# Evaluate performance using confusion matrix, accuracy, precision, and F1 score
confusion_mat <- confusionMatrix(predictions, test_data$type)
accuracy <- confusion_mat$overall["Accuracy"]
precision <- confusion_mat$byClass["Pos Pred Value"]
f1_score <- confusion_mat$byClass["F1"]
# Print the results
print(confusion_mat)
print(paste("Accuracy:", accuracy))
print(paste("Precision:", precision))
print(paste("F1 Score:", f1_score))
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 477 2
1 7 1464
Accuracy : 0.9954
95% CI : (0.9913, 0.9979)
No Information Rate : 0.7518
P-Value [Acc > NIR] : <2e-16
Kappa : 0.9876
Mcnemar's Test P-Value : 0.1824
Sensitivity : 0.9855
Specificity : 0.9986
Pos Pred Value : 0.9958
Neg Pred Value : 0.9952
Prevalence : 0.2482
Detection Rate : 0.2446
Detection Prevalence : 0.2456
Balanced Accuracy : 0.9921
'Positive' Class : 0
[1] "Accuracy: 0.995384615384615"
[1] "Precision: 0.995824634655532"
[1] "F1 Score: 0.990654205607477"
Insights: