# This R environment comes with many helpful analytics packages installed
# It is defined by the kaggle/rstats Docker image: https://github.com/kaggle/docker-rstats
# For example, here's a helpful package to load

library(tidyverse) # metapackage of all tidyverse packages

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

list.files(path = "../input")

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.2     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.2     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors


library(caret)        
library(rpart)        
library(e1071)        
library(randomForest) 
library(pROC)

Loading required package: lattice


Attaching package: ‘caret’


The following object is masked from ‘package:purrr’:

    lift


The following object is masked from ‘package:httr’:

    progress


randomForest 4.6-14

Type rfNews() to see new features/changes/bug fixes.


Attaching package: ‘randomForest’


The following object is masked from ‘package:gridExtra’:

    combine


The following object is masked from ‘package:dplyr’:

    combine


The following object is masked from ‘package:ggplot2’:

    margin


Type 'citation("pROC")' for a citation.


Attaching package: ‘pROC’


The following objects are masked from ‘package:stats’:

    cov, smooth, var


red_wine <- read.csv("/kaggle/input/wine-quality-red-and-white/winequality-red.csv")
white_wine <- read.csv("/kaggle/input/wine-quality-red-and-white/winequality-white.csv")
str(red_wine)

'data.frame':	1599 obs. of  12 variables:
 $ fixed.acidity       : num  7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
 $ volatile.acidity    : num  0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
 $ citric.acid         : num  0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
 $ residual.sugar      : num  1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
 $ chlorides           : num  0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
 $ free.sulfur.dioxide : num  11 25 15 17 11 13 15 15 9 17 ...
 $ total.sulfur.dioxide: num  34 67 54 60 34 40 59 21 18 102 ...
 $ density             : num  0.998 0.997 0.997 0.998 0.998 ...
 $ pH                  : num  3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
 $ sulphates           : num  0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
 $ alcohol             : num  9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
 $ quality             : int  5 5 5 6 5 5 5 7 7 5 ...


# Assign labels to the observations
red_wine$type <- 0
white_wine$type <- 1

# Combine the red and white wine datasets
combined_data <- rbind(red_wine, white_wine)

# Convert the "type" variable to factor
combined_data$type <- as.factor(combined_data$type)
head(combined_data)


# Plot the scatterplot matrix
pairs(combined_data)


library(ggplot2)
library(gridExtra)


# Create a 2x2 matrix for the first 4 features
plot1 <- ggplot(combined_data, aes(x = factor(type), y = `fixed.acidity`, fill = factor(type))) +
  geom_boxplot() +
  labs(x = "Wine Type", y = "Fixed Acidity")

plot2 <- ggplot(combined_data, aes(x = factor(type), y = `volatile.acidity`, fill = factor(type))) +
  geom_boxplot() +
  labs(x = "Wine Type", y = "Volatile Acidity")

plot3 <- ggplot(combined_data, aes(x = factor(type), y = `citric.acid`, fill = factor(type))) +
  geom_boxplot() +
  labs(x = "Wine Type", y = "Citric Acid")

plot4 <- ggplot(combined_data, aes(x = factor(type), y = `residual.sugar`, fill = factor(type))) +
  geom_boxplot() +
  labs(x = "Wine Type", y = "Residual Sugar")

# Arrange the plots in a 2x2 matrix
grid.arrange(plot1, plot2, plot3, plot4, nrow = 2)


# Plotting factors 5 - 8

plot5 <- ggplot(combined_data, aes(x = factor(type), y = `chlorides`, fill = factor(type))) +
  geom_boxplot() +
  labs(x = "Wine Type", y = "Chlorides")

plot6 <- ggplot(combined_data, aes(x = factor(type), y = `free.sulfur.dioxide`, fill = factor(type))) +
  geom_boxplot() +
  labs(x = "Wine Type", y = "Free Sulfur Dioxide")

plot7 <- ggplot(combined_data, aes(x = factor(type), y = `total.sulfur.dioxide`, fill = factor(type))) +
  geom_boxplot() +
  labs(x = "Wine Type", y = "Total Sulfur Dioxide")

plot8 <- ggplot(combined_data, aes(x = factor(type), y = `density`, fill = factor(type))) +
  geom_boxplot() +
  labs(x = "Wine Type", y = "Density")

# Arrange the plots in a 2x3 matrix
grid.arrange(plot5, plot6, plot7, plot8, nrow = 2)


# Create a 2x2 matrix for the remaining 4 features

plot9 <- ggplot(combined_data, aes(x = factor(type), y = `pH`, fill = factor(type))) +
  geom_boxplot() +
  labs(x = "Wine Type", y = "pH")

plot10 <- ggplot(combined_data, aes(x = factor(type), y = `sulphates`, fill = factor(type))) +
  geom_boxplot() +
  labs(x = "Wine Type", y = "Sulphates")

plot11 <- ggplot(combined_data, aes(x = factor(type), y = `alcohol`, fill = factor(type))) +
  geom_boxplot() +
  labs(x = "Wine Type", y = "Alcohol")

plot12 <- ggplot(combined_data, aes(x = factor(type), y = `quality`, fill = factor(type))) +
  geom_boxplot() +
  labs(x = "Wine Type", y = "Quality")

# Arrange the plots in a 2x3 matrix
grid.arrange(plot9, plot10, plot11, plot12, nrow = 2)


# Split the data into training and testing sets
set.seed(123)  # For reproducibility
train_index <- sample(1:nrow(combined_data), 0.7 * nrow(combined_data))  # 70% for training
train_data <- combined_data[train_index, ]
test_data <- combined_data[-train_index, ]


# Perform grid search for optimal cp value
cp_values <- seq(0.001, 0.1, by = 0.001)  # Define the range of cp values to search
cp_errors <- sapply(cp_values, function(cp) {
  model <- rpart(type ~ ., data = train_data, method = "class", control = rpart.control(cp = cp))
  predictions <- predict(model, newdata = train_data, type = "class")
  mean(predictions != train_data$type)
})


# Visualize the errors based on different cp values
plot(cp_values, cp_errors, type = "b", xlab = "cp", ylab = "Error", main = "Grid Search: Complexity Parameter (cp)")


# Find the optimal cp value with the minimum error
optimal_cp <- cp_values[which.min(cp_errors)]
cat("Optimal cp value:", optimal_cp, "\n")

Optimal cp value: 0.001


# Fit the final tree using the optimal cp value
tree_model <- rpart(type ~ ., data = train_data, method = "class", control = rpart.control(cp = optimal_cp))

# Make predictions using the final tree
predictions <- predict(tree_model, newdata = test_data, type = "class")

# Evaluate performance using confusion matrix, accuracy, precision, and F1 score
confusion_mat <- confusionMatrix(predictions, test_data$type)
accuracy <- confusion_mat$overall["Accuracy"]
precision <- confusion_mat$byClass["Pos Pred Value"]
f1_score <- confusion_mat$byClass["F1"]

# Print the results
print(confusion_mat)
print(paste("Accuracy:", accuracy))
print(paste("Precision:", precision))
print(paste("F1 Score:", f1_score))

Confusion Matrix and Statistics

          Reference
Prediction    0    1
         0  460    5
         1   24 1461
                                        
               Accuracy : 0.9851        
                 95% CI : (0.9787, 0.99)
    No Information Rate : 0.7518        
    P-Value [Acc > NIR] : < 2.2e-16     
                                        
                  Kappa : 0.9596        
                                        
 Mcnemar's Test P-Value : 0.0008302     
                                        
            Sensitivity : 0.9504        
            Specificity : 0.9966        
         Pos Pred Value : 0.9892        
         Neg Pred Value : 0.9838        
             Prevalence : 0.2482        
         Detection Rate : 0.2359        
   Detection Prevalence : 0.2385        
      Balanced Accuracy : 0.9735        
                                        
       'Positive' Class : 0             
                                        
[1] "Accuracy: 0.985128205128205"
[1] "Precision: 0.989247311827957"
[1] "F1 Score: 0.969441517386723"


# Define the range of values for mtry and ntree
mtry_values <- seq(2, ncol(train_data) - 1)
ntree_values <- seq(50, 300, by = 50)

# Perform grid search for optimal mtry and ntree values
best_accuracy <- 0
best_mtry <- 0
best_ntree <- 0

for (mtry in mtry_values) {
  for (ntree in ntree_values) {
    rf <- randomForest(type ~ ., data = train_data, mtry = mtry, ntree = ntree)
    predictions <- predict(rf, newdata = test_data, type = "class")
    accuracy <- sum(predictions == test_data$type) / nrow(test_data)
    
    if (accuracy > best_accuracy) {
      best_accuracy <- accuracy
      best_mtry <- mtry
      best_ntree <- ntree
    }
  }
}

# Print the optimal hyperparameters
cat("Optimal mtry:", best_mtry, "\n")
cat("Optimal ntree:", best_ntree, "\n")
cat("Best Accuracy:", best_accuracy, "\n")

Optimal mtry: 2 
Optimal ntree: 150 
Best Accuracy: 0.994359


# Fit the random forest model with the optimal hyperparameters
rf <- randomForest(type ~ ., data = train_data, mtry = best_mtry, ntree = best_ntree)

# Make predictions using the random forest model
predictions <- predict(rf, newdata = test_data, type = "class")

# Evaluate performance using confusion matrix, accuracy, precision, and F1 score
confusion_mat <- confusionMatrix(predictions, test_data$type)
accuracy <- confusion_mat$overall["Accuracy"]
precision <- confusion_mat$byClass["Pos Pred Value"]
f1_score <- confusion_mat$byClass["F1"]

# Print the results
print(confusion_mat)
print(paste("Accuracy:", accuracy))
print(paste("Precision:", precision))
print(paste("F1 Score:", f1_score))

Confusion Matrix and Statistics

          Reference
Prediction    0    1
         0  476    3
         1    8 1463
                                          
               Accuracy : 0.9944          
                 95% CI : (0.9899, 0.9972)
    No Information Rate : 0.7518          
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.9848          
                                          
 Mcnemar's Test P-Value : 0.2278          
                                          
            Sensitivity : 0.9835          
            Specificity : 0.9980          
         Pos Pred Value : 0.9937          
         Neg Pred Value : 0.9946          
             Prevalence : 0.2482          
         Detection Rate : 0.2441          
   Detection Prevalence : 0.2456          
      Balanced Accuracy : 0.9907          
                                          
       'Positive' Class : 0               
                                          
[1] "Accuracy: 0.994358974358974"
[1] "Precision: 0.993736951983298"
[1] "F1 Score: 0.988577362409138"


# Define the range of values for cost and gamma
cost_values <- c(0.1, 1, 10)
gamma_values <- c(0.1, 1, 10)

# Perform grid search for optimal cost and gamma values
best_accuracy <- 0
best_cost <- 0
best_gamma <- 0

for (cost in cost_values) {
  for (gamma in gamma_values) {
    svm_model <- svm(type ~ ., data = train_data, kernel = "radial", cost = cost, gamma = gamma)
    predictions <- predict(svm_model, newdata = test_data)
    accuracy <- sum(predictions == test_data$type) / nrow(test_data)
    
    if (accuracy > best_accuracy) {
      best_accuracy <- accuracy
      best_cost <- cost
      best_gamma <- gamma
    }
  }
}

# Print the optimal hyperparameters
cat("Optimal cost:", best_cost, "\n")
cat("Optimal gamma:", best_gamma, "\n")
cat("Best Accuracy:", best_accuracy, "\n")

Optimal cost: 1 
Optimal gamma: 0.1 
Best Accuracy: 0.9953846


# Fit the SVM model with the optimal hyperparameters
svm_model <- svm(type ~ ., data = train_data, kernel = "radial", cost = best_cost, gamma = best_gamma)

# Make predictions using the SVM model
predictions <- predict(svm_model, newdata = test_data)

# Evaluate performance using confusion matrix, accuracy, precision, and F1 score
confusion_mat <- confusionMatrix(predictions, test_data$type)
accuracy <- confusion_mat$overall["Accuracy"]
precision <- confusion_mat$byClass["Pos Pred Value"]
f1_score <- confusion_mat$byClass["F1"]

# Print the results
print(confusion_mat)
print(paste("Accuracy:", accuracy))
print(paste("Precision:", precision))
print(paste("F1 Score:", f1_score))

Confusion Matrix and Statistics

          Reference
Prediction    0    1
         0  477    2
         1    7 1464
                                          
               Accuracy : 0.9954          
                 95% CI : (0.9913, 0.9979)
    No Information Rate : 0.7518          
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.9876          
                                          
 Mcnemar's Test P-Value : 0.1824          
                                          
            Sensitivity : 0.9855          
            Specificity : 0.9986          
         Pos Pred Value : 0.9958          
         Neg Pred Value : 0.9952          
             Prevalence : 0.2482          
         Detection Rate : 0.2446          
   Detection Prevalence : 0.2456          
      Balanced Accuracy : 0.9921          
                                          
       'Positive' Class : 0               
                                          
[1] "Accuracy: 0.995384615384615"
[1] "Precision: 0.995824634655532"
[1] "F1 Score: 0.990654205607477"

Predicting wine color (red vs white): A model comparison between decision tree, Random forest, and SVM¶

Importing the datasets¶

Preparing data for analysis¶

Visualising the data¶

Predictions using Machine Learning models¶

1. Decision trees with pruning¶

2. Random forests with hyperparameter tuning¶

3. SVM with tuned hyperparameters¶

	fixed.acidity	volatile.acidity	citric.acid	residual.sugar	chlorides	free.sulfur.dioxide	total.sulfur.dioxide	density	pH	sulphates	alcohol	quality	type
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<int>	<fct>
1	7.4	0.70	0.00	1.9	0.076	11	34	0.9978	3.51	0.56	9.4	5	0
2	7.8	0.88	0.00	2.6	0.098	25	67	0.9968	3.20	0.68	9.8	5	0
3	7.8	0.76	0.04	2.3	0.092	15	54	0.9970	3.26	0.65	9.8	5	0
4	11.2	0.28	0.56	1.9	0.075	17	60	0.9980	3.16	0.58	9.8	6	0
5	7.4	0.70	0.00	1.9	0.076	11	34	0.9978	3.51	0.56	9.4	5	0
6	7.4	0.66	0.00	1.8	0.075	13	40	0.9978	3.51	0.56	9.4	5	0