# This R environment comes with many helpful analytics packages installed
# It is defined by the kaggle/rstats Docker image: https://github.com/kaggle/docker-rstats
# For example, here's a helpful package to load
library(tidyverse) # metapackage of all tidyverse packages
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
list.files(path = "../input")
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
abseentism <- read.csv("/kaggle/input/employee-abseentism/Absenteeism.csv", sep = ";")
head(abseentism)
| ID | Reason.for.absence | Month.of.absence | Day.of.the.week | Seasons | Transportation.expense | Distance.from.Residence.to.Work | Service.time | Age | Work.load.Average.day | ⋯ | Disciplinary.failure | Education | Son | Social.drinker | Social.smoker | Pet | Weight | Height | Body.mass.index | Absenteeism.time.in.hours | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <dbl> | ⋯ | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | |
| 1 | 11 | 26 | 7 | 3 | 1 | 289 | 36 | 13 | 33 | 239.554 | ⋯ | 0 | 1 | 2 | 1 | 0 | 1 | 90 | 172 | 30 | 4 |
| 2 | 36 | 0 | 7 | 3 | 1 | 118 | 13 | 18 | 50 | 239.554 | ⋯ | 1 | 1 | 1 | 1 | 0 | 0 | 98 | 178 | 31 | 0 |
| 3 | 3 | 23 | 7 | 4 | 1 | 179 | 51 | 18 | 38 | 239.554 | ⋯ | 0 | 1 | 0 | 1 | 0 | 0 | 89 | 170 | 31 | 2 |
| 4 | 7 | 7 | 7 | 5 | 1 | 279 | 5 | 14 | 39 | 239.554 | ⋯ | 0 | 1 | 2 | 1 | 1 | 0 | 68 | 168 | 24 | 4 |
| 5 | 11 | 23 | 7 | 5 | 1 | 289 | 36 | 13 | 33 | 239.554 | ⋯ | 0 | 1 | 2 | 1 | 0 | 1 | 90 | 172 | 30 | 2 |
| 6 | 3 | 23 | 7 | 6 | 1 | 179 | 51 | 18 | 38 | 239.554 | ⋯ | 0 | 1 | 0 | 1 | 0 | 0 | 89 | 170 | 31 | 2 |
summary(abseentism)
ID Reason.for.absence Month.of.absence Day.of.the.week
Min. : 1.00 Min. : 0.00 Min. : 0.000 Min. :2.000
1st Qu.: 9.00 1st Qu.:13.00 1st Qu.: 3.000 1st Qu.:3.000
Median :18.00 Median :23.00 Median : 6.000 Median :4.000
Mean :18.02 Mean :19.22 Mean : 6.324 Mean :3.915
3rd Qu.:28.00 3rd Qu.:26.00 3rd Qu.: 9.000 3rd Qu.:5.000
Max. :36.00 Max. :28.00 Max. :12.000 Max. :6.000
Seasons Transportation.expense Distance.from.Residence.to.Work
Min. :1.000 Min. :118.0 Min. : 5.00
1st Qu.:2.000 1st Qu.:179.0 1st Qu.:16.00
Median :3.000 Median :225.0 Median :26.00
Mean :2.545 Mean :221.3 Mean :29.63
3rd Qu.:4.000 3rd Qu.:260.0 3rd Qu.:50.00
Max. :4.000 Max. :388.0 Max. :52.00
Service.time Age Work.load.Average.day Hit.target
Min. : 1.00 Min. :27.00 Min. :205.9 Min. : 81.00
1st Qu.: 9.00 1st Qu.:31.00 1st Qu.:244.4 1st Qu.: 93.00
Median :13.00 Median :37.00 Median :264.2 Median : 95.00
Mean :12.55 Mean :36.45 Mean :271.5 Mean : 94.59
3rd Qu.:16.00 3rd Qu.:40.00 3rd Qu.:294.2 3rd Qu.: 97.00
Max. :29.00 Max. :58.00 Max. :378.9 Max. :100.00
Disciplinary.failure Education Son Social.drinker
Min. :0.00000 Min. :1.000 Min. :0.000 Min. :0.0000
1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.000 1st Qu.:0.0000
Median :0.00000 Median :1.000 Median :1.000 Median :1.0000
Mean :0.05405 Mean :1.292 Mean :1.019 Mean :0.5676
3rd Qu.:0.00000 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:1.0000
Max. :1.00000 Max. :4.000 Max. :4.000 Max. :1.0000
Social.smoker Pet Weight Height
Min. :0.00000 Min. :0.0000 Min. : 56.00 Min. :163.0
1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.: 69.00 1st Qu.:169.0
Median :0.00000 Median :0.0000 Median : 83.00 Median :170.0
Mean :0.07297 Mean :0.7459 Mean : 79.04 Mean :172.1
3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.: 89.00 3rd Qu.:172.0
Max. :1.00000 Max. :8.0000 Max. :108.00 Max. :196.0
Body.mass.index Absenteeism.time.in.hours
Min. :19.00 Min. : 0.000
1st Qu.:24.00 1st Qu.: 2.000
Median :25.00 Median : 3.000
Mean :26.68 Mean : 6.924
3rd Qu.:31.00 3rd Qu.: 8.000
Max. :38.00 Max. :120.000
ggplot(abseentism, aes(x = Absenteeism.time.in.hours)) +
geom_histogram(binwidth = 4, fill = "blue", color = "black") +
labs(x = "Absenteeism Time (hours)", y = "Count", title = "Distribution of Absenteeism Time")
# Check the type of variables in each column
str(abseentism)
'data.frame': 740 obs. of 21 variables: $ ID : int 11 36 3 7 11 3 10 20 14 1 ... $ Reason.for.absence : int 26 0 23 7 23 23 22 23 19 22 ... $ Month.of.absence : int 7 7 7 7 7 7 7 7 7 7 ... $ Day.of.the.week : int 3 3 4 5 5 6 6 6 2 2 ... $ Seasons : int 1 1 1 1 1 1 1 1 1 1 ... $ Transportation.expense : int 289 118 179 279 289 179 361 260 155 235 ... $ Distance.from.Residence.to.Work: int 36 13 51 5 36 51 52 50 12 11 ... $ Service.time : int 13 18 18 14 13 18 3 11 14 14 ... $ Age : int 33 50 38 39 33 38 28 36 34 37 ... $ Work.load.Average.day : num 240 240 240 240 240 ... $ Hit.target : int 97 97 97 97 97 97 97 97 97 97 ... $ Disciplinary.failure : int 0 1 0 0 0 0 0 0 0 0 ... $ Education : int 1 1 1 1 1 1 1 1 1 3 ... $ Son : int 2 1 0 2 2 0 1 4 2 1 ... $ Social.drinker : int 1 1 1 1 1 1 1 1 1 0 ... $ Social.smoker : int 0 0 0 1 0 0 0 0 0 0 ... $ Pet : int 1 0 0 0 1 0 4 0 0 1 ... $ Weight : int 90 98 89 68 90 89 80 65 95 88 ... $ Height : int 172 178 170 168 172 170 172 168 196 172 ... $ Body.mass.index : int 30 31 31 24 30 31 27 23 25 29 ... $ Absenteeism.time.in.hours : int 4 0 2 4 2 2 8 4 40 8 ...
# Specify the columns to convert to factors
columns_to_convert <- c("Reason.for.absence", "Month.of.absence", "Day.of.the.week", "Seasons", "Disciplinary.failure", "Education", "Son", "Social.drinker", "Social.smoker", "Pet")
# Convert columns to factors
for (col in columns_to_convert) {
abseentism[[col]] <- as.factor(abseentism[[col]])
}
# Deleting the ID column
abseentism <- subset(abseentism, select = -ID)
library(ggplot2)
# Create the boxplot
ggplot(abseentism, aes(x = Reason.for.absence, y = Absenteeism.time.in.hours)) +
geom_boxplot(fill = "lightblue", color = "black") +
labs(x = "Reason for Absence", y = "Total Absenteeism Time (hours)",
title = "Boxplot of Absenteeism Time by Reason for Absence")
# Boxplot for Day of the Week
ggplot(abseentism, aes(x = factor(Day.of.the.week), y = Absenteeism.time.in.hours)) +
geom_boxplot(fill = "lightblue", color = "black") +
labs(x = "Day of the Week", y = "Absenteeism Time (hours)",
title = "Boxplot of Absenteeism by Day of the Week")
# Boxplot for Seasons
ggplot(abseentism, aes(x = factor(Seasons), y = Absenteeism.time.in.hours)) +
geom_boxplot(fill = "lightblue", color = "black") +
labs(x = "Seasons", y = "Absenteeism Time (hours)",
title = "Boxplot of Absenteeism by Seasons")
# Boxplot for Social Drinker
ggplot(abseentism, aes(x = factor(Social.drinker), y = Absenteeism.time.in.hours)) +
geom_boxplot(fill = "lightblue", color = "black") +
labs(x = "Social Drinker", y = "Absenteeism Time (hours)",
title = "Boxplot of Absenteeism by Social Drinker")
# Boxplot for Social Smoker
ggplot(abseentism, aes(x = factor(Social.smoker), y = Absenteeism.time.in.hours)) +
geom_boxplot(fill = "lightblue", color = "black") +
labs(x = "Social Smoker", y = "Absenteeism Time (hours)",
title = "Boxplot of Absenteeism by Social Smoker")
# Scatterplot for Transportation Expense
ggplot(abseentism, aes(x = Transportation.expense, y = Absenteeism.time.in.hours)) +
geom_point(color = "blue") +
labs(x = "Transportation Expense", y = "Absenteeism Time (hours)",
title = "Scatterplot of Absenteeism by Transportation Expense")
# Scatterplot for Distance from Residence to Work
ggplot(abseentism, aes(x = Distance.from.Residence.to.Work, y = Absenteeism.time.in.hours)) +
geom_point(color = "red") +
labs(x = "Distance from Residence to Work", y = "Absenteeism Time (hours)",
title = "Scatterplot of Absenteeism by Distance from Residence to Work")
# Scatterplot for Age
ggplot(abseentism, aes(x = Age, y = Absenteeism.time.in.hours)) +
geom_point(color = "green") +
labs(x = "Age", y = "Absenteeism Time (hours)",
title = "Scatterplot of Absenteeism by Age")
# Scatterplot for Work Load Average per Day
ggplot(abseentism, aes(x = Work.load.Average.day, y = Absenteeism.time.in.hours)) +
geom_point(color = "orange") +
labs(x = "Work Load Average per Day", y = "Absenteeism Time (hours)",
title = "Scatterplot of Absenteeism by Work Load Average per Day")
# Scatterplot for Weight
ggplot(abseentism, aes(x = Weight, y = Absenteeism.time.in.hours)) +
geom_point(color = "purple") +
labs(x = "Weight", y = "Absenteeism Time (hours)",
title = "Scatterplot of Absenteeism by Weight")
# Scatterplot for Body Mass Index
ggplot(abseentism, aes(x = Body.mass.index, y = Absenteeism.time.in.hours)) +
geom_point(color = "brown") +
labs(x = "Body Mass Index", y = "Absenteeism Time (hours)",
title = "Scatterplot of Absenteeism by Body Mass Index")
Given that there are up to 19 features, it would be very confusing to run a clustering analysis with all the features, as we'd need to make some good sense of the distribution of clusters by features. As a result, I will be usign the random forest ensemble algorithm to determine the top five features explaining the biggest differences in the total number of absenteesim hours.
library(randomForest)
# Create a subset of the dataset with only the predictor variables
predictors <- abseentism[, c("Reason.for.absence", "Month.of.absence", "Day.of.the.week", "Seasons",
"Transportation.expense", "Distance.from.Residence.to.Work",
"Service.time", "Age", "Work.load.Average.day", "Hit.target",
"Disciplinary.failure", "Education", "Son", "Social.drinker",
"Social.smoker", "Pet", "Weight", "Height", "Body.mass.index")]
# Create a vector of the response variable (Total hours of absenteeism)
response <- abseentism$Absenteeism.time.in.hours
# Perform Random Forest analysis
rf_model <- randomForest(predictors, response, ntree = 500, importance = TRUE)
rf_model
Call:
randomForest(x = predictors, y = response, ntree = 500, importance = TRUE)
Type of random forest: regression
Number of trees: 500
No. of variables tried at each split: 6
Mean of squared residuals: 158.9623
% Var explained: 10.43
# Get variable importance measures
importance <- importance(rf_model, type = 1)
importance
| %IncMSE | |
|---|---|
| Reason.for.absence | 13.0542386 |
| Month.of.absence | -0.5304347 |
| Day.of.the.week | -0.8151381 |
| Seasons | 1.7302433 |
| Transportation.expense | 1.9980789 |
| Distance.from.Residence.to.Work | 3.2309486 |
| Service.time | 3.0733594 |
| Age | -0.5891115 |
| Work.load.Average.day | 0.5318483 |
| Hit.target | -1.2323174 |
| Disciplinary.failure | 7.8033216 |
| Education | 1.2900156 |
| Son | -0.6360769 |
| Social.drinker | 2.2806310 |
| Social.smoker | -1.1022361 |
| Pet | 1.3179246 |
| Weight | 2.0410672 |
| Height | 4.3207143 |
| Body.mass.index | 0.6317616 |
# Sort the importance measures in descending order
sorted_importance <- sort(importance, decreasing = TRUE)
sorted_importance
Here are the top 5 most important features for predicting absenteeism hours based on the %IncMSE (Increase in Mean Squared Error):
Reason.for.absence (13.0542386) Disciplinary.failure (7.8033216) Height (4.3207143) Distance.from.Residence.to.Work (3.2309486) Service.time (3.0733594)
These features have the highest %IncMSE values, indicating their importance in predicting the total hours of absenteeism.
They will now be used for clustering and making sense of the abseentism dataset.
# Create the dataframe with top five features and response variable
top_features <- abseentism[, c("Reason.for.absence", "Disciplinary.failure", "Height",
"Distance.from.Residence.to.Work", "Service.time",
"Absenteeism.time.in.hours")]
# Display the first few rows of the dataframe
head(top_features)
| Reason.for.absence | Disciplinary.failure | Height | Distance.from.Residence.to.Work | Service.time | Absenteeism.time.in.hours | |
|---|---|---|---|---|---|---|
| <fct> | <fct> | <int> | <int> | <int> | <int> | |
| 1 | 26 | 0 | 172 | 36 | 13 | 4 |
| 2 | 0 | 1 | 178 | 13 | 18 | 0 |
| 3 | 23 | 0 | 170 | 51 | 18 | 2 |
| 4 | 7 | 0 | 168 | 5 | 14 | 4 |
| 5 | 23 | 0 | 172 | 36 | 13 | 2 |
| 6 | 23 | 0 | 170 | 51 | 18 | 2 |
# Select the numerical features for standardization
numerical_features <- top_features[, c("Height", "Distance.from.Residence.to.Work", "Service.time")]
# Standardize the numerical features
scaled_features <- scale(numerical_features)
# Create a dataframe with the scaled features and categorical variables
scaled_data <- data.frame(scaled_features, Disciplinary.failure = top_features$Disciplinary.failure, Reason.for.absence = top_features$Reason.for.absence, Absenteeism.time.in.hours = top_features$Absenteeism.time.in.hours)
# Display the first few rows of the scaled data
head(scaled_data)
| Height | Distance.from.Residence.to.Work | Service.time | Disciplinary.failure | Reason.for.absence | Absenteeism.time.in.hours | |
|---|---|---|---|---|---|---|
| <dbl> | <dbl> | <dbl> | <fct> | <fct> | <int> | |
| 1 | -0.01903313 | 0.4292653 | 0.1017010 | 0 | 26 | 4 |
| 2 | 0.97516826 | -1.1209354 | 1.2419848 | 1 | 0 | 0 |
| 3 | -0.35043360 | 1.4402658 | 1.2419848 | 0 | 23 | 2 |
| 4 | -0.68183407 | -1.6601356 | 0.3297577 | 0 | 7 | 4 |
| 5 | -0.01903313 | 0.4292653 | 0.1017010 | 0 | 23 | 2 |
| 6 | -0.35043360 | 1.4402658 | 1.2419848 | 0 | 23 | 2 |
library(cluster)
library(caret)
# Select the features for clustering (excluding the response variable)
clustering_features <- top_features[, c("Reason.for.absence", "Disciplinary.failure", "Height", "Distance.from.Residence.to.Work", "Service.time")]
# Perform one-hot encoding for categorical variables
encoded_data <- predict(dummyVars(~., data = clustering_features), newdata = clustering_features)
# Scale the numerical features
scaled_features <- scale(encoded_data[, -ncol(encoded_data)])
# Perform k-means clustering for different numbers of clusters
wcss <- numeric(10) # Within-cluster sum of squares
for (k in 1:10) {
kmeans_model <- kmeans(scaled_features, centers = k)
wcss[k] <- kmeans_model$tot.withinss
}
# Plot the elbow curve
plot(1:10, wcss, type = "b", pch = 19, frame = FALSE, xlab = "Number of Clusters", ylab = "Within-Cluster Sum of Squares")
# Prompt the user to select the optimal number of clusters based on the plot
k_optimal <- readline(prompt = "Enter the optimal number of clusters: ")
# Convert the user input to an integer
k_optimal <- as.integer(k_optimal)
# Perform k-means clustering with the optimal number of clusters
kmeans_model_optimal <- kmeans(scaled_features, centers = k_optimal)
# Display the cluster centers
print(kmeans_model_optimal$centers)
Reason.for.absence.0 Reason.for.absence.1 Reason.for.absence.2
1 4.0233565 -0.14855835 -0.03676073
2 -0.2482128 -0.14855835 -0.03676073
3 -0.2279042 0.02566222 0.00635011
Reason.for.absence.3 Reason.for.absence.4 Reason.for.absence.5
1 -0.03676073 -0.052022734 -0.06375779
2 -0.03676073 -0.052022734 -0.06375779
3 0.00635011 0.008986495 0.01101363
Reason.for.absence.6 Reason.for.absence.7 Reason.for.absence.8
1 -0.1044710 -0.14374177 -0.09035121
2 -0.1044710 -0.14374177 -0.09035121
3 0.0180465 0.02483019 0.01560742
Reason.for.absence.9 Reason.for.absence.10 Reason.for.absence.11
1 -0.07367115 -0.18686301 -0.19069703
2 -0.07367115 -0.18686301 -0.19069703
3 0.01272608 0.03227903 0.03294133
Reason.for.absence.12 Reason.for.absence.13 Reason.for.absence.14
1 -0.1044710 -0.28316686 -0.16222416
2 -0.1044710 -0.28316686 -0.16222416
3 0.0180465 0.04891472 0.02802287
Reason.for.absence.15 Reason.for.absence.16 Reason.for.absence.17
1 -0.052022734 -0.06375779 -0.03676073
2 -0.052022734 -0.06375779 -0.03676073
3 0.008986495 0.01101363 0.00635011
Reason.for.absence.18 Reason.for.absence.19 Reason.for.absence.21
1 -0.17078572 -0.23888415 -0.09035121
2 -0.17078572 -0.23888415 -0.09035121
3 0.02950181 0.04126525 0.01560742
Reason.for.absence.22 Reason.for.absence.23 Reason.for.absence.24
1 -0.23250356 -0.5017712 -0.06375779
2 -0.23250356 -0.5017712 -0.06375779
3 0.04016306 0.0866768 0.01101363
Reason.for.absence.25 Reason.for.absence.26 Reason.for.absence.27
1 -0.20896045 -0.21590049 -0.3204569
2 -0.20896045 -0.21590049 3.1163274
3 0.03609618 0.03729501 -0.3204569
Reason.for.absence.28 Disciplinary.failure.0 Disciplinary.failure.1
1 -0.42202244 -4.1804726 4.1804726
2 -0.42202244 0.2388841 -0.2388841
3 0.07290087 0.2388841 -0.2388841
Height Distance.from.Residence.to.Work
1 -0.04388817 -0.23630997
2 -0.26638276 0.49568855
3 0.03191115 -0.03922363
Assigning cluster labels to the dataset.
# Get the cluster assignments for each data point
cluster_labels <- kmeans_model_optimal$cluster
# Add the cluster labels to the scaled_data dataframe
scaled_data$Cluster <- cluster_labels
head(scaled_data)
| Height | Distance.from.Residence.to.Work | Service.time | Disciplinary.failure | Reason.for.absence | Absenteeism.time.in.hours | Cluster | |
|---|---|---|---|---|---|---|---|
| <dbl> | <dbl> | <dbl> | <fct> | <fct> | <int> | <int> | |
| 1 | -0.01903313 | 0.4292653 | 0.1017010 | 0 | 26 | 4 | 3 |
| 2 | 0.97516826 | -1.1209354 | 1.2419848 | 1 | 0 | 0 | 1 |
| 3 | -0.35043360 | 1.4402658 | 1.2419848 | 0 | 23 | 2 | 3 |
| 4 | -0.68183407 | -1.6601356 | 0.3297577 | 0 | 7 | 4 | 3 |
| 5 | -0.01903313 | 0.4292653 | 0.1017010 | 0 | 23 | 2 | 3 |
| 6 | -0.35043360 | 1.4402658 | 1.2419848 | 0 | 23 | 2 | 3 |
# Calculate the average values for each feature by cluster
cluster_summary <- aggregate(. ~ Cluster, scaled_data, mean)
# Print the cluster summary table
cluster_summary
| Cluster | Height | Distance.from.Residence.to.Work | Service.time | Disciplinary.failure | Reason.for.absence | Absenteeism.time.in.hours |
|---|---|---|---|---|---|---|
| <int> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> |
| 1 | -0.04388817 | -0.23630997 | -0.0009245544 | 2 | 1.00000 | 0.000000 |
| 2 | -0.26638276 | 0.49568855 | 0.3595042734 | 1 | 27.00000 | 2.275362 |
| 3 | 0.03191115 | -0.03922363 | -0.0392532689 | 1 | 19.99366 | 7.871632 |
# Scatterplot: Absenteeism.time.in.hours vs Height
scatterplot_height <- ggplot(scaled_data, aes(x = Height, y = Absenteeism.time.in.hours, color = as.factor(Cluster))) +
geom_point() +
labs(x = "Height", y = "Absenteeism Time in Hours") +
ggtitle("Scatterplot: Absenteeism Time vs Height")
scatterplot_height
# Scatterplot: Absenteeism.time.in.hours vs Distance.from.Residence.to.Work
scatterplot_distance <- ggplot(scaled_data, aes(x = Distance.from.Residence.to.Work, y = Absenteeism.time.in.hours, color = as.factor(Cluster))) +
geom_point() +
labs(x = "Distance from Residence to Work", y = "Absenteeism Time in Hours") +
ggtitle("Scatterplot: Absenteeism Time vs Distance from Residence to Work")
scatterplot_distance
# Scatterplot: Absenteeism.time.in.hours vs Service.time
scatterplot_service <- ggplot(scaled_data, aes(x = Service.time, y = Absenteeism.time.in.hours, color = as.factor(Cluster))) +
geom_point() +
labs(x = "Service Time", y = "Absenteeism Time in Hours") +
ggtitle("Scatterplot: Absenteeism Time vs Service Time")
scatterplot_service
# Grouped bar plot: Cluster distribution by Disciplinary.failure
grouped_barplot <- ggplot(scaled_data, aes(x = as.factor(Cluster), fill = Disciplinary.failure)) +
geom_bar(position = "fill") +
labs(x = "Cluster", y = "Proportion", fill = "Disciplinary Failure") +
ggtitle("Cluster Distribution by Disciplinary Failure") +
scale_fill_manual(values = c("gray", "orange")) +
theme_bw()
# Display the grouped bar plot
grouped_barplot
# Stacked bar plot: Cluster distribution by Reason.for.absence
stacked_barplot <- ggplot(scaled_data, aes(x = Reason.for.absence, fill = as.factor(Cluster))) +
geom_bar(position = "stack") +
labs(x = "Reason for Absence", y = "Number of hours absent", fill = "Cluster") +
ggtitle("Cluster Distribution by Reason for Absence") +
theme_bw()
# Display the stacked bar plot
stacked_barplot
Based on the average values for each feature after k-means clustering and the descriptions of the reasons for absence, we can interpret the characteristics of each cluster and come up with creative names and descriptions for them as follows:
Description: This cluster represents a group of healthy professionals who have average characteristics and rarely miss work. They tend to have no disciplinary issues and are absent for various reasons not directly related to specific health conditions.
Description: This cluster represents employees who experience extended medical leaves. They tend to have above-average service time and often face disciplinary issues. The main reason for their absence is medical consultations, suggesting ongoing health concerns that require regular check-ups.
Description: This cluster represents employees who frequently face health issues and have a relatively high absenteeism rate. They experience a variety of health-related reasons for absence, such as neoplasms, mental and behavioral disorders, respiratory diseases, and digestive system issues. These individuals may require additional support or medical attention to manage their health conditions effectively.