# This R environment comes with many helpful analytics packages installed
# It is defined by the kaggle/rstats Docker image: https://github.com/kaggle/docker-rstats
# For example, here's a helpful package to load

library(tidyverse) # metapackage of all tidyverse packages

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

list.files(path = "../input")

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


abseentism <- read.csv("/kaggle/input/employee-abseentism/Absenteeism.csv", sep = ";")
head(abseentism)


summary(abseentism)

       ID        Reason.for.absence Month.of.absence Day.of.the.week
 Min.   : 1.00   Min.   : 0.00      Min.   : 0.000   Min.   :2.000  
 1st Qu.: 9.00   1st Qu.:13.00      1st Qu.: 3.000   1st Qu.:3.000  
 Median :18.00   Median :23.00      Median : 6.000   Median :4.000  
 Mean   :18.02   Mean   :19.22      Mean   : 6.324   Mean   :3.915  
 3rd Qu.:28.00   3rd Qu.:26.00      3rd Qu.: 9.000   3rd Qu.:5.000  
 Max.   :36.00   Max.   :28.00      Max.   :12.000   Max.   :6.000  
    Seasons      Transportation.expense Distance.from.Residence.to.Work
 Min.   :1.000   Min.   :118.0          Min.   : 5.00                  
 1st Qu.:2.000   1st Qu.:179.0          1st Qu.:16.00                  
 Median :3.000   Median :225.0          Median :26.00                  
 Mean   :2.545   Mean   :221.3          Mean   :29.63                  
 3rd Qu.:4.000   3rd Qu.:260.0          3rd Qu.:50.00                  
 Max.   :4.000   Max.   :388.0          Max.   :52.00                  
  Service.time        Age        Work.load.Average.day   Hit.target    
 Min.   : 1.00   Min.   :27.00   Min.   :205.9         Min.   : 81.00  
 1st Qu.: 9.00   1st Qu.:31.00   1st Qu.:244.4         1st Qu.: 93.00  
 Median :13.00   Median :37.00   Median :264.2         Median : 95.00  
 Mean   :12.55   Mean   :36.45   Mean   :271.5         Mean   : 94.59  
 3rd Qu.:16.00   3rd Qu.:40.00   3rd Qu.:294.2         3rd Qu.: 97.00  
 Max.   :29.00   Max.   :58.00   Max.   :378.9         Max.   :100.00  
 Disciplinary.failure   Education          Son        Social.drinker  
 Min.   :0.00000      Min.   :1.000   Min.   :0.000   Min.   :0.0000  
 1st Qu.:0.00000      1st Qu.:1.000   1st Qu.:0.000   1st Qu.:0.0000  
 Median :0.00000      Median :1.000   Median :1.000   Median :1.0000  
 Mean   :0.05405      Mean   :1.292   Mean   :1.019   Mean   :0.5676  
 3rd Qu.:0.00000      3rd Qu.:1.000   3rd Qu.:2.000   3rd Qu.:1.0000  
 Max.   :1.00000      Max.   :4.000   Max.   :4.000   Max.   :1.0000  
 Social.smoker          Pet             Weight           Height     
 Min.   :0.00000   Min.   :0.0000   Min.   : 56.00   Min.   :163.0  
 1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.: 69.00   1st Qu.:169.0  
 Median :0.00000   Median :0.0000   Median : 83.00   Median :170.0  
 Mean   :0.07297   Mean   :0.7459   Mean   : 79.04   Mean   :172.1  
 3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.: 89.00   3rd Qu.:172.0  
 Max.   :1.00000   Max.   :8.0000   Max.   :108.00   Max.   :196.0  
 Body.mass.index Absenteeism.time.in.hours
 Min.   :19.00   Min.   :  0.000          
 1st Qu.:24.00   1st Qu.:  2.000          
 Median :25.00   Median :  3.000          
 Mean   :26.68   Mean   :  6.924          
 3rd Qu.:31.00   3rd Qu.:  8.000          
 Max.   :38.00   Max.   :120.000


ggplot(abseentism, aes(x = Absenteeism.time.in.hours)) +
  geom_histogram(binwidth = 4, fill = "blue", color = "black") +
  labs(x = "Absenteeism Time (hours)", y = "Count", title = "Distribution of Absenteeism Time")


# Check the type of variables in each column
str(abseentism)

'data.frame':	740 obs. of  21 variables:
 $ ID                             : int  11 36 3 7 11 3 10 20 14 1 ...
 $ Reason.for.absence             : int  26 0 23 7 23 23 22 23 19 22 ...
 $ Month.of.absence               : int  7 7 7 7 7 7 7 7 7 7 ...
 $ Day.of.the.week                : int  3 3 4 5 5 6 6 6 2 2 ...
 $ Seasons                        : int  1 1 1 1 1 1 1 1 1 1 ...
 $ Transportation.expense         : int  289 118 179 279 289 179 361 260 155 235 ...
 $ Distance.from.Residence.to.Work: int  36 13 51 5 36 51 52 50 12 11 ...
 $ Service.time                   : int  13 18 18 14 13 18 3 11 14 14 ...
 $ Age                            : int  33 50 38 39 33 38 28 36 34 37 ...
 $ Work.load.Average.day          : num  240 240 240 240 240 ...
 $ Hit.target                     : int  97 97 97 97 97 97 97 97 97 97 ...
 $ Disciplinary.failure           : int  0 1 0 0 0 0 0 0 0 0 ...
 $ Education                      : int  1 1 1 1 1 1 1 1 1 3 ...
 $ Son                            : int  2 1 0 2 2 0 1 4 2 1 ...
 $ Social.drinker                 : int  1 1 1 1 1 1 1 1 1 0 ...
 $ Social.smoker                  : int  0 0 0 1 0 0 0 0 0 0 ...
 $ Pet                            : int  1 0 0 0 1 0 4 0 0 1 ...
 $ Weight                         : int  90 98 89 68 90 89 80 65 95 88 ...
 $ Height                         : int  172 178 170 168 172 170 172 168 196 172 ...
 $ Body.mass.index                : int  30 31 31 24 30 31 27 23 25 29 ...
 $ Absenteeism.time.in.hours      : int  4 0 2 4 2 2 8 4 40 8 ...


# Specify the columns to convert to factors
columns_to_convert <- c("Reason.for.absence", "Month.of.absence", "Day.of.the.week", "Seasons", "Disciplinary.failure", "Education", "Son", "Social.drinker", "Social.smoker", "Pet")

# Convert columns to factors
for (col in columns_to_convert) {
  abseentism[[col]] <- as.factor(abseentism[[col]])
}


# Deleting the ID column
abseentism <- subset(abseentism, select = -ID)


library(ggplot2)

# Create the boxplot
ggplot(abseentism, aes(x = Reason.for.absence, y = Absenteeism.time.in.hours)) +
  geom_boxplot(fill = "lightblue", color = "black") +
  labs(x = "Reason for Absence", y = "Total Absenteeism Time (hours)", 
       title = "Boxplot of Absenteeism Time by Reason for Absence")


# Boxplot for Day of the Week
ggplot(abseentism, aes(x = factor(Day.of.the.week), y = Absenteeism.time.in.hours)) +
  geom_boxplot(fill = "lightblue", color = "black") +
  labs(x = "Day of the Week", y = "Absenteeism Time (hours)",
       title = "Boxplot of Absenteeism by Day of the Week")


# Boxplot for Seasons
ggplot(abseentism, aes(x = factor(Seasons), y = Absenteeism.time.in.hours)) +
  geom_boxplot(fill = "lightblue", color = "black") +
  labs(x = "Seasons", y = "Absenteeism Time (hours)",
       title = "Boxplot of Absenteeism by Seasons")


# Boxplot for Social Drinker
ggplot(abseentism, aes(x = factor(Social.drinker), y = Absenteeism.time.in.hours)) +
  geom_boxplot(fill = "lightblue", color = "black") +
  labs(x = "Social Drinker", y = "Absenteeism Time (hours)",
       title = "Boxplot of Absenteeism by Social Drinker")


# Boxplot for Social Smoker
ggplot(abseentism, aes(x = factor(Social.smoker), y = Absenteeism.time.in.hours)) +
  geom_boxplot(fill = "lightblue", color = "black") +
  labs(x = "Social Smoker", y = "Absenteeism Time (hours)",
       title = "Boxplot of Absenteeism by Social Smoker")


# Scatterplot for Transportation Expense
ggplot(abseentism, aes(x = Transportation.expense, y = Absenteeism.time.in.hours)) +
  geom_point(color = "blue") +
  labs(x = "Transportation Expense", y = "Absenteeism Time (hours)",
       title = "Scatterplot of Absenteeism by Transportation Expense")


# Scatterplot for Distance from Residence to Work
ggplot(abseentism, aes(x = Distance.from.Residence.to.Work, y = Absenteeism.time.in.hours)) +
  geom_point(color = "red") +
  labs(x = "Distance from Residence to Work", y = "Absenteeism Time (hours)",
       title = "Scatterplot of Absenteeism by Distance from Residence to Work")


# Scatterplot for Age
ggplot(abseentism, aes(x = Age, y = Absenteeism.time.in.hours)) +
  geom_point(color = "green") +
  labs(x = "Age", y = "Absenteeism Time (hours)",
       title = "Scatterplot of Absenteeism by Age")


# Scatterplot for Work Load Average per Day
ggplot(abseentism, aes(x = Work.load.Average.day, y = Absenteeism.time.in.hours)) +
  geom_point(color = "orange") +
  labs(x = "Work Load Average per Day", y = "Absenteeism Time (hours)",
       title = "Scatterplot of Absenteeism by Work Load Average per Day")


# Scatterplot for Weight
ggplot(abseentism, aes(x = Weight, y = Absenteeism.time.in.hours)) +
  geom_point(color = "purple") +
  labs(x = "Weight", y = "Absenteeism Time (hours)",
       title = "Scatterplot of Absenteeism by Weight")


# Scatterplot for Body Mass Index
ggplot(abseentism, aes(x = Body.mass.index, y = Absenteeism.time.in.hours)) +
  geom_point(color = "brown") +
  labs(x = "Body Mass Index", y = "Absenteeism Time (hours)",
       title = "Scatterplot of Absenteeism by Body Mass Index")


library(randomForest)

# Create a subset of the dataset with only the predictor variables
predictors <- abseentism[, c("Reason.for.absence", "Month.of.absence", "Day.of.the.week", "Seasons",
                            "Transportation.expense", "Distance.from.Residence.to.Work",
                            "Service.time", "Age", "Work.load.Average.day", "Hit.target",
                            "Disciplinary.failure", "Education", "Son", "Social.drinker",
                            "Social.smoker", "Pet", "Weight", "Height", "Body.mass.index")]

# Create a vector of the response variable (Total hours of absenteeism)
response <- abseentism$Absenteeism.time.in.hours

# Perform Random Forest analysis
rf_model <- randomForest(predictors, response, ntree = 500, importance = TRUE)


rf_model

Call:
 randomForest(x = predictors, y = response, ntree = 500, importance = TRUE) 
               Type of random forest: regression
                     Number of trees: 500
No. of variables tried at each split: 6

          Mean of squared residuals: 158.9623
                    % Var explained: 10.43


# Get variable importance measures
importance <- importance(rf_model, type = 1)
importance


# Sort the importance measures in descending order
sorted_importance <- sort(importance, decreasing = TRUE)
sorted_importance


# Create the dataframe with top five features and response variable
top_features <- abseentism[, c("Reason.for.absence", "Disciplinary.failure", "Height",
                              "Distance.from.Residence.to.Work", "Service.time",
                              "Absenteeism.time.in.hours")]

# Display the first few rows of the dataframe
head(top_features)


# Select the numerical features for standardization
numerical_features <- top_features[, c("Height", "Distance.from.Residence.to.Work", "Service.time")]

# Standardize the numerical features
scaled_features <- scale(numerical_features)

# Create a dataframe with the scaled features and categorical variables
scaled_data <- data.frame(scaled_features, Disciplinary.failure = top_features$Disciplinary.failure, Reason.for.absence = top_features$Reason.for.absence, Absenteeism.time.in.hours = top_features$Absenteeism.time.in.hours)

# Display the first few rows of the scaled data
head(scaled_data)


library(cluster)
library(caret)

# Select the features for clustering (excluding the response variable)
clustering_features <- top_features[, c("Reason.for.absence", "Disciplinary.failure", "Height", "Distance.from.Residence.to.Work", "Service.time")]

# Perform one-hot encoding for categorical variables
encoded_data <- predict(dummyVars(~., data = clustering_features), newdata = clustering_features)

# Scale the numerical features
scaled_features <- scale(encoded_data[, -ncol(encoded_data)])

# Perform k-means clustering for different numbers of clusters
wcss <- numeric(10)  # Within-cluster sum of squares
for (k in 1:10) {
  kmeans_model <- kmeans(scaled_features, centers = k)
  wcss[k] <- kmeans_model$tot.withinss
}

# Plot the elbow curve
plot(1:10, wcss, type = "b", pch = 19, frame = FALSE, xlab = "Number of Clusters", ylab = "Within-Cluster Sum of Squares")


# Prompt the user to select the optimal number of clusters based on the plot
k_optimal <- readline(prompt = "Enter the optimal number of clusters: ")

# Convert the user input to an integer
k_optimal <- as.integer(k_optimal)

# Perform k-means clustering with the optimal number of clusters
kmeans_model_optimal <- kmeans(scaled_features, centers = k_optimal)

# Display the cluster centers
print(kmeans_model_optimal$centers)

  Reason.for.absence.0 Reason.for.absence.1 Reason.for.absence.2
1            4.0233565          -0.14855835          -0.03676073
2           -0.2482128          -0.14855835          -0.03676073
3           -0.2279042           0.02566222           0.00635011
  Reason.for.absence.3 Reason.for.absence.4 Reason.for.absence.5
1          -0.03676073         -0.052022734          -0.06375779
2          -0.03676073         -0.052022734          -0.06375779
3           0.00635011          0.008986495           0.01101363
  Reason.for.absence.6 Reason.for.absence.7 Reason.for.absence.8
1           -0.1044710          -0.14374177          -0.09035121
2           -0.1044710          -0.14374177          -0.09035121
3            0.0180465           0.02483019           0.01560742
  Reason.for.absence.9 Reason.for.absence.10 Reason.for.absence.11
1          -0.07367115           -0.18686301           -0.19069703
2          -0.07367115           -0.18686301           -0.19069703
3           0.01272608            0.03227903            0.03294133
  Reason.for.absence.12 Reason.for.absence.13 Reason.for.absence.14
1            -0.1044710           -0.28316686           -0.16222416
2            -0.1044710           -0.28316686           -0.16222416
3             0.0180465            0.04891472            0.02802287
  Reason.for.absence.15 Reason.for.absence.16 Reason.for.absence.17
1          -0.052022734           -0.06375779           -0.03676073
2          -0.052022734           -0.06375779           -0.03676073
3           0.008986495            0.01101363            0.00635011
  Reason.for.absence.18 Reason.for.absence.19 Reason.for.absence.21
1           -0.17078572           -0.23888415           -0.09035121
2           -0.17078572           -0.23888415           -0.09035121
3            0.02950181            0.04126525            0.01560742
  Reason.for.absence.22 Reason.for.absence.23 Reason.for.absence.24
1           -0.23250356            -0.5017712           -0.06375779
2           -0.23250356            -0.5017712           -0.06375779
3            0.04016306             0.0866768            0.01101363
  Reason.for.absence.25 Reason.for.absence.26 Reason.for.absence.27
1           -0.20896045           -0.21590049            -0.3204569
2           -0.20896045           -0.21590049             3.1163274
3            0.03609618            0.03729501            -0.3204569
  Reason.for.absence.28 Disciplinary.failure.0 Disciplinary.failure.1
1           -0.42202244             -4.1804726              4.1804726
2           -0.42202244              0.2388841             -0.2388841
3            0.07290087              0.2388841             -0.2388841
       Height Distance.from.Residence.to.Work
1 -0.04388817                     -0.23630997
2 -0.26638276                      0.49568855
3  0.03191115                     -0.03922363


# Get the cluster assignments for each data point
cluster_labels <- kmeans_model_optimal$cluster

# Add the cluster labels to the scaled_data dataframe
scaled_data$Cluster <- cluster_labels
head(scaled_data)


# Calculate the average values for each feature by cluster
cluster_summary <- aggregate(. ~ Cluster, scaled_data, mean)

# Print the cluster summary table
cluster_summary


# Scatterplot: Absenteeism.time.in.hours vs Height
scatterplot_height <- ggplot(scaled_data, aes(x = Height, y = Absenteeism.time.in.hours, color = as.factor(Cluster))) +
  geom_point() +
  labs(x = "Height", y = "Absenteeism Time in Hours") +
  ggtitle("Scatterplot: Absenteeism Time vs Height")
scatterplot_height


# Scatterplot: Absenteeism.time.in.hours vs Distance.from.Residence.to.Work
scatterplot_distance <- ggplot(scaled_data, aes(x = Distance.from.Residence.to.Work, y = Absenteeism.time.in.hours, color = as.factor(Cluster))) +
  geom_point() +
  labs(x = "Distance from Residence to Work", y = "Absenteeism Time in Hours") +
  ggtitle("Scatterplot: Absenteeism Time vs Distance from Residence to Work")
scatterplot_distance


# Scatterplot: Absenteeism.time.in.hours vs Service.time
scatterplot_service <- ggplot(scaled_data, aes(x = Service.time, y = Absenteeism.time.in.hours, color = as.factor(Cluster))) +
  geom_point() +
  labs(x = "Service Time", y = "Absenteeism Time in Hours") +
  ggtitle("Scatterplot: Absenteeism Time vs Service Time")
scatterplot_service


# Grouped bar plot: Cluster distribution by Disciplinary.failure
grouped_barplot <- ggplot(scaled_data, aes(x = as.factor(Cluster), fill = Disciplinary.failure)) +
  geom_bar(position = "fill") +
  labs(x = "Cluster", y = "Proportion", fill = "Disciplinary Failure") +
  ggtitle("Cluster Distribution by Disciplinary Failure") +
  scale_fill_manual(values = c("gray", "orange")) +
  theme_bw()

# Display the grouped bar plot
grouped_barplot


# Stacked bar plot: Cluster distribution by Reason.for.absence
stacked_barplot <- ggplot(scaled_data, aes(x = Reason.for.absence, fill = as.factor(Cluster))) +
  geom_bar(position = "stack") +
  labs(x = "Reason for Absence", y = "Number of hours absent", fill = "Cluster") +
  ggtitle("Cluster Distribution by Reason for Absence") +
  theme_bw()

# Display the stacked bar plot
stacked_barplot

	%IncMSE
Reason.for.absence	13.0542386
Month.of.absence	-0.5304347
Day.of.the.week	-0.8151381
Seasons	1.7302433
Transportation.expense	1.9980789
Distance.from.Residence.to.Work	3.2309486
Service.time	3.0733594
Age	-0.5891115
Work.load.Average.day	0.5318483
Hit.target	-1.2323174
Disciplinary.failure	7.8033216
Education	1.2900156
Son	-0.6360769
Social.drinker	2.2806310
Social.smoker	-1.1022361
Pet	1.3179246
Weight	2.0410672
Height	4.3207143
Body.mass.index	0.6317616

	Height	Distance.from.Residence.to.Work	Service.time	Disciplinary.failure	Reason.for.absence	Absenteeism.time.in.hours
	<dbl>	<dbl>	<dbl>	<fct>	<fct>	<int>
1	-0.01903313	0.4292653	0.1017010	0	26	4
2	0.97516826	-1.1209354	1.2419848	1	0	0
3	-0.35043360	1.4402658	1.2419848	0	23	2
4	-0.68183407	-1.6601356	0.3297577	0	7	4
5	-0.01903313	0.4292653	0.1017010	0	23	2
6	-0.35043360	1.4402658	1.2419848	0	23	2

	Height	Distance.from.Residence.to.Work	Service.time	Disciplinary.failure	Reason.for.absence	Absenteeism.time.in.hours	Cluster
	<dbl>	<dbl>	<dbl>	<fct>	<fct>	<int>	<int>
1	-0.01903313	0.4292653	0.1017010	0	26	4	3
2	0.97516826	-1.1209354	1.2419848	1	0	0	1
3	-0.35043360	1.4402658	1.2419848	0	23	2	3
4	-0.68183407	-1.6601356	0.3297577	0	7	4	3
5	-0.01903313	0.4292653	0.1017010	0	23	2	3
6	-0.35043360	1.4402658	1.2419848	0	23	2	3

Cluster	Height	Distance.from.Residence.to.Work	Service.time	Disciplinary.failure	Reason.for.absence	Absenteeism.time.in.hours
<int>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
1	-0.04388817	-0.23630997	-0.0009245544	2	1.00000	0.000000
2	-0.26638276	0.49568855	0.3595042734	1	27.00000	2.275362
3	0.03191115	-0.03922363	-0.0392532689	1	19.99366	7.871632

Setting the relevant environment¶

Importing the dataset¶

Exploring the dataset¶

Converting the following columns into factors¶

Visually exploring the data¶

Visualising numeric variables¶

Feature importance¶

Clustering the dataset using K-means clustering¶

Performing some standardisation/ scaling of numerical features¶

Determining the optimum number of clusters¶

Visualising the clusters¶

Interpreting the distribution of clusters¶

Cluster 1: "Healthy Professionals"¶

Height: Slightly above average¶

Distance from Residence to Work: Slightly below average¶

Service time: Close to average¶

Disciplinary failure: No disciplinary failure¶

Reason for absence: No specific reason (low value)¶

Absenteeism time in hours: Low¶

Cluster 2: "Extended Medical Leaves"¶

Height: Below average¶

Distance from Residence to Work: Above average¶

Service time: Above average¶

Disciplinary failure: Occasional disciplinary failure¶

Reason for absence: Specific medical reasons (Reason 27 - "Medical Consultations")¶

Absenteeism time in hours: Moderate¶

Cluster 3: "Frequent Health Issues"¶

Height: Slightly above average¶

Distance from Residence to Work: Slightly below average¶

Service time: Slightly below average¶

Disciplinary failure: Occasional disciplinary failure¶

Absenteeism time in hours: High¶

~ THANK YOU ~¶

Setting the relevant environment¶

Importing the dataset¶

Exploring the dataset¶

Converting the following columns into factors¶

Visually exploring the data¶

Visualising numeric variables¶

Feature importance¶

Clustering the dataset using K-means clustering¶

Performing some standardisation/ scaling of numerical features¶

Determining the optimum number of clusters¶

Visualising the clusters¶

Interpreting the distribution of clusters¶

Cluster 1: "Healthy Professionals"¶

Height: Slightly above average¶

Distance from Residence to Work: Slightly below average¶

Service time: Close to average¶

Disciplinary failure: No disciplinary failure¶

Reason for absence: No specific reason (low value)¶

Absenteeism time in hours: Low¶

Cluster 2: "Extended Medical Leaves"¶

Height: Below average¶

Distance from Residence to Work: Above average¶

Service time: Above average¶

Disciplinary failure: Occasional disciplinary failure¶

Reason for absence: Specific medical reasons (Reason 27 - "Medical Consultations")¶

Absenteeism time in hours: Moderate¶

Cluster 3: "Frequent Health Issues"¶

Height: Slightly above average¶

Distance from Residence to Work: Slightly below average¶

Service time: Slightly below average¶

Disciplinary failure: Occasional disciplinary failure¶

Reason for absence: Various health-related reasons (Reasons 2, 5, 10, 11, etc.)¶

Absenteeism time in hours: High¶

~ THANK YOU ~¶