Using the french motor claims dataset to predict the likelihood of a vehicle insurance claim¶

Wonder Mahembe¶

We begin by loading the dataset¶

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Set the working directory to the folder containing the dataset
import os
os.chdir(r"C:\Users\dell vostro\Documents\Dalarna Masters_Wonder\Business Intelligence\Project Assignment")
In [22]:
insurance = pd.read_csv("freMTPL2freq.csv")
insurance.head()
Out[22]:
IDpol ClaimNb Exposure Area VehPower VehAge DrivAge BonusMalus VehBrand VehGas Density Region
0 1 1 0.10 D 5 0 55 50 B12 Regular 1217 R82
1 3 1 0.77 D 5 0 55 50 B12 Regular 1217 R82
2 5 1 0.75 B 6 2 52 50 B12 Diesel 54 R22
3 10 1 0.09 B 7 0 46 50 B12 Diesel 76 R72
4 11 1 0.84 B 7 0 46 50 B12 Diesel 76 R72

Subsetting the data to convert the target variable to binary¶

In [23]:
# Create a new column 'Claim' based on the 'ClaimNb' column
insurance['Claim'] = pd.Series(insurance['ClaimNb'] > 0, dtype=int)

insurance.head()
Out[23]:
IDpol ClaimNb Exposure Area VehPower VehAge DrivAge BonusMalus VehBrand VehGas Density Region Claim
0 1 1 0.10 D 5 0 55 50 B12 Regular 1217 R82 1
1 3 1 0.77 D 5 0 55 50 B12 Regular 1217 R82 1
2 5 1 0.75 B 6 2 52 50 B12 Diesel 54 R22 1
3 10 1 0.09 B 7 0 46 50 B12 Diesel 76 R72 1
4 11 1 0.84 B 7 0 46 50 B12 Diesel 76 R72 1
In [12]:
# Let us see the distribution of classes. How many policyholders claimed versus those who did not?

print(insurance['Claim'].value_counts())
0    643953
1     34060
Name: Claim, dtype: int64

Some feature engineering to prepare the data for predictive learning¶

In [32]:
# Creating a list of populous regions
populous_regions = ['R82', 'R52', 'R53', 'R93', 'R11', 'R24']

# Creating the populous_region column
insurance['populous_region'] = insurance['Region'].apply(lambda x: 'Yes' if x in populous_regions else 'No')

# Displaying the updated dataframe
insurance.head()
Out[32]:
IDpol ClaimNb Exposure Area VehPower VehAge DrivAge BonusMalus VehBrand VehGas Density Region Claim populous_region low_miles special brands new_vehicle Claim_txt
0 1 1 0.10 D 5 0 55 50 B12 Regular 1217 R82 1 Yes Yes Yes Yes Yes
1 3 1 0.77 D 5 0 55 50 B12 Regular 1217 R82 1 Yes Yes Yes Yes Yes
2 5 1 0.75 B 6 2 52 50 B12 Diesel 54 R22 1 No Yes Yes Yes Yes
3 10 1 0.09 B 7 0 46 50 B12 Diesel 76 R72 1 No Yes Yes Yes Yes
4 11 1 0.84 B 7 0 46 50 B12 Diesel 76 R72 1 No Yes Yes Yes Yes
In [6]:
# Grouping the data by "populous_region" and "Claim" columns and counting the number of occurrences
grouped_data = insurance.groupby(['populous_region', 'Claim'])['IDpol'].count().reset_index()

# Pivot the data to have "Claim" values as columns
pivot_data = grouped_data.pivot(index='populous_region', columns='Claim', values='IDpol')

# Plotting the clustered column chart
pivot_data.plot(kind='bar', stacked=True)
plt.xlabel('Region')
plt.ylabel('Number of Claims')
plt.title('Number of Claims by Region')
plt.legend(title='Claim', labels=['No Claim', 'Claim'])
plt.show()
In [33]:
# Creating the low_miles column
insurance['low_miles'] = insurance['BonusMalus'].apply(lambda x: 'Yes' if x < 109 else 'No')

# Displaying the updated dataframe
insurance.head()
Out[33]:
IDpol ClaimNb Exposure Area VehPower VehAge DrivAge BonusMalus VehBrand VehGas Density Region Claim populous_region low_miles special brands new_vehicle Claim_txt
0 1 1 0.10 D 5 0 55 50 B12 Regular 1217 R82 1 Yes Yes Yes Yes Yes
1 3 1 0.77 D 5 0 55 50 B12 Regular 1217 R82 1 Yes Yes Yes Yes Yes
2 5 1 0.75 B 6 2 52 50 B12 Diesel 54 R22 1 No Yes Yes Yes Yes
3 10 1 0.09 B 7 0 46 50 B12 Diesel 76 R72 1 No Yes Yes Yes Yes
4 11 1 0.84 B 7 0 46 50 B12 Diesel 76 R72 1 No Yes Yes Yes Yes
In [14]:
grouped_data = insurance.groupby(['low_miles', 'Claim'])['IDpol'].count().reset_index()
pivot_data = grouped_data.pivot(index='low_miles', columns='Claim', values='IDpol')

# Plotting the clustered column chart
pivot_data.plot(kind='bar', stacked=True)
plt.xlabel('low_miles')
plt.ylabel('Number of Claims')
plt.title('Number of Claims by levels of bonus miles')
plt.legend(title='Claim', labels=['No Claim', 'Claim'])
plt.show()
In [34]:
# Creating a list of special vehicle brands
special_brands = ['B1', 'B12', 'B2', 'B3']

# Creating the populous_region column
insurance['special brands'] = insurance['VehBrand'].apply(lambda x: 'Yes' if x in special_brands else 'No')

# Displaying the updated dataframe
insurance.head()
Out[34]:
IDpol ClaimNb Exposure Area VehPower VehAge DrivAge BonusMalus VehBrand VehGas Density Region Claim populous_region low_miles special brands new_vehicle Claim_txt
0 1 1 0.10 D 5 0 55 50 B12 Regular 1217 R82 1 Yes Yes Yes Yes Yes
1 3 1 0.77 D 5 0 55 50 B12 Regular 1217 R82 1 Yes Yes Yes Yes Yes
2 5 1 0.75 B 6 2 52 50 B12 Diesel 54 R22 1 No Yes Yes Yes Yes
3 10 1 0.09 B 7 0 46 50 B12 Diesel 76 R72 1 No Yes Yes Yes Yes
4 11 1 0.84 B 7 0 46 50 B12 Diesel 76 R72 1 No Yes Yes Yes Yes
In [11]:
grouped_data = insurance.groupby(['special brands', 'Claim'])['IDpol'].count().reset_index()
pivot_data = grouped_data.pivot(index='special brands', columns='Claim', values='IDpol')

# Plotting the clustered column chart
pivot_data.plot(kind='bar', stacked=True)
plt.xlabel('Brand of vehicle')
plt.ylabel('Number of Claims')
plt.title('Number of Claims by vehicle brand')
plt.legend(title='Claim', labels=['No Claim', 'Claim'])
plt.show()
In [35]:
# Creating the new_vehicle column
insurance['new_vehicle'] = insurance['VehAge'].apply(lambda x: 'Yes' if x < 30 else 'No')

# Displaying the updated dataframe
insurance.head()
Out[35]:
IDpol ClaimNb Exposure Area VehPower VehAge DrivAge BonusMalus VehBrand VehGas Density Region Claim populous_region low_miles special brands new_vehicle Claim_txt
0 1 1 0.10 D 5 0 55 50 B12 Regular 1217 R82 1 Yes Yes Yes Yes Yes
1 3 1 0.77 D 5 0 55 50 B12 Regular 1217 R82 1 Yes Yes Yes Yes Yes
2 5 1 0.75 B 6 2 52 50 B12 Diesel 54 R22 1 No Yes Yes Yes Yes
3 10 1 0.09 B 7 0 46 50 B12 Diesel 76 R72 1 No Yes Yes Yes Yes
4 11 1 0.84 B 7 0 46 50 B12 Diesel 76 R72 1 No Yes Yes Yes Yes
In [13]:
grouped_data = insurance.groupby(['new_vehicle', 'Claim'])['IDpol'].count().reset_index()
pivot_data = grouped_data.pivot(index='new_vehicle', columns='Claim', values='IDpol')

# Plotting the clustered column chart
pivot_data.plot(kind='bar', stacked=True)
plt.xlabel('new_vehicle')
plt.ylabel('Number of Claims')
plt.title('Number of Claims by vehicle age')
plt.legend(title='Claim', labels=['No Claim', 'Claim'])
plt.show()

Visualising quantitative features¶

In [36]:
# Make a new column with a "binary value "Yes" and "No" for use in Boxplots
insurance['Claim_txt'] = pd.Series(['No' if x==0 else 'Yes' for x in insurance['Claim']])
insurance.head()
Out[36]:
IDpol ClaimNb Exposure Area VehPower VehAge DrivAge BonusMalus VehBrand VehGas Density Region Claim populous_region low_miles special brands new_vehicle Claim_txt
0 1 1 0.10 D 5 0 55 50 B12 Regular 1217 R82 1 Yes Yes Yes Yes Yes
1 3 1 0.77 D 5 0 55 50 B12 Regular 1217 R82 1 Yes Yes Yes Yes Yes
2 5 1 0.75 B 6 2 52 50 B12 Diesel 54 R22 1 No Yes Yes Yes Yes
3 10 1 0.09 B 7 0 46 50 B12 Diesel 76 R72 1 No Yes Yes Yes Yes
4 11 1 0.84 B 7 0 46 50 B12 Diesel 76 R72 1 No Yes Yes Yes Yes
In [31]:
# Create a 2x2 grid of subplots
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(12, 12))

# Claims by different driver ages, using a boxplot
sns.boxplot(ax=axs[0, 0], data=insurance, x='Claim_txt', y='DrivAge')
axs[0, 0].set_title('Claims by Different Driver Ages')
axs[0, 0].set_xlabel('Claim')
axs[0, 0].set_ylabel('Driver Age')

# Claims by exposure periods, using a boxplot
sns.boxplot(ax=axs[0, 1], data=insurance, x='Claim_txt', y='Exposure')
axs[0, 1].set_title('Claims by Exposure Periods')
axs[0, 1].set_xlabel('Claim')
axs[0, 1].set_ylabel('Exposure')

# Claims by different population densities, using a boxplot
sns.boxplot(ax=axs[1, 0], data=insurance, x='Claim_txt', y='Density')
axs[1, 0].set_title('Claims by Different Population Densities')
axs[1, 0].set_xlabel('Claim')
axs[1, 0].set_ylabel('Density')

# Claims by different bonusMalus, using a boxplot
sns.boxplot(ax=axs[1, 1], data=insurance, x='Claim_txt', y='VehAge')
axs[1, 1].set_title('Claims by Different VehAge')
axs[1, 1].set_xlabel('Claim')
axs[1, 1].set_ylabel('VehAge')

plt.tight_layout()
plt.show()
In [ ]:
# Delete some redundant columns
# Since some features have been re-coded, they will now be removed from the dataframe
In [37]:
# Drop 'DrivAge' and 'VehGas_Diesel' columns from insurance_selected
insurance.drop(['BonusMalus', 'VehBrand', 'VehAge', 'Region', 'Claim_txt'], axis=1, inplace=True)
insurance.head()
Out[37]:
IDpol ClaimNb Exposure Area VehPower DrivAge VehGas Density Claim populous_region low_miles special brands new_vehicle
0 1 1 0.10 D 5 55 Regular 1217 1 Yes Yes Yes Yes
1 3 1 0.77 D 5 55 Regular 1217 1 Yes Yes Yes Yes
2 5 1 0.75 B 6 52 Diesel 54 1 No Yes Yes Yes
3 10 1 0.09 B 7 46 Diesel 76 1 No Yes Yes Yes
4 11 1 0.84 B 7 46 Diesel 76 1 No Yes Yes Yes

In preparation for predictive analysis, let us generate dummy values for our predictors which are categorical¶

In [18]:
# Perform one-hot encoding on the categorical variables
insurance2 = pd.get_dummies(insurance, columns=['populous_region', 'VehGas', 'new_vehicle', 'Area', 'special brands', 'low_miles'])
insurance2.head()
Out[18]:
IDpol ClaimNb Exposure VehPower DrivAge Density Claim populous_region_No populous_region_Yes VehGas_Diesel ... Area_A Area_B Area_C Area_D Area_E Area_F special brands_No special brands_Yes low_miles_No low_miles_Yes
0 1 1 0.10 5 55 1217 1 0 1 0 ... 0 0 0 1 0 0 0 1 0 1
1 3 1 0.77 5 55 1217 1 0 1 0 ... 0 0 0 1 0 0 0 1 0 1
2 5 1 0.75 6 52 54 1 1 0 1 ... 0 1 0 0 0 0 0 1 0 1
3 10 1 0.09 7 46 76 1 1 0 1 ... 0 1 0 0 0 0 0 1 0 1
4 11 1 0.84 7 46 76 1 1 0 1 ... 0 1 0 0 0 0 0 1 0 1

5 rows × 23 columns

In [ ]:
# some variables are strongly correlated. We'll need to drop one of them, but before that ...

We'll use ANOVA to select 10 best features¶

In [20]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

X = insurance2.drop(['IDpol', 'ClaimNb', 'Claim'], axis=1)
y = insurance2['Claim']

selector = SelectKBest(score_func=f_classif, k=10)
selector.fit(X, y)

# Get the selected feature indices and names
selected_indices = selector.get_support(indices=True)
selected_names = X.columns[selected_indices]

# Create a new dataset with the selected features
insurance_selected = insurance2[selected_names].join(insurance2[['Claim']])
insurance_selected.head()
Out[20]:
Exposure DrivAge populous_region_No populous_region_Yes VehGas_Diesel VehGas_Regular new_vehicle_No new_vehicle_Yes low_miles_No low_miles_Yes Claim
0 0.10 55 0 1 0 1 0 1 0 1 1
1 0.77 55 0 1 0 1 0 1 0 1 1
2 0.75 52 1 0 1 0 0 1 0 1 1
3 0.09 46 1 0 1 0 0 1 0 1 1
4 0.84 46 1 0 1 0 0 1 0 1 1

Check for correlations in the set of selected variables¶

In [22]:
# Drop the "Claim" column from the dataset before calculating the correlation matrix
insurance_selected_without_claim = insurance_selected.drop('Claim', axis=1)

# Create correlation matrix
cor_matrix = insurance_selected_without_claim.corr()

# Create correlation heatmap
sns.set(style='white')
mask = np.triu(np.ones_like(cor_matrix, dtype=bool))
cmap = sns.diverging_palette(20, 220, as_cmap=True)
plt.figure(figsize=(10, 8))
sns.heatmap(cor_matrix, mask=mask, cmap=cmap, center=0, annot=True, fmt='.2f', square=True,
            cbar_kws={'shrink': .75}, linewidths=.5, annot_kws={'size': 10})
plt.title('Correlation Heatmap of Numeric Variables in selected Dataset')
plt.show()

two of the columns which are higly correlated with others will be removed¶

In [23]:
# Drop 'DrivAge' and 'VehGas_Diesel' columns from insurance_selected
insurance_selected.drop(['populous_region_No', 'VehGas_Diesel', 'new_vehicle_No', 'low_miles_No'], axis=1, inplace=True)
insurance_selected.head()
Out[23]:
Exposure DrivAge populous_region_Yes VehGas_Regular new_vehicle_Yes low_miles_Yes Claim
0 0.10 55 1 1 1 1 1
1 0.77 55 1 1 1 1 1
2 0.75 52 0 0 1 1 1
3 0.09 46 0 0 1 1 1
4 0.84 46 0 0 1 1 1

We previously established that there is a major imbalance in classes. Now we will perform resampling to ensure class balance before predictive analysis.¶

In [ ]:
# Since the dataset is relatively large, I decided to downsample the majority class as opposed to 
# oversampling the minority class
In [24]:
# Separate minority and majority classes
minority_class = insurance_selected[insurance_selected['Claim'] == 1]
majority_class = insurance_selected[insurance_selected['Claim'] == 0]

# Determine the size of the minority class
minority_size = len(minority_class)

# Randomly sample majority class observations
downsampled_majority = majority_class.sample(n=minority_size, random_state=42)

# Combine minority class with downsampled majority class
insurance_downsampled = pd.concat([minority_class, downsampled_majority])

# Shuffle the new DataFrame
insurance_downsampled = insurance_downsampled.sample(frac=1, random_state=42).reset_index(drop=True)

Algorithm 1: Logistic Regression¶

In [25]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X = insurance_downsampled.drop('Claim', axis=1)
y = insurance_downsampled['Claim']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Fit the logistic regression model
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
Out[25]:
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
In [26]:
# Make predictions on the test set
y_pred = logreg.predict(X_test)

# Compute the confusion matrix, accuracy, precision, recall, and F1-score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

confusion = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print('Confusion matrix:\n', confusion)
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1-score:', f1)
Confusion matrix:
 [[6070 4148]
 [ 281 9937]]
Accuracy: 0.7832746134272852
Precision: 0.7055023074192404
Recall: 0.9724995106674496
F1-score: 0.8177591243879355
In [27]:
# Perform 5-fold cross validation and print the validation scores
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(logreg, X, y, cv=5)
print('Cross-validation scores:', cv_scores)
print('Mean cross-validation score:', cv_scores.mean())
Cross-validation scores: [0.78075455 0.78317675 0.78985614 0.78134175 0.78280975]
Mean cross-validation score: 0.783587786259542

Algorithm 2: random Forest¶

In [31]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score

# Fit Random Forest model
rf = RandomForestClassifier(n_estimators=50, random_state=42)
rf.fit(X_train, y_train)
Out[31]:
RandomForestClassifier(n_estimators=50, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(n_estimators=50, random_state=42)
In [32]:
# Make predictions on the test set
y_pred = rf.predict(X_test)

# Compute evaluation metrics
confusion_mtx = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print evaluation metrics
print("Confusion Matrix:\n", confusion_mtx)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
Confusion Matrix:
 [[6876 3342]
 [1834 8384]]
Accuracy: 0.7467214719123116
Precision: 0.7149923247484223
Recall: 0.8205128205128205
F1-score: 0.7641268683922713
In [30]:
# Determine optimum number of trees
num_trees = [10, 50, 100, 200, 300]
cv_scores = []
for num_tree in num_trees:
    rf = RandomForestClassifier(n_estimators=num_tree, random_state=42)
    scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='f1')
    cv_scores.append(scores.mean())

# Print cross-validation scores for different number of trees
for num_tree, cv_score in zip(num_trees, cv_scores):
    print("Number of Trees:", num_tree)
    print("Mean CV Score:", cv_score)
    print()
Number of Trees: 10
Mean CV Score: 0.7594888178419236

Number of Trees: 50
Mean CV Score: 0.7651744380799489

Number of Trees: 100
Mean CV Score: 0.7663422673468246

Number of Trees: 200
Mean CV Score: 0.7675760508804794

Number of Trees: 300
Mean CV Score: 0.7682985673515799

In [33]:
# Perform 5-fold cross-validation
cv_scores = cross_val_score(rf, X, y, cv=5)

print("Cross-Validation Scores:", cv_scores)
Cross-Validation Scores: [0.747431   0.7497064  0.75455079 0.74427481 0.7473576 ]

Performance comparisons¶

Comparison of metrics¶

In [34]:
# Create a DataFrame with model names and performance metrics
data = {
    'Model': ['Logit', 'Random Forest'],
    'Accuracy': [0.7833, 0.7467],
    'Precision': [0.7055, 0.7150],
    'Recall': [0.9725, 0.8205],
    'F1-score': [0.8178, 0.7641]
}

df = pd.DataFrame(data)

# Set the figure size
plt.figure(figsize=(10, 6))

# Define the position of each bar on the x-axis
bar_width = 0.15
index = df.index

# Plot the performance metrics as clustered columns
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-score']
colors = ['#FF7F0E', '#1F77B4', '#FFBB78', '#2CA02C']
offset = -2 * bar_width

for i, metric in enumerate(metrics):
    plt.bar(index + offset, df[metric], bar_width, label=metric, color=colors[i])
    offset += bar_width

# Set the x-axis labels
plt.xlabel('Model')
plt.ylabel('Metric Value')
plt.title('Performance Metrics Comparison')
plt.xticks(index, df['Model'], rotation=45)

# Add a legend
plt.legend(prop={'size': 8})

# Display the plot
plt.tight_layout()
plt.show()

Comparison of cross validated scores¶

In [3]:
import plotly.graph_objects as go

# Create a DataFrame with model names and cross-validation scores
data = {
    'Model': ['Logit', 'Random Forest'],
    'CV Scores': [
        [0.7808, 0.7831, 0.7899, 0.7813, 0.7828],
        [0.7474, 0.7497, 0.7546, 0.7442, 0.7473]
    ]
}

df = pd.DataFrame(data)

# Convert range(1, 6) to a list
x_values = list(range(1, 6))

# Create the line graph using Plotly
fig = go.Figure()

for i in range(len(df)):
    fig.add_trace(go.Scatter(
        x=x_values,
        y=df['CV Scores'][i],
        mode='lines+markers',
        name=df['Model'][i]
    ))

fig.update_layout(
    xaxis=dict(title='Fold'),
    yaxis=dict(title='Cross-Validation Score'),
    title='Cross-Validation Scores Comparison'
)

fig.show()
In [39]:
# The Logit model outperformed the random forest model in all five folds, hence, will be used in making the final predictions.

END ~¶