import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Set the working directory to the folder containing the dataset
import os
os.chdir(r"C:\Users\dell vostro\Documents\Dalarna Masters_Wonder\Business Intelligence\Project Assignment")


insurance = pd.read_csv("freMTPL2freq.csv")
insurance.head()


# Create a new column 'Claim' based on the 'ClaimNb' column
insurance['Claim'] = pd.Series(insurance['ClaimNb'] > 0, dtype=int)

insurance.head()


# Let us see the distribution of classes. How many policyholders claimed versus those who did not?

print(insurance['Claim'].value_counts())

0    643953
1     34060
Name: Claim, dtype: int64


# Creating a list of populous regions
populous_regions = ['R82', 'R52', 'R53', 'R93', 'R11', 'R24']

# Creating the populous_region column
insurance['populous_region'] = insurance['Region'].apply(lambda x: 'Yes' if x in populous_regions else 'No')

# Displaying the updated dataframe
insurance.head()


# Grouping the data by "populous_region" and "Claim" columns and counting the number of occurrences
grouped_data = insurance.groupby(['populous_region', 'Claim'])['IDpol'].count().reset_index()

# Pivot the data to have "Claim" values as columns
pivot_data = grouped_data.pivot(index='populous_region', columns='Claim', values='IDpol')

# Plotting the clustered column chart
pivot_data.plot(kind='bar', stacked=True)
plt.xlabel('Region')
plt.ylabel('Number of Claims')
plt.title('Number of Claims by Region')
plt.legend(title='Claim', labels=['No Claim', 'Claim'])
plt.show()


# Creating the low_miles column
insurance['low_miles'] = insurance['BonusMalus'].apply(lambda x: 'Yes' if x < 109 else 'No')

# Displaying the updated dataframe
insurance.head()


grouped_data = insurance.groupby(['low_miles', 'Claim'])['IDpol'].count().reset_index()
pivot_data = grouped_data.pivot(index='low_miles', columns='Claim', values='IDpol')

# Plotting the clustered column chart
pivot_data.plot(kind='bar', stacked=True)
plt.xlabel('low_miles')
plt.ylabel('Number of Claims')
plt.title('Number of Claims by levels of bonus miles')
plt.legend(title='Claim', labels=['No Claim', 'Claim'])
plt.show()


# Creating a list of special vehicle brands
special_brands = ['B1', 'B12', 'B2', 'B3']

# Creating the populous_region column
insurance['special brands'] = insurance['VehBrand'].apply(lambda x: 'Yes' if x in special_brands else 'No')

# Displaying the updated dataframe
insurance.head()


grouped_data = insurance.groupby(['special brands', 'Claim'])['IDpol'].count().reset_index()
pivot_data = grouped_data.pivot(index='special brands', columns='Claim', values='IDpol')

# Plotting the clustered column chart
pivot_data.plot(kind='bar', stacked=True)
plt.xlabel('Brand of vehicle')
plt.ylabel('Number of Claims')
plt.title('Number of Claims by vehicle brand')
plt.legend(title='Claim', labels=['No Claim', 'Claim'])
plt.show()


# Creating the new_vehicle column
insurance['new_vehicle'] = insurance['VehAge'].apply(lambda x: 'Yes' if x < 30 else 'No')

# Displaying the updated dataframe
insurance.head()


grouped_data = insurance.groupby(['new_vehicle', 'Claim'])['IDpol'].count().reset_index()
pivot_data = grouped_data.pivot(index='new_vehicle', columns='Claim', values='IDpol')

# Plotting the clustered column chart
pivot_data.plot(kind='bar', stacked=True)
plt.xlabel('new_vehicle')
plt.ylabel('Number of Claims')
plt.title('Number of Claims by vehicle age')
plt.legend(title='Claim', labels=['No Claim', 'Claim'])
plt.show()


# Make a new column with a "binary value "Yes" and "No" for use in Boxplots
insurance['Claim_txt'] = pd.Series(['No' if x==0 else 'Yes' for x in insurance['Claim']])
insurance.head()


# Create a 2x2 grid of subplots
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(12, 12))

# Claims by different driver ages, using a boxplot
sns.boxplot(ax=axs[0, 0], data=insurance, x='Claim_txt', y='DrivAge')
axs[0, 0].set_title('Claims by Different Driver Ages')
axs[0, 0].set_xlabel('Claim')
axs[0, 0].set_ylabel('Driver Age')

# Claims by exposure periods, using a boxplot
sns.boxplot(ax=axs[0, 1], data=insurance, x='Claim_txt', y='Exposure')
axs[0, 1].set_title('Claims by Exposure Periods')
axs[0, 1].set_xlabel('Claim')
axs[0, 1].set_ylabel('Exposure')

# Claims by different population densities, using a boxplot
sns.boxplot(ax=axs[1, 0], data=insurance, x='Claim_txt', y='Density')
axs[1, 0].set_title('Claims by Different Population Densities')
axs[1, 0].set_xlabel('Claim')
axs[1, 0].set_ylabel('Density')

# Claims by different bonusMalus, using a boxplot
sns.boxplot(ax=axs[1, 1], data=insurance, x='Claim_txt', y='VehAge')
axs[1, 1].set_title('Claims by Different VehAge')
axs[1, 1].set_xlabel('Claim')
axs[1, 1].set_ylabel('VehAge')

plt.tight_layout()
plt.show()


# Delete some redundant columns
# Since some features have been re-coded, they will now be removed from the dataframe


# Drop 'DrivAge' and 'VehGas_Diesel' columns from insurance_selected
insurance.drop(['BonusMalus', 'VehBrand', 'VehAge', 'Region', 'Claim_txt'], axis=1, inplace=True)
insurance.head()


# Perform one-hot encoding on the categorical variables
insurance2 = pd.get_dummies(insurance, columns=['populous_region', 'VehGas', 'new_vehicle', 'Area', 'special brands', 'low_miles'])
insurance2.head()


# some variables are strongly correlated. We'll need to drop one of them, but before that ...


from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

X = insurance2.drop(['IDpol', 'ClaimNb', 'Claim'], axis=1)
y = insurance2['Claim']

selector = SelectKBest(score_func=f_classif, k=10)
selector.fit(X, y)

# Get the selected feature indices and names
selected_indices = selector.get_support(indices=True)
selected_names = X.columns[selected_indices]

# Create a new dataset with the selected features
insurance_selected = insurance2[selected_names].join(insurance2[['Claim']])
insurance_selected.head()


# Drop the "Claim" column from the dataset before calculating the correlation matrix
insurance_selected_without_claim = insurance_selected.drop('Claim', axis=1)

# Create correlation matrix
cor_matrix = insurance_selected_without_claim.corr()

# Create correlation heatmap
sns.set(style='white')
mask = np.triu(np.ones_like(cor_matrix, dtype=bool))
cmap = sns.diverging_palette(20, 220, as_cmap=True)
plt.figure(figsize=(10, 8))
sns.heatmap(cor_matrix, mask=mask, cmap=cmap, center=0, annot=True, fmt='.2f', square=True,
            cbar_kws={'shrink': .75}, linewidths=.5, annot_kws={'size': 10})
plt.title('Correlation Heatmap of Numeric Variables in selected Dataset')
plt.show()


# Drop 'DrivAge' and 'VehGas_Diesel' columns from insurance_selected
insurance_selected.drop(['populous_region_No', 'VehGas_Diesel', 'new_vehicle_No', 'low_miles_No'], axis=1, inplace=True)
insurance_selected.head()


# Since the dataset is relatively large, I decided to downsample the majority class as opposed to 
# oversampling the minority class


# Separate minority and majority classes
minority_class = insurance_selected[insurance_selected['Claim'] == 1]
majority_class = insurance_selected[insurance_selected['Claim'] == 0]

# Determine the size of the minority class
minority_size = len(minority_class)

# Randomly sample majority class observations
downsampled_majority = majority_class.sample(n=minority_size, random_state=42)

# Combine minority class with downsampled majority class
insurance_downsampled = pd.concat([minority_class, downsampled_majority])

# Shuffle the new DataFrame
insurance_downsampled = insurance_downsampled.sample(frac=1, random_state=42).reset_index(drop=True)


# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X = insurance_downsampled.drop('Claim', axis=1)
y = insurance_downsampled['Claim']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Fit the logistic regression model
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression()

LogisticRegression()


# Make predictions on the test set
y_pred = logreg.predict(X_test)

# Compute the confusion matrix, accuracy, precision, recall, and F1-score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

confusion = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print('Confusion matrix:\n', confusion)
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1-score:', f1)

Confusion matrix:
 [[6070 4148]
 [ 281 9937]]
Accuracy: 0.7832746134272852
Precision: 0.7055023074192404
Recall: 0.9724995106674496
F1-score: 0.8177591243879355


# Perform 5-fold cross validation and print the validation scores
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(logreg, X, y, cv=5)
print('Cross-validation scores:', cv_scores)
print('Mean cross-validation score:', cv_scores.mean())

Cross-validation scores: [0.78075455 0.78317675 0.78985614 0.78134175 0.78280975]
Mean cross-validation score: 0.783587786259542


from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score

# Fit Random Forest model
rf = RandomForestClassifier(n_estimators=50, random_state=42)
rf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=50, random_state=42)

RandomForestClassifier(n_estimators=50, random_state=42)


# Make predictions on the test set
y_pred = rf.predict(X_test)

# Compute evaluation metrics
confusion_mtx = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print evaluation metrics
print("Confusion Matrix:\n", confusion_mtx)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Confusion Matrix:
 [[6876 3342]
 [1834 8384]]
Accuracy: 0.7467214719123116
Precision: 0.7149923247484223
Recall: 0.8205128205128205
F1-score: 0.7641268683922713


# Determine optimum number of trees
num_trees = [10, 50, 100, 200, 300]
cv_scores = []
for num_tree in num_trees:
    rf = RandomForestClassifier(n_estimators=num_tree, random_state=42)
    scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='f1')
    cv_scores.append(scores.mean())

# Print cross-validation scores for different number of trees
for num_tree, cv_score in zip(num_trees, cv_scores):
    print("Number of Trees:", num_tree)
    print("Mean CV Score:", cv_score)
    print()

Number of Trees: 10
Mean CV Score: 0.7594888178419236

Number of Trees: 50
Mean CV Score: 0.7651744380799489

Number of Trees: 100
Mean CV Score: 0.7663422673468246

Number of Trees: 200
Mean CV Score: 0.7675760508804794

Number of Trees: 300
Mean CV Score: 0.7682985673515799


# Perform 5-fold cross-validation
cv_scores = cross_val_score(rf, X, y, cv=5)

print("Cross-Validation Scores:", cv_scores)

Cross-Validation Scores: [0.747431   0.7497064  0.75455079 0.74427481 0.7473576 ]


# Create a DataFrame with model names and performance metrics
data = {
    'Model': ['Logit', 'Random Forest'],
    'Accuracy': [0.7833, 0.7467],
    'Precision': [0.7055, 0.7150],
    'Recall': [0.9725, 0.8205],
    'F1-score': [0.8178, 0.7641]
}

df = pd.DataFrame(data)

# Set the figure size
plt.figure(figsize=(10, 6))

# Define the position of each bar on the x-axis
bar_width = 0.15
index = df.index

# Plot the performance metrics as clustered columns
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-score']
colors = ['#FF7F0E', '#1F77B4', '#FFBB78', '#2CA02C']
offset = -2 * bar_width

for i, metric in enumerate(metrics):
    plt.bar(index + offset, df[metric], bar_width, label=metric, color=colors[i])
    offset += bar_width

# Set the x-axis labels
plt.xlabel('Model')
plt.ylabel('Metric Value')
plt.title('Performance Metrics Comparison')
plt.xticks(index, df['Model'], rotation=45)

# Add a legend
plt.legend(prop={'size': 8})

# Display the plot
plt.tight_layout()
plt.show()


import plotly.graph_objects as go

# Create a DataFrame with model names and cross-validation scores
data = {
    'Model': ['Logit', 'Random Forest'],
    'CV Scores': [
        [0.7808, 0.7831, 0.7899, 0.7813, 0.7828],
        [0.7474, 0.7497, 0.7546, 0.7442, 0.7473]
    ]
}

df = pd.DataFrame(data)

# Convert range(1, 6) to a list
x_values = list(range(1, 6))

# Create the line graph using Plotly
fig = go.Figure()

for i in range(len(df)):
    fig.add_trace(go.Scatter(
        x=x_values,
        y=df['CV Scores'][i],
        mode='lines+markers',
        name=df['Model'][i]
    ))

fig.update_layout(
    xaxis=dict(title='Fold'),
    yaxis=dict(title='Cross-Validation Score'),
    title='Cross-Validation Scores Comparison'
)

fig.show()


# The Logit model outperformed the random forest model in all five folds, hence, will be used in making the final predictions.

Using the french motor claims dataset to predict the likelihood of a vehicle insurance claim¶

Wonder Mahembe¶

We begin by loading the dataset¶

Subsetting the data to convert the target variable to binary¶

Some feature engineering to prepare the data for predictive learning¶

Visualising quantitative features¶

In preparation for predictive analysis, let us generate dummy values for our predictors which are categorical¶

We'll use ANOVA to select 10 best features¶

Check for correlations in the set of selected variables¶

two of the columns which are higly correlated with others will be removed¶

We previously established that there is a major imbalance in classes. Now we will perform resampling to ensure class balance before predictive analysis.¶

Algorithm 1: Logistic Regression¶

Algorithm 2: random Forest¶

Performance comparisons¶

Comparison of metrics¶

Comparison of cross validated scores¶

END ~¶

	IDpol	ClaimNb	Exposure	Area	VehPower	VehAge	DrivAge	BonusMalus	VehBrand	VehGas	Density	Region
0	1	1	0.10	D	5	0	55	50	B12	Regular	1217	R82
1	3	1	0.77	D	5	0	55	50	B12	Regular	1217	R82
2	5	1	0.75	B	6	2	52	50	B12	Diesel	54	R22
3	10	1	0.09	B	7	0	46	50	B12	Diesel	76	R72
4	11	1	0.84	B	7	0	46	50	B12	Diesel	76	R72

	Exposure	DrivAge	populous_region_No	populous_region_Yes	VehGas_Diesel	VehGas_Regular	new_vehicle_Yes	low_miles_Yes	Claim
0	0.10	55	0	1	0	1	1	1	1
1	0.77	55	0	1	0	1	1	1	1
2	0.75	52	1	0	1	0	1	1	1
3	0.09	46	1	0	1	0	1	1	1
4	0.84	46	1	0	1	0	1	1	1