import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# Set the working directory to the folder containing the dataset
import os
os.chdir(r"C:\Users\dell vostro\Documents\Dalarna Masters_Wonder\Business Intelligence\Project Assignment")
insurance = pd.read_csv("freMTPL2freq.csv")
insurance.head()
| IDpol | ClaimNb | Exposure | Area | VehPower | VehAge | DrivAge | BonusMalus | VehBrand | VehGas | Density | Region | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 0.10 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 |
| 1 | 3 | 1 | 0.77 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 |
| 2 | 5 | 1 | 0.75 | B | 6 | 2 | 52 | 50 | B12 | Diesel | 54 | R22 |
| 3 | 10 | 1 | 0.09 | B | 7 | 0 | 46 | 50 | B12 | Diesel | 76 | R72 |
| 4 | 11 | 1 | 0.84 | B | 7 | 0 | 46 | 50 | B12 | Diesel | 76 | R72 |
# Create a new column 'Claim' based on the 'ClaimNb' column
insurance['Claim'] = pd.Series(insurance['ClaimNb'] > 0, dtype=int)
insurance.head()
| IDpol | ClaimNb | Exposure | Area | VehPower | VehAge | DrivAge | BonusMalus | VehBrand | VehGas | Density | Region | Claim | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 0.10 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 | 1 |
| 1 | 3 | 1 | 0.77 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 | 1 |
| 2 | 5 | 1 | 0.75 | B | 6 | 2 | 52 | 50 | B12 | Diesel | 54 | R22 | 1 |
| 3 | 10 | 1 | 0.09 | B | 7 | 0 | 46 | 50 | B12 | Diesel | 76 | R72 | 1 |
| 4 | 11 | 1 | 0.84 | B | 7 | 0 | 46 | 50 | B12 | Diesel | 76 | R72 | 1 |
# Let us see the distribution of classes. How many policyholders claimed versus those who did not?
print(insurance['Claim'].value_counts())
0 643953 1 34060 Name: Claim, dtype: int64
# Creating a list of populous regions
populous_regions = ['R82', 'R52', 'R53', 'R93', 'R11', 'R24']
# Creating the populous_region column
insurance['populous_region'] = insurance['Region'].apply(lambda x: 'Yes' if x in populous_regions else 'No')
# Displaying the updated dataframe
insurance.head()
| IDpol | ClaimNb | Exposure | Area | VehPower | VehAge | DrivAge | BonusMalus | VehBrand | VehGas | Density | Region | Claim | populous_region | low_miles | special brands | new_vehicle | Claim_txt | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 0.10 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 | 1 | Yes | Yes | Yes | Yes | Yes |
| 1 | 3 | 1 | 0.77 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 | 1 | Yes | Yes | Yes | Yes | Yes |
| 2 | 5 | 1 | 0.75 | B | 6 | 2 | 52 | 50 | B12 | Diesel | 54 | R22 | 1 | No | Yes | Yes | Yes | Yes |
| 3 | 10 | 1 | 0.09 | B | 7 | 0 | 46 | 50 | B12 | Diesel | 76 | R72 | 1 | No | Yes | Yes | Yes | Yes |
| 4 | 11 | 1 | 0.84 | B | 7 | 0 | 46 | 50 | B12 | Diesel | 76 | R72 | 1 | No | Yes | Yes | Yes | Yes |
# Grouping the data by "populous_region" and "Claim" columns and counting the number of occurrences
grouped_data = insurance.groupby(['populous_region', 'Claim'])['IDpol'].count().reset_index()
# Pivot the data to have "Claim" values as columns
pivot_data = grouped_data.pivot(index='populous_region', columns='Claim', values='IDpol')
# Plotting the clustered column chart
pivot_data.plot(kind='bar', stacked=True)
plt.xlabel('Region')
plt.ylabel('Number of Claims')
plt.title('Number of Claims by Region')
plt.legend(title='Claim', labels=['No Claim', 'Claim'])
plt.show()
# Creating the low_miles column
insurance['low_miles'] = insurance['BonusMalus'].apply(lambda x: 'Yes' if x < 109 else 'No')
# Displaying the updated dataframe
insurance.head()
| IDpol | ClaimNb | Exposure | Area | VehPower | VehAge | DrivAge | BonusMalus | VehBrand | VehGas | Density | Region | Claim | populous_region | low_miles | special brands | new_vehicle | Claim_txt | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 0.10 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 | 1 | Yes | Yes | Yes | Yes | Yes |
| 1 | 3 | 1 | 0.77 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 | 1 | Yes | Yes | Yes | Yes | Yes |
| 2 | 5 | 1 | 0.75 | B | 6 | 2 | 52 | 50 | B12 | Diesel | 54 | R22 | 1 | No | Yes | Yes | Yes | Yes |
| 3 | 10 | 1 | 0.09 | B | 7 | 0 | 46 | 50 | B12 | Diesel | 76 | R72 | 1 | No | Yes | Yes | Yes | Yes |
| 4 | 11 | 1 | 0.84 | B | 7 | 0 | 46 | 50 | B12 | Diesel | 76 | R72 | 1 | No | Yes | Yes | Yes | Yes |
grouped_data = insurance.groupby(['low_miles', 'Claim'])['IDpol'].count().reset_index()
pivot_data = grouped_data.pivot(index='low_miles', columns='Claim', values='IDpol')
# Plotting the clustered column chart
pivot_data.plot(kind='bar', stacked=True)
plt.xlabel('low_miles')
plt.ylabel('Number of Claims')
plt.title('Number of Claims by levels of bonus miles')
plt.legend(title='Claim', labels=['No Claim', 'Claim'])
plt.show()
# Creating a list of special vehicle brands
special_brands = ['B1', 'B12', 'B2', 'B3']
# Creating the populous_region column
insurance['special brands'] = insurance['VehBrand'].apply(lambda x: 'Yes' if x in special_brands else 'No')
# Displaying the updated dataframe
insurance.head()
| IDpol | ClaimNb | Exposure | Area | VehPower | VehAge | DrivAge | BonusMalus | VehBrand | VehGas | Density | Region | Claim | populous_region | low_miles | special brands | new_vehicle | Claim_txt | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 0.10 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 | 1 | Yes | Yes | Yes | Yes | Yes |
| 1 | 3 | 1 | 0.77 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 | 1 | Yes | Yes | Yes | Yes | Yes |
| 2 | 5 | 1 | 0.75 | B | 6 | 2 | 52 | 50 | B12 | Diesel | 54 | R22 | 1 | No | Yes | Yes | Yes | Yes |
| 3 | 10 | 1 | 0.09 | B | 7 | 0 | 46 | 50 | B12 | Diesel | 76 | R72 | 1 | No | Yes | Yes | Yes | Yes |
| 4 | 11 | 1 | 0.84 | B | 7 | 0 | 46 | 50 | B12 | Diesel | 76 | R72 | 1 | No | Yes | Yes | Yes | Yes |
grouped_data = insurance.groupby(['special brands', 'Claim'])['IDpol'].count().reset_index()
pivot_data = grouped_data.pivot(index='special brands', columns='Claim', values='IDpol')
# Plotting the clustered column chart
pivot_data.plot(kind='bar', stacked=True)
plt.xlabel('Brand of vehicle')
plt.ylabel('Number of Claims')
plt.title('Number of Claims by vehicle brand')
plt.legend(title='Claim', labels=['No Claim', 'Claim'])
plt.show()
# Creating the new_vehicle column
insurance['new_vehicle'] = insurance['VehAge'].apply(lambda x: 'Yes' if x < 30 else 'No')
# Displaying the updated dataframe
insurance.head()
| IDpol | ClaimNb | Exposure | Area | VehPower | VehAge | DrivAge | BonusMalus | VehBrand | VehGas | Density | Region | Claim | populous_region | low_miles | special brands | new_vehicle | Claim_txt | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 0.10 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 | 1 | Yes | Yes | Yes | Yes | Yes |
| 1 | 3 | 1 | 0.77 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 | 1 | Yes | Yes | Yes | Yes | Yes |
| 2 | 5 | 1 | 0.75 | B | 6 | 2 | 52 | 50 | B12 | Diesel | 54 | R22 | 1 | No | Yes | Yes | Yes | Yes |
| 3 | 10 | 1 | 0.09 | B | 7 | 0 | 46 | 50 | B12 | Diesel | 76 | R72 | 1 | No | Yes | Yes | Yes | Yes |
| 4 | 11 | 1 | 0.84 | B | 7 | 0 | 46 | 50 | B12 | Diesel | 76 | R72 | 1 | No | Yes | Yes | Yes | Yes |
grouped_data = insurance.groupby(['new_vehicle', 'Claim'])['IDpol'].count().reset_index()
pivot_data = grouped_data.pivot(index='new_vehicle', columns='Claim', values='IDpol')
# Plotting the clustered column chart
pivot_data.plot(kind='bar', stacked=True)
plt.xlabel('new_vehicle')
plt.ylabel('Number of Claims')
plt.title('Number of Claims by vehicle age')
plt.legend(title='Claim', labels=['No Claim', 'Claim'])
plt.show()
# Make a new column with a "binary value "Yes" and "No" for use in Boxplots
insurance['Claim_txt'] = pd.Series(['No' if x==0 else 'Yes' for x in insurance['Claim']])
insurance.head()
| IDpol | ClaimNb | Exposure | Area | VehPower | VehAge | DrivAge | BonusMalus | VehBrand | VehGas | Density | Region | Claim | populous_region | low_miles | special brands | new_vehicle | Claim_txt | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 0.10 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 | 1 | Yes | Yes | Yes | Yes | Yes |
| 1 | 3 | 1 | 0.77 | D | 5 | 0 | 55 | 50 | B12 | Regular | 1217 | R82 | 1 | Yes | Yes | Yes | Yes | Yes |
| 2 | 5 | 1 | 0.75 | B | 6 | 2 | 52 | 50 | B12 | Diesel | 54 | R22 | 1 | No | Yes | Yes | Yes | Yes |
| 3 | 10 | 1 | 0.09 | B | 7 | 0 | 46 | 50 | B12 | Diesel | 76 | R72 | 1 | No | Yes | Yes | Yes | Yes |
| 4 | 11 | 1 | 0.84 | B | 7 | 0 | 46 | 50 | B12 | Diesel | 76 | R72 | 1 | No | Yes | Yes | Yes | Yes |
# Create a 2x2 grid of subplots
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(12, 12))
# Claims by different driver ages, using a boxplot
sns.boxplot(ax=axs[0, 0], data=insurance, x='Claim_txt', y='DrivAge')
axs[0, 0].set_title('Claims by Different Driver Ages')
axs[0, 0].set_xlabel('Claim')
axs[0, 0].set_ylabel('Driver Age')
# Claims by exposure periods, using a boxplot
sns.boxplot(ax=axs[0, 1], data=insurance, x='Claim_txt', y='Exposure')
axs[0, 1].set_title('Claims by Exposure Periods')
axs[0, 1].set_xlabel('Claim')
axs[0, 1].set_ylabel('Exposure')
# Claims by different population densities, using a boxplot
sns.boxplot(ax=axs[1, 0], data=insurance, x='Claim_txt', y='Density')
axs[1, 0].set_title('Claims by Different Population Densities')
axs[1, 0].set_xlabel('Claim')
axs[1, 0].set_ylabel('Density')
# Claims by different bonusMalus, using a boxplot
sns.boxplot(ax=axs[1, 1], data=insurance, x='Claim_txt', y='VehAge')
axs[1, 1].set_title('Claims by Different VehAge')
axs[1, 1].set_xlabel('Claim')
axs[1, 1].set_ylabel('VehAge')
plt.tight_layout()
plt.show()
# Delete some redundant columns
# Since some features have been re-coded, they will now be removed from the dataframe
# Drop 'DrivAge' and 'VehGas_Diesel' columns from insurance_selected
insurance.drop(['BonusMalus', 'VehBrand', 'VehAge', 'Region', 'Claim_txt'], axis=1, inplace=True)
insurance.head()
| IDpol | ClaimNb | Exposure | Area | VehPower | DrivAge | VehGas | Density | Claim | populous_region | low_miles | special brands | new_vehicle | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 0.10 | D | 5 | 55 | Regular | 1217 | 1 | Yes | Yes | Yes | Yes |
| 1 | 3 | 1 | 0.77 | D | 5 | 55 | Regular | 1217 | 1 | Yes | Yes | Yes | Yes |
| 2 | 5 | 1 | 0.75 | B | 6 | 52 | Diesel | 54 | 1 | No | Yes | Yes | Yes |
| 3 | 10 | 1 | 0.09 | B | 7 | 46 | Diesel | 76 | 1 | No | Yes | Yes | Yes |
| 4 | 11 | 1 | 0.84 | B | 7 | 46 | Diesel | 76 | 1 | No | Yes | Yes | Yes |
# Perform one-hot encoding on the categorical variables
insurance2 = pd.get_dummies(insurance, columns=['populous_region', 'VehGas', 'new_vehicle', 'Area', 'special brands', 'low_miles'])
insurance2.head()
| IDpol | ClaimNb | Exposure | VehPower | DrivAge | Density | Claim | populous_region_No | populous_region_Yes | VehGas_Diesel | ... | Area_A | Area_B | Area_C | Area_D | Area_E | Area_F | special brands_No | special brands_Yes | low_miles_No | low_miles_Yes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 0.10 | 5 | 55 | 1217 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 |
| 1 | 3 | 1 | 0.77 | 5 | 55 | 1217 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 |
| 2 | 5 | 1 | 0.75 | 6 | 52 | 54 | 1 | 1 | 0 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 |
| 3 | 10 | 1 | 0.09 | 7 | 46 | 76 | 1 | 1 | 0 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 |
| 4 | 11 | 1 | 0.84 | 7 | 46 | 76 | 1 | 1 | 0 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 |
5 rows × 23 columns
# some variables are strongly correlated. We'll need to drop one of them, but before that ...
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
X = insurance2.drop(['IDpol', 'ClaimNb', 'Claim'], axis=1)
y = insurance2['Claim']
selector = SelectKBest(score_func=f_classif, k=10)
selector.fit(X, y)
# Get the selected feature indices and names
selected_indices = selector.get_support(indices=True)
selected_names = X.columns[selected_indices]
# Create a new dataset with the selected features
insurance_selected = insurance2[selected_names].join(insurance2[['Claim']])
insurance_selected.head()
| Exposure | DrivAge | populous_region_No | populous_region_Yes | VehGas_Diesel | VehGas_Regular | new_vehicle_No | new_vehicle_Yes | low_miles_No | low_miles_Yes | Claim | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.10 | 55 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 1 |
| 1 | 0.77 | 55 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 1 |
| 2 | 0.75 | 52 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 |
| 3 | 0.09 | 46 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 |
| 4 | 0.84 | 46 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 |
# Drop the "Claim" column from the dataset before calculating the correlation matrix
insurance_selected_without_claim = insurance_selected.drop('Claim', axis=1)
# Create correlation matrix
cor_matrix = insurance_selected_without_claim.corr()
# Create correlation heatmap
sns.set(style='white')
mask = np.triu(np.ones_like(cor_matrix, dtype=bool))
cmap = sns.diverging_palette(20, 220, as_cmap=True)
plt.figure(figsize=(10, 8))
sns.heatmap(cor_matrix, mask=mask, cmap=cmap, center=0, annot=True, fmt='.2f', square=True,
cbar_kws={'shrink': .75}, linewidths=.5, annot_kws={'size': 10})
plt.title('Correlation Heatmap of Numeric Variables in selected Dataset')
plt.show()
# Drop 'DrivAge' and 'VehGas_Diesel' columns from insurance_selected
insurance_selected.drop(['populous_region_No', 'VehGas_Diesel', 'new_vehicle_No', 'low_miles_No'], axis=1, inplace=True)
insurance_selected.head()
| Exposure | DrivAge | populous_region_Yes | VehGas_Regular | new_vehicle_Yes | low_miles_Yes | Claim | |
|---|---|---|---|---|---|---|---|
| 0 | 0.10 | 55 | 1 | 1 | 1 | 1 | 1 |
| 1 | 0.77 | 55 | 1 | 1 | 1 | 1 | 1 |
| 2 | 0.75 | 52 | 0 | 0 | 1 | 1 | 1 |
| 3 | 0.09 | 46 | 0 | 0 | 1 | 1 | 1 |
| 4 | 0.84 | 46 | 0 | 0 | 1 | 1 | 1 |
# Since the dataset is relatively large, I decided to downsample the majority class as opposed to
# oversampling the minority class
# Separate minority and majority classes
minority_class = insurance_selected[insurance_selected['Claim'] == 1]
majority_class = insurance_selected[insurance_selected['Claim'] == 0]
# Determine the size of the minority class
minority_size = len(minority_class)
# Randomly sample majority class observations
downsampled_majority = majority_class.sample(n=minority_size, random_state=42)
# Combine minority class with downsampled majority class
insurance_downsampled = pd.concat([minority_class, downsampled_majority])
# Shuffle the new DataFrame
insurance_downsampled = insurance_downsampled.sample(frac=1, random_state=42).reset_index(drop=True)
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X = insurance_downsampled.drop('Claim', axis=1)
y = insurance_downsampled['Claim']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit the logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression()
# Make predictions on the test set
y_pred = logreg.predict(X_test)
# Compute the confusion matrix, accuracy, precision, recall, and F1-score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
confusion = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print('Confusion matrix:\n', confusion)
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1-score:', f1)
Confusion matrix: [[6070 4148] [ 281 9937]] Accuracy: 0.7832746134272852 Precision: 0.7055023074192404 Recall: 0.9724995106674496 F1-score: 0.8177591243879355
# Perform 5-fold cross validation and print the validation scores
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(logreg, X, y, cv=5)
print('Cross-validation scores:', cv_scores)
print('Mean cross-validation score:', cv_scores.mean())
Cross-validation scores: [0.78075455 0.78317675 0.78985614 0.78134175 0.78280975] Mean cross-validation score: 0.783587786259542
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score
# Fit Random Forest model
rf = RandomForestClassifier(n_estimators=50, random_state=42)
rf.fit(X_train, y_train)
RandomForestClassifier(n_estimators=50, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(n_estimators=50, random_state=42)
# Make predictions on the test set
y_pred = rf.predict(X_test)
# Compute evaluation metrics
confusion_mtx = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
# Print evaluation metrics
print("Confusion Matrix:\n", confusion_mtx)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
Confusion Matrix: [[6876 3342] [1834 8384]] Accuracy: 0.7467214719123116 Precision: 0.7149923247484223 Recall: 0.8205128205128205 F1-score: 0.7641268683922713
# Determine optimum number of trees
num_trees = [10, 50, 100, 200, 300]
cv_scores = []
for num_tree in num_trees:
rf = RandomForestClassifier(n_estimators=num_tree, random_state=42)
scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='f1')
cv_scores.append(scores.mean())
# Print cross-validation scores for different number of trees
for num_tree, cv_score in zip(num_trees, cv_scores):
print("Number of Trees:", num_tree)
print("Mean CV Score:", cv_score)
print()
Number of Trees: 10 Mean CV Score: 0.7594888178419236 Number of Trees: 50 Mean CV Score: 0.7651744380799489 Number of Trees: 100 Mean CV Score: 0.7663422673468246 Number of Trees: 200 Mean CV Score: 0.7675760508804794 Number of Trees: 300 Mean CV Score: 0.7682985673515799
# Perform 5-fold cross-validation
cv_scores = cross_val_score(rf, X, y, cv=5)
print("Cross-Validation Scores:", cv_scores)
Cross-Validation Scores: [0.747431 0.7497064 0.75455079 0.74427481 0.7473576 ]
# Create a DataFrame with model names and performance metrics
data = {
'Model': ['Logit', 'Random Forest'],
'Accuracy': [0.7833, 0.7467],
'Precision': [0.7055, 0.7150],
'Recall': [0.9725, 0.8205],
'F1-score': [0.8178, 0.7641]
}
df = pd.DataFrame(data)
# Set the figure size
plt.figure(figsize=(10, 6))
# Define the position of each bar on the x-axis
bar_width = 0.15
index = df.index
# Plot the performance metrics as clustered columns
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-score']
colors = ['#FF7F0E', '#1F77B4', '#FFBB78', '#2CA02C']
offset = -2 * bar_width
for i, metric in enumerate(metrics):
plt.bar(index + offset, df[metric], bar_width, label=metric, color=colors[i])
offset += bar_width
# Set the x-axis labels
plt.xlabel('Model')
plt.ylabel('Metric Value')
plt.title('Performance Metrics Comparison')
plt.xticks(index, df['Model'], rotation=45)
# Add a legend
plt.legend(prop={'size': 8})
# Display the plot
plt.tight_layout()
plt.show()
import plotly.graph_objects as go
# Create a DataFrame with model names and cross-validation scores
data = {
'Model': ['Logit', 'Random Forest'],
'CV Scores': [
[0.7808, 0.7831, 0.7899, 0.7813, 0.7828],
[0.7474, 0.7497, 0.7546, 0.7442, 0.7473]
]
}
df = pd.DataFrame(data)
# Convert range(1, 6) to a list
x_values = list(range(1, 6))
# Create the line graph using Plotly
fig = go.Figure()
for i in range(len(df)):
fig.add_trace(go.Scatter(
x=x_values,
y=df['CV Scores'][i],
mode='lines+markers',
name=df['Model'][i]
))
fig.update_layout(
xaxis=dict(title='Fold'),
yaxis=dict(title='Cross-Validation Score'),
title='Cross-Validation Scores Comparison'
)
fig.show()
# The Logit model outperformed the random forest model in all five folds, hence, will be used in making the final predictions.