import pandas as pd
import numpy as np
import os
os.chdir(r'C:\Users\dell vostro\Documents\Dalarna Masters_Wonder\Business Intelligence\Lab3 Machine Learning')
# Load the dataset from the CSV file
electricity = pd.read_csv('electricitydata.csv')
# Display the first few rows of the dataset
electricity.head()
C:\Users\dell vostro\AppData\Local\Temp\ipykernel_8212\3839329021.py:2: DtypeWarning: Columns (9,10,11,14,15,16,17) have mixed types. Specify dtype option on import or set low_memory=False.
electricity = pd.read_csv('electricitydata.csv')
| DateTime | Holiday | HolidayFlag | DayOfWeek | WeekOfYear | Day | Month | Year | PeriodOfDay | ForecastWindProduction | SystemLoadEA | SMPEA | ORKTemperature | ORKWindspeed | CO2Intensity | ActualWindProduction | SystemLoadEP2 | SMPEP2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 01/11/2011 00:00 | None | 0 | 1 | 44 | 1 | 11 | 2011 | 0 | 315.31 | 3388.77 | 49.26 | 6.00 | 9.30 | 600.71 | 356.00 | 3159.60 | 54.32 |
| 1 | 01/11/2011 00:30 | None | 0 | 1 | 44 | 1 | 11 | 2011 | 1 | 321.80 | 3196.66 | 49.26 | 6.00 | 11.10 | 605.42 | 317.00 | 2973.01 | 54.23 |
| 2 | 01/11/2011 01:00 | None | 0 | 1 | 44 | 1 | 11 | 2011 | 2 | 328.57 | 3060.71 | 49.10 | 5.00 | 11.10 | 589.97 | 311.00 | 2834.00 | 54.23 |
| 3 | 01/11/2011 01:30 | None | 0 | 1 | 44 | 1 | 11 | 2011 | 3 | 335.60 | 2945.56 | 48.04 | 6.00 | 9.30 | 585.94 | 313.00 | 2725.99 | 53.47 |
| 4 | 01/11/2011 02:00 | None | 0 | 1 | 44 | 1 | 11 | 2011 | 4 | 342.90 | 2849.34 | 33.75 | 6.00 | 11.10 | 571.52 | 346.00 | 2655.64 | 39.87 |
# Replace '?' with NaN values
electricity.replace('?', np.nan, inplace=True)
# Count the number of NaN values in each row
num_missing = electricity.isna().sum(axis=1)
# Count the number of rows with at least one missing value
num_rows_missing = (num_missing > 0).sum()
print(f"Number of rows with at least one missing value: {num_rows_missing}")
Number of rows with at least one missing value: 332
# Drop all null values
electricity.dropna(inplace=True)
# Verify that there are no more null values
print(electricity.isnull().sum())
DateTime 0 Holiday 0 HolidayFlag 0 DayOfWeek 0 WeekOfYear 0 Day 0 Month 0 Year 0 PeriodOfDay 0 ForecastWindProduction 0 SystemLoadEA 0 SMPEA 0 ORKTemperature 0 ORKWindspeed 0 CO2Intensity 0 ActualWindProduction 0 SystemLoadEP2 0 SMPEP2 0 dtype: int64
# Checking the data types of the columns
print(electricity.dtypes)
DateTime object Holiday object HolidayFlag int64 DayOfWeek int64 WeekOfYear int64 Day int64 Month int64 Year int64 PeriodOfDay int64 ForecastWindProduction object SystemLoadEA object SMPEA object ORKTemperature object ORKWindspeed object CO2Intensity object ActualWindProduction object SystemLoadEP2 object SMPEP2 object dtype: object
# Identify categorical features
categorical_cols = electricity.select_dtypes(include=['object']).columns.tolist()
print(categorical_cols)
['DateTime', 'Holiday', 'ForecastWindProduction', 'SystemLoadEA', 'SMPEA', 'ORKTemperature', 'ORKWindspeed', 'CO2Intensity', 'ActualWindProduction', 'SystemLoadEP2', 'SMPEP2']
# Convert columns to numeric data types
numeric_cols = ['ForecastWindProduction', 'SystemLoadEA', 'SMPEA', 'ORKTemperature', 'ORKWindspeed', 'CO2Intensity', 'ActualWindProduction', 'SystemLoadEP2', 'SMPEP2']
electricity[numeric_cols] = electricity[numeric_cols].apply(pd.to_numeric)
electricity['DateTime'] = pd.to_datetime(electricity['DateTime'])
electricity['Holiday'] = electricity['Holiday'].astype('category')
electricity['ForecastWindProduction'] = pd.to_numeric(electricity['ForecastWindProduction'])
electricity['SystemLoadEA'] = pd.to_numeric(electricity['SystemLoadEA'])
electricity['SMPEA'] = pd.to_numeric(electricity['SMPEA'])
electricity['ORKTemperature'] = pd.to_numeric(electricity['ORKTemperature'])
electricity['ORKWindspeed'] = pd.to_numeric(electricity['ORKWindspeed'])
electricity['CO2Intensity'] = pd.to_numeric(electricity['CO2Intensity'])
electricity['ActualWindProduction'] = pd.to_numeric(electricity['ActualWindProduction'])
electricity['SystemLoadEP2'] = pd.to_numeric(electricity['SystemLoadEP2'])
electricity['SMPEP2'] = pd.to_numeric(electricity['SMPEP2'])
# Checking the data types of the columns to see if changes have taken effect
print(electricity.dtypes)
DateTime object Holiday object HolidayFlag int64 DayOfWeek int64 WeekOfYear int64 Day int64 Month int64 Year int64 PeriodOfDay int64 ForecastWindProduction float64 SystemLoadEA float64 SMPEA float64 ORKTemperature float64 ORKWindspeed float64 CO2Intensity float64 ActualWindProduction float64 SystemLoadEP2 float64 SMPEP2 float64 dtype: object
# Check which columns contain discrete values
for column in electricity.columns:
print(f"{column}: {electricity[column].nunique()}")
DateTime: 38014 Holiday: 15 HolidayFlag: 2 DayOfWeek: 7 WeekOfYear: 52 Day: 31 Month: 12 Year: 3 PeriodOfDay: 48 ForecastWindProduction: 27475 SystemLoadEA: 35584 SMPEA: 7339 ORKTemperature: 30 ORKWindspeed: 52 CO2Intensity: 22458 ActualWindProduction: 1535 SystemLoadEP2: 35653 SMPEP2: 7813
import seaborn as sns
import matplotlib.pyplot as plt
# Drop DateTime and Holiday columns
corr_df = electricity.drop(['Holiday'], axis=1)
# Compute correlation matrix
corr = corr_df.corr()
# Create a heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()
C:\Users\dell vostro\AppData\Local\Temp\ipykernel_1912\2848700503.py:8: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. corr = corr_df.corr()
# Mean of SMPEP2
mean = electricity['SMPEP2'].mean()
print("Mean of SMPEP2: ", mean)
# Mode of SMPEP2
mode = electricity['SMPEP2'].mode()[0]
print("Mode of SMPEP2: ", mode)
# Median of SMPEP2
median = electricity['SMPEP2'].median()
print("Median of SMPEP2: ", median)
# Get the minimum and maximum values of SMPEP2
min_smpep2 = electricity['SMPEP2'].min()
max_smpep2 = electricity['SMPEP2'].max()
print("Minimum of SMPEP2: ", min_smpep2)
print("Maximum of SMPEP2: ", max_smpep2)
Mean of SMPEP2: 64.13682258234243 Mode of SMPEP2: 42.04 Median of SMPEP2: 55.545 Minimum of SMPEP2: -47.74 Maximum of SMPEP2: 1000.0
import plotly.express as px
fig = px.histogram(electricity, x="SMPEP2", nbins=30)
fig.show()
# Use median calculated a few cells ago
electricity['SMPEP2_bin'] = (electricity['SMPEP2'] > median).astype(int)
# Peek at the updated dataframe
electricity.head(3)
| DateTime | Holiday | HolidayFlag | DayOfWeek | WeekOfYear | Day | Month | Year | PeriodOfDay | ForecastWindProduction | SystemLoadEA | SMPEA | ORKTemperature | ORKWindspeed | CO2Intensity | ActualWindProduction | SystemLoadEP2 | SMPEP2 | SMPEP2_bin | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 01/11/2011 00:00 | None | 0 | 1 | 44 | 1 | 11 | 2011 | 0 | 315.31 | 3388.77 | 49.26 | 6.0 | 9.3 | 600.71 | 356.0 | 3159.60 | 54.32 | 0 |
| 1 | 01/11/2011 00:30 | None | 0 | 1 | 44 | 1 | 11 | 2011 | 1 | 321.80 | 3196.66 | 49.26 | 6.0 | 11.1 | 605.42 | 317.0 | 2973.01 | 54.23 | 0 |
| 2 | 01/11/2011 01:00 | None | 0 | 1 | 44 | 1 | 11 | 2011 | 2 | 328.57 | 3060.71 | 49.10 | 5.0 | 11.1 | 589.97 | 311.0 | 2834.00 | 54.23 | 0 |
counts = electricity['SMPEP2_bin'].value_counts()
print(counts)
0 18844 1 18838 Name: SMPEP2_bin, dtype: int64
electricity2 = electricity
from sklearn.model_selection import train_test_split
X = electricity2[['PeriodOfDay', 'SystemLoadEA', 'SMPEA', 'SystemLoadEP2', 'DayOfWeek', 'ForecastWindProduction']]
y = electricity2['SMPEP2_bin']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
from sklearn.impute import SimpleImputer
# create an instance of SimpleImputer with 'mean' strategy
imputer = SimpleImputer(strategy='mean')
# fill in the missing values in X_train and X_test
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
DecisionTreeClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier()
y_pred = dtc.predict(X_test)
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)
[[3105 668] [ 695 3069]]
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("F1 score:", f1)
print("Precision:", precision)
print("Recall:", recall)
Accuracy: 0.8191588165052408 F1 score: 0.8182908945473937 Precision: 0.8212469895638213 Recall: 0.815356004250797
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(dtc, X, y, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())
Cross-validation scores: [0.69842112 0.68714343 0.69612527 0.69307325 0.72332803] Mean CV score: 0.699618217501374
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
# Split the data using the same splits as for the decision tree
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create an SVM instance and fit it to the training data
svm = SVC()
svm.fit(X_train, y_train)
# Make predictions on the test data
y_pred = svm.predict(X_test)
# Compute performance metrics
conf_mat = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
# Print the performance metrics
print("Confusion matrix:\n", conf_mat)
print("Accuracy:", accuracy)
print("F1 score:", f1)
print("Precision:", precision)
print("Recall:", recall)
Confusion matrix: [[2650 1123] [ 758 3006]] Accuracy: 0.7504312060501526 F1 score: 0.7616875712656785 Precision: 0.7280213126665052 Recall: 0.7986184909670563
# Perform cross-validation
cv_scores = cross_val_score(svm, X, y, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())
Cross-validation scores: [0.67215072 0.71394454 0.83346603 0.72518577 0.77760085] Mean CV score: 0.7444695834590401
import plotly.graph_objects as go
dtc_labels = ['True Negative', 'False Positive', 'False Negative', 'True Positive']
dtc_values = [3105, 663, 675, 3094]
svm_labels = ['True Negative', 'False Positive', 'False Negative', 'True Positive']
svm_values = [2642, 1126, 760, 3009]
fig = go.Figure(data=[
go.Bar(name='Decision Tree', x=dtc_labels, y=dtc_values),
go.Bar(name='SVM', x=svm_labels, y=svm_values)
])
# Change the bar mode to group to create a clustered bar chart
fig.update_layout(barmode='group')
fig.show()
# Create a grouped bar chart
fig = go.Figure(data=[
go.Bar(name='Decision Tree', x=['Accuracy', 'F1 Score', 'Precision', 'Recall'],
y=[0.822, 0.822, 0.824, 0.821]),
go.Bar(name='SVM', x=['Accuracy', 'F1 Score', 'Precision', 'Recall'],
y=[0.750, 0.761, 0.728, 0.798])
])
# Update layout
fig.update_layout(barmode='group', title='Performance Comparison of Decision Tree and SVM')
# Show chart
fig.show()
# Define data for the line chart
x = ['Fold 1', 'Fold 2', 'Fold 3', 'Fold 4', 'Fold 5']
y1 = [0.70850471, 0.69245058, 0.70700637, 0.69426752, 0.72120488]
y2 = [0.67215072, 0.71394454, 0.83346603, 0.72518577, 0.77760085]
# Define the layout of the chart
layout = go.Layout(title='Cross-validation scores comparison', xaxis=dict(title='Fold'),
yaxis=dict(title='Score'))
# Define the traces for the line chart
trace1 = go.Scatter(x=x, y=y1, name='Decision Tree')
trace2 = go.Scatter(x=x, y=y2, name='SVM')
# Combine the layout and traces to create a figure
fig = go.Figure(data=[trace1, trace2], layout=layout)
# Show the figure
fig.show()
import plotly.express as px
fig = px.scatter(electricity, x='PeriodOfDay', y='SMPEP2',
title='Electricity Price vs Period of Day')
fig.show()
fig = px.scatter(electricity, x='SystemLoadEA', y='SMPEP2',
title='Electricity Price vs forecasted System Load')
fig.show()
fig = px.scatter(electricity, x='SMPEA', y='SMPEP2',
title='Electricity Price vs the forecasted price')
fig.show()
fig = px.scatter(electricity, x='SystemLoadEP2', y='SMPEP2',
title='Electricity Price vs the actual System Load')
fig.show()
fig = px.scatter(electricity, x='ForecastWindProduction', y='SMPEP2',
title='Electricity Price vs forecasted wind production')
fig.show()
# create boxplots for each feature variable
for feature in ['PeriodOfDay', 'SystemLoadEA', 'SMPEA', 'SystemLoadEP2', 'DayOfWeek', 'ForecastWindProduction']:
# create traces for the boxplots
trace0 = go.Box(
x=electricity2[electricity2['SMPEP2_bin']==0][feature],
name='Below Median'
)
trace1 = go.Box(
x=electricity2[electricity2['SMPEP2_bin']==1][feature],
name='Above Median'
)
# create layout for the plot
layout = go.Layout(
title=f'Distribution of {feature} based on SMPEP2_bin',
xaxis=dict(title=feature),
yaxis=dict(title='Value')
)
# combine the traces and layout into a figure
fig = go.Figure(data=[trace0, trace1], layout=layout)
# show the figure
fig.show()