import pandas as pd
import numpy as np


import os
os.chdir(r'C:\Users\dell vostro\Documents\Dalarna Masters_Wonder\Business Intelligence\Lab3 Machine Learning')


# Load the dataset from the CSV file
electricity = pd.read_csv('electricitydata.csv')

# Display the first few rows of the dataset
electricity.head()

C:\Users\dell vostro\AppData\Local\Temp\ipykernel_8212\3839329021.py:2: DtypeWarning: Columns (9,10,11,14,15,16,17) have mixed types. Specify dtype option on import or set low_memory=False.
  electricity = pd.read_csv('electricitydata.csv')


# Replace '?' with NaN values
electricity.replace('?', np.nan, inplace=True)

# Count the number of NaN values in each row
num_missing = electricity.isna().sum(axis=1)
# Count the number of rows with at least one missing value
num_rows_missing = (num_missing > 0).sum()
print(f"Number of rows with at least one missing value: {num_rows_missing}")

Number of rows with at least one missing value: 332


# Drop all null values
electricity.dropna(inplace=True)

# Verify that there are no more null values
print(electricity.isnull().sum())

DateTime                  0
Holiday                   0
HolidayFlag               0
DayOfWeek                 0
WeekOfYear                0
Day                       0
Month                     0
Year                      0
PeriodOfDay               0
ForecastWindProduction    0
SystemLoadEA              0
SMPEA                     0
ORKTemperature            0
ORKWindspeed              0
CO2Intensity              0
ActualWindProduction      0
SystemLoadEP2             0
SMPEP2                    0
dtype: int64


# Checking the data types of the columns
print(electricity.dtypes)

DateTime                  object
Holiday                   object
HolidayFlag                int64
DayOfWeek                  int64
WeekOfYear                 int64
Day                        int64
Month                      int64
Year                       int64
PeriodOfDay                int64
ForecastWindProduction    object
SystemLoadEA              object
SMPEA                     object
ORKTemperature            object
ORKWindspeed              object
CO2Intensity              object
ActualWindProduction      object
SystemLoadEP2             object
SMPEP2                    object
dtype: object


# Identify categorical features
categorical_cols = electricity.select_dtypes(include=['object']).columns.tolist()
print(categorical_cols)

['DateTime', 'Holiday', 'ForecastWindProduction', 'SystemLoadEA', 'SMPEA', 'ORKTemperature', 'ORKWindspeed', 'CO2Intensity', 'ActualWindProduction', 'SystemLoadEP2', 'SMPEP2']


# Convert columns to numeric data types
numeric_cols = ['ForecastWindProduction', 'SystemLoadEA', 'SMPEA', 'ORKTemperature', 'ORKWindspeed', 'CO2Intensity', 'ActualWindProduction', 'SystemLoadEP2', 'SMPEP2']
electricity[numeric_cols] = electricity[numeric_cols].apply(pd.to_numeric)


electricity['DateTime'] = pd.to_datetime(electricity['DateTime'])
electricity['Holiday'] = electricity['Holiday'].astype('category')
electricity['ForecastWindProduction'] = pd.to_numeric(electricity['ForecastWindProduction'])
electricity['SystemLoadEA'] = pd.to_numeric(electricity['SystemLoadEA'])
electricity['SMPEA'] = pd.to_numeric(electricity['SMPEA'])
electricity['ORKTemperature'] = pd.to_numeric(electricity['ORKTemperature'])
electricity['ORKWindspeed'] = pd.to_numeric(electricity['ORKWindspeed'])
electricity['CO2Intensity'] = pd.to_numeric(electricity['CO2Intensity'])
electricity['ActualWindProduction'] = pd.to_numeric(electricity['ActualWindProduction'])
electricity['SystemLoadEP2'] = pd.to_numeric(electricity['SystemLoadEP2'])
electricity['SMPEP2'] = pd.to_numeric(electricity['SMPEP2'])


# Checking the data types of the columns to see if changes have taken effect
print(electricity.dtypes)

DateTime                   object
Holiday                    object
HolidayFlag                 int64
DayOfWeek                   int64
WeekOfYear                  int64
Day                         int64
Month                       int64
Year                        int64
PeriodOfDay                 int64
ForecastWindProduction    float64
SystemLoadEA              float64
SMPEA                     float64
ORKTemperature            float64
ORKWindspeed              float64
CO2Intensity              float64
ActualWindProduction      float64
SystemLoadEP2             float64
SMPEP2                    float64
dtype: object


# Check which columns contain discrete values

for column in electricity.columns:
    print(f"{column}: {electricity[column].nunique()}")

DateTime: 38014
Holiday: 15
HolidayFlag: 2
DayOfWeek: 7
WeekOfYear: 52
Day: 31
Month: 12
Year: 3
PeriodOfDay: 48
ForecastWindProduction: 27475
SystemLoadEA: 35584
SMPEA: 7339
ORKTemperature: 30
ORKWindspeed: 52
CO2Intensity: 22458
ActualWindProduction: 1535
SystemLoadEP2: 35653
SMPEP2: 7813


import seaborn as sns
import matplotlib.pyplot as plt

# Drop DateTime and Holiday columns
corr_df = electricity.drop(['Holiday'], axis=1)

# Compute correlation matrix
corr = corr_df.corr()

# Create a heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

C:\Users\dell vostro\AppData\Local\Temp\ipykernel_1912\2848700503.py:8: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  corr = corr_df.corr()


# Mean of SMPEP2
mean = electricity['SMPEP2'].mean()
print("Mean of SMPEP2: ", mean)

# Mode of SMPEP2
mode = electricity['SMPEP2'].mode()[0]
print("Mode of SMPEP2: ", mode)

# Median of SMPEP2
median = electricity['SMPEP2'].median()
print("Median of SMPEP2: ", median)

# Get the minimum and maximum values of SMPEP2
min_smpep2 = electricity['SMPEP2'].min()
max_smpep2 = electricity['SMPEP2'].max()

print("Minimum of SMPEP2: ", min_smpep2)
print("Maximum of SMPEP2: ", max_smpep2)

Mean of SMPEP2:  64.13682258234243
Mode of SMPEP2:  42.04
Median of SMPEP2:  55.545
Minimum of SMPEP2:  -47.74
Maximum of SMPEP2:  1000.0


import plotly.express as px

fig = px.histogram(electricity, x="SMPEP2", nbins=30)
fig.show()


# Use median calculated a few cells ago
electricity['SMPEP2_bin'] = (electricity['SMPEP2'] > median).astype(int)

# Peek at the updated dataframe
electricity.head(3)


counts = electricity['SMPEP2_bin'].value_counts()
print(counts)

0    18844
1    18838
Name: SMPEP2_bin, dtype: int64


electricity2 = electricity


from sklearn.model_selection import train_test_split

X = electricity2[['PeriodOfDay', 'SystemLoadEA', 'SMPEA', 'SystemLoadEP2', 'DayOfWeek', 'ForecastWindProduction']]
y = electricity2['SMPEP2_bin']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


from sklearn.impute import SimpleImputer

# create an instance of SimpleImputer with 'mean' strategy
imputer = SimpleImputer(strategy='mean')

# fill in the missing values in X_train and X_test
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)


from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

DecisionTreeClassifier()

DecisionTreeClassifier()


y_pred = dtc.predict(X_test)
from sklearn.metrics import confusion_matrix

conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)

[[3105  668]
 [ 695 3069]]


from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("F1 score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.8191588165052408
F1 score: 0.8182908945473937
Precision: 0.8212469895638213
Recall: 0.815356004250797


from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(dtc, X, y, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())

Cross-validation scores: [0.69842112 0.68714343 0.69612527 0.69307325 0.72332803]
Mean CV score: 0.699618217501374


from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score

# Split the data using the same splits as for the decision tree
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Create an SVM instance and fit it to the training data
svm = SVC()
svm.fit(X_train, y_train)

# Make predictions on the test data
y_pred = svm.predict(X_test)


# Compute performance metrics
conf_mat = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Print the performance metrics
print("Confusion matrix:\n", conf_mat)
print("Accuracy:", accuracy)
print("F1 score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Confusion matrix:
 [[2650 1123]
 [ 758 3006]]
Accuracy: 0.7504312060501526
F1 score: 0.7616875712656785
Precision: 0.7280213126665052
Recall: 0.7986184909670563


# Perform cross-validation
cv_scores = cross_val_score(svm, X, y, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())

Cross-validation scores: [0.67215072 0.71394454 0.83346603 0.72518577 0.77760085]
Mean CV score: 0.7444695834590401


import plotly.graph_objects as go

dtc_labels = ['True Negative', 'False Positive', 'False Negative', 'True Positive']
dtc_values = [3105, 663, 675, 3094]

svm_labels = ['True Negative', 'False Positive', 'False Negative', 'True Positive']
svm_values = [2642, 1126, 760, 3009]

fig = go.Figure(data=[
    go.Bar(name='Decision Tree', x=dtc_labels, y=dtc_values),
    go.Bar(name='SVM', x=svm_labels, y=svm_values)
])
# Change the bar mode to group to create a clustered bar chart
fig.update_layout(barmode='group')
fig.show()


# Create a grouped bar chart
fig = go.Figure(data=[
    go.Bar(name='Decision Tree', x=['Accuracy', 'F1 Score', 'Precision', 'Recall'], 
           y=[0.822, 0.822, 0.824, 0.821]),
    go.Bar(name='SVM', x=['Accuracy', 'F1 Score', 'Precision', 'Recall'], 
           y=[0.750, 0.761, 0.728, 0.798])
])

# Update layout
fig.update_layout(barmode='group', title='Performance Comparison of Decision Tree and SVM')

# Show chart
fig.show()


# Define data for the line chart
x = ['Fold 1', 'Fold 2', 'Fold 3', 'Fold 4', 'Fold 5']
y1 = [0.70850471, 0.69245058, 0.70700637, 0.69426752, 0.72120488]
y2 = [0.67215072, 0.71394454, 0.83346603, 0.72518577, 0.77760085]

# Define the layout of the chart
layout = go.Layout(title='Cross-validation scores comparison', xaxis=dict(title='Fold'),
                   yaxis=dict(title='Score'))

# Define the traces for the line chart
trace1 = go.Scatter(x=x, y=y1, name='Decision Tree')
trace2 = go.Scatter(x=x, y=y2, name='SVM')

# Combine the layout and traces to create a figure
fig = go.Figure(data=[trace1, trace2], layout=layout)

# Show the figure
fig.show()


import plotly.express as px

fig = px.scatter(electricity, x='PeriodOfDay', y='SMPEP2',
                 title='Electricity Price vs Period of Day')
fig.show()


fig = px.scatter(electricity, x='SystemLoadEA', y='SMPEP2',
                 title='Electricity Price vs forecasted System Load')
fig.show()


fig = px.scatter(electricity, x='SMPEA', y='SMPEP2',
                 title='Electricity Price vs the forecasted price')
fig.show()


fig = px.scatter(electricity, x='SystemLoadEP2', y='SMPEP2',
                 title='Electricity Price vs the actual System Load')
fig.show()


fig = px.scatter(electricity, x='ForecastWindProduction', y='SMPEP2',
                 title='Electricity Price vs forecasted wind production')
fig.show()


# create boxplots for each feature variable
for feature in ['PeriodOfDay', 'SystemLoadEA', 'SMPEA', 'SystemLoadEP2', 'DayOfWeek', 'ForecastWindProduction']:
    
    # create traces for the boxplots
    trace0 = go.Box(
        x=electricity2[electricity2['SMPEP2_bin']==0][feature],
        name='Below Median'
    )
    trace1 = go.Box(
        x=electricity2[electricity2['SMPEP2_bin']==1][feature],
        name='Above Median'
    )

    # create layout for the plot
    layout = go.Layout(
        title=f'Distribution of {feature} based on SMPEP2_bin',
        xaxis=dict(title=feature),
        yaxis=dict(title='Value')
    )

    # combine the traces and layout into a figure
    fig = go.Figure(data=[trace0, trace1], layout=layout)

    # show the figure
    fig.show()

	DateTime	Holiday	DayOfWeek	WeekOfYear	Day	Month	Year	PeriodOfDay	ForecastWindProduction	SystemLoadEA	SMPEA	ORKTemperature	ORKWindspeed	CO2Intensity	ActualWindProduction	SystemLoadEP2	SMPEP2
0	01/11/2011 00:00	None	1	44	1	11	2011	0	315.31	3388.77	49.26	6.00	9.30	600.71	356.00	3159.60	54.32
1	01/11/2011 00:30	None	1	44	1	11	2011	1	321.80	3196.66	49.26	6.00	11.10	605.42	317.00	2973.01	54.23
2	01/11/2011 01:00	None	1	44	1	11	2011	2	328.57	3060.71	49.10	5.00	11.10	589.97	311.00	2834.00	54.23
3	01/11/2011 01:30	None	1	44	1	11	2011	3	335.60	2945.56	48.04	6.00	9.30	585.94	313.00	2725.99	53.47
4	01/11/2011 02:00	None	1	44	1	11	2011	4	342.90	2849.34	33.75	6.00	11.10	571.52	346.00	2655.64	39.87

	DateTime	Holiday	DayOfWeek	WeekOfYear	Day	Month	Year	PeriodOfDay	ForecastWindProduction	SystemLoadEA	SMPEA	ORKTemperature	ORKWindspeed	CO2Intensity	ActualWindProduction	SystemLoadEP2	SMPEP2
0	01/11/2011 00:00	None	1	44	1	11	2011	0	315.31	3388.77	49.26	6.0	9.3	600.71	356.0	3159.60	54.32
1	01/11/2011 00:30	None	1	44	1	11	2011	1	321.80	3196.66	49.26	6.0	11.1	605.42	317.0	2973.01	54.23
2	01/11/2011 01:00	None	1	44	1	11	2011	2	328.57	3060.71	49.10	5.0	11.1	589.97	311.0	2834.00	54.23

Task 2: Predictive Analysis, Supervised Learning – Electricity Price Prediction¶

Wonder Mahembe¶

v23wonma@du.se¶

Data exploration and cleaning processes¶

Changing data types¶

Running some correlations to identify which features are best to use in the analysis¶

Exploring the distribution of the target variable before making predictions. In this case I want to convert the target variable SMPEP2 to a binary value, with the price split into two; one below and another above a certain threshold. I will start by checking how the variable is distributed.¶

Check value counts to determine if the two classess are balanced¶

Begin by splitting the data into train and test sets.¶

Now to make predictions on the test set and compute the confusion matrix:¶

Now we will compute other performance metrics: accuracy, F1 score, precision, and recall using functions from scikit-learn.¶

Finally, we can perform cross-validation to get a better estimate of the model's performance using the cross_val_score function from scikit-learn.¶

We will now implement a Support Vector Machine to predict the price of electricity based on the consumption data.¶

Fit the SVM model¶

Compute the performance metrics for SVM¶

Now we will perform cross validations on the SVM¶

Time to make some visuals and compare the performances of the two models¶

Let us compare the raw performances based on the confusion matrices. Interpretations in the word report.¶

The following is a comparison of other metrics 'Accuracy', 'F1 Score', 'Precision', and 'Recall'¶

Comparison of the two models based on cross-validation scores¶

We can visualize the results and gain insights into the predictions made by the classifiers to provide actionable business insights.¶

Here i am only using the variables entered into the models to perform some visualisations on the relationship between the price of electricity and selected metrics.¶

We begin with the relationship between electricity price and time of day.¶

Relationship between price and forecasted system load¶

Relationship between price and forecasted price¶

Electricity price versus actual system load.¶

Electricity Price vs forecasted wind production¶

We can also use boxplots to analyse how the distribution of feature variables affect whether the price of electricity was above or below the median.¶

The attached report shows my interpretations of selected visualisations as well as an overall conclusion providing actionable insights for businesses.¶