import pandas as pd
# Set the working directory to the folder containing the datasets
import os
os.chdir(r"C:\Users\Waneda\Documents\Personal documents\Projects\World happiness index_Python")
# Import the 2019.csv file as a Pandas dataframe
df2019 = pd.read_csv("2019.csv")
df2019.head()
| Overall rank | Country or region | Score | GDP per capita | Social support | Healthy life expectancy | Freedom to make life choices | Generosity | Perceptions of corruption | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Finland | 7.769 | 1.340 | 1.587 | 0.986 | 0.596 | 0.153 | 0.393 |
| 1 | 2 | Denmark | 7.600 | 1.383 | 1.573 | 0.996 | 0.592 | 0.252 | 0.410 |
| 2 | 3 | Norway | 7.554 | 1.488 | 1.582 | 1.028 | 0.603 | 0.271 | 0.341 |
| 3 | 4 | Iceland | 7.494 | 1.380 | 1.624 | 1.026 | 0.591 | 0.354 | 0.118 |
| 4 | 5 | Netherlands | 7.488 | 1.396 | 1.522 | 0.999 | 0.557 | 0.322 | 0.298 |
# Import the 2018.csv file as a Pandas dataframe
df2018 = pd.read_csv("2018.csv")
df2018.head()
| Overall rank | Country or region | Score | GDP per capita | Social support | Healthy life expectancy | Freedom to make life choices | Generosity | Perceptions of corruption | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Finland | 7.632 | 1.305 | 1.592 | 0.874 | 0.681 | 0.202 | 0.393 |
| 1 | 2 | Norway | 7.594 | 1.456 | 1.582 | 0.861 | 0.686 | 0.286 | 0.340 |
| 2 | 3 | Denmark | 7.555 | 1.351 | 1.590 | 0.868 | 0.683 | 0.284 | 0.408 |
| 3 | 4 | Iceland | 7.495 | 1.343 | 1.644 | 0.914 | 0.677 | 0.353 | 0.138 |
| 4 | 5 | Switzerland | 7.487 | 1.420 | 1.549 | 0.927 | 0.660 | 0.256 | 0.357 |
# let us check whether the columns, data types in the two dataframes are exactly the same.
# Compare the columns of the two dataframes
print("Columns in 2019 dataframe: ")
print(df2019.columns)
print("\nColumns in 2018 dataframe: ")
print(df2018.columns)
# Compare the data types of the two dataframes
print("\nData types in 2019 dataframe: ")
print(df2019.dtypes)
print("\nData types in 2018 dataframe: ")
print(df2018.dtypes)
Columns in 2019 dataframe:
Index(['Overall rank', 'Country or region', 'Score', 'GDP per capita',
'Social support', 'Healthy life expectancy',
'Freedom to make life choices', 'Generosity',
'Perceptions of corruption'],
dtype='object')
Columns in 2018 dataframe:
Index(['Overall rank', 'Country or region', 'Score', 'GDP per capita',
'Social support', 'Healthy life expectancy',
'Freedom to make life choices', 'Generosity',
'Perceptions of corruption'],
dtype='object')
Data types in 2019 dataframe:
Overall rank int64
Country or region object
Score float64
GDP per capita float64
Social support float64
Healthy life expectancy float64
Freedom to make life choices float64
Generosity float64
Perceptions of corruption float64
dtype: object
Data types in 2018 dataframe:
Overall rank int64
Country or region object
Score float64
GDP per capita float64
Social support float64
Healthy life expectancy float64
Freedom to make life choices float64
Generosity float64
Perceptions of corruption float64
dtype: object
# Add the Year column to the df2018 and df2019 dataframes
df2018 = df2018.assign(Year=2018)
df2019 = df2019.assign(Year=2019)
# Merge the df2018 and df2019 dataframes with the happiness dataframe
happiness = pd.concat([df2019.assign(Year=2019), df2018.assign(Year=2018)], sort=False)
# Print the first few rows of the merged dataframe to verify that it has been merged correctly
happiness.head()
| Overall rank | Country or region | Score | GDP per capita | Social support | Healthy life expectancy | Freedom to make life choices | Generosity | Perceptions of corruption | Year | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Finland | 7.769 | 1.340 | 1.587 | 0.986 | 0.596 | 0.153 | 0.393 | 2019 |
| 1 | 2 | Denmark | 7.600 | 1.383 | 1.573 | 0.996 | 0.592 | 0.252 | 0.410 | 2019 |
| 2 | 3 | Norway | 7.554 | 1.488 | 1.582 | 1.028 | 0.603 | 0.271 | 0.341 | 2019 |
| 3 | 4 | Iceland | 7.494 | 1.380 | 1.624 | 1.026 | 0.591 | 0.354 | 0.118 | 2019 |
| 4 | 5 | Netherlands | 7.488 | 1.396 | 1.522 | 0.999 | 0.557 | 0.322 | 0.298 | 2019 |
# Checking the information of the new merged dataset
happiness.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 312 entries, 0 to 155 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Overall rank 312 non-null int64 1 Country or region 312 non-null object 2 Score 312 non-null float64 3 GDP per capita 312 non-null float64 4 Social support 312 non-null float64 5 Healthy life expectancy 312 non-null float64 6 Freedom to make life choices 312 non-null float64 7 Generosity 312 non-null float64 8 Perceptions of corruption 311 non-null float64 9 Year 312 non-null int64 dtypes: float64(7), int64(2), object(1) memory usage: 26.8+ KB
# Import the 2016.csv file as a Pandas dataframe
df2016 = pd.read_csv("2016.csv")
# Display the first few rows of the 2016 dataframe
df2016.head(3)
| Country | Region | Happiness Rank | Happiness Score | Lower Confidence Interval | Upper Confidence Interval | Economy (GDP per Capita) | Family | Health (Life Expectancy) | Freedom | Trust (Government Corruption) | Generosity | Dystopia Residual | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Denmark | Western Europe | 1 | 7.526 | 7.460 | 7.592 | 1.44178 | 1.16374 | 0.79504 | 0.57941 | 0.44453 | 0.36171 | 2.73939 |
| 1 | Switzerland | Western Europe | 2 | 7.509 | 7.428 | 7.590 | 1.52733 | 1.14524 | 0.86303 | 0.58557 | 0.41203 | 0.28083 | 2.69463 |
| 2 | Iceland | Western Europe | 3 | 7.501 | 7.333 | 7.669 | 1.42666 | 1.18326 | 0.86733 | 0.56624 | 0.14975 | 0.47678 | 2.83137 |
# Rename the 'Country or region' column to 'Country' in the happiness dataset
happiness.rename(columns={'Country or region': 'Country'}, inplace=True)
# Print the first few rows of the renamed dataframe to verify that the column name has been changed
happiness.head()
| Overall rank | Country | Score | GDP per capita | Social support | Healthy life expectancy | Freedom to make life choices | Generosity | Perceptions of corruption | Year | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Finland | 7.769 | 1.340 | 1.587 | 0.986 | 0.596 | 0.153 | 0.393 | 2019 |
| 1 | 2 | Denmark | 7.600 | 1.383 | 1.573 | 0.996 | 0.592 | 0.252 | 0.410 | 2019 |
| 2 | 3 | Norway | 7.554 | 1.488 | 1.582 | 1.028 | 0.603 | 0.271 | 0.341 | 2019 |
| 3 | 4 | Iceland | 7.494 | 1.380 | 1.624 | 1.026 | 0.591 | 0.354 | 0.118 | 2019 |
| 4 | 5 | Netherlands | 7.488 | 1.396 | 1.522 | 0.999 | 0.557 | 0.322 | 0.298 | 2019 |
# Create a dictionary that maps countries to regions
region_dict = df2016.set_index('Country')['Region'].to_dict()
# Map the regions to the countries in the happiness dataframe
happiness['Region'] = happiness['Country'].map(region_dict)
# Print the first few rows of the updated dataframe to verify that the regions have been assigned correctly
happiness.head()
| Overall rank | Country | Score | GDP per capita | Social support | Healthy life expectancy | Freedom to make life choices | Generosity | Perceptions of corruption | Year | Region | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Finland | 7.769 | 1.340 | 1.587 | 0.986 | 0.596 | 0.153 | 0.393 | 2019 | Western Europe |
| 1 | 2 | Denmark | 7.600 | 1.383 | 1.573 | 0.996 | 0.592 | 0.252 | 0.410 | 2019 | Western Europe |
| 2 | 3 | Norway | 7.554 | 1.488 | 1.582 | 1.028 | 0.603 | 0.271 | 0.341 | 2019 | Western Europe |
| 3 | 4 | Iceland | 7.494 | 1.380 | 1.624 | 1.026 | 0.591 | 0.354 | 0.118 | 2019 | Western Europe |
| 4 | 5 | Netherlands | 7.488 | 1.396 | 1.522 | 0.999 | 0.557 | 0.322 | 0.298 | 2019 | Western Europe |
# Find which countries have a missing region
missing_regions = happiness[happiness['Region'].isna()]['Country'].unique()
# Print the list of countries with missing regions
print("Countries with missing regions:")
for country in missing_regions:
print(country)
Countries with missing regions: Trinidad & Tobago Northern Cyprus North Macedonia Gambia Mozambique Swaziland Lesotho Central African Republic
region_counts = happiness.groupby('Region')['Country'].count()
print(region_counts)
Region Australia and New Zealand 4 Central and Eastern Europe 57 Eastern Asia 12 Latin America and Caribbean 41 Middle East and Northern Africa 38 North America 4 Southeastern Asia 18 Southern Asia 14 Sub-Saharan Africa 71 Western Europe 40 Name: Country, dtype: int64
# Create a dictionary with appropriate region names for the missing countries
region_dict = {
'Trinidad & Tobago': 'Latin America and Caribbean',
'Northern Cyprus': 'Middle East and Northern Africa',
'North Macedonia': 'Central and Eastern Europe',
'Gambia': 'Sub-Saharan Africa',
'Mozambique': 'Sub-Saharan Africa',
'Swaziland': 'Sub-Saharan Africa',
'Lesotho': 'Sub-Saharan Africa',
'Central African Republic': 'Sub-Saharan Africa'
}
# Use the map() method to modify the Region column
happiness['Region'] = happiness['Region'].fillna(happiness['Country'].map(region_dict))
# Now to check if the assignment was done correctly, let us check which countries still have no region assigned:
# Find which countries have a missing region
missing_regions1 = happiness[happiness['Region'].isna()]['Country'].unique()
# Print the list of countries with missing regions
print("Countries with missing regions:")
for country in missing_regions1:
print(country)
Countries with missing regions:
# We can also check the new region counts:
region_counts1 = happiness.groupby('Region')['Country'].count()
print(region_counts1)
Region Australia and New Zealand 4 Central and Eastern Europe 58 Eastern Asia 12 Latin America and Caribbean 43 Middle East and Northern Africa 40 North America 4 Southeastern Asia 18 Southern Asia 14 Sub-Saharan Africa 79 Western Europe 40 Name: Country, dtype: int64
#display last five rows
happiness.tail()
| Overall rank | Country | Score | GDP per capita | Social support | Healthy life expectancy | Freedom to make life choices | Generosity | Perceptions of corruption | Year | Region | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 151 | 152 | Yemen | 3.355 | 0.442 | 1.073 | 0.343 | 0.244 | 0.083 | 0.064 | 2018 | Middle East and Northern Africa |
| 152 | 153 | Tanzania | 3.303 | 0.455 | 0.991 | 0.381 | 0.481 | 0.270 | 0.097 | 2018 | Sub-Saharan Africa |
| 153 | 154 | South Sudan | 3.254 | 0.337 | 0.608 | 0.177 | 0.112 | 0.224 | 0.106 | 2018 | Sub-Saharan Africa |
| 154 | 155 | Central African Republic | 3.083 | 0.024 | 0.000 | 0.010 | 0.305 | 0.218 | 0.038 | 2018 | Sub-Saharan Africa |
| 155 | 156 | Burundi | 2.905 | 0.091 | 0.627 | 0.145 | 0.065 | 0.149 | 0.076 | 2018 | Sub-Saharan Africa |
# Let us sort the data in alphabetical order by the Country column.
happiness = happiness.sort_values(by='Country')
happiness.head()
| Overall rank | Country | Score | GDP per capita | Social support | Healthy life expectancy | Freedom to make life choices | Generosity | Perceptions of corruption | Year | Region | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 153 | 154 | Afghanistan | 3.203 | 0.350 | 0.517 | 0.361 | 0.000 | 0.158 | 0.025 | 2019 | Southern Asia |
| 144 | 145 | Afghanistan | 3.632 | 0.332 | 0.537 | 0.255 | 0.085 | 0.191 | 0.036 | 2018 | Southern Asia |
| 111 | 112 | Albania | 4.586 | 0.916 | 0.817 | 0.790 | 0.419 | 0.149 | 0.032 | 2018 | Central and Eastern Europe |
| 106 | 107 | Albania | 4.719 | 0.947 | 0.848 | 0.874 | 0.383 | 0.178 | 0.027 | 2019 | Central and Eastern Europe |
| 83 | 84 | Algeria | 5.295 | 0.979 | 1.154 | 0.687 | 0.077 | 0.055 | 0.135 | 2018 | Middle East and Northern Africa |
# Now let us sort by GDP per capita in descending order
happiness = happiness.sort_values(by='GDP per capita', ascending=False)
happiness.head()
| Overall rank | Country | Score | GDP per capita | Social support | Healthy life expectancy | Freedom to make life choices | Generosity | Perceptions of corruption | Year | Region | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 19 | 20 | United Arab Emirates | 6.774 | 2.096 | 0.776 | 0.670 | 0.284 | 0.186 | NaN | 2018 | Middle East and Northern Africa |
| 28 | 29 | Qatar | 6.374 | 1.684 | 1.313 | 0.871 | 0.555 | 0.220 | 0.167 | 2019 | Middle East and Northern Africa |
| 31 | 32 | Qatar | 6.374 | 1.649 | 1.303 | 0.748 | 0.654 | 0.256 | 0.171 | 2018 | Middle East and Northern Africa |
| 13 | 14 | Luxembourg | 7.090 | 1.609 | 1.479 | 1.012 | 0.526 | 0.194 | 0.316 | 2019 | Western Europe |
| 16 | 17 | Luxembourg | 6.910 | 1.576 | 1.520 | 0.896 | 0.632 | 0.196 | 0.321 | 2018 | Western Europe |
# Now let us display the information of my home country, Zimbabwe.
zimbabwe_data = happiness[happiness['Country'] == 'Zimbabwe']
print(zimbabwe_data)
Overall rank Country Score GDP per capita Social support \
145 146 Zimbabwe 3.663 0.366 1.114
143 144 Zimbabwe 3.692 0.357 1.094
Healthy life expectancy Freedom to make life choices Generosity \
145 0.433 0.361 0.151
143 0.248 0.406 0.132
Perceptions of corruption Year Region
145 0.089 2019 Sub-Saharan Africa
143 0.099 2018 Sub-Saharan Africa
# Let's produce a table showing the GDP per capita and Healthy life expectancy of the top 10 countries
#with the highest Perceptions of corruption score.
# Sort by Perceptions of corruption score in descending order
sorted_happiness = happiness.sort_values(by='Perceptions of corruption', ascending=False)
# Select the top 5 countries with highest Perceptions of corruption score
top_10 = sorted_happiness.head(10)
# Select only the columns we need
top_10_gdp_hle = top_10[['Country', 'GDP per capita', 'Healthy life expectancy']]
# Display the table
print(top_10_gdp_hle)
Country GDP per capita Healthy life expectancy 33 Singapore 1.529 1.008 33 Singapore 1.572 1.141 150 Rwanda 0.332 0.400 151 Rwanda 0.359 0.614 1 Denmark 1.383 0.996 2 Denmark 1.351 0.868 0 Finland 1.305 0.874 0 Finland 1.340 0.986 7 New Zealand 1.268 0.876 8 Sweden 1.355 0.913
import matplotlib.pyplot as plt
# Create a dictionary to map region names to colors
colors = {
'Western Europe': '#1f77b4',
'North America': '#ff7f0e',
'Australia and New Zealand': '#2ca02c',
'Middle East and Northern Africa': '#d62728',
'Latin America and Caribbean': '#9467bd',
'Southeast Asia': '#8c564b',
'Central and Eastern Europe': '#e377c2',
'Eastern Asia': '#7f7f7f',
'Sub-Saharan Africa': '#bcbd22'
}
# Create a scatter plot of Happiness Score versus GDP per Capita, colored by Region
fig, ax = plt.subplots(figsize=(10, 6))
for region, group in happiness.groupby('Region'):
color = colors.get(region, '#999999')
ax.scatter(group['GDP per capita'], group['Score'], label=region, color=color, alpha=0.7)
# Add axis labels and title
ax.set_xlabel('GDP per Capita')
ax.set_ylabel('Happiness Score')
ax.set_title('Happiness Score vs GDP per Capita by Region')
# Add a legend
ax.legend()
# Display the plot
plt.show()
# Export the happiness dataframe to a CSV file
happiness.to_csv('happiness.csv', index=False)
# Index=False is used to avoid exporting the dataframe index as a separate column in the CSV file