In [1]:
import pandas as pd
import matplotlib.pyplot as plt
In [2]:
#Question - Does the Coivd-19 Death Rate have a correlation with GDP Per Capita? Do more people die in poorer countries?
#Let's read in our data sources
#GDP Per Capita Per Country - https://www.focus-economics.com/economic-indicator/gdp-per-capita
gdpPerCapita = pd.read_csv(r"C:\Users\Jonny\Documents\Projects\covidToGDP\gdpPerCapita.csv")
#Covid 19 Death Statistics Per Country - https://ourworldindata.org/explorers/coronavirus-data-explorer?facet=none&pickerSort=desc&pickerMetric=location&Metric=Confirmed+deaths&Interval=Cumulative&Relative+to+Population=true&Color+by+test+positivity=false
covidDeathRates = pd.read_csv(r"C:\Users\Jonny\Documents\Projects\covidToGDP\covidDeathsPerCountry.csv")
In [3]:
#Let's remove some unecessary columns and extra data irrelevant to what we are going to look at today
gdpPerCapita.head()
#After looking at the first several rows, we don't have any extra rows. Everything is in USD ($) and to read this you could
#For each citizen who lives in Luxembourg, they make $101,207 anually
Out[3]:
Country | gdpPerCapita | |
---|---|---|
0 | Luxembourg | 101207 |
1 | Switzerland | 85682 |
2 | Cayman Islands | 83536 |
3 | Ireland | 78558 |
4 | Norway | 75059 |
In [4]:
#Let's remove some unecessary columns and extra data irrelevant to what we are going to look at today
covidDeathRates.head()
#this data needs to cleaned a little bit more.
#lets only keep 'location' and 'total_deaths'
#what we want to do is to remove all other columns and then take the MAX of'total_deaths' for each location
covidDeathRates = covidDeathRates.groupby('location')['total_deaths_per_million'].max()
covidDeathRates = covidDeathRates.to_frame()
covidDeathRates.columns.values[0] = "Covid Death Rate Per Million Rate"
In [5]:
covidDeathRates.head()
#perfect, now we have each location and their max total deaths per million people living in that country (their rate)
Out[5]:
Covid Death Rate Per Million Rate | |
---|---|
location | |
Afghanistan | 193.220 |
Africa | 184.726 |
Albania | 1217.223 |
Algeria | 154.091 |
Andorra | 2003.775 |
In [6]:
#lets sort this data so that we can get a quick look to see who has the lowest reported death rate
covidDeathRates.sort_values('Covid Death Rate Per Million Rate').head()
#Okay cool so we can now see that North Korea has the lowest reported death rates :/
Out[6]:
Covid Death Rate Per Million Rate | |
---|---|
location | |
North Korea | 2.395 |
Burundi | 3.101 |
China | 3.616 |
Chad | 11.410 |
South Sudan | 12.125 |
In [7]:
#Now lets merge our two datasets together. This will allow us to get a quick view of GDP Per Capita and Covid Deaths Rate for each country
mergedData = covidDeathRates.merge(gdpPerCapita,how="inner",left_on="location",right_on="Country")
mergedData.head()
#let's permanently sort the data so that we get a better visualization
mergedData = mergedData.sort_values('Covid Death Rate Per Million Rate')
#now we have a quick view of each country and their GDP Per Capita and Covid Death Rates
In [8]:
#Lets plot this data out and see what we can visualize
fix,ax1 = plt.subplots()
ax2 = ax1.twinx()
ax1.set_xlabel('Country')
ax1.set_ylabel('Covid Deaths (Million)', color = 'blue')
ax2.set_ylabel('GDP (Capita)', color = 'red')
ax1.plot(mergedData['Country'], mergedData['Covid Death Rate Per Million Rate'], label = "Covid Deaths (Million)", color='blue')
ax2.plot(mergedData['Country'], mergedData['gdpPerCapita'], label = "GDP (Capita)", color='red')
plt.show()
In [9]:
#There seems to be some correlation between GDP and Death Rates because in the beginning we can see that countries with low Death rates also ahve low GDP rates
#lets calculate correlation
correlation_matrix = mergedData.corr()
print(correlation_matrix)
Covid Death Rate Per Million Rate \ Covid Death Rate Per Million Rate 1.000000 gdpPerCapita 0.161727 gdpPerCapita Covid Death Rate Per Million Rate 0.161727 gdpPerCapita 1.000000
In [10]:
#Okay, so our correlation between GDP Per Capita and Covid Death Rates is not great. we have a correlatyion of 0.1617.
#This indicates a weak positive correlation
#This sources of this data is solid, but the differences in reporting accross countries may be affecting the data
#Is there any way that we could get better data?