Last Updated on February 15, 2021 by David Vause
# -*- coding: utf-8 -*-
"""
Created on Sun Jul 12 16:28:46 2020
@author: dave
"""
import pandas as pd
#https://ourworldindata.org/coronavirus
covid_data = pd.read_csv('owid-covid-data.csv',
parse_dates=['date'],
usecols=['iso_code','continent','location',
'date','total_cases_per_million', 'total_deaths_per_million',
'total_tests_per_thousand','stringency_index', 'population', 'population_density' ])
#https://ourworldindata.org/urbanization#share-of-populations-living-in-urban-areas
urbanization = pd.read_csv('share-of-population-urban.csv',
names=['iso_code', 'Year', 'urbanization'],
header=0)
urbanization.reset_index(drop=True, inplace=True)
# get rid of countries with unusable iso_codes
covid_data.dropna(subset=['iso_code', 'population_density'], inplace=True)
covid_data = covid_data[covid_data['iso_code']!='OWID_WRL']
covid_data = covid_data[covid_data['iso_code']!='OWID_KOS']
covid_data = covid_data[covid_data['iso_code']!='']
# for each country pick the row with the most
# recent date. covid_data contains all the data,
# country_list is the most recent subset of covid data
country_groups = covid_data.groupby('location')
row_indeces = country_groups['date'].idxmax()
row_indeces = row_indeces.values
# indeces from country_groups
country_list = covid_data.loc[row_indeces]
#print('country_list: \n', country_list.iloc[1])
# do the same for urbanization
urbanizationization_groups = urbanization.groupby('iso_code')
row_indeces = urbanizationization_groups['Year'].idxmax()
row_indeces = row_indeces.values
urbanization_list = urbanization.loc[row_indeces]
urbanization_list = urbanization_list.drop(columns=['Year'])
urbanization_list = urbanization_list[urbanization_list['iso_code']!='OWID_WRL']
urbanization_list = urbanization_list[urbanization_list['iso_code']!='OWID_CIS']
#
country_list = country_list.merge(urbanization_list,
on=['iso_code'],
how='inner')
# sorted ascending index
country_list = country_list[country_list['total_deaths_per_million']!=0]
country_list = country_list.dropna(axis='index', subset=['total_deaths_per_million'])
#print(cimport numpy as npountry_list[['iso_code','total_deaths_per_million'] ])#['total_deaths_per_million']) #.head(5)) #['iso_code']['total_deaths_per_million'])
# https://matplotlib.org/3.2.2/gallery/mplot3d/surface3d.html#sphx-glr-gallery-mplot3d-surface3d-py
# https://matplotlib.org/3.2.2/gallery/color/colorbar_basics.html#sphx-glr-gallery-color-colorbar-basics-py
import matplotlib.pyplot as plt
fig, (ax1, ax2) = plt.subplots(2, 1)
#plt.figure(figsize=(30,20))
plt.subplots_adjust(hspace=0.4)
ax1.set_ylabel('population density')
ax1.set_xlabel('total_deaths_per_million')
country_list.sort_values(axis=0,
by='total_deaths_per_million',
inplace=True,
ascending = False,
ignore_index=True)
# label the deadliest countries and set up colors (country_list is sorted in
# descending order on total_deaths_per_million)
rows, columns = country_list.shape
max_index = rows - 1
# note that color1 will presume that the data is sorted on
# total_deaths_per_million, descending
#colors1 = ['royalblue' for i in range(rows)]
country_list['colors'] = ['royalblue' for i in range(rows)]
country_list['alpha'] = [0.4 for i in range(rows)]
# get column number for 'colors'
colors_loc = country_list.columns.get_loc('colors')
alpha_loc = country_list.columns.get_loc('alpha')
for i in range(3):
country_list.iloc[i,colors_loc] = 'red'
country_list.iloc[i,alpha_loc] = 1.0
country=country_list.iloc[i,2]
xy1 = (float(country_list['total_deaths_per_million'].iloc[i]),
float(country_list['population_density'].iloc[i]))
xy2 = (float(country_list['total_deaths_per_million'].iloc[i]),
float(country_list['urbanization'].iloc[i]))
#print(country, xy1, xy2)
ax1.annotate(s= country,
xy=xy1,
xytext =(xy1[0]-80, xy1[1] + 5000),
arrowprops={'arrowstyle':'->'})
ax2.annotate(s= country, xy=xy2,
xytext =(xy2[0]-80, xy2[1] - 30),
arrowprops={'arrowstyle':'->'})
# label the least deadly (country_list is sorted in
# descending order on total_deaths_per_million)
displacement = 0.5
for i in range(max_index, max_index-2, -1):
country_list.iloc[i,colors_loc] = 'red'
country_list.iloc[i,alpha_loc] = 1.0
country=country_list.iloc[i,2]
xy1 = (float(country_list['total_deaths_per_million'].iloc[i]),
float(country_list['population_density'].iloc[i]))
xy2 = (float(country_list['total_deaths_per_million'].iloc[i]),
float(country_list['urbanization'].iloc[i]))
#print(country, xy1, xy2)
ax1.annotate(s= country,
xy=xy1,
xytext =(xy1[0]+200 - 50*displacement, xy1[1] + displacement*4000),
arrowprops={'arrowstyle':'->'})
ax2.annotate(s= country, xy=xy2,
xytext =(xy2[0]+100, xy2[1] + 10),
arrowprops={'arrowstyle':'->'})
displacement = displacement+1
# annotate the highest population density points in ax1
country_list.sort_values(axis=0,
by='population_density',
inplace=True,
ascending = False)
for i in range(2):
country_list.iloc[i,alpha_loc] = 1.0
xy1 = (float(country_list['total_deaths_per_million'].iloc[i]),
float(country_list['population_density'].iloc[i]))
country=country_list.iloc[i,2]
ax1.annotate(s= country_list.iloc[i,2],
xy=xy1,
xytext=(xy1[0]+50 , xy1[1]-5000 + i*8000),
arrowprops={'arrowstyle':'->'})
country_list.iloc[i,colors_loc] = 'red'
xy2 = (float(country_list['total_deaths_per_million'].iloc[i]),
float(country_list['urbanization'].iloc[i]))
ax2.annotate(s= country, xy=xy2,
xytext =(xy2[0] + 150, xy2[1]-10-i*15),
arrowprops={'arrowstyle':'->'})
#fig, (ax1, ax2, ax3) = plt.subplots(figsize=(13, 3), ncols=3)
#plt.scatter(country_list['population_density'], country_list['total_deaths_per_million'])
#ax1.hexbin(country_list['population_density'], country_list['total_deaths_per_million'],
# gridsize=(10,10))
plt.suptitle('Urbanization vs. Population Density in Driving COVID Death Rate')
#ax2.hexbin(country_list['population_density'],
# country_list['urbanization population (% of total) (% of total)'],
# gridsize=(10,10))
ax2.set_xlabel('total_deaths_per_million')
ax2.set_ylabel('urbanization population \n (% of total)')
colors_loc = country_list.columns.get_loc('colors')
alpha_loc = country_list.columns.get_loc('alpha')
total_loc = country_list.columns.get_loc('total_deaths_per_million')
urban_loc = country_list.columns.get_loc('urbanization')
pop_loc = country_list.columns.get_loc('population_density')
for i in range(max_index):
ax1.scatter(
country_list.iloc[i, total_loc],
country_list.iloc[i, pop_loc],
marker='.', c=country_list.iloc[i, colors_loc],
alpha=country_list.iloc[i, alpha_loc])
ax2.scatter(country_list.iloc[i, total_loc],
country_list.iloc[i, urban_loc],
marker='.', c=country_list.iloc[i, colors_loc],
alpha=country_list.iloc[i, alpha_loc])