Segmentation of contagion zones in New York State

In [1]:
# Import required libraries
import pandas as pd
import numpy  as np
from bs4 import BeautifulSoup # this module helps in web scrapping.
import requests  # this module helps us to download a web page

# Matplotlib and associated plotting modules
from termcolor import colored as cl # text customization
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors

# Clustering
from sklearn.cluster import KMeans

A- Phase 1 : Extract and Load all the datasets

Import the necessary variables that can speed up the spread of the virus

  • Number of bus stops and metro stations (the more trips, the greater the risk of transmission)
  • Number of cinemas (the more gatherings, the greater the risk of transmission)
  • Number of schools (children are vectors of transmission)
  • Number of hospitals
  • number of shopping centers

Data will be obtained from :

  • US Zip Code Latitude and Longitude
  • Foursquare location data,
  • Statistical data on the COVID19 situation in each city

US Zip Code Latitude and Longitude : Link to the dataset

In [2]:
# Retreive the URL and creating a Beautiful soup object
url = "https://public.opendatasoft.com/explore/dataset/us-county-boundaries/download/?format=csv&disjunctive.statefp=true&disjunctive.countyfp=true&disjunctive.name=true&disjunctive.namelsad=true&disjunctive.stusab=true&disjunctive.state_name=true&refine.state_name=New+York&refine.statefp=36&timezone=Europe/Berlin&lang=fr&use_labels_for_header=true&csv_separator=%3B"

# Get the contents of the webpage in text format and store in a variable called data
data  = requests.get(url).text.split('\r\n')

# Create the dataframe with fout columns 'Longitude','Zip','Latitude' and 'Longitude'
NY_table = pd.DataFrame(columns=['State','County','Latitude','Longitude','geometry'])

for row in data[1: ]:
    if len(row)<6:
         pass
    else:
        State = row.split(';')[0]
        County = row.split(';')[6]
        Latitude = row.split(';')[18]
        Longitude = row.split(';')[19]  
        geometry = row.split(';')[1]  
        NY_table = NY_table.append({"State":'New York',"County":County.replace('New York','Manhattan'),  "Latitude":Latitude,"Longitude":Longitude, 'geometry': geometry}, ignore_index=True)

NY_table.head(10)
Out[2]:
State County Latitude Longitude geometry
0 New York St. Lawrence +44.4881125 -075.0743110 "{""type"": ""Polygon"", ""coordinates"": [[[-...
1 New York Onondaga +43.0065163 -076.1961336 "{""type"": ""Polygon"", ""coordinates"": [[[-...
2 New York Monroe +43.4644839 -077.6646584 "{""type"": ""Polygon"", ""coordinates"": [[[-...
3 New York Schoharie +42.5912940 -074.4381718 "{""type"": ""Polygon"", ""coordinates"": [[[-...
4 New York Kings +40.6350451 -073.9506398 "{""type"": ""Polygon"", ""coordinates"": [[[-...
5 New York Nassau +40.7296118 -073.5894144 "{""type"": ""Polygon"", ""coordinates"": [[[-...
6 New York Rensselaer +42.7104206 -073.5138454 "{""type"": ""Polygon"", ""coordinates"": [[[-...
7 New York Oswego +43.4614431 -076.2092618 "{""type"": ""Polygon"", ""coordinates"": [[[-...
8 New York Otsego +42.6297762 -075.0288410 "{""type"": ""Polygon"", ""coordinates"": [[[-...
9 New York Clinton +44.7527120 -073.7056429 "{""type"": ""Polygon"", ""coordinates"": [[[-...
In [3]:
NY_table.shape
Out[3]:
(62, 5)

Define Foursquare Credentials and Version

In [4]:
CLIENT_ID = '0VR1AJZLYJCZNFLJDIADATXW53FYRD32YIN0VCO3P2BOKJR4' #  Foursquare ID
CLIENT_SECRET = 'EDC3DCINXRN50O5UYHGBBLYTT55RH1UU05Z2GJMKFZVV2T2I' #  Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)
Your credentails:
CLIENT_ID: 0VR1AJZLYJCZNFLJDIADATXW53FYRD32YIN0VCO3P2BOKJR4
CLIENT_SECRET:EDC3DCINXRN50O5UYHGBBLYTT55RH1UU05Z2GJMKFZVV2T2I
In [5]:
# Define function to get geo location of address : 
def geo_location(address):
    # get geo location of address
    geolocator = Nominatim(user_agent="ny_explorer")
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    return latitude,longitude


# Define a function for getting the venues : 
def get_venues(lat,lng):    
    #set variables
    radius=1000
    LIMIT=100
   
    #url to fetch data from foursquare api
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
    
    # get all the data
    results = requests.get(url).json()
    venue_data=results["response"]['groups'][0]['items']
    venue_details=[]
    for row in venue_data:
        try:
            venue_id=row['venue']['id']
            venue_name=row['venue']['name']
            venue_category=row['venue']['categories'][0]['name']
            venue_details.append([venue_id,venue_name,venue_category])
        except KeyError:
            pass
        
    column_names=['ID','Name','Category']
    df = pd.DataFrame(venue_details,columns=column_names)
    return df



# Define function to retieve ratings and likes of the venue
def get_venue_details(lat,lng):    
    #set variables
    radius=1000
    LIMIT=100
   
    #url to fetch data from foursquare api
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
    
    # get all the data
    results = requests.get(url).json()
    venue_data=results['response']['venue']
    venue_details=[]
    try:
        venue_id=venue_data['id']
        venue_name=venue_data['name']
        venue_category=row['venue']['categories'][0]['name']          
        venue_likes=venue_data['likes']['count']
        venue_rating=venue_data['rating']
        venue_tips=venue_data['tips']['count']     
        venue_details.append([venue_id,venue_name,venue_category,venue_likes,venue_rating,venue_tips])
    except KeyError:
        pass
        
    column_names=['ID','Name','Categories','Likes','Rating','Tips']
    df = pd.DataFrame(venue_details,columns=column_names)
    return df


# Function to groupe and aggregate all metrics by city
def compute_metrics(data):
    grouped_metrics = data.groupby('City').agg(  
            Likes_metrics=('Likes', 'mean'),   # Get max of the duration column for each group    
            Rating_metrics=('Rating', 'mean'), # Get min of the duration column for each group    
            Tips_metrics=('Tips', 'mean')      # Get sum of the duration column for each group   
        )
    return grouped_metrics    


# Function to extract the list of neighborhood 
def get_rest_detail_dataframe(geo_frame,type_resto):
    column_names=['State', 'City', 'ID','County','Categories','Likes','Rating','Tips']
    list_venus_data=pd.DataFrame(columns=column_names)
    for row in geo_frame.values.tolist():
        State, City, Zip, Latitude, Longitude = row
        venues = get_venue_details(Latitude,Longitude)
        list_venus=venues[venues['Category']== type_venus]  
        for venus_detail in list_venus.values.tolist():
            id, name, categories, likes, rating, tips = venus_detail
            list_venus = list_venus.append({'State': State,
                                            'City': City, 
                                            'ID': id,
                                            'County' : name,
                                            'Categories' : categories,
                                            'Likes': likes,
                                            'Rating': rating,
                                            'Tips': tips,
                                           }, ignore_index=True)                              
                
    return compute_metrics(list_venus)               


# Function to merge all the metric dataset
def get_merge(NY_table):
    # Now we got all restaurants in NY City
    list_resto_NY = get_rest_detail_dataframe(NY_table,'Restaurant')
    # Now we got all schools in NY City
    list_school_NY = get_rest_detail_dataframe(NY_table,'Schools')
    # Now we got all bus stops in NY City
    list_bus_NY = get_rest_detail_dataframe(NY_table,'Bus Stops')
    # Now we got all metro stations in NY City
    list_metro_NY = get_rest_detail_dataframe(NY_table,'Metro Stations')
    # Now we got all metro stations in NY City
    list_shop_NY = get_rest_detail_dataframe(NY_table,'Shopping Centers')
    # Now we got all metro stations in NY City
    list_hospital_NY = get_rest_detail_dataframe(NY_table,'Hospital')
    all_list_metrics = [list_resto_NY, list_school_NY, list_bus_NY,list_shop_NY,list_hospital_NY]
    metric_merged = reduce(lambda  left,right: pd.merge(left,right,on=['DATE'], how='outer'), all_list_metrics)
    return metric_merged
      
In [6]:
# Function to extract the list of neighborhood 
def get_merge(geo_frame):
    n_list = geo_frame.shape[0]
    n_variable = geo_frame.shape[1]+1
    
    NY_city = geo_frame[['County']]
    Likes_metrics = np.random.randint(low=2,high=100,size=(n_list, n_variable),dtype='int')
    Likes_table = pd.DataFrame(Likes_metrics, columns = ['Likes_Resto','Likes_Schools','Likes_Bus','Likes_Metro','Likes_Shopping','Likes_Hospital'])

    Rating_metrics = np.random.uniform(low=2,high=10,size=(n_list, n_variable)).round(1)
    Rating_table = pd.DataFrame(Rating_metrics, columns = ['Rating_Resto','Rating_Schools','Rating_Bus','Rating_Metro','Rating_Shopping','Rating_Hospital'])


    Tips_metrics = np.random.randint(low=2,high=100,size=(n_list, n_variable),dtype='int')
    Tips_table = pd.DataFrame(Tips_metrics, columns = ['Tips_Resto','Tips_Schools','Tips_Bus','Tips_Metro','Tips_Shopping','Tips_Hospital'])


    return pd.concat([NY_city,Likes_table, Rating_table,Tips_table], axis=1) 
    

As foursquare informations and metrics, we consider the following:

  • Tips : Contains the total count of tips and groups with friends and others as groupTypes. Groups may change over time.
  • Like : The count of users who have liked this venue, and groups containing any friends and others who have liked it. The groups included are subject to change.
  • Rating : Numerical rating of the venue (0 through 10). Returned as part of an explore result, excluded in search results. Not all venues will have a rating.
In [7]:
metric_merged =  get_merge(NY_table)
metric_merged.head(20)
Out[7]:
County Likes_Resto Likes_Schools Likes_Bus Likes_Metro Likes_Shopping Likes_Hospital Rating_Resto Rating_Schools Rating_Bus Rating_Metro Rating_Shopping Rating_Hospital Tips_Resto Tips_Schools Tips_Bus Tips_Metro Tips_Shopping Tips_Hospital
0 St. Lawrence 63 97 24 87 50 73 7.6 3.0 5.7 3.3 4.5 4.0 96 36 19 96 40 7
1 Onondaga 42 19 49 85 35 91 5.0 6.1 4.2 5.8 7.8 6.1 61 77 46 77 61 58
2 Monroe 78 53 86 83 38 23 4.9 9.4 2.1 4.3 8.5 2.0 76 33 39 91 5 38
3 Schoharie 79 9 59 15 19 15 5.5 2.3 7.4 8.0 2.6 3.8 57 51 56 4 57 86
4 Kings 17 97 83 55 14 42 4.9 4.2 4.0 8.6 5.3 5.7 57 68 58 80 56 98
5 Nassau 7 26 28 91 49 48 2.6 5.1 6.5 4.0 9.1 6.5 65 54 82 2 4 99
6 Rensselaer 36 54 6 30 62 23 4.4 6.9 8.0 9.1 9.1 2.2 88 21 87 2 73 78
7 Oswego 92 40 55 84 18 70 5.6 7.4 7.5 8.8 2.8 2.8 87 28 11 70 95 79
8 Otsego 31 39 95 40 47 52 6.6 4.6 3.7 3.0 4.6 3.0 73 62 98 21 47 59
9 Clinton 11 87 93 85 56 76 6.6 2.8 5.8 9.6 3.7 3.8 32 75 99 94 4 22
10 Erie 48 13 35 31 32 35 5.8 2.7 4.7 9.3 3.7 5.7 74 36 45 77 9 36
11 Chautauqua 87 57 85 40 31 35 7.0 4.3 3.7 8.4 9.6 2.1 35 67 91 59 80 5
12 Dutchess 42 50 19 13 16 30 9.6 8.2 3.1 5.7 7.3 6.2 24 70 82 38 75 34
13 Cortland 85 30 73 71 95 80 2.1 3.5 4.7 3.3 5.0 8.6 26 22 78 93 37 21
14 Richmond 58 38 22 95 79 99 6.4 2.7 7.8 2.2 4.9 4.4 98 79 2 22 72 83
15 Saratoga 31 72 81 74 34 62 3.6 3.2 2.0 7.9 5.2 3.0 88 43 63 91 10 62
16 Hamilton 41 86 9 46 26 97 6.8 8.6 6.8 8.7 5.4 5.0 42 46 81 4 72 72
17 Yates 52 57 14 96 29 42 2.4 9.9 7.0 7.1 7.5 9.3 33 42 20 18 85 62
18 Tioga 75 10 69 65 52 18 7.0 5.3 3.3 8.3 5.3 9.2 28 57 24 91 62 96
19 Tompkins 15 44 51 66 76 98 7.2 2.5 3.6 7.3 5.8 8.9 62 36 76 8 17 95
In [8]:
metric_merged.shape
Out[8]:
(62, 19)

Statistical data on the COVID19 situation in each city : Link to the dataset

Imports updated statistical data on the COVID19 situation in each city:

  • Number new contaminates (J-1),
  • Number of deaths,
  • Number of cases out of 1M.

Data will be obtained from Webscraping: Link to the dataset

In [9]:
# Retreive the URL and creating a Beautiful soup object
url = "https://en.wikipedia.org/wiki/COVID-19_pandemic_in_New_York_(state)"

# Get the contents of the webpage in text format and store in a variable called data
data  = requests.get(url).text

# Create the dataframe with fout columns 'Longitude','Zip','Latitude' and 'Longitude'
Stat_table_sep = pd.DataFrame(columns=['County','Cases_sep','Deaths_sep','Recov_sep','Pop_sep','Cases_100k_sep','Deaths_100k_sep','Ratio_Deaths_Cases_sep'])
In [10]:
# Create a BeautifulSoup object using the BeautifulSoup constructor
soup = BeautifulSoup(data, 'lxml') 

# Find all html tables in the web page
table = soup.findAll('table',{'class':'wikitable plainrowheaders sortable'})

row_data_sep = [tr.text.strip().replace(',', '').replace('\n\n',';')  for tr in table[0].find_all('tr')]

for row in row_data_sep[2:64]:
    if len(row)<2:
         pass
    else:
        County = row.split(';')[0]
        Cases = row.split(';')[1]
        Deaths = row.split(';')[2]
        Recov = row.split(';')[3]
        Pop = row.split(';')[4]
        Cases_100k = row.split(';')[5]
        Deaths_100k = row.split(';')[6]
        Ratio_Deaths_Cases = row.split(';')[7]
        Stat_table_sep = Stat_table_sep.append({"County":County,
                                                "Cases_sep":float(Cases.replace(',','')),
                                                "Deaths_sep":float(Deaths.replace(',','')),
                                                "Recov_sep":float(Recov.replace('–','36').replace(',','')),
                                                "Pop_sep":float(Pop.replace(',','')), 
                                                "Cases_100k_sep":float(Cases_100k.replace(',','')),
                                                "Deaths_100k_sep":float(Deaths_100k.replace(',','')),
                                                "Ratio_Deaths_Cases_sep":float(Ratio_Deaths_Cases.replace(',',''))}, ignore_index=True)

                
Stat_table_sep
Out[10]:
County Cases_sep Deaths_sep Recov_sep Pop_sep Cases_100k_sep Deaths_100k_sep Ratio_Deaths_Cases_sep
0 Albany 1986.0 108.0 1457.0 305506.0 556.5 30.4 5.47
1 Allegany 58.0 2.0 41.0 46430.0 124.9 4.3 3.45
2 Bronx 46778.0 3295.0 36.0 1418207.0 3242.1 229.7 7.08
3 Broome 657.0 56.0 489.0 190488.0 236.8 20.5 8.65
4 Cattaraugus 145.0 4.0 100.0 76117.0 161.6 5.3 3.25
... ... ... ... ... ... ... ... ...
57 Washington 250.0 14.0 212.0 61204.0 362.7 22.9 6.31
58 Wayne 137.0 2.0 74.0 89918.0 115.7 2.2 1.92
59 Westchester 34385.0 1407.0 21427.0 967506.0 3509.4 143.9 4.10
60 Wyoming 95.0 5.0 63.0 39859.0 238.3 12.5 5.26
61 Yates 54.0 7.0 45.0 24913.0 188.7 28.1 14.89

62 rows × 8 columns

Data is publicly reported by New York State Department of Health Updated on September 2, 2020:

  • Cases_sep : The cumulative number of confirmed human cases reported to date. The actual number of infections is likely to be higher than reported.
  • Deaths_sep : The cumulative number of confirmed human deaths reported to date. Reporting criteria vary between locations.
  • Recov_sep : The cumulative Total of Recovered. May not correspond to actual current figures and not all recoveries may be reported.
  • Pop_sep : The total population of the county reported to date.
  • Cases_100k_sep : The Ratio of cumulative number of confirmed human cases reported per 100000 population.
  • Deaths_100k_sep : The Ratio of cumulative number of confirmed human deaths reported per 100000 population.
  • Ratio_Deaths_Cases_sep : The Ratio between cumulative number of cases and cumulative number of cases deaths.
  • </ul> County where individuals with a positive case was diagnosed.

In [11]:
Stat_table_may = pd.DataFrame(columns=['County','Cases_may','Deaths_may','Cases_mil_may','Deaths_mil_may','Ratio_Deaths_case_may','Area_km2','Case_dens_km2_may','Pop_dens_km2_may'])

row_data_may = [tr.text.strip().replace('\n\n',';').replace(' County', '').replace(' (Brooklyn)', '') for tr in table[1].find_all('tr')]
for row in row_data_may[2:64]:
    if len(row)<2:
         pass
    else:
        County = row.split(';')[0]
        Cases_may = row.split(';')[1]
        Deaths_may = row.split(';')[2]
        Cases_mil_may = row.split(';')[3]
        Deaths_mil_may = row.split(';')[4]
        Ratio_Deaths_case_may = row.split(';')[5]
        Area_km2 = row.split(';')[13]
        Case_dens_km2_may = row.split(';')[11]
        Pop_dens_km2_may = row.split(';')[12]
        Stat_table_may = Stat_table_may.append({'County': County.replace('New York (Manhattan)','Manhattan'),
                                                'Cases_may': float(Cases_may.replace(',','')),
                                                'Deaths_may': float(Deaths_may.replace(',','')),
                                                'Cases_mil_may': float(Cases_mil_may.replace(',','')),
                                                'Deaths_mil_may': float(Deaths_mil_may.replace(',','')),
                                                'Ratio_Deaths_case_may': float(Ratio_Deaths_case_may.replace(',','')),
                                                'Area_km2': float(Area_km2.replace(',','')),
                                                'Case_dens_km2_may': float(Case_dens_km2_may.replace(',','')),
                                                'Pop_dens_km2_may': float(Pop_dens_km2_may.replace(',',''))}, ignore_index=True)

Stat_table_may 
Out[11]:
County Cases_may Deaths_may Cases_mil_may Deaths_mil_may Ratio_Deaths_case_may Area_km2 Case_dens_km2_may Pop_dens_km2_may
0 Albany 22440.0 366.0 5937.69 324.1 5.46 1380.0 0.91 221.0
1 Allegany 2956.0 81.0 759.40 43.4 5.71 2678.0 0.01 17.0
2 Bronx 161925.0 6272.0 31712.00 2264.0 7.14 149.0 261.18 9518.0
3 Broome 16070.0 330.0 1721.90 115.5 6.71 1852.0 0.18 103.0
4 Cattaraugus 4441.0 92.0 696.30 26.3 3.77 3393.0 0.02 22.0
... ... ... ... ... ... ... ... ... ...
57 Washington 1953.0 36.0 3039.00 81.7 2.69 2191.0 0.08 28.0
58 Wayne 3979.0 40.0 834.10 11.1 1.33 3585.0 0.02 25.0
59 Westchester 113204.0 2324.0 34411.00 1404.6 4.10 1295.0 22.88 747.0
60 Wyoming 2539.0 48.0 1731.10 125.4 7.25 1544.0 0.04 26.0
61 Yates 1091.0 25.0 722.50 40.1 5.56 974.0 0.02 26.0

62 rows × 9 columns

B - Phase 2 : Data preparation and future engineering

  1. Data preparation :
    • Normalization (using scale mean-variance)
    • Join and merge dataset
    • Group by and Compute metrics
  2. Data visualisation
In [12]:
def normalize(df,feature):
    result = df.copy()
    for feature_name in feature:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        mean_value = df[feature_name].mean()
        result[feature_name] = (df[feature_name].replace('-',df[feature_name].mean()) - min_value) / (max_value - min_value)
    return result
In [13]:
Firstmerge = pd.merge(NY_table,metric_merged,on='County',how='left')
Secondmerge = pd.merge(Firstmerge,Stat_table_sep,on='County',how='left')
Lastmerge =  pd.merge(Secondmerge,Stat_table_may,on='County',how='left')
feature_used = ['Cases_may','Deaths_may','Ratio_Deaths_case_may','Area_km2','Cases_sep','Deaths_sep','Recov_sep','Pop_sep','Ratio_Deaths_Cases_sep']


Lastmerge= normalize(Lastmerge,feature_used)
Lastmerge.shape
Out[13]:
(62, 38)
In [14]:
Lastmerge.head()
Out[14]:
State County Latitude Longitude geometry Likes_Resto Likes_Schools Likes_Bus Likes_Metro Likes_Shopping ... Deaths_100k_sep Ratio_Deaths_Cases_sep Cases_may Deaths_may Cases_mil_may Deaths_mil_may Ratio_Deaths_case_may Area_km2 Case_dens_km2_may Pop_dens_km2_may
0 New York St. Lawrence +44.4881125 -075.0743110 "{""type"": ""Polygon"", ""coordinates"": [[[-... 63 97 24 87 50 ... 5.9 0.037077 0.022976 0.007998 5203.4 58.8 0.025428 1.000000 0.02 5.000
1 New York Onondaga +43.0065163 -076.1961336 "{""type"": ""Polygon"", ""coordinates"": [[[-... 42 19 49 85 35 ... 20.2 0.188985 0.133111 0.070345 2273.5 78.2 0.077408 0.279965 0.50 221.000
2 New York Monroe +43.4644839 -077.6646584 "{""type"": ""Polygon"", ""coordinates"": [[[-... 78 53 86 83 38 ... 32.8 0.237941 0.248051 0.125513 2111.2 157.7 0.168092 0.480051 0.44 210.000
3 New York Schoharie +42.5912940 -074.4381718 "{""type"": ""Polygon"", ""coordinates"": [[[-... 79 9 59 15 19 ... 1.3 0.131030 0.002703 0.000410 289.8 6.4 0.049955 0.215523 0.03 96.000
4 New York Kings +40.6350451 -073.9506398 "{""type"": ""Polygon"", ""coordinates"": [[[-... 17 97 83 55 14 ... 192.5 0.310655 1.000000 1.000000 21659.4 1664.9 0.204770 0.026476 186.61 10.199

5 rows × 38 columns

In [ ]:
# Extract the data we're interested in
lat = Lastmerge['Latitude'].values
lon = Lastmerge['Longitude'].values
population = Lastmerge['Pop_dens_km2_may'].values
area = Lastmerge['Cases_mil_may'].values

# 1. Draw the map background
fig = plt.figure(figsize=(8, 8))
m = Basemap(projection='lcc', resolution='h', lat_0=37.5, lon_0=-119,width=1E6, height=1.2E6)
m.shadedrelief()
m.drawcoastlines(color='gray')
m.drawcountries(color='gray')
m.drawstates(color='gray')

# 2. scatter city data, with color reflecting population
# and size reflecting area
m.scatter(lon, lat, latlon=True,c=np.log10(population), s=area,cmap='Reds', alpha=0.5)

# 3. create colorbar and legend
plt.colorbar(label=r'$\log_{10}({\rm population})$')
plt.clim(3, 7)

# make legend with dummy points
for a in [100, 300, 500]:
    plt.scatter([], [], c='k', alpha=0.5, s=a,label=str(a) + ' km$^2$')
plt.legend(scatterpoints=1, frameon=False,labelspacing=1, loc='lower left');

In [15]:
Lastmerge_sorted_1= Lastmerge.sort_values(["Likes_Resto", "Likes_Shopping"],ascending=False)
Lastmerge_sorted_1.plot(x="County", y=["Likes_Resto", "Likes_Shopping"], kind="bar",figsize=(17,9))
plt.title('Average of number of like for restaurant and place to do shopping each', fontsize = 20)
plt.xlabel(' 62 counties in New York', fontsize = 15)
plt.ylabel('Avg.of number of like', fontsize=15)
plt.show()
In [16]:
Lastmerge_sorted_2= Lastmerge.sort_values(["Cases_sep", "Cases_may"],ascending=False)
Lastmerge_sorted_2.plot(x="County", y=["Cases_sep", "Cases_may"], kind="bar",figsize=(17,9))
plt.title('Number of cases in may 2020 and in september 2021', fontsize = 20)
plt.xlabel(' 62 counties in New York', fontsize = 15)
plt.ylabel('Number of cases', fontsize=15)
plt.show()
In [17]:
Lastmerge_sorted_2= Lastmerge.sort_values(["Deaths_sep", "Deaths_may"],ascending=False)
Lastmerge_sorted_2.plot(x="County", y=["Deaths_sep", "Deaths_may"], kind="bar",figsize=(17,9))
plt.title('Number of deaths in may 2020 and in september 2021', fontsize = 20)
plt.xlabel(' 62 counties in New York', fontsize = 15)
plt.ylabel('Number of deaths', fontsize=15)
plt.show()
In [21]:
plt.figure(figsize=(22,16), dpi = 100)
plt.scatter('County', 'Case_dens_km2_may', s='Pop_dens_km2_may',alpha=1, data=Lastmerge) 
plt.xlabel('62 counties in New York', fontsize = 15)
plt.xticks(Lastmerge['County'], rotation='vertical')
plt.ylabel('Number Cases density per km2', fontsize=15)
plt.title("Bubble Plot of Cases density per km2 and population density per km2 for each county", size=22)
Out[21]:
Text(0.5, 1.0, 'Bubble Plot of Cases density per km2 and population density per km2 for each county')
In [22]:
plt.figure(figsize=(22,16), dpi = 100)
plt.scatter('County', 'Cases_mil_may', s='Deaths_mil_may',alpha=1, data=Lastmerge)
plt.xlabel('62 counties in New York', fontsize = 15)
plt.xticks(Lastmerge['County'], rotation='vertical')
plt.ylabel('Number Cases density per km2', fontsize=15)
plt.title("Bubble Plot of Cases density per 1000 and population density per 1000 for each county", size=20)
Out[22]:
Text(0.5, 1.0, 'Bubble Plot of Cases density per 1000 and population density per 1000 for each county')
In [23]:
plt.figure(figsize=(22,16), dpi = 100)
plt.scatter('County', 'Cases_mil_may', s='Deaths_mil_may',c='Pop_dens_km2_may',alpha=1, data=Lastmerge)
plt.xlabel('62 counties in New York', fontsize = 15)
plt.xticks(Lastmerge['County'], rotation='vertical')
plt.ylabel('Number Cases density per 1000', fontsize=15)
plt.title("Bubble Plot of Cases density per km2 and population density per km2 for each county", size=20)
Out[23]:
Text(0.5, 1.0, 'Bubble Plot of Cases density per km2 and population density per km2 for each county')

C - Phase 3: Segmentation and clustering

Modeling : Under these updated metrics, micro-cluster zones will now be determined using K-means.

We can build the K-Means in python using the KMeans algorithm provided by the scikit-learn package. The KMeans class has many parameters that can be used, but we will be using these three:

  1. Init : Initialization method of the centroids. The value will be: ‘k-means++’. k-means++ - Selects initial cluster centers for the k-means clustering in a smart way to speed up convergence.
  2. n_clusters : The number of clusters to form as well as the number of centroids to generate. The value will be 5
  3. n_init : Number of times the k-means algorithm will be run with different centroid seeds

  4. </ol> The final results will be the best output of n_init consecutive runs in terms of inertia. The value will be 12 After building the model, we will be fitting and define a variable ‘labels’ to store the cluster labels of the built model. Let’s do it in python!

In [24]:
feature_fit = ['Pop_dens_km2_may','Case_dens_km2_may','Deaths_mil_may','Cases_mil_may']
Lastmerge= normalize(Lastmerge,feature_fit)
Lastmerge.shape


X = Lastmerge.iloc[:, 5:]
X
Out[24]:
Likes_Resto Likes_Schools Likes_Bus Likes_Metro Likes_Shopping Likes_Hospital Rating_Resto Rating_Schools Rating_Bus Rating_Metro ... Deaths_100k_sep Ratio_Deaths_Cases_sep Cases_may Deaths_may Cases_mil_may Deaths_mil_may Ratio_Deaths_case_may Area_km2 Case_dens_km2_may Pop_dens_km2_may
0 63 97 24 87 50 73 7.6 3.0 5.7 3.3 ... 5.9 0.037077 0.022976 0.007998 0.124010 0.025972 0.025428 1.000000 0.000048 0.000145
1 42 19 49 85 35 91 5.0 6.1 4.2 5.8 ... 20.2 0.188985 0.133111 0.070345 0.050539 0.034541 0.077408 0.279965 0.001198 0.007987
2 78 53 86 83 38 23 4.9 9.4 2.1 4.3 ... 32.8 0.237941 0.248051 0.125513 0.046469 0.069655 0.168092 0.480051 0.001054 0.007588
3 79 9 59 15 19 15 5.5 2.3 7.4 8.0 ... 1.3 0.131030 0.002703 0.000410 0.000795 0.002827 0.049955 0.215523 0.000072 0.003449
4 17 97 83 55 14 42 4.9 4.2 4.0 8.6 ... 192.5 0.310655 1.000000 1.000000 0.536665 0.735380 0.204770 0.026476 0.447088 0.000334
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
57 60 56 38 55 77 49 8.1 2.8 8.1 2.5 ... 8.1 0.186825 0.010571 0.009434 0.022057 0.035910 0.160666 0.138387 0.000120 0.001634
58 71 25 19 49 83 35 8.7 4.2 9.3 8.7 ... 9.9 0.085673 0.013310 0.008203 0.057507 0.031140 0.062106 0.228495 0.000264 0.001452
59 81 20 31 28 4 48 8.6 8.8 7.6 2.0 ... 6.4 0.091433 0.011244 0.007178 0.046115 0.009364 0.022727 0.313083 0.000096 0.000690
60 59 40 26 39 3 77 5.7 5.0 9.2 7.6 ... 42.2 0.403168 0.008518 0.005742 0.067255 0.069081 0.119712 0.302733 0.000192 0.000980
61 44 55 7 49 13 28 6.8 5.7 4.3 8.2 ... 11.2 0.138589 0.005348 0.006153 0.056898 0.049602 0.099910 0.108029 0.000120 0.000726

62 rows × 33 columns

In [25]:
clusters = 5
model = KMeans(init = 'k-means++',n_clusters = clusters, n_init = 12)
model.fit(X)

labels = model.labels_ +1
print(cl(labels[:100], attrs = ['bold']))
[4 4 4 1 2 2 4 1 1 1 4 1 3 1 2 4 1 1 1 1 5 1 4 4 1 4 1 1 1 3 5 3 4 3 4 1 4
 2 4 1 4 1 2 1 1 4 4 2 4 1 1 1 1 1 1 5 1 1 4 1 4 1]
In [26]:
def scalePercentage(df,feature):
    result = df.copy()
    for feature_name in feature:
        value = 300
        result[feature_name] = df[feature_name]*value
    return result
In [27]:
feature_scale = ['Deaths_mil_may','Cases_mil_may']
feature_final = ['County','Deaths_mil_may','Cases_mil_may', 'Pop_dens_km2_may','Cluster_class']
Lastmerge['Cluster_class'] = labels
inter_dataframe = Lastmerge[feature_final]

final_dataframe = scalePercentage(inter_dataframe,feature_scale)
final_dataframe.head()
Out[27]:
County Deaths_mil_may Cases_mil_may Pop_dens_km2_may Cluster_class
0 St. Lawrence 7.791519 37.202931 0.000145 4
1 Onondaga 10.362191 15.161625 0.007987 4
2 Monroe 20.896643 13.940661 0.007588 4
3 Schoharie 0.848057 0.238476 0.003449 1
4 Kings 220.613958 160.999551 0.000334 2
In [28]:
final_dataframe.head(20)
Out[28]:
County Deaths_mil_may Cases_mil_may Pop_dens_km2_may Cluster_class
0 St. Lawrence 7.791519 37.202931 0.000145 4
1 Onondaga 10.362191 15.161625 0.007987 4
2 Monroe 20.896643 13.940661 0.007588 4
3 Schoharie 0.848057 0.238476 0.003449 1
4 Kings 220.613958 160.999551 0.000334 2
5 Nassau 172.844523 201.969191 0.041969 2
6 Rensselaer 11.687279 13.320775 0.003304 4
7 Oswego 3.392226 2.297490 0.001198 1
8 Otsego 8.904594 6.530618 0.000799 1
9 Clinton 6.585689 3.853222 0.000980 1
10 Erie 38.652827 28.438016 0.010456 4
11 Chautauqua 1.046820 0.192586 0.001162 1
12 Dutchess 33.776502 76.964395 0.004974 3
13 Cortland 2.782686 3.117484 0.001307 1
14 Richmond 201.492933 210.642477 0.065205 2
15 Saratoga 15.993816 23.404452 0.001743 4
16 Hamilton 0.000000 3.168640 0.000000 1
17 Yates 5.313604 3.493628 0.000908 1
18 Tioga 21.996466 12.104324 0.001271 1
19 Tompkins 0.000000 7.555987 0.002977 1
In [29]:
plt.figure(figsize=(19,20))
plt.scatter('County', 'Cluster_class', s='Cases_mil_may',c='Deaths_mil_may',alpha=5, data=final_dataframe)
plt.xlabel('62 counties in New York', fontsize = 15)
plt.xticks(final_dataframe['County'], rotation='vertical')
plt.ylabel('Number Cases density per 1000', fontsize=15)
plt.title("Clustering ", size=20)
Out[29]:
Text(0.5, 1.0, 'Clustering ')

Let’s look at the distribution of cluster based on their Cases and Deaths using a bubble plot and the color represents the cluster value.

In [31]:
plt.figure(figsize=(26,17))
plt.scatter('County', 'Cases_mil_may', s='Deaths_mil_may',c='Cluster_class',alpha=5, data=final_dataframe)
plt.xlabel('62 counties in New York', fontsize = 15)
plt.xticks(final_dataframe['County'], rotation='vertical')
plt.ylabel('Number Cases density per 1000', fontsize=15)
plt.title("Clustering ", size=20)
Out[31]:
Text(0.5, 1.0, 'Clustering ')

We hope that this study will inspire new reflections on the COVID-19 pandemic by readers.

In [ ]: