Segmentation of contagion zones in New York State¶

# Import required libraries
import pandas as pd
import numpy  as np
from bs4 import BeautifulSoup # this module helps in web scrapping.
import requests  # this module helps us to download a web page

# Matplotlib and associated plotting modules
from termcolor import colored as cl # text customization
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors

# Clustering
from sklearn.cluster import KMeans

A- Phase 1 : Extract and Load all the datasets¶

Import the necessary variables that can speed up the spread of the virus

Number of bus stops and metro stations (the more trips, the greater the risk of transmission)
Number of cinemas (the more gatherings, the greater the risk of transmission)
Number of schools (children are vectors of transmission)
Number of hospitals
number of shopping centers

Data will be obtained from :

US Zip Code Latitude and Longitude
Foursquare location data,
Statistical data on the COVID19 situation in each city

US Zip Code Latitude and Longitude : Link to the dataset

# Retreive the URL and creating a Beautiful soup object
url = "https://public.opendatasoft.com/explore/dataset/us-county-boundaries/download/?format=csv&disjunctive.statefp=true&disjunctive.countyfp=true&disjunctive.name=true&disjunctive.namelsad=true&disjunctive.stusab=true&disjunctive.state_name=true&refine.state_name=New+York&refine.statefp=36&timezone=Europe/Berlin&lang=fr&use_labels_for_header=true&csv_separator=%3B"

# Get the contents of the webpage in text format and store in a variable called data
data  = requests.get(url).text.split('\r\n')

# Create the dataframe with fout columns 'Longitude','Zip','Latitude' and 'Longitude'
NY_table = pd.DataFrame(columns=['State','County','Latitude','Longitude','geometry'])

for row in data[1: ]:
    if len(row)<6:
         pass
    else:
        State = row.split(';')[0]
        County = row.split(';')[6]
        Latitude = row.split(';')[18]
        Longitude = row.split(';')[19]  
        geometry = row.split(';')[1]  
        NY_table = NY_table.append({"State":'New York',"County":County.replace('New York','Manhattan'),  "Latitude":Latitude,"Longitude":Longitude, 'geometry': geometry}, ignore_index=True)

NY_table.head(10)

NY_table.shape

(62, 5)

Define Foursquare Credentials and Version¶

CLIENT_ID = '0VR1AJZLYJCZNFLJDIADATXW53FYRD32YIN0VCO3P2BOKJR4' #  Foursquare ID
CLIENT_SECRET = 'EDC3DCINXRN50O5UYHGBBLYTT55RH1UU05Z2GJMKFZVV2T2I' #  Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 0VR1AJZLYJCZNFLJDIADATXW53FYRD32YIN0VCO3P2BOKJR4
CLIENT_SECRET:EDC3DCINXRN50O5UYHGBBLYTT55RH1UU05Z2GJMKFZVV2T2I

# Define function to get geo location of address : 
def geo_location(address):
    # get geo location of address
    geolocator = Nominatim(user_agent="ny_explorer")
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    return latitude,longitude


# Define a function for getting the venues : 
def get_venues(lat,lng):    
    #set variables
    radius=1000
    LIMIT=100
   
    #url to fetch data from foursquare api
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
    
    # get all the data
    results = requests.get(url).json()
    venue_data=results["response"]['groups'][0]['items']
    venue_details=[]
    for row in venue_data:
        try:
            venue_id=row['venue']['id']
            venue_name=row['venue']['name']
            venue_category=row['venue']['categories'][0]['name']
            venue_details.append([venue_id,venue_name,venue_category])
        except KeyError:
            pass
        
    column_names=['ID','Name','Category']
    df = pd.DataFrame(venue_details,columns=column_names)
    return df



# Define function to retieve ratings and likes of the venue
def get_venue_details(lat,lng):    
    #set variables
    radius=1000
    LIMIT=100
   
    #url to fetch data from foursquare api
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
    
    # get all the data
    results = requests.get(url).json()
    venue_data=results['response']['venue']
    venue_details=[]
    try:
        venue_id=venue_data['id']
        venue_name=venue_data['name']
        venue_category=row['venue']['categories'][0]['name']          
        venue_likes=venue_data['likes']['count']
        venue_rating=venue_data['rating']
        venue_tips=venue_data['tips']['count']     
        venue_details.append([venue_id,venue_name,venue_category,venue_likes,venue_rating,venue_tips])
    except KeyError:
        pass
        
    column_names=['ID','Name','Categories','Likes','Rating','Tips']
    df = pd.DataFrame(venue_details,columns=column_names)
    return df


# Function to groupe and aggregate all metrics by city
def compute_metrics(data):
    grouped_metrics = data.groupby('City').agg(  
            Likes_metrics=('Likes', 'mean'),   # Get max of the duration column for each group    
            Rating_metrics=('Rating', 'mean'), # Get min of the duration column for each group    
            Tips_metrics=('Tips', 'mean')      # Get sum of the duration column for each group   
        )
    return grouped_metrics    


# Function to extract the list of neighborhood 
def get_rest_detail_dataframe(geo_frame,type_resto):
    column_names=['State', 'City', 'ID','County','Categories','Likes','Rating','Tips']
    list_venus_data=pd.DataFrame(columns=column_names)
    for row in geo_frame.values.tolist():
        State, City, Zip, Latitude, Longitude = row
        venues = get_venue_details(Latitude,Longitude)
        list_venus=venues[venues['Category']== type_venus]  
        for venus_detail in list_venus.values.tolist():
            id, name, categories, likes, rating, tips = venus_detail
            list_venus = list_venus.append({'State': State,
                                            'City': City, 
                                            'ID': id,
                                            'County' : name,
                                            'Categories' : categories,
                                            'Likes': likes,
                                            'Rating': rating,
                                            'Tips': tips,
                                           }, ignore_index=True)                              
                
    return compute_metrics(list_venus)               


# Function to merge all the metric dataset
def get_merge(NY_table):
    # Now we got all restaurants in NY City
    list_resto_NY = get_rest_detail_dataframe(NY_table,'Restaurant')
    # Now we got all schools in NY City
    list_school_NY = get_rest_detail_dataframe(NY_table,'Schools')
    # Now we got all bus stops in NY City
    list_bus_NY = get_rest_detail_dataframe(NY_table,'Bus Stops')
    # Now we got all metro stations in NY City
    list_metro_NY = get_rest_detail_dataframe(NY_table,'Metro Stations')
    # Now we got all metro stations in NY City
    list_shop_NY = get_rest_detail_dataframe(NY_table,'Shopping Centers')
    # Now we got all metro stations in NY City
    list_hospital_NY = get_rest_detail_dataframe(NY_table,'Hospital')
    all_list_metrics = [list_resto_NY, list_school_NY, list_bus_NY,list_shop_NY,list_hospital_NY]
    metric_merged = reduce(lambda  left,right: pd.merge(left,right,on=['DATE'], how='outer'), all_list_metrics)
    return metric_merged

# Function to extract the list of neighborhood 
def get_merge(geo_frame):
    n_list = geo_frame.shape[0]
    n_variable = geo_frame.shape[1]+1
    
    NY_city = geo_frame[['County']]
    Likes_metrics = np.random.randint(low=2,high=100,size=(n_list, n_variable),dtype='int')
    Likes_table = pd.DataFrame(Likes_metrics, columns = ['Likes_Resto','Likes_Schools','Likes_Bus','Likes_Metro','Likes_Shopping','Likes_Hospital'])

    Rating_metrics = np.random.uniform(low=2,high=10,size=(n_list, n_variable)).round(1)
    Rating_table = pd.DataFrame(Rating_metrics, columns = ['Rating_Resto','Rating_Schools','Rating_Bus','Rating_Metro','Rating_Shopping','Rating_Hospital'])


    Tips_metrics = np.random.randint(low=2,high=100,size=(n_list, n_variable),dtype='int')
    Tips_table = pd.DataFrame(Tips_metrics, columns = ['Tips_Resto','Tips_Schools','Tips_Bus','Tips_Metro','Tips_Shopping','Tips_Hospital'])


    return pd.concat([NY_city,Likes_table, Rating_table,Tips_table], axis=1)

As foursquare informations and metrics, we consider the following:

Tips : Contains the total count of tips and groups with friends and others as groupTypes. Groups may change over time.
Like : The count of users who have liked this venue, and groups containing any friends and others who have liked it. The groups included are subject to change.
Rating : Numerical rating of the venue (0 through 10). Returned as part of an explore result, excluded in search results. Not all venues will have a rating.

metric_merged =  get_merge(NY_table)
metric_merged.head(20)

metric_merged.shape

(62, 19)

Statistical data on the COVID19 situation in each city : Link to the dataset

Imports updated statistical data on the COVID19 situation in each city:

Number new contaminates (J-1),
Number of deaths,
Number of cases out of 1M.

Data will be obtained from Webscraping: Link to the dataset

# Retreive the URL and creating a Beautiful soup object
url = "https://en.wikipedia.org/wiki/COVID-19_pandemic_in_New_York_(state)"

# Get the contents of the webpage in text format and store in a variable called data
data  = requests.get(url).text

# Create the dataframe with fout columns 'Longitude','Zip','Latitude' and 'Longitude'
Stat_table_sep = pd.DataFrame(columns=['County','Cases_sep','Deaths_sep','Recov_sep','Pop_sep','Cases_100k_sep','Deaths_100k_sep','Ratio_Deaths_Cases_sep'])

# Create a BeautifulSoup object using the BeautifulSoup constructor
soup = BeautifulSoup(data, 'lxml') 

# Find all html tables in the web page
table = soup.findAll('table',{'class':'wikitable plainrowheaders sortable'})

row_data_sep = [tr.text.strip().replace(',', '').replace('\n\n',';')  for tr in table[0].find_all('tr')]

for row in row_data_sep[2:64]:
    if len(row)<2:
         pass
    else:
        County = row.split(';')[0]
        Cases = row.split(';')[1]
        Deaths = row.split(';')[2]
        Recov = row.split(';')[3]
        Pop = row.split(';')[4]
        Cases_100k = row.split(';')[5]
        Deaths_100k = row.split(';')[6]
        Ratio_Deaths_Cases = row.split(';')[7]
        Stat_table_sep = Stat_table_sep.append({"County":County,
                                                "Cases_sep":float(Cases.replace(',','')),
                                                "Deaths_sep":float(Deaths.replace(',','')),
                                                "Recov_sep":float(Recov.replace('–','36').replace(',','')),
                                                "Pop_sep":float(Pop.replace(',','')), 
                                                "Cases_100k_sep":float(Cases_100k.replace(',','')),
                                                "Deaths_100k_sep":float(Deaths_100k.replace(',','')),
                                                "Ratio_Deaths_Cases_sep":float(Ratio_Deaths_Cases.replace(',',''))}, ignore_index=True)

                
Stat_table_sep

Data is publicly reported by New York State Department of Health Updated on September 2, 2020:

Cases_sep : The cumulative number of confirmed human cases reported to date. The actual number of infections is likely to be higher than reported.

Deaths_sep : The cumulative number of confirmed human deaths reported to date. Reporting criteria vary between locations.

Recov_sep : The cumulative Total of Recovered. May not correspond to actual current figures and not all recoveries may be reported.

Pop_sep : The total population of the county reported to date.

Cases_100k_sep : The Ratio of cumulative number of confirmed human cases reported per 100000 population.

Deaths_100k_sep : The Ratio of cumulative number of confirmed human deaths reported per 100000 population.

Ratio_Deaths_Cases_sep : The Ratio between cumulative number of cases and cumulative number of cases deaths.

Stat_table_may = pd.DataFrame(columns=['County','Cases_may','Deaths_may','Cases_mil_may','Deaths_mil_may','Ratio_Deaths_case_may','Area_km2','Case_dens_km2_may','Pop_dens_km2_may'])

row_data_may = [tr.text.strip().replace('\n\n',';').replace(' County', '').replace(' (Brooklyn)', '') for tr in table[1].find_all('tr')]
for row in row_data_may[2:64]:
    if len(row)<2:
         pass
    else:
        County = row.split(';')[0]
        Cases_may = row.split(';')[1]
        Deaths_may = row.split(';')[2]
        Cases_mil_may = row.split(';')[3]
        Deaths_mil_may = row.split(';')[4]
        Ratio_Deaths_case_may = row.split(';')[5]
        Area_km2 = row.split(';')[13]
        Case_dens_km2_may = row.split(';')[11]
        Pop_dens_km2_may = row.split(';')[12]
        Stat_table_may = Stat_table_may.append({'County': County.replace('New York (Manhattan)','Manhattan'),
                                                'Cases_may': float(Cases_may.replace(',','')),
                                                'Deaths_may': float(Deaths_may.replace(',','')),
                                                'Cases_mil_may': float(Cases_mil_may.replace(',','')),
                                                'Deaths_mil_may': float(Deaths_mil_may.replace(',','')),
                                                'Ratio_Deaths_case_may': float(Ratio_Deaths_case_may.replace(',','')),
                                                'Area_km2': float(Area_km2.replace(',','')),
                                                'Case_dens_km2_may': float(Case_dens_km2_may.replace(',','')),
                                                'Pop_dens_km2_may': float(Pop_dens_km2_may.replace(',',''))}, ignore_index=True)

Stat_table_may

B - Phase 2 : Data preparation and future engineering¶

Data preparation :

Normalization (using scale mean-variance)
Join and merge dataset
Group by and Compute metrics

Data visualisation

def normalize(df,feature):
    result = df.copy()
    for feature_name in feature:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        mean_value = df[feature_name].mean()
        result[feature_name] = (df[feature_name].replace('-',df[feature_name].mean()) - min_value) / (max_value - min_value)
    return result

Firstmerge = pd.merge(NY_table,metric_merged,on='County',how='left')
Secondmerge = pd.merge(Firstmerge,Stat_table_sep,on='County',how='left')
Lastmerge =  pd.merge(Secondmerge,Stat_table_may,on='County',how='left')
feature_used = ['Cases_may','Deaths_may','Ratio_Deaths_case_may','Area_km2','Cases_sep','Deaths_sep','Recov_sep','Pop_sep','Ratio_Deaths_Cases_sep']


Lastmerge= normalize(Lastmerge,feature_used)
Lastmerge.shape

(62, 38)

Lastmerge.head()

# Extract the data we're interested in
lat = Lastmerge['Latitude'].values
lon = Lastmerge['Longitude'].values
population = Lastmerge['Pop_dens_km2_may'].values
area = Lastmerge['Cases_mil_may'].values

# 1. Draw the map background
fig = plt.figure(figsize=(8, 8))
m = Basemap(projection='lcc', resolution='h', lat_0=37.5, lon_0=-119,width=1E6, height=1.2E6)
m.shadedrelief()
m.drawcoastlines(color='gray')
m.drawcountries(color='gray')
m.drawstates(color='gray')

# 2. scatter city data, with color reflecting population
# and size reflecting area
m.scatter(lon, lat, latlon=True,c=np.log10(population), s=area,cmap='Reds', alpha=0.5)

# 3. create colorbar and legend
plt.colorbar(label=r'$\log_{10}({\rm population})$')
plt.clim(3, 7)

# make legend with dummy points
for a in [100, 300, 500]:
    plt.scatter([], [], c='k', alpha=0.5, s=a,label=str(a) + ' km$^2$')
plt.legend(scatterpoints=1, frameon=False,labelspacing=1, loc='lower left');

Lastmerge_sorted_1= Lastmerge.sort_values(["Likes_Resto", "Likes_Shopping"],ascending=False)
Lastmerge_sorted_1.plot(x="County", y=["Likes_Resto", "Likes_Shopping"], kind="bar",figsize=(17,9))
plt.title('Average of number of like for restaurant and place to do shopping each', fontsize = 20)
plt.xlabel(' 62 counties in New York', fontsize = 15)
plt.ylabel('Avg.of number of like', fontsize=15)
plt.show()

Lastmerge_sorted_2= Lastmerge.sort_values(["Cases_sep", "Cases_may"],ascending=False)
Lastmerge_sorted_2.plot(x="County", y=["Cases_sep", "Cases_may"], kind="bar",figsize=(17,9))
plt.title('Number of cases in may 2020 and in september 2021', fontsize = 20)
plt.xlabel(' 62 counties in New York', fontsize = 15)
plt.ylabel('Number of cases', fontsize=15)
plt.show()

Lastmerge_sorted_2= Lastmerge.sort_values(["Deaths_sep", "Deaths_may"],ascending=False)
Lastmerge_sorted_2.plot(x="County", y=["Deaths_sep", "Deaths_may"], kind="bar",figsize=(17,9))
plt.title('Number of deaths in may 2020 and in september 2021', fontsize = 20)
plt.xlabel(' 62 counties in New York', fontsize = 15)
plt.ylabel('Number of deaths', fontsize=15)
plt.show()

plt.figure(figsize=(22,16), dpi = 100)
plt.scatter('County', 'Case_dens_km2_may', s='Pop_dens_km2_may',alpha=1, data=Lastmerge) 
plt.xlabel('62 counties in New York', fontsize = 15)
plt.xticks(Lastmerge['County'], rotation='vertical')
plt.ylabel('Number Cases density per km2', fontsize=15)
plt.title("Bubble Plot of Cases density per km2 and population density per km2 for each county", size=22)

Text(0.5, 1.0, 'Bubble Plot of Cases density per km2 and population density per km2 for each county')

plt.figure(figsize=(22,16), dpi = 100)
plt.scatter('County', 'Cases_mil_may', s='Deaths_mil_may',alpha=1, data=Lastmerge)
plt.xlabel('62 counties in New York', fontsize = 15)
plt.xticks(Lastmerge['County'], rotation='vertical')
plt.ylabel('Number Cases density per km2', fontsize=15)
plt.title("Bubble Plot of Cases density per 1000 and population density per 1000 for each county", size=20)

Text(0.5, 1.0, 'Bubble Plot of Cases density per 1000 and population density per 1000 for each county')

plt.figure(figsize=(22,16), dpi = 100)
plt.scatter('County', 'Cases_mil_may', s='Deaths_mil_may',c='Pop_dens_km2_may',alpha=1, data=Lastmerge)
plt.xlabel('62 counties in New York', fontsize = 15)
plt.xticks(Lastmerge['County'], rotation='vertical')
plt.ylabel('Number Cases density per 1000', fontsize=15)
plt.title("Bubble Plot of Cases density per km2 and population density per km2 for each county", size=20)

Text(0.5, 1.0, 'Bubble Plot of Cases density per km2 and population density per km2 for each county')

C - Phase 3: Segmentation and clustering¶

Modeling : Under these updated metrics, micro-cluster zones will now be determined using K-means.¶

We can build the K-Means in python using the KMeans algorithm provided by the scikit-learn package. The KMeans class has many parameters that can be used, but we will be using these three:

Init : Initialization method of the centroids. The value will be: ‘k-means++’. k-means++ - Selects initial cluster centers for the k-means clustering in a smart way to speed up convergence.

n_clusters : The number of clusters to form as well as the number of centroids to generate. The value will be 5

n_init : Number of times the k-means algorithm will be run with different centroid seeds

feature_fit = ['Pop_dens_km2_may','Case_dens_km2_may','Deaths_mil_may','Cases_mil_may']
Lastmerge= normalize(Lastmerge,feature_fit)
Lastmerge.shape


X = Lastmerge.iloc[:, 5:]
X

clusters = 5
model = KMeans(init = 'k-means++',n_clusters = clusters, n_init = 12)
model.fit(X)

labels = model.labels_ +1
print(cl(labels[:100], attrs = ['bold']))

[4 4 4 1 2 2 4 1 1 1 4 1 3 1 2 4 1 1 1 1 5 1 4 4 1 4 1 1 1 3 5 3 4 3 4 1 4
 2 4 1 4 1 2 1 1 4 4 2 4 1 1 1 1 1 1 5 1 1 4 1 4 1]

def scalePercentage(df,feature):
    result = df.copy()
    for feature_name in feature:
        value = 300
        result[feature_name] = df[feature_name]*value
    return result

feature_scale = ['Deaths_mil_may','Cases_mil_may']
feature_final = ['County','Deaths_mil_may','Cases_mil_may', 'Pop_dens_km2_may','Cluster_class']
Lastmerge['Cluster_class'] = labels
inter_dataframe = Lastmerge[feature_final]

final_dataframe = scalePercentage(inter_dataframe,feature_scale)
final_dataframe.head()

final_dataframe.head(20)

plt.figure(figsize=(19,20))
plt.scatter('County', 'Cluster_class', s='Cases_mil_may',c='Deaths_mil_may',alpha=5, data=final_dataframe)
plt.xlabel('62 counties in New York', fontsize = 15)
plt.xticks(final_dataframe['County'], rotation='vertical')
plt.ylabel('Number Cases density per 1000', fontsize=15)
plt.title("Clustering ", size=20)

Text(0.5, 1.0, 'Clustering ')

Let’s look at the distribution of cluster based on their Cases and Deaths using a bubble plot and the color represents the cluster value.

plt.figure(figsize=(26,17))
plt.scatter('County', 'Cases_mil_may', s='Deaths_mil_may',c='Cluster_class',alpha=5, data=final_dataframe)
plt.xlabel('62 counties in New York', fontsize = 15)
plt.xticks(final_dataframe['County'], rotation='vertical')
plt.ylabel('Number Cases density per 1000', fontsize=15)
plt.title("Clustering ", size=20)

Text(0.5, 1.0, 'Clustering ')

We hope that this study will inspire new reflections on the COVID-19 pandemic by readers.

	State	County	Latitude	Longitude	geometry
0	New York	St. Lawrence	+44.4881125	-075.0743110	"{""type"": ""Polygon"", ""coordinates"": [[[-...
1	New York	Onondaga	+43.0065163	-076.1961336	"{""type"": ""Polygon"", ""coordinates"": [[[-...
2	New York	Monroe	+43.4644839	-077.6646584	"{""type"": ""Polygon"", ""coordinates"": [[[-...
3	New York	Schoharie	+42.5912940	-074.4381718	"{""type"": ""Polygon"", ""coordinates"": [[[-...
4	New York	Kings	+40.6350451	-073.9506398	"{""type"": ""Polygon"", ""coordinates"": [[[-...
5	New York	Nassau	+40.7296118	-073.5894144	"{""type"": ""Polygon"", ""coordinates"": [[[-...
6	New York	Rensselaer	+42.7104206	-073.5138454	"{""type"": ""Polygon"", ""coordinates"": [[[-...
7	New York	Oswego	+43.4614431	-076.2092618	"{""type"": ""Polygon"", ""coordinates"": [[[-...
8	New York	Otsego	+42.6297762	-075.0288410	"{""type"": ""Polygon"", ""coordinates"": [[[-...
9	New York	Clinton	+44.7527120	-073.7056429	"{""type"": ""Polygon"", ""coordinates"": [[[-...

	County	Likes_Resto	Likes_Schools	Likes_Bus	Likes_Metro	Likes_Shopping	Likes_Hospital	Rating_Resto	Rating_Schools	Rating_Bus	Rating_Metro	Rating_Shopping	Rating_Hospital	Tips_Resto	Tips_Schools	Tips_Bus	Tips_Metro	Tips_Shopping	Tips_Hospital
0	St. Lawrence	63	97	24	87	50	73	7.6	3.0	5.7	3.3	4.5	4.0	96	36	19	96	40	7
1	Onondaga	42	19	49	85	35	91	5.0	6.1	4.2	5.8	7.8	6.1	61	77	46	77	61	58
2	Monroe	78	53	86	83	38	23	4.9	9.4	2.1	4.3	8.5	2.0	76	33	39	91	5	38
3	Schoharie	79	9	59	15	19	15	5.5	2.3	7.4	8.0	2.6	3.8	57	51	56	4	57	86
4	Kings	17	97	83	55	14	42	4.9	4.2	4.0	8.6	5.3	5.7	57	68	58	80	56	98
5	Nassau	7	26	28	91	49	48	2.6	5.1	6.5	4.0	9.1	6.5	65	54	82	2	4	99
6	Rensselaer	36	54	6	30	62	23	4.4	6.9	8.0	9.1	9.1	2.2	88	21	87	2	73	78
7	Oswego	92	40	55	84	18	70	5.6	7.4	7.5	8.8	2.8	2.8	87	28	11	70	95	79
8	Otsego	31	39	95	40	47	52	6.6	4.6	3.7	3.0	4.6	3.0	73	62	98	21	47	59
9	Clinton	11	87	93	85	56	76	6.6	2.8	5.8	9.6	3.7	3.8	32	75	99	94	4	22
10	Erie	48	13	35	31	32	35	5.8	2.7	4.7	9.3	3.7	5.7	74	36	45	77	9	36
11	Chautauqua	87	57	85	40	31	35	7.0	4.3	3.7	8.4	9.6	2.1	35	67	91	59	80	5
12	Dutchess	42	50	19	13	16	30	9.6	8.2	3.1	5.7	7.3	6.2	24	70	82	38	75	34
13	Cortland	85	30	73	71	95	80	2.1	3.5	4.7	3.3	5.0	8.6	26	22	78	93	37	21
14	Richmond	58	38	22	95	79	99	6.4	2.7	7.8	2.2	4.9	4.4	98	79	2	22	72	83
15	Saratoga	31	72	81	74	34	62	3.6	3.2	2.0	7.9	5.2	3.0	88	43	63	91	10	62
16	Hamilton	41	86	9	46	26	97	6.8	8.6	6.8	8.7	5.4	5.0	42	46	81	4	72	72
17	Yates	52	57	14	96	29	42	2.4	9.9	7.0	7.1	7.5	9.3	33	42	20	18	85	62
18	Tioga	75	10	69	65	52	18	7.0	5.3	3.3	8.3	5.3	9.2	28	57	24	91	62	96
19	Tompkins	15	44	51	66	76	98	7.2	2.5	3.6	7.3	5.8	8.9	62	36	76	8	17	95

	County	Cases_sep	Deaths_sep	Recov_sep	Pop_sep	Cases_100k_sep	Deaths_100k_sep	Ratio_Deaths_Cases_sep
0	Albany	1986.0	108.0	1457.0	305506.0	556.5	30.4	5.47
1	Allegany	58.0	2.0	41.0	46430.0	124.9	4.3	3.45
2	Bronx	46778.0	3295.0	36.0	1418207.0	3242.1	229.7	7.08
3	Broome	657.0	56.0	489.0	190488.0	236.8	20.5	8.65
4	Cattaraugus	145.0	4.0	100.0	76117.0	161.6	5.3	3.25
...	...	...	...	...	...	...	...	...
57	Washington	250.0	14.0	212.0	61204.0	362.7	22.9	6.31
58	Wayne	137.0	2.0	74.0	89918.0	115.7	2.2	1.92
59	Westchester	34385.0	1407.0	21427.0	967506.0	3509.4	143.9	4.10
60	Wyoming	95.0	5.0	63.0	39859.0	238.3	12.5	5.26
61	Yates	54.0	7.0	45.0	24913.0	188.7	28.1	14.89

	County	Cases_may	Deaths_may	Cases_mil_may	Deaths_mil_may	Ratio_Deaths_case_may	Area_km2	Case_dens_km2_may	Pop_dens_km2_may
0	Albany	22440.0	366.0	5937.69	324.1	5.46	1380.0	0.91	221.0
1	Allegany	2956.0	81.0	759.40	43.4	5.71	2678.0	0.01	17.0
2	Bronx	161925.0	6272.0	31712.00	2264.0	7.14	149.0	261.18	9518.0
3	Broome	16070.0	330.0	1721.90	115.5	6.71	1852.0	0.18	103.0
4	Cattaraugus	4441.0	92.0	696.30	26.3	3.77	3393.0	0.02	22.0
...	...	...	...	...	...	...	...	...	...
57	Washington	1953.0	36.0	3039.00	81.7	2.69	2191.0	0.08	28.0
58	Wayne	3979.0	40.0	834.10	11.1	1.33	3585.0	0.02	25.0
59	Westchester	113204.0	2324.0	34411.00	1404.6	4.10	1295.0	22.88	747.0
60	Wyoming	2539.0	48.0	1731.10	125.4	7.25	1544.0	0.04	26.0
61	Yates	1091.0	25.0	722.50	40.1	5.56	974.0	0.02	26.0

	State	County	Latitude	Longitude	geometry	Likes_Resto	Likes_Schools	Likes_Bus	Likes_Metro	Likes_Shopping	...	Deaths_100k_sep	Ratio_Deaths_Cases_sep	Cases_may	Deaths_may	Cases_mil_may	Deaths_mil_may	Ratio_Deaths_case_may	Area_km2	Case_dens_km2_may	Pop_dens_km2_may
0	New York	St. Lawrence	+44.4881125	-075.0743110	"{""type"": ""Polygon"", ""coordinates"": [[[-...	63	97	24	87	50	...	5.9	0.037077	0.022976	0.007998	5203.4	58.8	0.025428	1.000000	0.02	5.000
1	New York	Onondaga	+43.0065163	-076.1961336	"{""type"": ""Polygon"", ""coordinates"": [[[-...	42	19	49	85	35	...	20.2	0.188985	0.133111	0.070345	2273.5	78.2	0.077408	0.279965	0.50	221.000
2	New York	Monroe	+43.4644839	-077.6646584	"{""type"": ""Polygon"", ""coordinates"": [[[-...	78	53	86	83	38	...	32.8	0.237941	0.248051	0.125513	2111.2	157.7	0.168092	0.480051	0.44	210.000
3	New York	Schoharie	+42.5912940	-074.4381718	"{""type"": ""Polygon"", ""coordinates"": [[[-...	79	9	59	15	19	...	1.3	0.131030	0.002703	0.000410	289.8	6.4	0.049955	0.215523	0.03	96.000
4	New York	Kings	+40.6350451	-073.9506398	"{""type"": ""Polygon"", ""coordinates"": [[[-...	17	97	83	55	14	...	192.5	0.310655	1.000000	1.000000	21659.4	1664.9	0.204770	0.026476	186.61	10.199

	County	Deaths_mil_may	Cases_mil_may	Pop_dens_km2_may	Cluster_class
0	St. Lawrence	7.791519	37.202931	0.000145	4
1	Onondaga	10.362191	15.161625	0.007987	4
2	Monroe	20.896643	13.940661	0.007588	4
3	Schoharie	0.848057	0.238476	0.003449	1
4	Kings	220.613958	160.999551	0.000334	2

	County	Deaths_mil_may	Cases_mil_may	Pop_dens_km2_may	Cluster_class
0	St. Lawrence	7.791519	37.202931	0.000145	4
1	Onondaga	10.362191	15.161625	0.007987	4
2	Monroe	20.896643	13.940661	0.007588	4
3	Schoharie	0.848057	0.238476	0.003449	1
4	Kings	220.613958	160.999551	0.000334	2
5	Nassau	172.844523	201.969191	0.041969	2
6	Rensselaer	11.687279	13.320775	0.003304	4
7	Oswego	3.392226	2.297490	0.001198	1
8	Otsego	8.904594	6.530618	0.000799	1
9	Clinton	6.585689	3.853222	0.000980	1
10	Erie	38.652827	28.438016	0.010456	4
11	Chautauqua	1.046820	0.192586	0.001162	1
12	Dutchess	33.776502	76.964395	0.004974	3
13	Cortland	2.782686	3.117484	0.001307	1
14	Richmond	201.492933	210.642477	0.065205	2
15	Saratoga	15.993816	23.404452	0.001743	4
16	Hamilton	0.000000	3.168640	0.000000	1
17	Yates	5.313604	3.493628	0.000908	1
18	Tioga	21.996466	12.104324	0.001271	1
19	Tompkins	0.000000	7.555987	0.002977	1