# Import required libraries
import pandas as pd
import numpy  as np
from bs4 import BeautifulSoup # this module helps in web scrapping.
import requests  # this module helps us to download a web page
# Matplotlib and associated plotting modules
from termcolor import colored as cl # text customization
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
# Clustering
from sklearn.cluster import KMeans
Import the necessary variables that can speed up the spread of the virus
Data will be obtained from :
# Retreive the URL and creating a Beautiful soup object
url = "https://public.opendatasoft.com/explore/dataset/us-county-boundaries/download/?format=csv&disjunctive.statefp=true&disjunctive.countyfp=true&disjunctive.name=true&disjunctive.namelsad=true&disjunctive.stusab=true&disjunctive.state_name=true&refine.state_name=New+York&refine.statefp=36&timezone=Europe/Berlin&lang=fr&use_labels_for_header=true&csv_separator=%3B"
# Get the contents of the webpage in text format and store in a variable called data
data  = requests.get(url).text.split('\r\n')
# Create the dataframe with fout columns 'Longitude','Zip','Latitude' and 'Longitude'
NY_table = pd.DataFrame(columns=['State','County','Latitude','Longitude','geometry'])
for row in data[1: ]:
    if len(row)<6:
         pass
    else:
        State = row.split(';')[0]
        County = row.split(';')[6]
        Latitude = row.split(';')[18]
        Longitude = row.split(';')[19]  
        geometry = row.split(';')[1]  
        NY_table = NY_table.append({"State":'New York',"County":County.replace('New York','Manhattan'),  "Latitude":Latitude,"Longitude":Longitude, 'geometry': geometry}, ignore_index=True)
NY_table.head(10)
NY_table.shape
CLIENT_ID = '0VR1AJZLYJCZNFLJDIADATXW53FYRD32YIN0VCO3P2BOKJR4' #  Foursquare ID
CLIENT_SECRET = 'EDC3DCINXRN50O5UYHGBBLYTT55RH1UU05Z2GJMKFZVV2T2I' #  Foursquare Secret
VERSION = '20180605' # Foursquare API version
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)
# Define function to get geo location of address : 
def geo_location(address):
    # get geo location of address
    geolocator = Nominatim(user_agent="ny_explorer")
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    return latitude,longitude
# Define a function for getting the venues : 
def get_venues(lat,lng):    
    #set variables
    radius=1000
    LIMIT=100
   
    #url to fetch data from foursquare api
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
    
    # get all the data
    results = requests.get(url).json()
    venue_data=results["response"]['groups'][0]['items']
    venue_details=[]
    for row in venue_data:
        try:
            venue_id=row['venue']['id']
            venue_name=row['venue']['name']
            venue_category=row['venue']['categories'][0]['name']
            venue_details.append([venue_id,venue_name,venue_category])
        except KeyError:
            pass
        
    column_names=['ID','Name','Category']
    df = pd.DataFrame(venue_details,columns=column_names)
    return df
# Define function to retieve ratings and likes of the venue
def get_venue_details(lat,lng):    
    #set variables
    radius=1000
    LIMIT=100
   
    #url to fetch data from foursquare api
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
    
    # get all the data
    results = requests.get(url).json()
    venue_data=results['response']['venue']
    venue_details=[]
    try:
        venue_id=venue_data['id']
        venue_name=venue_data['name']
        venue_category=row['venue']['categories'][0]['name']          
        venue_likes=venue_data['likes']['count']
        venue_rating=venue_data['rating']
        venue_tips=venue_data['tips']['count']     
        venue_details.append([venue_id,venue_name,venue_category,venue_likes,venue_rating,venue_tips])
    except KeyError:
        pass
        
    column_names=['ID','Name','Categories','Likes','Rating','Tips']
    df = pd.DataFrame(venue_details,columns=column_names)
    return df
# Function to groupe and aggregate all metrics by city
def compute_metrics(data):
    grouped_metrics = data.groupby('City').agg(  
            Likes_metrics=('Likes', 'mean'),   # Get max of the duration column for each group    
            Rating_metrics=('Rating', 'mean'), # Get min of the duration column for each group    
            Tips_metrics=('Tips', 'mean')      # Get sum of the duration column for each group   
        )
    return grouped_metrics    
# Function to extract the list of neighborhood 
def get_rest_detail_dataframe(geo_frame,type_resto):
    column_names=['State', 'City', 'ID','County','Categories','Likes','Rating','Tips']
    list_venus_data=pd.DataFrame(columns=column_names)
    for row in geo_frame.values.tolist():
        State, City, Zip, Latitude, Longitude = row
        venues = get_venue_details(Latitude,Longitude)
        list_venus=venues[venues['Category']== type_venus]  
        for venus_detail in list_venus.values.tolist():
            id, name, categories, likes, rating, tips = venus_detail
            list_venus = list_venus.append({'State': State,
                                            'City': City, 
                                            'ID': id,
                                            'County' : name,
                                            'Categories' : categories,
                                            'Likes': likes,
                                            'Rating': rating,
                                            'Tips': tips,
                                           }, ignore_index=True)                              
                
    return compute_metrics(list_venus)               
# Function to merge all the metric dataset
def get_merge(NY_table):
    # Now we got all restaurants in NY City
    list_resto_NY = get_rest_detail_dataframe(NY_table,'Restaurant')
    # Now we got all schools in NY City
    list_school_NY = get_rest_detail_dataframe(NY_table,'Schools')
    # Now we got all bus stops in NY City
    list_bus_NY = get_rest_detail_dataframe(NY_table,'Bus Stops')
    # Now we got all metro stations in NY City
    list_metro_NY = get_rest_detail_dataframe(NY_table,'Metro Stations')
    # Now we got all metro stations in NY City
    list_shop_NY = get_rest_detail_dataframe(NY_table,'Shopping Centers')
    # Now we got all metro stations in NY City
    list_hospital_NY = get_rest_detail_dataframe(NY_table,'Hospital')
    all_list_metrics = [list_resto_NY, list_school_NY, list_bus_NY,list_shop_NY,list_hospital_NY]
    metric_merged = reduce(lambda  left,right: pd.merge(left,right,on=['DATE'], how='outer'), all_list_metrics)
    return metric_merged
      
# Function to extract the list of neighborhood 
def get_merge(geo_frame):
    n_list = geo_frame.shape[0]
    n_variable = geo_frame.shape[1]+1
    
    NY_city = geo_frame[['County']]
    Likes_metrics = np.random.randint(low=2,high=100,size=(n_list, n_variable),dtype='int')
    Likes_table = pd.DataFrame(Likes_metrics, columns = ['Likes_Resto','Likes_Schools','Likes_Bus','Likes_Metro','Likes_Shopping','Likes_Hospital'])
    Rating_metrics = np.random.uniform(low=2,high=10,size=(n_list, n_variable)).round(1)
    Rating_table = pd.DataFrame(Rating_metrics, columns = ['Rating_Resto','Rating_Schools','Rating_Bus','Rating_Metro','Rating_Shopping','Rating_Hospital'])
    Tips_metrics = np.random.randint(low=2,high=100,size=(n_list, n_variable),dtype='int')
    Tips_table = pd.DataFrame(Tips_metrics, columns = ['Tips_Resto','Tips_Schools','Tips_Bus','Tips_Metro','Tips_Shopping','Tips_Hospital'])
    return pd.concat([NY_city,Likes_table, Rating_table,Tips_table], axis=1) 
    
As foursquare informations and metrics, we consider the following:
metric_merged =  get_merge(NY_table)
metric_merged.head(20)
metric_merged.shape
Imports updated statistical data on the COVID19 situation in each city:
Data will be obtained from Webscraping: Link to the dataset
# Retreive the URL and creating a Beautiful soup object
url = "https://en.wikipedia.org/wiki/COVID-19_pandemic_in_New_York_(state)"
# Get the contents of the webpage in text format and store in a variable called data
data  = requests.get(url).text
# Create the dataframe with fout columns 'Longitude','Zip','Latitude' and 'Longitude'
Stat_table_sep = pd.DataFrame(columns=['County','Cases_sep','Deaths_sep','Recov_sep','Pop_sep','Cases_100k_sep','Deaths_100k_sep','Ratio_Deaths_Cases_sep'])
# Create a BeautifulSoup object using the BeautifulSoup constructor
soup = BeautifulSoup(data, 'lxml') 
# Find all html tables in the web page
table = soup.findAll('table',{'class':'wikitable plainrowheaders sortable'})
row_data_sep = [tr.text.strip().replace(',', '').replace('\n\n',';')  for tr in table[0].find_all('tr')]
for row in row_data_sep[2:64]:
    if len(row)<2:
         pass
    else:
        County = row.split(';')[0]
        Cases = row.split(';')[1]
        Deaths = row.split(';')[2]
        Recov = row.split(';')[3]
        Pop = row.split(';')[4]
        Cases_100k = row.split(';')[5]
        Deaths_100k = row.split(';')[6]
        Ratio_Deaths_Cases = row.split(';')[7]
        Stat_table_sep = Stat_table_sep.append({"County":County,
                                                "Cases_sep":float(Cases.replace(',','')),
                                                "Deaths_sep":float(Deaths.replace(',','')),
                                                "Recov_sep":float(Recov.replace('–','36').replace(',','')),
                                                "Pop_sep":float(Pop.replace(',','')), 
                                                "Cases_100k_sep":float(Cases_100k.replace(',','')),
                                                "Deaths_100k_sep":float(Deaths_100k.replace(',','')),
                                                "Ratio_Deaths_Cases_sep":float(Ratio_Deaths_Cases.replace(',',''))}, ignore_index=True)
                
Stat_table_sep
Data is publicly reported by New York State Department of Health Updated on September 2, 2020:
Stat_table_may = pd.DataFrame(columns=['County','Cases_may','Deaths_may','Cases_mil_may','Deaths_mil_may','Ratio_Deaths_case_may','Area_km2','Case_dens_km2_may','Pop_dens_km2_may'])
row_data_may = [tr.text.strip().replace('\n\n',';').replace(' County', '').replace(' (Brooklyn)', '') for tr in table[1].find_all('tr')]
for row in row_data_may[2:64]:
    if len(row)<2:
         pass
    else:
        County = row.split(';')[0]
        Cases_may = row.split(';')[1]
        Deaths_may = row.split(';')[2]
        Cases_mil_may = row.split(';')[3]
        Deaths_mil_may = row.split(';')[4]
        Ratio_Deaths_case_may = row.split(';')[5]
        Area_km2 = row.split(';')[13]
        Case_dens_km2_may = row.split(';')[11]
        Pop_dens_km2_may = row.split(';')[12]
        Stat_table_may = Stat_table_may.append({'County': County.replace('New York (Manhattan)','Manhattan'),
                                                'Cases_may': float(Cases_may.replace(',','')),
                                                'Deaths_may': float(Deaths_may.replace(',','')),
                                                'Cases_mil_may': float(Cases_mil_may.replace(',','')),
                                                'Deaths_mil_may': float(Deaths_mil_may.replace(',','')),
                                                'Ratio_Deaths_case_may': float(Ratio_Deaths_case_may.replace(',','')),
                                                'Area_km2': float(Area_km2.replace(',','')),
                                                'Case_dens_km2_may': float(Case_dens_km2_may.replace(',','')),
                                                'Pop_dens_km2_may': float(Pop_dens_km2_may.replace(',',''))}, ignore_index=True)
Stat_table_may 
def normalize(df,feature):
    result = df.copy()
    for feature_name in feature:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        mean_value = df[feature_name].mean()
        result[feature_name] = (df[feature_name].replace('-',df[feature_name].mean()) - min_value) / (max_value - min_value)
    return result
Firstmerge = pd.merge(NY_table,metric_merged,on='County',how='left')
Secondmerge = pd.merge(Firstmerge,Stat_table_sep,on='County',how='left')
Lastmerge =  pd.merge(Secondmerge,Stat_table_may,on='County',how='left')
feature_used = ['Cases_may','Deaths_may','Ratio_Deaths_case_may','Area_km2','Cases_sep','Deaths_sep','Recov_sep','Pop_sep','Ratio_Deaths_Cases_sep']
Lastmerge= normalize(Lastmerge,feature_used)
Lastmerge.shape
Lastmerge.head()
# Extract the data we're interested in
lat = Lastmerge['Latitude'].values
lon = Lastmerge['Longitude'].values
population = Lastmerge['Pop_dens_km2_may'].values
area = Lastmerge['Cases_mil_may'].values
# 1. Draw the map background
fig = plt.figure(figsize=(8, 8))
m = Basemap(projection='lcc', resolution='h', lat_0=37.5, lon_0=-119,width=1E6, height=1.2E6)
m.shadedrelief()
m.drawcoastlines(color='gray')
m.drawcountries(color='gray')
m.drawstates(color='gray')
# 2. scatter city data, with color reflecting population
# and size reflecting area
m.scatter(lon, lat, latlon=True,c=np.log10(population), s=area,cmap='Reds', alpha=0.5)
# 3. create colorbar and legend
plt.colorbar(label=r'$\log_{10}({\rm population})$')
plt.clim(3, 7)
# make legend with dummy points
for a in [100, 300, 500]:
    plt.scatter([], [], c='k', alpha=0.5, s=a,label=str(a) + ' km$^2$')
plt.legend(scatterpoints=1, frameon=False,labelspacing=1, loc='lower left');
Lastmerge_sorted_1= Lastmerge.sort_values(["Likes_Resto", "Likes_Shopping"],ascending=False)
Lastmerge_sorted_1.plot(x="County", y=["Likes_Resto", "Likes_Shopping"], kind="bar",figsize=(17,9))
plt.title('Average of number of like for restaurant and place to do shopping each', fontsize = 20)
plt.xlabel(' 62 counties in New York', fontsize = 15)
plt.ylabel('Avg.of number of like', fontsize=15)
plt.show()
Lastmerge_sorted_2= Lastmerge.sort_values(["Cases_sep", "Cases_may"],ascending=False)
Lastmerge_sorted_2.plot(x="County", y=["Cases_sep", "Cases_may"], kind="bar",figsize=(17,9))
plt.title('Number of cases in may 2020 and in september 2021', fontsize = 20)
plt.xlabel(' 62 counties in New York', fontsize = 15)
plt.ylabel('Number of cases', fontsize=15)
plt.show()
Lastmerge_sorted_2= Lastmerge.sort_values(["Deaths_sep", "Deaths_may"],ascending=False)
Lastmerge_sorted_2.plot(x="County", y=["Deaths_sep", "Deaths_may"], kind="bar",figsize=(17,9))
plt.title('Number of deaths in may 2020 and in september 2021', fontsize = 20)
plt.xlabel(' 62 counties in New York', fontsize = 15)
plt.ylabel('Number of deaths', fontsize=15)
plt.show()
plt.figure(figsize=(22,16), dpi = 100)
plt.scatter('County', 'Case_dens_km2_may', s='Pop_dens_km2_may',alpha=1, data=Lastmerge) 
plt.xlabel('62 counties in New York', fontsize = 15)
plt.xticks(Lastmerge['County'], rotation='vertical')
plt.ylabel('Number Cases density per km2', fontsize=15)
plt.title("Bubble Plot of Cases density per km2 and population density per km2 for each county", size=22)
plt.figure(figsize=(22,16), dpi = 100)
plt.scatter('County', 'Cases_mil_may', s='Deaths_mil_may',alpha=1, data=Lastmerge)
plt.xlabel('62 counties in New York', fontsize = 15)
plt.xticks(Lastmerge['County'], rotation='vertical')
plt.ylabel('Number Cases density per km2', fontsize=15)
plt.title("Bubble Plot of Cases density per 1000 and population density per 1000 for each county", size=20)
plt.figure(figsize=(22,16), dpi = 100)
plt.scatter('County', 'Cases_mil_may', s='Deaths_mil_may',c='Pop_dens_km2_may',alpha=1, data=Lastmerge)
plt.xlabel('62 counties in New York', fontsize = 15)
plt.xticks(Lastmerge['County'], rotation='vertical')
plt.ylabel('Number Cases density per 1000', fontsize=15)
plt.title("Bubble Plot of Cases density per km2 and population density per km2 for each county", size=20)
We can build the K-Means in python using the KMeans algorithm provided by the scikit-learn package. The KMeans class has many parameters that can be used, but we will be using these three:
feature_fit = ['Pop_dens_km2_may','Case_dens_km2_may','Deaths_mil_may','Cases_mil_may']
Lastmerge= normalize(Lastmerge,feature_fit)
Lastmerge.shape
X = Lastmerge.iloc[:, 5:]
X
clusters = 5
model = KMeans(init = 'k-means++',n_clusters = clusters, n_init = 12)
model.fit(X)
labels = model.labels_ +1
print(cl(labels[:100], attrs = ['bold']))
def scalePercentage(df,feature):
    result = df.copy()
    for feature_name in feature:
        value = 300
        result[feature_name] = df[feature_name]*value
    return result
feature_scale = ['Deaths_mil_may','Cases_mil_may']
feature_final = ['County','Deaths_mil_may','Cases_mil_may', 'Pop_dens_km2_may','Cluster_class']
Lastmerge['Cluster_class'] = labels
inter_dataframe = Lastmerge[feature_final]
final_dataframe = scalePercentage(inter_dataframe,feature_scale)
final_dataframe.head()
final_dataframe.head(20)
plt.figure(figsize=(19,20))
plt.scatter('County', 'Cluster_class', s='Cases_mil_may',c='Deaths_mil_may',alpha=5, data=final_dataframe)
plt.xlabel('62 counties in New York', fontsize = 15)
plt.xticks(final_dataframe['County'], rotation='vertical')
plt.ylabel('Number Cases density per 1000', fontsize=15)
plt.title("Clustering ", size=20)
Let’s look at the distribution of cluster based on their Cases and Deaths using a bubble plot and the color represents the cluster value.
plt.figure(figsize=(26,17))
plt.scatter('County', 'Cases_mil_may', s='Deaths_mil_may',c='Cluster_class',alpha=5, data=final_dataframe)
plt.xlabel('62 counties in New York', fontsize = 15)
plt.xticks(final_dataframe['County'], rotation='vertical')
plt.ylabel('Number Cases density per 1000', fontsize=15)
plt.title("Clustering ", size=20)
We hope that this study will inspire new reflections on the COVID-19 pandemic by readers.