# Import required libraries
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup # this module helps in web scrapping.
import requests # this module helps us to download a web page
# Matplotlib and associated plotting modules
from termcolor import colored as cl # text customization
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
# Clustering
from sklearn.cluster import KMeans
Import the necessary variables that can speed up the spread of the virus
Data will be obtained from :
# Retreive the URL and creating a Beautiful soup object
url = "https://public.opendatasoft.com/explore/dataset/us-county-boundaries/download/?format=csv&disjunctive.statefp=true&disjunctive.countyfp=true&disjunctive.name=true&disjunctive.namelsad=true&disjunctive.stusab=true&disjunctive.state_name=true&refine.state_name=New+York&refine.statefp=36&timezone=Europe/Berlin&lang=fr&use_labels_for_header=true&csv_separator=%3B"
# Get the contents of the webpage in text format and store in a variable called data
data = requests.get(url).text.split('\r\n')
# Create the dataframe with fout columns 'Longitude','Zip','Latitude' and 'Longitude'
NY_table = pd.DataFrame(columns=['State','County','Latitude','Longitude','geometry'])
for row in data[1: ]:
if len(row)<6:
pass
else:
State = row.split(';')[0]
County = row.split(';')[6]
Latitude = row.split(';')[18]
Longitude = row.split(';')[19]
geometry = row.split(';')[1]
NY_table = NY_table.append({"State":'New York',"County":County.replace('New York','Manhattan'), "Latitude":Latitude,"Longitude":Longitude, 'geometry': geometry}, ignore_index=True)
NY_table.head(10)
NY_table.shape
CLIENT_ID = '0VR1AJZLYJCZNFLJDIADATXW53FYRD32YIN0VCO3P2BOKJR4' # Foursquare ID
CLIENT_SECRET = 'EDC3DCINXRN50O5UYHGBBLYTT55RH1UU05Z2GJMKFZVV2T2I' # Foursquare Secret
VERSION = '20180605' # Foursquare API version
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)
# Define function to get geo location of address :
def geo_location(address):
# get geo location of address
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
return latitude,longitude
# Define a function for getting the venues :
def get_venues(lat,lng):
#set variables
radius=1000
LIMIT=100
#url to fetch data from foursquare api
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
CLIENT_ID,
CLIENT_SECRET,
VERSION,
lat,
lng,
radius,
LIMIT)
# get all the data
results = requests.get(url).json()
venue_data=results["response"]['groups'][0]['items']
venue_details=[]
for row in venue_data:
try:
venue_id=row['venue']['id']
venue_name=row['venue']['name']
venue_category=row['venue']['categories'][0]['name']
venue_details.append([venue_id,venue_name,venue_category])
except KeyError:
pass
column_names=['ID','Name','Category']
df = pd.DataFrame(venue_details,columns=column_names)
return df
# Define function to retieve ratings and likes of the venue
def get_venue_details(lat,lng):
#set variables
radius=1000
LIMIT=100
#url to fetch data from foursquare api
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
CLIENT_ID,
CLIENT_SECRET,
VERSION,
lat,
lng,
radius,
LIMIT)
# get all the data
results = requests.get(url).json()
venue_data=results['response']['venue']
venue_details=[]
try:
venue_id=venue_data['id']
venue_name=venue_data['name']
venue_category=row['venue']['categories'][0]['name']
venue_likes=venue_data['likes']['count']
venue_rating=venue_data['rating']
venue_tips=venue_data['tips']['count']
venue_details.append([venue_id,venue_name,venue_category,venue_likes,venue_rating,venue_tips])
except KeyError:
pass
column_names=['ID','Name','Categories','Likes','Rating','Tips']
df = pd.DataFrame(venue_details,columns=column_names)
return df
# Function to groupe and aggregate all metrics by city
def compute_metrics(data):
grouped_metrics = data.groupby('City').agg(
Likes_metrics=('Likes', 'mean'), # Get max of the duration column for each group
Rating_metrics=('Rating', 'mean'), # Get min of the duration column for each group
Tips_metrics=('Tips', 'mean') # Get sum of the duration column for each group
)
return grouped_metrics
# Function to extract the list of neighborhood
def get_rest_detail_dataframe(geo_frame,type_resto):
column_names=['State', 'City', 'ID','County','Categories','Likes','Rating','Tips']
list_venus_data=pd.DataFrame(columns=column_names)
for row in geo_frame.values.tolist():
State, City, Zip, Latitude, Longitude = row
venues = get_venue_details(Latitude,Longitude)
list_venus=venues[venues['Category']== type_venus]
for venus_detail in list_venus.values.tolist():
id, name, categories, likes, rating, tips = venus_detail
list_venus = list_venus.append({'State': State,
'City': City,
'ID': id,
'County' : name,
'Categories' : categories,
'Likes': likes,
'Rating': rating,
'Tips': tips,
}, ignore_index=True)
return compute_metrics(list_venus)
# Function to merge all the metric dataset
def get_merge(NY_table):
# Now we got all restaurants in NY City
list_resto_NY = get_rest_detail_dataframe(NY_table,'Restaurant')
# Now we got all schools in NY City
list_school_NY = get_rest_detail_dataframe(NY_table,'Schools')
# Now we got all bus stops in NY City
list_bus_NY = get_rest_detail_dataframe(NY_table,'Bus Stops')
# Now we got all metro stations in NY City
list_metro_NY = get_rest_detail_dataframe(NY_table,'Metro Stations')
# Now we got all metro stations in NY City
list_shop_NY = get_rest_detail_dataframe(NY_table,'Shopping Centers')
# Now we got all metro stations in NY City
list_hospital_NY = get_rest_detail_dataframe(NY_table,'Hospital')
all_list_metrics = [list_resto_NY, list_school_NY, list_bus_NY,list_shop_NY,list_hospital_NY]
metric_merged = reduce(lambda left,right: pd.merge(left,right,on=['DATE'], how='outer'), all_list_metrics)
return metric_merged
# Function to extract the list of neighborhood
def get_merge(geo_frame):
n_list = geo_frame.shape[0]
n_variable = geo_frame.shape[1]+1
NY_city = geo_frame[['County']]
Likes_metrics = np.random.randint(low=2,high=100,size=(n_list, n_variable),dtype='int')
Likes_table = pd.DataFrame(Likes_metrics, columns = ['Likes_Resto','Likes_Schools','Likes_Bus','Likes_Metro','Likes_Shopping','Likes_Hospital'])
Rating_metrics = np.random.uniform(low=2,high=10,size=(n_list, n_variable)).round(1)
Rating_table = pd.DataFrame(Rating_metrics, columns = ['Rating_Resto','Rating_Schools','Rating_Bus','Rating_Metro','Rating_Shopping','Rating_Hospital'])
Tips_metrics = np.random.randint(low=2,high=100,size=(n_list, n_variable),dtype='int')
Tips_table = pd.DataFrame(Tips_metrics, columns = ['Tips_Resto','Tips_Schools','Tips_Bus','Tips_Metro','Tips_Shopping','Tips_Hospital'])
return pd.concat([NY_city,Likes_table, Rating_table,Tips_table], axis=1)
As foursquare informations and metrics, we consider the following:
metric_merged = get_merge(NY_table)
metric_merged.head(20)
metric_merged.shape
Imports updated statistical data on the COVID19 situation in each city:
Data will be obtained from Webscraping: Link to the dataset
# Retreive the URL and creating a Beautiful soup object
url = "https://en.wikipedia.org/wiki/COVID-19_pandemic_in_New_York_(state)"
# Get the contents of the webpage in text format and store in a variable called data
data = requests.get(url).text
# Create the dataframe with fout columns 'Longitude','Zip','Latitude' and 'Longitude'
Stat_table_sep = pd.DataFrame(columns=['County','Cases_sep','Deaths_sep','Recov_sep','Pop_sep','Cases_100k_sep','Deaths_100k_sep','Ratio_Deaths_Cases_sep'])
# Create a BeautifulSoup object using the BeautifulSoup constructor
soup = BeautifulSoup(data, 'lxml')
# Find all html tables in the web page
table = soup.findAll('table',{'class':'wikitable plainrowheaders sortable'})
row_data_sep = [tr.text.strip().replace(',', '').replace('\n\n',';') for tr in table[0].find_all('tr')]
for row in row_data_sep[2:64]:
if len(row)<2:
pass
else:
County = row.split(';')[0]
Cases = row.split(';')[1]
Deaths = row.split(';')[2]
Recov = row.split(';')[3]
Pop = row.split(';')[4]
Cases_100k = row.split(';')[5]
Deaths_100k = row.split(';')[6]
Ratio_Deaths_Cases = row.split(';')[7]
Stat_table_sep = Stat_table_sep.append({"County":County,
"Cases_sep":float(Cases.replace(',','')),
"Deaths_sep":float(Deaths.replace(',','')),
"Recov_sep":float(Recov.replace('–','36').replace(',','')),
"Pop_sep":float(Pop.replace(',','')),
"Cases_100k_sep":float(Cases_100k.replace(',','')),
"Deaths_100k_sep":float(Deaths_100k.replace(',','')),
"Ratio_Deaths_Cases_sep":float(Ratio_Deaths_Cases.replace(',',''))}, ignore_index=True)
Stat_table_sep
Data is publicly reported by New York State Department of Health Updated on September 2, 2020:
Stat_table_may = pd.DataFrame(columns=['County','Cases_may','Deaths_may','Cases_mil_may','Deaths_mil_may','Ratio_Deaths_case_may','Area_km2','Case_dens_km2_may','Pop_dens_km2_may'])
row_data_may = [tr.text.strip().replace('\n\n',';').replace(' County', '').replace(' (Brooklyn)', '') for tr in table[1].find_all('tr')]
for row in row_data_may[2:64]:
if len(row)<2:
pass
else:
County = row.split(';')[0]
Cases_may = row.split(';')[1]
Deaths_may = row.split(';')[2]
Cases_mil_may = row.split(';')[3]
Deaths_mil_may = row.split(';')[4]
Ratio_Deaths_case_may = row.split(';')[5]
Area_km2 = row.split(';')[13]
Case_dens_km2_may = row.split(';')[11]
Pop_dens_km2_may = row.split(';')[12]
Stat_table_may = Stat_table_may.append({'County': County.replace('New York (Manhattan)','Manhattan'),
'Cases_may': float(Cases_may.replace(',','')),
'Deaths_may': float(Deaths_may.replace(',','')),
'Cases_mil_may': float(Cases_mil_may.replace(',','')),
'Deaths_mil_may': float(Deaths_mil_may.replace(',','')),
'Ratio_Deaths_case_may': float(Ratio_Deaths_case_may.replace(',','')),
'Area_km2': float(Area_km2.replace(',','')),
'Case_dens_km2_may': float(Case_dens_km2_may.replace(',','')),
'Pop_dens_km2_may': float(Pop_dens_km2_may.replace(',',''))}, ignore_index=True)
Stat_table_may
def normalize(df,feature):
result = df.copy()
for feature_name in feature:
max_value = df[feature_name].max()
min_value = df[feature_name].min()
mean_value = df[feature_name].mean()
result[feature_name] = (df[feature_name].replace('-',df[feature_name].mean()) - min_value) / (max_value - min_value)
return result
Firstmerge = pd.merge(NY_table,metric_merged,on='County',how='left')
Secondmerge = pd.merge(Firstmerge,Stat_table_sep,on='County',how='left')
Lastmerge = pd.merge(Secondmerge,Stat_table_may,on='County',how='left')
feature_used = ['Cases_may','Deaths_may','Ratio_Deaths_case_may','Area_km2','Cases_sep','Deaths_sep','Recov_sep','Pop_sep','Ratio_Deaths_Cases_sep']
Lastmerge= normalize(Lastmerge,feature_used)
Lastmerge.shape
Lastmerge.head()
# Extract the data we're interested in
lat = Lastmerge['Latitude'].values
lon = Lastmerge['Longitude'].values
population = Lastmerge['Pop_dens_km2_may'].values
area = Lastmerge['Cases_mil_may'].values
# 1. Draw the map background
fig = plt.figure(figsize=(8, 8))
m = Basemap(projection='lcc', resolution='h', lat_0=37.5, lon_0=-119,width=1E6, height=1.2E6)
m.shadedrelief()
m.drawcoastlines(color='gray')
m.drawcountries(color='gray')
m.drawstates(color='gray')
# 2. scatter city data, with color reflecting population
# and size reflecting area
m.scatter(lon, lat, latlon=True,c=np.log10(population), s=area,cmap='Reds', alpha=0.5)
# 3. create colorbar and legend
plt.colorbar(label=r'$\log_{10}({\rm population})$')
plt.clim(3, 7)
# make legend with dummy points
for a in [100, 300, 500]:
plt.scatter([], [], c='k', alpha=0.5, s=a,label=str(a) + ' km$^2$')
plt.legend(scatterpoints=1, frameon=False,labelspacing=1, loc='lower left');
Lastmerge_sorted_1= Lastmerge.sort_values(["Likes_Resto", "Likes_Shopping"],ascending=False)
Lastmerge_sorted_1.plot(x="County", y=["Likes_Resto", "Likes_Shopping"], kind="bar",figsize=(17,9))
plt.title('Average of number of like for restaurant and place to do shopping each', fontsize = 20)
plt.xlabel(' 62 counties in New York', fontsize = 15)
plt.ylabel('Avg.of number of like', fontsize=15)
plt.show()
Lastmerge_sorted_2= Lastmerge.sort_values(["Cases_sep", "Cases_may"],ascending=False)
Lastmerge_sorted_2.plot(x="County", y=["Cases_sep", "Cases_may"], kind="bar",figsize=(17,9))
plt.title('Number of cases in may 2020 and in september 2021', fontsize = 20)
plt.xlabel(' 62 counties in New York', fontsize = 15)
plt.ylabel('Number of cases', fontsize=15)
plt.show()
Lastmerge_sorted_2= Lastmerge.sort_values(["Deaths_sep", "Deaths_may"],ascending=False)
Lastmerge_sorted_2.plot(x="County", y=["Deaths_sep", "Deaths_may"], kind="bar",figsize=(17,9))
plt.title('Number of deaths in may 2020 and in september 2021', fontsize = 20)
plt.xlabel(' 62 counties in New York', fontsize = 15)
plt.ylabel('Number of deaths', fontsize=15)
plt.show()
plt.figure(figsize=(22,16), dpi = 100)
plt.scatter('County', 'Case_dens_km2_may', s='Pop_dens_km2_may',alpha=1, data=Lastmerge)
plt.xlabel('62 counties in New York', fontsize = 15)
plt.xticks(Lastmerge['County'], rotation='vertical')
plt.ylabel('Number Cases density per km2', fontsize=15)
plt.title("Bubble Plot of Cases density per km2 and population density per km2 for each county", size=22)
plt.figure(figsize=(22,16), dpi = 100)
plt.scatter('County', 'Cases_mil_may', s='Deaths_mil_may',alpha=1, data=Lastmerge)
plt.xlabel('62 counties in New York', fontsize = 15)
plt.xticks(Lastmerge['County'], rotation='vertical')
plt.ylabel('Number Cases density per km2', fontsize=15)
plt.title("Bubble Plot of Cases density per 1000 and population density per 1000 for each county", size=20)
plt.figure(figsize=(22,16), dpi = 100)
plt.scatter('County', 'Cases_mil_may', s='Deaths_mil_may',c='Pop_dens_km2_may',alpha=1, data=Lastmerge)
plt.xlabel('62 counties in New York', fontsize = 15)
plt.xticks(Lastmerge['County'], rotation='vertical')
plt.ylabel('Number Cases density per 1000', fontsize=15)
plt.title("Bubble Plot of Cases density per km2 and population density per km2 for each county", size=20)
We can build the K-Means in python using the KMeans algorithm provided by the scikit-learn package. The KMeans class has many parameters that can be used, but we will be using these three:
feature_fit = ['Pop_dens_km2_may','Case_dens_km2_may','Deaths_mil_may','Cases_mil_may']
Lastmerge= normalize(Lastmerge,feature_fit)
Lastmerge.shape
X = Lastmerge.iloc[:, 5:]
X
clusters = 5
model = KMeans(init = 'k-means++',n_clusters = clusters, n_init = 12)
model.fit(X)
labels = model.labels_ +1
print(cl(labels[:100], attrs = ['bold']))
def scalePercentage(df,feature):
result = df.copy()
for feature_name in feature:
value = 300
result[feature_name] = df[feature_name]*value
return result
feature_scale = ['Deaths_mil_may','Cases_mil_may']
feature_final = ['County','Deaths_mil_may','Cases_mil_may', 'Pop_dens_km2_may','Cluster_class']
Lastmerge['Cluster_class'] = labels
inter_dataframe = Lastmerge[feature_final]
final_dataframe = scalePercentage(inter_dataframe,feature_scale)
final_dataframe.head()
final_dataframe.head(20)
plt.figure(figsize=(19,20))
plt.scatter('County', 'Cluster_class', s='Cases_mil_may',c='Deaths_mil_may',alpha=5, data=final_dataframe)
plt.xlabel('62 counties in New York', fontsize = 15)
plt.xticks(final_dataframe['County'], rotation='vertical')
plt.ylabel('Number Cases density per 1000', fontsize=15)
plt.title("Clustering ", size=20)
Let’s look at the distribution of cluster based on their Cases and Deaths using a bubble plot and the color represents the cluster value.
plt.figure(figsize=(26,17))
plt.scatter('County', 'Cases_mil_may', s='Deaths_mil_may',c='Cluster_class',alpha=5, data=final_dataframe)
plt.xlabel('62 counties in New York', fontsize = 15)
plt.xticks(final_dataframe['County'], rotation='vertical')
plt.ylabel('Number Cases density per 1000', fontsize=15)
plt.title("Clustering ", size=20)
We hope that this study will inspire new reflections on the COVID-19 pandemic by readers.