# ---------------------------------------------------------
# IMPORT DES PACKAGES
# ---------------------------------------------------------
# If we want to use files in HDFS
import os
spark_path = "/usr/local/Cellar/spark-2.4.3-bin-hadoop2.7"
os.environ["SPARK_HOME"] = spark_path
import math
import pandas as pd
from scipy import stats
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
pd.plotting.register_matplotlib_converters()
from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
# Machine learning
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
# -----------------------------------------------------------------
# Set parameters for the plt
# -----------------------------------------------------------------
font = {'family' : 'Verdana','weight' : 15 ,'size' : 14}
plt.rc('font', **font)
# -------------------------------------------
# sparkSession
# -------------------------------------------
spark = SparkSession \
.builder \
.appName("Attrition") \
.config("spark.memory.fraction", 0.8) \
.config("spark.executor.memory", "18g") \
.config("spark.driver.memory", "18g")\
.config("spark.sql.shuffle.partitions" , "800") \
.getOrCreate()
sparkcontext = spark.sparkContext
# -------------------------------------------
# Read and load Dataframe
# -------------------------------------------
# 1- Data eBoutique : Sales
AAPL_path= "/tmp/incoming/AAPL_pricing/AAPL_prices_parquet"
AAPLRaw = spark.read.parquet(AAPL_path)
AAPLRaw.show(4)
AAPLRaw.agg(min(col("Date")), max(col("Date"))).show()
AAPL_stat_desc = AAPLRaw.describe()
for col_name in AAPL_stat_desc.columns[1:6]:
AAPL_stat_desc = AAPL_stat_desc.withColumn(col_name, round(col_name, 3))
AAPL_stat_desc.show()
# conversion to pandas dataframe
AAPLRaw_pd = AAPLRaw.toPandas()
AAPLRaw_pd.isna().any()
AAPLRaw_pd.info()
AAPLRaw_pd.reset_index(inplace=True)
AAPLRaw_pd.set_index("Date", inplace=True)
# plot the Apple Adj. Close Price History
plt.figure(figsize=(15, 8))
plt.plot(AAPLRaw_pd['Adj_Close'],label = 'APPL_adj')
plt.title('Apple Adj. Close Price History', pad =10, c = 'r', fontweight='bold')
plt.xlabel('Jan. 04, 2010 - Apr. 28, 2020', fontsize=18 , labelpad=11, fontweight='bold')
plt.ylabel('Adj. Close Price USD ($)', fontsize=18 , labelpad=11, fontweight='bold')
plt.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
plt.legend(loc ='upper left', fontsize=15)
# Get the timeseries. This now returns a Pandas Series object indexed by date.
AAPL_Close = AAPLRaw_pd.loc[:, 'Close']
# Calculate the 20 and 100 days moving averages of the closing prices
short_rolling_AAPL = AAPL_Close.rolling(window=15).mean()
middle_rolling_AAPL = AAPL_Close.rolling(window=30).mean()
long_rolling_AAPL = AAPL_Close.rolling(window=100).mean()
# Plot everything by leveraging the very powerful matplotlib package
fig, ax1 = plt.subplots(figsize=(16,9))
ax1.plot(AAPL_Close.index, AAPL_Close, label='AAPL_Close')
ax1.plot(short_rolling_AAPL.index, short_rolling_AAPL, label='10 days rolling')
ax1.plot(middle_rolling_AAPL.index, middle_rolling_AAPL, label='30 days rolling')
ax1.plot(long_rolling_AAPL.index, long_rolling_AAPL, label='100 days rolling')
ax1.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
ax1.set_title('Apple Close Price History', pad =10, c = 'r', fontweight='bold')
ax1.set_xlabel('Date', fontsize=18 , labelpad=11, fontweight='bold')
ax1.set_ylabel('Close price ($)', fontsize=18 , labelpad=11, fontweight='bold')
ax1.legend(loc ='upper left', fontsize=18)
# Get the MSFT timeseries. This now returns a Pandas Series object indexed by date.
AAPL_Adj_Close = AAPLRaw_pd.loc[:, 'Adj_Close']
# Calculate the 20 and 100 days moving averages of the closing prices
short_rolling_AAPL_Adj = AAPL_Adj_Close.rolling(window=10).mean()
middle_rolling_AAPL_Adj = AAPL_Adj_Close.rolling(window=30).mean()
long_rolling_AAPL_Adj = AAPL_Adj_Close.rolling(window=100).mean()
# Plot everything by leveraging the very powerful matplotlib package
fig, ax1 = plt.subplots(figsize=(16,9))
ax1.plot(AAPL_Adj_Close.index, AAPL_Adj_Close, label='AAPL_Adj_Close')
ax1.plot(short_rolling_AAPL_Adj.index, short_rolling_AAPL_Adj, label='10 days rolling')
ax1.plot(middle_rolling_AAPL_Adj.index, middle_rolling_AAPL_Adj, label='30 days rolling')
ax1.plot(long_rolling_AAPL_Adj.index, long_rolling_AAPL_Adj, label='100 days rolling')
ax1.set_xlabel('Jan. 04, 2010 - Apr. 28, 2020', fontsize=18 , labelpad=11, fontweight='bold')
ax1.set_ylabel('Adjusted closing price ($)', fontsize=18 , labelpad=11, fontweight='bold')
ax1.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
ax1.set_title('Apple Adj. Close Price History', pad =10, c = 'r', fontweight='bold')
ax1.legend()
fig, (ax1, ax2) = plt.subplots(2, 1,figsize=(15,10))
AAPL_Adj_Close.plot(x='AAPL_Adj_Close', y='unemployment', ax=ax1, legend=True)
ax1.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
ax1.set_xlabel('AAPL 10-year history of trading days', fontsize=18 , labelpad=11, fontweight='bold')
ax1.set_ylabel('Adjusted closing price', fontsize=18 , labelpad=11, fontweight='bold')
AAPL_Adj_Close.pct_change().hist(bins=50, color = 'b', ax=ax2)
ax2.set_xlabel('AAPL 10-year history of trading days', fontsize=18 , labelpad=11, fontweight='bold')
ax2.set_ylabel('Adjusted closing price', fontsize=18 , labelpad=11, fontweight='bold')
# -------------------------------------------
# Conversion to pandas dataframe
# -------------------------------------------
AAPLRaw_pd = AAPLRaw.toPandas()
AAPLRaw_pd.reset_index(inplace=True)
AAPLRaw_pd.set_index("Date", inplace=True)
# ------------------------------------------------------------
# Relative strength and Relative Strength Index (RSI)
# ------------------------------------------------------------
def RSI(stock, column="Close", period=14):
# Wilder's RSI
close = stock[column]
delta = close.diff()
up, down = delta.copy(), delta.copy()
up[up < 0] = 0
down[down > 0] = 0
# Calculate the exponential moving averages (EWMA)
roll_up = up.ewm(com=period - 1, adjust=False).mean()
roll_down = down.ewm(com=period - 1, adjust=False).mean().abs()
# Calculate RS based on exponential moving average (EWMA)
rs = roll_up / roll_down # relative strength = average gain/average loss
rsi = 100-(100/(1+rs))
stock['RSI'] = rsi
return stock
AAPLRaw_pd = RSI(AAPLRaw_pd)
fig,ax=plt.subplots(figsize=(15,5))
AAPLRaw_pd.RSI.plot(ax=ax, label='RSI', legend=True, color='b');
ax.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
ax.set_xlabel('Trading days', fontsize=18 , labelpad=11, fontweight='bold')
ax.set_ylabel('14-day RSI', fontsize=18 , labelpad=11, fontweight='bold')
ax.set_title('14-day RSI of the Close price AAPL', pad =10, c = 'r', fontweight='bold')
# ------------------------------------------------------------
# Define Predictor/Independent Variables
# ------------------------------------------------------------
AAPLRaw_pd['middle_MA'] = AAPLRaw_pd['Close'].rolling(window=30).mean()
AAPLRaw_pd['Corr'] = AAPLRaw_pd['Close'].rolling(window=30).corr(AAPLRaw_pd['middle_MA'])
AAPLRaw_pd['Open-Close'] = AAPLRaw_pd['Open'] - AAPLRaw_pd['Close'].shift(1)
AAPLRaw_pd['Open-Open'] = AAPLRaw_pd['Open'] - AAPLRaw_pd['Open'].shift(1)
AAPLRaw_pd_reduice = AAPLRaw_pd.dropna().drop(['index','Adj_Close'], axis=1)
AAPLRaw_pd_reduice.head()
# ------------------------------------------------------------
# Define Target/Dependent Variable
# ------------------------------------------------------------
# If tomorrow’s closing price is higher than today’s closing price,
# then the price is going up (1), else the price is going down (-1).
AAPL_y = np.where(AAPLRaw_pd_reduice['Close'].shift(-1) > AAPLRaw_pd_reduice['Close'],1,-1)
# ------------------------------------------------------------
# Split The Dataset
# ------------------------------------------------------------
split = int(0.8*len(AAPLRaw_pd_reduice))
X = AAPLRaw_pd_reduice
y = AAPL_y
X_train, X_test, y_train, y_test = X[:split], X[split:], y[:split], y[split:]
dates_test = X_test.index.values
y_test
print("X dimension:",X.shape)
print("Y dimension:",y.shape)
print("X_train dimension:",X_train.shape)
print("X_train dimension:",X_test.shape)
print("y_train dimension:",y_train.shape)
print("y_test dimension:",y_test.shape)
# ------------------------------------------------------------
# A- Logistic Regression
# ------------------------------------------------------------
# Step 1 : Instantiate the model
# ------------------------------------------------------------
LR = LogisticRegression(solver='lbfgs')
model_LR = LR.fit (X_train,y_train)
# ------------------------------------------------------------
# Examine The Coefficients
# ------------------------------------------------------------
print('Result of the Logistic Regression :')
print('Coefficients :')
Coeff_model_LR = pd.DataFrame(zip(X.columns, np.transpose(model_LR.coef_)))
print(Coeff_model_LR)
print('\n')
# ------------------------------------------------------------
# Calculate Class Probabilities
# ------------------------------------------------------------
print('Class Probabilities :')
probability_LR = model_LR.predict_proba(X_test)
print(probability_LR)
print('\n')
# ------------------------------------------------------------
# Step 2 : Predict Class Labels
# ------------------------------------------------------------
predicted_LR = model_LR.predict(X_test)
# ------------------------------------------------------------
# Step 3 : Evaluate The Model
# ------------------------------------------------------------
print('Confusion Matrix :')
confusion_matrix_LR = metrics.confusion_matrix(y_test, predicted_LR)
print(confusion_matrix_LR)
print('\n')
print('Classification Report :')
classification_report_LR = metrics.classification_report(y_test, predicted_LR)
print(classification_report_LR)
print('\n')
print('Model Accuracy :')
score_LR = model_LR.score(X_test,y_test)
print(score_LR)
print('\n')
# ------------------------------------------------------------
# Cross-Validation
# ------------------------------------------------------------
cross_val_LR = cross_val_score(LR, X, y, scoring='accuracy', cv=10)
print('Cross-Validation result :', cross_val_LR)
print('\n')
print('Cross-Validation mean :',cross_val_LR.mean())
plt.figure(figsize=(20, 10))
plt.ylim(-1.5, 1.5)
plt.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
plt.plot(dates_test, predicted_LR ,"ro",marker=".",alpha=1, color='g', label="y_predicted")
plt.stem(dates_test, y_test, markerfmt='', use_line_collection = True, basefmt=None ,label="y_test")
plt.xlabel('Date', fontsize=18 , labelpad=11, fontweight='bold')
plt.ylabel('Price', fontsize=18 , labelpad=11, fontweight='bold')
plt.title('Logistic Regression model', pad =10, c = 'r', fontweight='bold')
plt.legend()
plt.show()
# ------------------------------------------------------------
# B- SVM Classifier
# ------------------------------------------------------------
# Step 1 : Instantiate the model
# ------------------------------------------------------------
# Create a svm Classifier
clf = svm.SVC(kernel='rbf',gamma=0.3, max_iter=-1, probability=True,random_state=None, shrinking=True, tol=0.01)
model_svc = clf.fit(X_train, y_train)
# ------------------------------------------------------------
# Step 2 : Predict Class Labels
# ------------------------------------------------------------
predicted_svc = model_svc.predict(X_test)
# ------------------------------------------------------------
# Step 3 : Evaluate The Model
# ------------------------------------------------------------
print('Result of the SVM Classifier :')
print('Confusion Matrix for SVC:')
confusion_matrix_svc = metrics.confusion_matrix(y_test, predicted_svc)
print(confusion_matrix_svc)
print('\n')
print('Classification Report for SVC :')
classification_report_svc = metrics.classification_report(y_test, predicted_svc)
print(classification_report_svc)
print('\n')
print('Model Accuracy for SVC:')
score_svc = model_svc.score(X_test,y_test)
print(score_svc)
# ------------------------------------------------------------
# Cross-Validation
# ------------------------------------------------------------
cross_val_svc = cross_val_score(clf, X, y, scoring='accuracy', cv=10)
print('Cross-Validation result :', cross_val_svc)
print('\n')
print('Cross-Validation mean :',cross_val_svc.mean())
plt.figure(figsize=(20, 10))
plt.ylim(-1.5, 1.5)
plt.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
plt.plot(dates_test, predicted_svc ,"ro",marker=".",alpha=1, color='g', label="y_predicted")
plt.stem(dates_test, y_test, markerfmt='', use_line_collection = True, basefmt=None ,label="y_test")
plt.xlabel('Date', fontsize=18 , labelpad=11, fontweight='bold')
plt.ylabel('Price', fontsize=18 , labelpad=11, fontweight='bold')
plt.title('SVM Classifier model', pad =10, c = 'r', fontweight='bold')
plt.legend()
plt.show()
# ------------------------------------------------------------
# C - Lasso Classifier
# ------------------------------------------------------------
# Step 1 : Instantiate the model
# ------------------------------------------------------------
# Create a svm Classifier
ls = LogisticRegression(penalty='l1', solver='saga')
model_ls =ls.fit(X_train, y_train)
# ------------------------------------------------------------
# Step 2 : Predict Class Labels
# ------------------------------------------------------------
predicted_ls = model_ls.predict(X_test)
# ------------------------------------------------------------
# Step 3 : Evaluate The Model
# ------------------------------------------------------------
print('Result of the Lasso Classifier :')
print('Confusion Matrix for SVC:')
confusion_matrix_ls = metrics.confusion_matrix(y_test, predicted_ls)
print(confusion_matrix_ls)
print('\n')
print('Classification Report for SVC :')
classification_report_ls = metrics.classification_report(y_test, predicted_ls)
print(classification_report_ls)
print('\n')
print('Model Accuracy for SVC:')
score_ls = model_ls.score(X_test,y_test)
print(score_ls)
# ------------------------------------------------------------
# Cross-Validation
# ------------------------------------------------------------
cross_val_ls = cross_val_score(ls, X, y, scoring='accuracy', cv=10)
print('Cross-Validation result :', cross_val_ls)
print('\n')
print('Cross-Validation mean :',cross_val_ls.mean())
plt.figure(figsize=(20, 10))
plt.ylim(-1.5, 1.5)
plt.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
plt.plot(dates_test, predicted_svc ,"ro",marker=".",alpha=1, color='g', label="y_predicted")
plt.stem(dates_test, y_test, markerfmt='', use_line_collection = True, basefmt=None ,label="y_test")
plt.xlabel('Date', fontsize=18 , labelpad=11, fontweight='bold')
plt.ylabel('Price', fontsize=18 , labelpad=11, fontweight='bold')
plt.title('Lasso Classifier model', pad =10, c = 'r', fontweight='bold')
plt.legend()
plt.show()
# ------------------------------------------------------------
# D- SVM Classifier with StandardScaler
# ------------------------------------------------------------
# Step 1 : Instantiate the model
# ------------------------------------------------------------
# Create a svm Classifier
pipe_lrSVC = Pipeline([('scaler', StandardScaler()), ('clf', svm.SVC(kernel='rbf',gamma=0.1, max_iter=-1, probability=False,random_state=None, shrinking=True, tol=0.001))])
pipe_lrSVC.fit(X_train, y_train)
# ------------------------------------------------------------
# Step 2 : Predict Class Labels
# ------------------------------------------------------------
predicted_svc_scale = pipe_lrSVC.predict(X_test)
# ------------------------------------------------------------
# Step 3 : Evaluate The Model
# ------------------------------------------------------------
print('Result of the SVM Classifier :')
print('Confusion Matrix for SVC:')
confusion_matrix_svc_scale = metrics.confusion_matrix(y_test, predicted_svc_scale)
print(confusion_matrix_svc_scale)
print('\n')
print('Classification Report for SVC :')
classification_report_svc_scale = metrics.classification_report(y_test, predicted_svc_scale)
print(classification_report_svc_scale)
print('\n')
print('Model Accuracy for SVC:')
score_svc_scale = pipe_lrSVC.score(X_test,y_test)
print(score_svc_scale)
# ------------------------------------------------------------
# Cross-Validation
# ------------------------------------------------------------
cross_val_svc_scale = cross_val_score(pipe_lrSVC, X, y, scoring='accuracy', cv=10)
print('Cross-Validation result :', cross_val_svc_scale)
print('\n')
print('Cross-Validation mean :',cross_val_svc_scale.mean())
plt.figure(figsize=(20, 10))
plt.ylim(-1.5, 1.5)
plt.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
plt.plot(dates_test, predicted_svc_scale ,"ro",marker="o",alpha=1, color='g', label="y_predicted")
plt.stem(dates_test, y_test, markerfmt='', use_line_collection = True, basefmt=None ,label="y_test")
plt.xlabel('Date', fontsize=18 , labelpad=11, fontweight='bold')
plt.ylabel('Price', fontsize=18 , labelpad=11, fontweight='bold')
plt.title('SVM Classifier with StandardScaler', pad =10, c = 'r', fontweight='bold')
plt.legend()
plt.show()
# ------------------------------------------------------------
# D- Lasso Classifier with StandardScaler
# ------------------------------------------------------------
# Step 1 : Instantiate the model
# ------------------------------------------------------------
pipe_ls = Pipeline([('scaler', StandardScaler()), ('pipe_ls', LogisticRegression(penalty='l1', solver='saga',max_iter =5000))])
pipe_ls.fit(X_train, y_train)
# ------------------------------------------------------------
# Step 2 : Predict Class Labels
# ------------------------------------------------------------
predicted_ls_scale = pipe_ls.predict(X_test)
# ------------------------------------------------------------
# Step 3 : Evaluate The Model
# ------------------------------------------------------------
print('Results from Lasso Classifier with StandardScaler:')
print('Confusion Matrix for SVC:')
confusion_matrix_ls_scale = metrics.confusion_matrix(y_test, predicted_ls_scale)
print(confusion_matrix_ls_scale)
print('\n')
print('Classification Report for SVC :')
classification_report_ls_scale = metrics.classification_report(y_test, predicted_ls_scale)
print(classification_report_ls_scale)
print('\n')
print('Model Accuracy for SVC:')
score_ls_scale = pipe_ls.score(X_test,y_test)
print(score_ls_scale)
# ------------------------------------------------------------
# Cross-Validation
# ------------------------------------------------------------
cross_val_ls_scale = cross_val_score(pipe_ls, X, y, scoring='accuracy', cv=10)
print('Cross-Validation result :', cross_val_ls_scale)
print('\n')
print('Cross-Validation mean :',cross_val_ls_scale.mean())
plt.figure(figsize=(20, 10))
plt.ylim(-1.5, 1.5)
plt.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
plt.plot(dates_test, predicted_ls_scale ,"ro",marker="o",alpha=1, color='g', label="y_predicted")
plt.stem(dates_test, y_test, markerfmt='', use_line_collection = True, basefmt=None ,label="y_test")
plt.xlabel('Date', fontsize=18 , labelpad=11, fontweight='bold')
plt.ylabel('Price', fontsize=18 , labelpad=11, fontweight='bold')
plt.title('Lasso Classifier with StandardScaler', pad =10, c = 'r', fontweight='bold')
plt.legend()
plt.show()
# ------------------------------------------------------------
# E- Logistic Regression with StandardScaler
# ------------------------------------------------------------
# Step 1 : Instantiate the model
# ------------------------------------------------------------
pipe_lR = Pipeline([('scaler', StandardScaler()), ('pipe_lR', LogisticRegression(solver='lbfgs'))])
pipe_lR.fit(X_train, y_train)
# ------------------------------------------------------------
# Step 2 : Predict Class Labels
# ------------------------------------------------------------
predicted_lR_scale = pipe_lR.predict(X_test)
# ------------------------------------------------------------
# Step 3 : Evaluate The Model
# ------------------------------------------------------------
print('Results from Lasso Classifier with StandardScaler:')
print('Confusion Matrix for SVC:')
confusion_matrix_lR_scale = metrics.confusion_matrix(y_test, predicted_lR_scale)
print(confusion_matrix_lR_scale)
print('\n')
print('Classification Report for SVC :')
classification_report_lR_scale = metrics.classification_report(y_test, predicted_lR_scale)
print(classification_report_lR_scale)
print('\n')
print('Model Accuracy for SVC:')
score_lR_scale = pipe_lR.score(X_test,y_test)
print(score_lR_scale)
# ------------------------------------------------------------
# Cross-Validation
# ------------------------------------------------------------
cross_val_lR_scale = cross_val_score(pipe_lR, X, y, scoring='accuracy', cv=10)
print('Cross-Validation result :', cross_val_lR_scale)
print('\n')
print('Cross-Validation mean :',cross_val_lR_scale.mean())
plt.figure(figsize=(20, 10))
plt.ylim(-1.5, 1.5)
plt.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
plt.plot(dates_test, predicted_lR_scale ,"ro",marker="o",alpha=1, color='g', label="y_predicted")
plt.stem(dates_test, y_test, markerfmt='', use_line_collection = True, basefmt=None ,label="y_test")
plt.xlabel('Date', fontsize=18 , labelpad=11, fontweight='bold')
plt.ylabel('Price', fontsize=18 , labelpad=11, fontweight='bold')
plt.title('Logistic Regression with StandardScaler', pad =10, c = 'r', fontweight='bold')
plt.legend()
plt.show()
# importing the libraries
from bs4 import BeautifulSoup
from urllib import request
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from nltk.tokenize import sent_tokenize, word_tokenize
import spacy
# import sys
# !{sys.executable} -m pip install -U vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()
pd.set_option('display.max_rows', None)
# Sending a HTTP request to a URL
url = "file:///Users/leafanirisoa/Documents/data_test/10-K/2016-10-26.html"
# ------------------------------------------------------------
# Step 0 : Read the .html
# ------------------------------------------------------------
# fetch the raw HTML content and Parse the html content
def read_html(url):
response = request.urlopen(url)
html_content = response.read().decode('utf8')
return BeautifulSoup(html_content, "lxml")
data_init = read_html(url)
# ------------------------------------------------------------ font style="font-family
# Step 1 : Transforme to text
# ------------------------------------------------------------
def extract_text(date, data_parsed):
if int(date) <= 2015 :
all_p = data_parsed.find_all('p')
else:
all_p = data_parsed.find_all('div')
ls = [] # Create empty list
for l in all_p:
#Find all data structure that is ‘div’
ls.append(l.text)
return ls
# ------------------------------------------------------------
# Step 2 : Removing line breaks
# ------------------------------------------------------------
def remove_line_breaks(data_extracted):
ls = [] # Create empty list
for l in data_extracted:
#Find all data structure that is ‘div’
ls.append(" ".join(l.split()))
return ls
# ------------------------------------------------------------
# Step 3 : Removing accented characters
# ------------------------------------------------------------
def remove_accented_chars(data_extracted):
ls = [] # Create empty list
for l in data_extracted:
#Find all data structure that is ‘div’
ls.append(l.encode('ascii', 'ignore').decode('utf-8', 'ignore'))
return ls
text = unicodedata.encode('ascii', 'ignore').decode('utf-8', 'ignore')
return text
CONTRACTION_MAP = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}
# ------------------------------------------------------------
# Step 4 : Expanding Contractions
# ------------------------------------------------------------
def text_expand(text, contraction_mapping=CONTRACTION_MAP):
contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
flags=re.IGNORECASE|re.DOTALL)
def expand_match(contraction):
match = contraction.group(0)
first_char = match[0]
expanded_contraction = contraction_mapping.get(match)\
if contraction_mapping.get(match)\
else contraction_mapping.get(match.lower())
expanded_contraction = first_char+expanded_contraction[1:]
return expanded_contraction
expanded_text = contractions_pattern.sub(expand_match, text)
expanded_text = re.sub("'", "", expanded_text)
return expanded_text
# expand_contractions("Y'all can't expand contractions I'd think")
# ===> 'You all cannot expand contractions I would think'
def expand_contractions(data_extracted):
ls = [] # Create empty list
for l in data_extracted:
#Find all data structure that is ‘div’
ls.append(text_expand(l))
return ls
# ------------------------------------------------------------
# Step 5 : Expanding Contractions
# ------------------------------------------------------------
# Install spacy (run in terminal/prompt)
# import sys
# !{sys.executable} -m pip install spacy
# Download spacy's 'en' Model
# !{sys.executable} -m spacy download en
nlp = spacy.load('en', parse=True, tag=True, entity=True)
def text_lemmatize(data_extracted):
ls = [] # Create empty list
for l in data_extracted:
text = nlp(l)
ls.append(' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text]))
return ls
# ------------------------------------------------------------
# Step 6 : Removing Stopwords
# ------------------------------------------------------------
from spacy.lang.en.stop_words import STOP_WORDS
def delete_stopwords(data_extracted):
ls = [] # Create empty list
for l in data_extracted:
text = l.lower()
ls.append(' '.join(w for w in text.split() if w not in STOP_WORDS))
return ls
# ------------------------------------------------------------
# Step 7 : Removing Special Characters
# ------------------------------------------------------------
def delete_characters(data_extracted):
data_reduiced = [s for s in data_extracted if len(s.split()) > 10]
ls = [] # Create empty list
for l in data_reduiced:
ls.append(' '.join(w for w in l.split() if w.isalpha()))
return ls
ls_1 = extract_text('2016',data_init)
ls_2 = remove_line_breaks(ls_1)
ls_3 = remove_accented_chars(ls_2)
ls_4 = expand_contractions(ls_3)
ls_5 = text_lemmatize(ls_4)
ls_6 = delete_stopwords(ls_5)
ls_7 = delete_characters(ls_6)
# ------------------------------------------------------------
# Step 8 : Scoring
# ------------------------------------------------------------
def Scoring_sentences(sentence):
score = analyser.polarity_scores(sentence)
return score
def sentiment_analyzer_scores(data_extracted):
ls = [] # Create empty list
for l in data_extracted:
l_score = Scoring_sentences(l)
ls.append(l_score)
return ls
def scoring_dataset(data_extracted):
df = pd.DataFrame(sentiment_analyzer_scores(data_extracted))
df.columns = ['Compound' , 'Negative', 'Neutral','Positive']
return df
ls_8 = scoring_dataset(ls_7)
ls_8.head()
# percentile list
perc =[ .60, .80 ,.90]
# ------------------------------------------------------------
# Step 9 : Metrics
# ------------------------------------------------------------
def metrics_values(df, perc):
include =['object', 'float', 'int']
desc_data = df.describe(percentiles = perc, include = include)
return desc_data
ls_9 = metrics_values(ls_8,perc)
ls_9
# ------------------------------------------------------------
# Final Step : in one step
# ------------------------------------------------------------
def metrics_extraction(date,data_init,perc):
ls_1 = extract_text(date,data_init) # Step 1 : Transforme to text
ls_2 = remove_line_breaks(ls_1) # Step 2 : Removing line breaks
ls_3 = remove_accented_chars(ls_2) # Step 3 : Removing accented characters
ls_4 = expand_contractions(ls_3) # Step 4 : Expanding Contractions
ls_5 = text_lemmatize(ls_4) # Step 5 : Expanding Contractions
ls_6 = delete_stopwords(ls_5) # Step 6 : Removing Stopwords
ls_7 = delete_characters(ls_6) # Step 7 : Removing Special Characters
ls_8 = scoring_dataset(ls_7) # Step 8 : Scoring
ls_9 = metrics_values(ls_8,perc) # Step 9 : Metrics
return ls_9
desc_data = metrics_extraction('2016',data_init,perc)
desc_data
import seaborn as sns
sns.set_style("white")
x1 = ls_8.Compound
x2 = ls_8.Negative
x3 = ls_8.Neutral
x4 = ls_8.Positive
# plot
fig, axes = plt.subplots(1, 4, figsize=(10, 4), sharey=True, dpi=100)
sns.distplot(x1 , color="dodgerblue", ax=axes[0], axlabel='Compound')
sns.distplot(x2 , color="deeppink", ax=axes[1], axlabel='Negative')
sns.distplot(x3 , color="orange", ax=axes[2], axlabel='Neutral')
sns.distplot(x4 , color="green", ax=axes[3], axlabel='Positive')
# Draw Plot
plt.figure(figsize=(16,10), dpi= 80)
sns.kdeplot(x1, shade=True, color="dodgerblue", label="Compound", alpha=.7)
sns.kdeplot(x2, shade=True, color="deeppink", label="Negative", alpha=.7)
sns.kdeplot(x3, shade=True, color="orange", label="Neutral", alpha=.7)
sns.kdeplot(x4, shade=True, color="green", label="Positive", alpha=.7)
# Decoration
plt.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
plt.title('Density Plot of each categories ', fontsize=22)
plt.legend(loc ='best', fontsize=15)
plt.show()
# importing the libraries
from bs4 import BeautifulSoup
from urllib import request
import csv
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', None)
# Step 1: Sending a HTTP request to a URL
url = "file:///Users/leafanirisoa/Documents/data_test/10-K/2011-10-26.html"
# Make a GET request to fetch the raw HTML content
response = request.urlopen(url)
html_content = response.read().decode('utf8')
# Step 2: Parse the html content
soup = BeautifulSoup(html_content, "lxml")
def table_generator(data, k):
#Get the table having in the rank num_table = k
# Parameters of the table :
gdp_table = data.find_all('table')[k]
gdp_table_data = gdp_table.find_all("tr")
num_cols = len(gdp_table_data[0].find_all("td"))
num_rows =len(gdp_table_data)
# Save the table in pd.dataframe
new_table = pd.DataFrame(columns=range(0,num_cols), index = range(0,num_rows)) # I know the size
for i in range(0,num_rows):
column_marker = 0
columns = gdp_table.find_all("tr")[i].find_all('td')
for column in columns:
new_table.iat[i,column_marker] = column.get_text(strip = True).replace('\n', ' ').replace('$', '').replace('(', '').replace(')', '').strip()
column_marker += 1
# Define the header of the dataset
headings = []
for td in gdp_table_data[1].find_all("td"):
# remove any newlines and extra spaces from left and right
headings.append(td.get_text(strip = True).replace('\n', ' ').strip())
headings_table = [x for x in headings if x]
# Consolidation of the dataset
new_table_1 = new_table
new_table_1.replace('', np.nan, inplace=True)
last_table = new_table_1.dropna(how='all').reset_index(drop=True).iloc[1:].dropna(1,how='all').dropna(0,how='all')
if len(headings_table) == len(last_table.columns) :
last_table.columns = headings_table
elif len(headings_table) == len(last_table.columns) -1 :
last_table.columns = ["Attributes"] +[x for x in headings if x]
else:
last_table = new_table_1.dropna(how='all').reset_index(drop=True).iloc[2:].dropna(1,how='all').dropna(0,how='all')
last_table.columns = ["Attributes"] + ['Col_'+ str(x) for x in range(1, len(last_table.columns))]
return last_table.fillna('').reset_index(drop=True)
# ------------------------------------------------------------
# Load Packages
# ------------------------------------------------------------
import os
import math
import pandas as pd
from scipy import stats
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
pd.plotting.register_matplotlib_converters()
import import_ipynb
# ------------------------------------------------------------
# set Parameters
# ------------------------------------------------------------
source_dir = "file:///Users/leafanirisoa/Documents/data_test/10-K/"
dates = ["2011-10-26", "2012-10-31", "2013-10-30","2014-10-27","2015-10-28","2016-10-26","2017-11-03","2018-11-05","2019-10-31"]
# percentile list
perc =[ .60, .80 ,.90]
# ------------------------------------------------------------
# Read the .html
# ------------------------------------------------------------
def one_date_data(source_dir, dates):
url = source_dir + dates + ".html"
response = request.urlopen(url)
html_content = response.read().decode('utf8')
soup = BeautifulSoup(html_content, "lxml")
return soup
def get_dico_data(source_dir, dates):
list_of_url = {t[0:4]: source_dir + t + ".html" for t in dates}
data_html = {}
for t in dates:
k = t[0:4]
data_html[k] = read_html(list_of_url[k])
return data_html
data_used = get_dico_data(source_dir, dates)
# ------------------------------------------------------------
# Generate Metric matrix
# ------------------------------------------------------------
def Matrix_metric(dates,data_used,perc):
dico_metrics = {}
for t in dates:
k = t[0:4]
data = data_used[k]
dico_metrics[k] = metrics_extraction(k,data,perc)
return dico_metrics
dico_metrics = Matrix_metric(dates,data_used,perc)
def Union_metric(dates,dico_metrics,metric):
ls = []
for t in dates:
k = t[0:4]
dico_metrics[k]['Year'] = k
ls.append(dico_metrics[k].loc[dico_metrics[k].index == metric,:])
return pd.concat(ls,ignore_index=True).drop_duplicates().reset_index(drop=True)
Union_metric(dates,dico_metrics,'mean')
df= Union_metric(dates,dico_metrics,'90%')
fig, ax1 = plt.subplots(figsize=(16,9))
ax1.plot(df.Year, df.Negative, label='Negative', color='tab:blue')
#ax1.plot(df.Year, df.Compound, label='Compound', color='tab:red')
#ax1.plot(df.Year, df.Neutral, label='Neutral', color='tab:green')
ax1.plot(df.Year, df.Positive, label='Positive', color='tab:orange')
ax1.set_xlabel('Jan. 04, 2010 - Apr. 28, 2020', fontsize=18 , labelpad=11, fontweight='bold')
ax1.set_ylabel('Metric Values ', fontsize=18 , labelpad=11, fontweight='bold')
ax1.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
ax1.set_title('Sentiment analysis metric', pad =10, c = 'r', fontweight='bold',fontsize=22)
ax1.legend(loc ='best', fontsize=15)
df2= Union_metric(dates,dico_metrics,'std')
fig, ax1 = plt.subplots(figsize=(16,9))
# ax1.plot(df.Year, df2.Compound, label='Compound', color='tab:red')
ax1.plot(df.Year, df2.Negative, label='Negative', color='tab:blue')
ax1.plot(df.Year, df2.Neutral, label='Neutral', color='tab:green')
ax1.plot(df.Year, df2.Positive, label='Positive', color='tab:orange')
ax1.set_xlabel('Jan. 04, 2010 - Apr. 28, 2020', fontsize=18 , labelpad=11, fontweight='bold')
ax1.set_ylabel('Metric Values ', fontsize=18 , labelpad=11, fontweight='bold')
ax1.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
ax1.set_title('Sentiment analysis metric (Std)', pad =10, c = 'r', fontweight='bold',fontsize=22)
ax1.legend(loc ='best', fontsize=15)
df= Union_metric(dates,dico_metrics,'mean')
fig, ax1 = plt.subplots(figsize=(16,9))
ax1.plot(df.Year, df.Negative, label='Negative', color='tab:blue')
ax1.plot(df.Year, df.Compound, label='Compound', color='tab:red')
ax1.plot(df.Year, df.Neutral, label='Neutral', color='tab:green')
ax1.plot(df.Year, df.Positive, label='Positive', color='tab:orange')
ax1.set_xlabel('Jan. 04, 2010 - Apr. 28, 2020', fontsize=18 , labelpad=11, fontweight='bold')
ax1.set_ylabel('Metric Values ', fontsize=18 , labelpad=11, fontweight='bold')
ax1.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
ax1.set_title('Sentiment analysis metric', pad =10, c = 'r', fontweight='bold',fontsize=22)
ax1.legend(loc ='best', fontsize=15)
df_2013 = one_date_data(source_dir, '2013-10-30')
table_2013=table_generator(df_2013, 74)
table_2013
df_2011 = one_date_data(source_dir, '2011-10-26')
table_2011= table_generator(df_2011, 68)
table_2011
df_cd = pd.merge(table_2011, table_2013, how='inner', on = ['Attributes','2011'])
data_last = df_cd[['Attributes', '2009', '2010', '2011', '2012', '2013']]
data_last
data_transposed = data_last.set_index('Attributes').T
data_transposed
data_transposed.columns = ['C_1', 'C_2', 'C_3', 'C_4', 'C_5', 'C_6', 'C_7']
data_transposed['C_1'] = data_transposed['C_1'].str.replace(',', '').astype(float)
data_transposed['C_7'] = data_transposed['C_7'].str.replace(',', '').astype(float)
data_transposed
for i in data_transposed.columns:
data_transposed[i] = pd.to_numeric(data_transposed[i], errors='ignore')
fig, ax1 = plt.subplots(figsize=(16,9))
# ax1.plot(df.Year, df2.Compound, label='Compound', color='tab:red')
ax1.plot(data_transposed.index, data_transposed.C_1, label='Beginning Balance', color='tab:blue')
ax1.plot(data_transposed.index, data_transposed.C_7, label='Ending Balance', color='tab:green')
ax1.set_xlabel('Jan. 2009 - Dec. 2013', fontsize=18 , labelpad=11, fontweight='bold')
ax1.set_ylabel('Balance values ', fontsize=18 , labelpad=11, fontweight='bold')
ax1.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
ax1.set_title('The aggregate changes in the balance of gross unrecognized tax benefits', pad =10, c = 'r', fontweight='bold',fontsize=22)
ax1.legend(loc ='best', fontsize=15)
fig, ax1 = plt.subplots(figsize=(16,9))
# ax1.plot(df.Year, df2.Compound, label='Compound', color='tab:red')
ax1.plot(data_transposed.index, data_transposed.C_7 - data_transposed.C_1, label='C_7-C_1', color='tab:blue')
ax1.plot(data_transposed.index, data_transposed.C_2, label='C_2', color='tab:green')
ax1.plot(data_transposed.index, data_transposed.C_3, label='C_3', color='tab:red')
ax1.plot(data_transposed.index, data_transposed.C_4, label='C_4', color='tab:orange')
ax1.set_xlabel('Jan. 2009 - Dec. 2013', fontsize=18 , labelpad=11, fontweight='bold')
ax1.set_ylabel('Balance values ', fontsize=18 , labelpad=11, fontweight='bold')
ax1.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
ax1.set_title('The aggregate changes in the balance of gross unrecognized tax benefits', pad =10, c = 'r', fontweight='bold',fontsize=22)
ax1.legend(loc ='best', fontsize=15)