In [1]:
# ---------------------------------------------------------
#                   IMPORT DES PACKAGES 
# ---------------------------------------------------------

# If we want to use files in HDFS
import os

spark_path = "/usr/local/Cellar/spark-2.4.3-bin-hadoop2.7"
os.environ["SPARK_HOME"] = spark_path

import math
import pandas as pd
from scipy import stats
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
pd.plotting.register_matplotlib_converters()

from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# Machine learning
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
In [2]:
# -----------------------------------------------------------------
#                   Set parameters for the plt
# -----------------------------------------------------------------

font = {'family' : 'Verdana','weight' : 15 ,'size'   : 14}
plt.rc('font', **font)
In [3]:
# -------------------------------------------
#                   sparkSession 
# -------------------------------------------

spark = SparkSession \
  .builder \
  .appName("Attrition") \
  .config("spark.memory.fraction", 0.8) \
  .config("spark.executor.memory", "18g") \
  .config("spark.driver.memory", "18g")\
  .config("spark.sql.shuffle.partitions" , "800") \
  .getOrCreate()

sparkcontext =  spark.sparkContext
In [4]:
# -------------------------------------------
#              Read and load Dataframe
# -------------------------------------------
# 1- Data eBoutique : Sales
AAPL_path= "/tmp/incoming/AAPL_pricing/AAPL_prices_parquet"
AAPLRaw = spark.read.parquet(AAPL_path)
AAPLRaw.show(4)
+-------------------+---------+---------+---------+---------+---------+---------+
|               Date|     Open|     High|      Low|    Close|Adj_Close|   Volume|
+-------------------+---------+---------+---------+---------+---------+---------+
|2010-01-04 00:00:00|    30.49|30.642857|    30.34|30.572857|26.538483|123432400|
|2010-01-05 00:00:00|30.657143|30.798571|30.464285|30.625713|26.584366|150476200|
|2010-01-06 00:00:00|30.625713|30.747143|30.107143|30.138571|26.161509|138040000|
|2010-01-07 00:00:00|    30.25|30.285715|29.864286|30.082857|26.113146|119282800|
+-------------------+---------+---------+---------+---------+---------+---------+
only showing top 4 rows

In [5]:
AAPLRaw.agg(min(col("Date")), max(col("Date"))).show()
+-------------------+-------------------+
|          min(Date)|          max(Date)|
+-------------------+-------------------+
|2010-01-04 00:00:00|2020-04-28 00:00:00|
+-------------------+-------------------+

In [6]:
AAPL_stat_desc = AAPLRaw.describe()

for col_name in  AAPL_stat_desc.columns[1:6]:
    AAPL_stat_desc = AAPL_stat_desc.withColumn(col_name, round(col_name, 3))
    
AAPL_stat_desc.show()
+-------+-------+-------+-------+-------+---------+-------------------+
|summary|   Open|   High|    Low|  Close|Adj_Close|             Volume|
+-------+-------+-------+-------+-------+---------+-------------------+
|  count| 2597.0| 2597.0| 2597.0| 2597.0|   2597.0|               2597|
|   mean|115.853|116.965|114.758|115.905|  109.571|7.402348221024258E7|
| stddev| 63.435| 64.192| 62.844| 63.578|   65.231|5.821870515136323E7|
|    min| 27.481|   28.0| 27.179| 27.436|   23.815|           11362000|
|    max| 324.74| 327.85| 323.35|  327.2|    327.2|          470249500|
+-------+-------+-------+-------+-------+---------+-------------------+

In [7]:
# conversion to pandas dataframe
AAPLRaw_pd = AAPLRaw.toPandas()
AAPLRaw_pd.isna().any()
AAPLRaw_pd.info()
AAPLRaw_pd.reset_index(inplace=True)
AAPLRaw_pd.set_index("Date", inplace=True)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2597 entries, 0 to 2596
Data columns (total 7 columns):
Date         2597 non-null datetime64[ns]
Open         2597 non-null float64
High         2597 non-null float64
Low          2597 non-null float64
Close        2597 non-null float64
Adj_Close    2597 non-null float64
Volume       2597 non-null int32
dtypes: datetime64[ns](1), float64(5), int32(1)
memory usage: 132.0 KB
In [8]:
#  plot the Apple Adj. Close Price History
plt.figure(figsize=(15, 8))
plt.plot(AAPLRaw_pd['Adj_Close'],label = 'APPL_adj')
plt.title('Apple Adj. Close Price History', pad =10, c = 'r', fontweight='bold')
plt.xlabel('Jan. 04, 2010 - Apr. 28, 2020', fontsize=18 , labelpad=11, fontweight='bold')
plt.ylabel('Adj. Close Price USD ($)', fontsize=18 , labelpad=11, fontweight='bold')
plt.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
plt.legend(loc ='upper left', fontsize=15)
Out[8]:
<matplotlib.legend.Legend at 0x7fb8692a7358>
In [9]:
# Get the timeseries. This now returns a Pandas Series object indexed by date.
AAPL_Close = AAPLRaw_pd.loc[:, 'Close']

# Calculate the 20 and 100 days moving averages of the closing prices
short_rolling_AAPL = AAPL_Close.rolling(window=15).mean()
middle_rolling_AAPL = AAPL_Close.rolling(window=30).mean()
long_rolling_AAPL = AAPL_Close.rolling(window=100).mean()

# Plot everything by leveraging the very powerful matplotlib package
fig, ax1 = plt.subplots(figsize=(16,9))
ax1.plot(AAPL_Close.index, AAPL_Close, label='AAPL_Close')
ax1.plot(short_rolling_AAPL.index, short_rolling_AAPL, label='10 days rolling')
ax1.plot(middle_rolling_AAPL.index, middle_rolling_AAPL, label='30 days rolling')
ax1.plot(long_rolling_AAPL.index, long_rolling_AAPL, label='100 days rolling')
ax1.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
ax1.set_title('Apple Close Price History', pad =10, c = 'r', fontweight='bold')
ax1.set_xlabel('Date', fontsize=18 , labelpad=11, fontweight='bold')
ax1.set_ylabel('Close price ($)', fontsize=18 , labelpad=11, fontweight='bold')
ax1.legend(loc ='upper left', fontsize=18)
Out[9]:
<matplotlib.legend.Legend at 0x7fb869341e80>
In [10]:
# Get the MSFT timeseries. This now returns a Pandas Series object indexed by date.
AAPL_Adj_Close = AAPLRaw_pd.loc[:, 'Adj_Close']

# Calculate the 20 and 100 days moving averages of the closing prices
short_rolling_AAPL_Adj = AAPL_Adj_Close.rolling(window=10).mean()
middle_rolling_AAPL_Adj = AAPL_Adj_Close.rolling(window=30).mean()
long_rolling_AAPL_Adj = AAPL_Adj_Close.rolling(window=100).mean()

# Plot everything by leveraging the very powerful matplotlib package
fig, ax1 = plt.subplots(figsize=(16,9))
ax1.plot(AAPL_Adj_Close.index, AAPL_Adj_Close, label='AAPL_Adj_Close')
ax1.plot(short_rolling_AAPL_Adj.index, short_rolling_AAPL_Adj, label='10 days rolling')
ax1.plot(middle_rolling_AAPL_Adj.index, middle_rolling_AAPL_Adj, label='30 days rolling')
ax1.plot(long_rolling_AAPL_Adj.index, long_rolling_AAPL_Adj, label='100 days rolling')
ax1.set_xlabel('Jan. 04, 2010 - Apr. 28, 2020', fontsize=18 , labelpad=11, fontweight='bold')
ax1.set_ylabel('Adjusted closing price ($)', fontsize=18 , labelpad=11, fontweight='bold')
ax1.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
ax1.set_title('Apple Adj. Close Price History', pad =10, c = 'r', fontweight='bold')
ax1.legend()
Out[10]:
<matplotlib.legend.Legend at 0x7fb879550828>
In [11]:
fig, (ax1, ax2) = plt.subplots(2, 1,figsize=(15,10))

AAPL_Adj_Close.plot(x='AAPL_Adj_Close', y='unemployment', ax=ax1, legend=True)
ax1.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
ax1.set_xlabel('AAPL 10-year history of trading days', fontsize=18 , labelpad=11, fontweight='bold')
ax1.set_ylabel('Adjusted closing price', fontsize=18 , labelpad=11, fontweight='bold')

AAPL_Adj_Close.pct_change().hist(bins=50, color = 'b', ax=ax2)
ax2.set_xlabel('AAPL 10-year history of trading days', fontsize=18 , labelpad=11, fontweight='bold')
ax2.set_ylabel('Adjusted closing price', fontsize=18 , labelpad=11, fontweight='bold')
Out[11]:
Text(0, 0.5, 'Adjusted closing price')
In [12]:
# -------------------------------------------
#           Conversion to pandas dataframe
# -------------------------------------------
AAPLRaw_pd = AAPLRaw.toPandas()
AAPLRaw_pd.reset_index(inplace=True)
AAPLRaw_pd.set_index("Date", inplace=True)
In [13]:
# ------------------------------------------------------------
#           Relative strength and Relative Strength Index (RSI)
# ------------------------------------------------------------

def RSI(stock, column="Close", period=14):
    # Wilder's RSI
    close = stock[column]
    delta = close.diff() 
    up, down = delta.copy(), delta.copy()

    up[up < 0] = 0
    down[down > 0] = 0
    
    # Calculate the exponential moving averages (EWMA)
    roll_up = up.ewm(com=period - 1, adjust=False).mean()
    roll_down = down.ewm(com=period - 1, adjust=False).mean().abs()
    
    # Calculate RS based on exponential moving average (EWMA)
    rs = roll_up / roll_down   # relative strength =  average gain/average loss

    rsi = 100-(100/(1+rs))
    stock['RSI'] = rsi
    
    return stock

AAPLRaw_pd = RSI(AAPLRaw_pd)
In [14]:
fig,ax=plt.subplots(figsize=(15,5))
AAPLRaw_pd.RSI.plot(ax=ax, label='RSI', legend=True, color='b');
ax.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
ax.set_xlabel('Trading days', fontsize=18 , labelpad=11, fontweight='bold')
ax.set_ylabel('14-day RSI', fontsize=18 , labelpad=11, fontweight='bold')
ax.set_title('14-day RSI of the Close price AAPL', pad =10, c = 'r', fontweight='bold')
Out[14]:
Text(0.5, 1.0, '14-day RSI of the Close price AAPL')
In [15]:
# ------------------------------------------------------------
#             Define Predictor/Independent Variables
# ------------------------------------------------------------
AAPLRaw_pd['middle_MA'] = AAPLRaw_pd['Close'].rolling(window=30).mean()
AAPLRaw_pd['Corr'] = AAPLRaw_pd['Close'].rolling(window=30).corr(AAPLRaw_pd['middle_MA'])
AAPLRaw_pd['Open-Close'] = AAPLRaw_pd['Open'] - AAPLRaw_pd['Close'].shift(1)
AAPLRaw_pd['Open-Open'] = AAPLRaw_pd['Open'] - AAPLRaw_pd['Open'].shift(1)
AAPLRaw_pd_reduice = AAPLRaw_pd.dropna().drop(['index','Adj_Close'], axis=1)
AAPLRaw_pd_reduice.head()
Out[15]:
Open High Low Close Volume RSI middle_MA Corr Open-Close Open-Open
Date
2010-03-29 33.285713 33.410000 33.088570 33.198570 135186100 71.676475 30.871000 0.745134 0.299999 0.578571
2010-03-30 33.799999 33.925713 33.464287 33.692856 131827500 74.703099 31.025524 0.775770 0.601429 0.514286
2010-03-31 33.641430 33.801430 33.494286 33.571430 107664900 72.649288 31.180047 0.795911 -0.051426 -0.158569
2010-04-01 33.915714 34.104286 33.250000 33.709999 150786300 73.543207 31.337381 0.811838 0.344284 0.274284
2010-04-05 33.568573 34.072857 33.538570 34.070000 171126900 75.759809 31.512714 0.830812 -0.141426 -0.347141
In [16]:
# ------------------------------------------------------------
#           Define Target/Dependent Variable
# ------------------------------------------------------------
# If tomorrow’s closing price is higher than today’s closing price, 
# then the price is going up (1), else the price is going down (-1).
AAPL_y = np.where(AAPLRaw_pd_reduice['Close'].shift(-1) > AAPLRaw_pd_reduice['Close'],1,-1)

# ------------------------------------------------------------
#                   Split The Dataset
# ------------------------------------------------------------

split = int(0.8*len(AAPLRaw_pd_reduice))
X = AAPLRaw_pd_reduice
y = AAPL_y
X_train, X_test, y_train, y_test = X[:split], X[split:], y[:split], y[split:]
dates_test = X_test.index.values
In [17]:
y_test
Out[17]:
array([-1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1, -1, -1,  1,
       -1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1,  1,  1,  1, -1, -1, -1,
        1, -1,  1, -1, -1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1,  1,
        1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1, -1, -1,
        1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1,  1,  1, -1, -1,  1,
        1,  1,  1,  1,  1,  1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1, -1,
        1,  1,  1, -1,  1,  1, -1,  1,  1,  1,  1,  1, -1, -1, -1,  1, -1,
       -1,  1, -1,  1, -1, -1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1, -1,
       -1,  1,  1, -1, -1, -1, -1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,
       -1, -1,  1, -1, -1, -1,  1, -1,  1,  1, -1, -1,  1, -1, -1, -1, -1,
        1, -1,  1,  1,  1, -1,  1, -1,  1,  1,  1, -1, -1,  1,  1,  1,  1,
       -1,  1, -1,  1, -1, -1,  1,  1,  1,  1,  1,  1, -1, -1, -1,  1, -1,
        1, -1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1, -1, -1, -1,  1,  1,
        1,  1,  1,  1,  1, -1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1,  1,
        1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1,  1,  1, -1, -1, -1,  1,
       -1,  1, -1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1, -1,  1, -1,
       -1, -1, -1, -1,  1, -1, -1,  1,  1,  1,  1,  1,  1, -1, -1, -1,  1,
        1, -1,  1, -1, -1, -1,  1, -1, -1,  1,  1,  1, -1, -1,  1,  1, -1,
        1,  1, -1, -1,  1, -1,  1,  1, -1, -1,  1,  1, -1,  1, -1, -1, -1,
        1,  1,  1, -1, -1,  1, -1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,
        1, -1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1,  1, -1, -1,  1,
       -1,  1, -1, -1,  1,  1, -1,  1,  1,  1, -1,  1,  1,  1, -1, -1, -1,
        1,  1,  1, -1,  1,  1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1,  1,
        1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1,  1,
        1,  1, -1,  1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1, -1,  1,
        1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1, -1,
       -1,  1,  1, -1, -1, -1,  1,  1,  1, -1,  1, -1,  1, -1,  1, -1,  1,
       -1, -1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1, -1,  1, -1, -1,  1,
       -1,  1, -1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1,  1, -1,  1, -1,
        1,  1,  1,  1, -1,  1, -1, -1, -1,  1, -1,  1,  1, -1, -1])
In [18]:
print("X dimension:",X.shape)
print("Y dimension:",y.shape)
print("X_train dimension:",X_train.shape)
print("X_train dimension:",X_test.shape)
print("y_train dimension:",y_train.shape)
print("y_test dimension:",y_test.shape)
X dimension: (2539, 10)
Y dimension: (2539,)
X_train dimension: (2031, 10)
X_train dimension: (508, 10)
y_train dimension: (2031,)
y_test dimension: (508,)
In [19]:
# ------------------------------------------------------------
#                A- Logistic Regression
# ------------------------------------------------------------
#         Step 1 : Instantiate the model
# ------------------------------------------------------------
LR = LogisticRegression(solver='lbfgs')
model_LR = LR.fit (X_train,y_train)

# ------------------------------------------------------------
#          Examine The Coefficients 
# ------------------------------------------------------------
print('Result of the Logistic Regression  :')

print('Coefficients  :')
Coeff_model_LR = pd.DataFrame(zip(X.columns, np.transpose(model_LR.coef_)))
print(Coeff_model_LR)

print('\n')
# ------------------------------------------------------------
#           Calculate Class Probabilities
# ------------------------------------------------------------
print('Class Probabilities  :')
probability_LR = model_LR.predict_proba(X_test)
print(probability_LR)
print('\n')

# ------------------------------------------------------------
#            Step 2 :   Predict Class Labels
# ------------------------------------------------------------
predicted_LR = model_LR.predict(X_test)

# ------------------------------------------------------------
#            Step 3 :   Evaluate The Model
# ------------------------------------------------------------
print('Confusion Matrix :')
confusion_matrix_LR = metrics.confusion_matrix(y_test, predicted_LR)
print(confusion_matrix_LR)

print('\n')

print('Classification Report :')
classification_report_LR = metrics.classification_report(y_test, predicted_LR)
print(classification_report_LR)

print('\n')

print('Model Accuracy :')
score_LR = model_LR.score(X_test,y_test)
print(score_LR)
print('\n')

# ------------------------------------------------------------
#               Cross-Validation
# ------------------------------------------------------------
cross_val_LR = cross_val_score(LR, X, y, scoring='accuracy', cv=10)
print('Cross-Validation result :', cross_val_LR)

print('\n')

print('Cross-Validation mean :',cross_val_LR.mean())
Result of the Logistic Regression  :
Coefficients  :
            0                         1
0        Open   [7.954088776729342e-16]
1        High   [7.918867184166374e-16]
2         Low   [7.864993930545724e-16]
3       Close   [7.800735206278794e-16]
4      Volume    [5.98726795794646e-10]
5         RSI     [9.7205283093297e-16]
6   middle_MA    [7.35516390436568e-16]
7        Corr  [2.4905056529809057e-18]
8  Open-Close   [5.518848519987871e-18]
9   Open-Open  [1.0151231664843978e-17]


Class Probabilities  :
[[0.49453452 0.50546548]
 [0.4949571  0.5050429 ]
 [0.49575182 0.50424818]
 ...
 [0.49526613 0.50473387]
 [0.49561864 0.50438136]
 [0.49581733 0.50418267]]


Confusion Matrix :
[[  0 232]
 [  0 276]]


Classification Report :
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       232
           1       0.54      1.00      0.70       276

    accuracy                           0.54       508
   macro avg       0.27      0.50      0.35       508
weighted avg       0.30      0.54      0.38       508



Model Accuracy :
0.5433070866141733


Cross-Validation result : [0.5254902  0.5254902  0.52362205 0.52362205 0.52362205 0.52362205
 0.52362205 0.5256917  0.5256917  0.5256917 ]


Cross-Validation mean : 0.5246165727191564
/Users/leafanirisoa/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
In [20]:
plt.figure(figsize=(20, 10))
plt.ylim(-1.5, 1.5)
plt.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
plt.plot(dates_test, predicted_LR ,"ro",marker=".",alpha=1, color='g', label="y_predicted")
plt.stem(dates_test, y_test, markerfmt='', use_line_collection = True, basefmt=None ,label="y_test")
plt.xlabel('Date', fontsize=18 , labelpad=11, fontweight='bold')
plt.ylabel('Price', fontsize=18 , labelpad=11, fontweight='bold')
plt.title('Logistic Regression model', pad =10, c = 'r', fontweight='bold')
plt.legend()
plt.show() 
In [21]:
# ------------------------------------------------------------
#                B- SVM Classifier
# ------------------------------------------------------------
#         Step 1 : Instantiate the model
# ------------------------------------------------------------
# Create a svm Classifier
clf = svm.SVC(kernel='rbf',gamma=0.3, max_iter=-1, probability=True,random_state=None, shrinking=True, tol=0.01) 
model_svc = clf.fit(X_train, y_train)

# ------------------------------------------------------------
#            Step 2 :   Predict Class Labels
# ------------------------------------------------------------
predicted_svc = model_svc.predict(X_test)

# ------------------------------------------------------------
#            Step 3 :   Evaluate The Model
# ------------------------------------------------------------
print('Result of the SVM Classifier :')

print('Confusion Matrix for SVC:')
confusion_matrix_svc = metrics.confusion_matrix(y_test, predicted_svc)
print(confusion_matrix_svc)

print('\n')

print('Classification Report for SVC :')
classification_report_svc = metrics.classification_report(y_test, predicted_svc)
print(classification_report_svc)

print('\n')

print('Model Accuracy for SVC:')
score_svc = model_svc.score(X_test,y_test)
print(score_svc)


# ------------------------------------------------------------
#               Cross-Validation
# ------------------------------------------------------------

cross_val_svc = cross_val_score(clf, X, y, scoring='accuracy', cv=10)
print('Cross-Validation result :', cross_val_svc)

print('\n')

print('Cross-Validation mean :',cross_val_svc.mean())
Result of the SVM Classifier :
Confusion Matrix for SVC:
[[  0 232]
 [  0 276]]


Classification Report for SVC :
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       232
           1       0.54      1.00      0.70       276

    accuracy                           0.54       508
   macro avg       0.27      0.50      0.35       508
weighted avg       0.30      0.54      0.38       508



Model Accuracy for SVC:
0.5433070866141733
/Users/leafanirisoa/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
Cross-Validation result : [0.5254902  0.5254902  0.52362205 0.52362205 0.52362205 0.52362205
 0.52362205 0.5256917  0.5256917  0.5256917 ]


Cross-Validation mean : 0.5246165727191564
In [22]:
plt.figure(figsize=(20, 10))
plt.ylim(-1.5, 1.5)
plt.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
plt.plot(dates_test, predicted_svc ,"ro",marker=".",alpha=1, color='g', label="y_predicted")
plt.stem(dates_test, y_test, markerfmt='', use_line_collection = True, basefmt=None ,label="y_test")
plt.xlabel('Date', fontsize=18 , labelpad=11, fontweight='bold')
plt.ylabel('Price', fontsize=18 , labelpad=11, fontweight='bold')
plt.title('SVM Classifier model', pad =10, c = 'r', fontweight='bold')
plt.legend()
plt.show() 
In [23]:
# ------------------------------------------------------------
#                C - Lasso Classifier
# ------------------------------------------------------------
#         Step 1 : Instantiate the model
# ------------------------------------------------------------
# Create a svm Classifier
ls =  LogisticRegression(penalty='l1', solver='saga')
model_ls =ls.fit(X_train, y_train)

# ------------------------------------------------------------
#            Step 2 :   Predict Class Labels
# ------------------------------------------------------------
predicted_ls = model_ls.predict(X_test)

# ------------------------------------------------------------
#            Step 3 :   Evaluate The Model
# ------------------------------------------------------------
print('Result of the Lasso Classifier :')

print('Confusion Matrix for SVC:')
confusion_matrix_ls = metrics.confusion_matrix(y_test, predicted_ls)
print(confusion_matrix_ls)

print('\n')

print('Classification Report for SVC :')
classification_report_ls = metrics.classification_report(y_test, predicted_ls)
print(classification_report_ls)

print('\n')

print('Model Accuracy for SVC:')
score_ls = model_ls.score(X_test,y_test)
print(score_ls)


# ------------------------------------------------------------
#               Cross-Validation
# ------------------------------------------------------------

cross_val_ls = cross_val_score(ls, X, y, scoring='accuracy', cv=10)
print('Cross-Validation result :', cross_val_ls)

print('\n')

print('Cross-Validation mean :',cross_val_ls.mean())
Result of the Lasso Classifier :
Confusion Matrix for SVC:
[[  0 232]
 [  0 276]]


Classification Report for SVC :
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       232
           1       0.54      1.00      0.70       276

    accuracy                           0.54       508
   macro avg       0.27      0.50      0.35       508
weighted avg       0.30      0.54      0.38       508



Model Accuracy for SVC:
0.5433070866141733
Cross-Validation result : [0.5254902  0.5254902  0.52362205 0.52362205 0.52362205 0.52362205
 0.52362205 0.5256917  0.5256917  0.5256917 ]


Cross-Validation mean : 0.5246165727191564
/Users/leafanirisoa/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
In [24]:
plt.figure(figsize=(20, 10))
plt.ylim(-1.5, 1.5)
plt.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
plt.plot(dates_test, predicted_svc ,"ro",marker=".",alpha=1, color='g', label="y_predicted")
plt.stem(dates_test, y_test, markerfmt='', use_line_collection = True, basefmt=None ,label="y_test")
plt.xlabel('Date', fontsize=18 , labelpad=11, fontweight='bold')
plt.ylabel('Price', fontsize=18 , labelpad=11, fontweight='bold')
plt.title('Lasso Classifier model', pad =10, c = 'r', fontweight='bold')
plt.legend()
plt.show() 
In [25]:
# ------------------------------------------------------------
#          D- SVM Classifier with StandardScaler
# ------------------------------------------------------------
#         Step 1 : Instantiate the model
# ------------------------------------------------------------
# Create a svm Classifier
pipe_lrSVC = Pipeline([('scaler', StandardScaler()), ('clf', svm.SVC(kernel='rbf',gamma=0.1, max_iter=-1, probability=False,random_state=None, shrinking=True, tol=0.001))])
pipe_lrSVC.fit(X_train, y_train)


# ------------------------------------------------------------
#            Step 2 :   Predict Class Labels
# ------------------------------------------------------------
predicted_svc_scale = pipe_lrSVC.predict(X_test)

# ------------------------------------------------------------
#            Step 3 :   Evaluate The Model
# ------------------------------------------------------------
print('Result of the SVM Classifier :')

print('Confusion Matrix for SVC:')
confusion_matrix_svc_scale = metrics.confusion_matrix(y_test, predicted_svc_scale)
print(confusion_matrix_svc_scale)

print('\n')

print('Classification Report for SVC :')
classification_report_svc_scale = metrics.classification_report(y_test, predicted_svc_scale)
print(classification_report_svc_scale)

print('\n')

print('Model Accuracy for SVC:')
score_svc_scale = pipe_lrSVC.score(X_test,y_test)
print(score_svc_scale)


# ------------------------------------------------------------
#               Cross-Validation
# ------------------------------------------------------------
cross_val_svc_scale = cross_val_score(pipe_lrSVC, X, y, scoring='accuracy', cv=10)
print('Cross-Validation result :', cross_val_svc_scale)

print('\n')

print('Cross-Validation mean :',cross_val_svc_scale.mean())
Result of the SVM Classifier :
Confusion Matrix for SVC:
[[135  97]
 [183  93]]


Classification Report for SVC :
              precision    recall  f1-score   support

          -1       0.42      0.58      0.49       232
           1       0.49      0.34      0.40       276

    accuracy                           0.45       508
   macro avg       0.46      0.46      0.45       508
weighted avg       0.46      0.45      0.44       508



Model Accuracy for SVC:
0.44881889763779526
Cross-Validation result : [0.53333333 0.49411765 0.48425197 0.50393701 0.50787402 0.48425197
 0.53149606 0.41501976 0.50197628 0.52964427]


Cross-Validation mean : 0.49859023202197383
In [26]:
plt.figure(figsize=(20, 10))
plt.ylim(-1.5, 1.5)
plt.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
plt.plot(dates_test, predicted_svc_scale ,"ro",marker="o",alpha=1, color='g', label="y_predicted")
plt.stem(dates_test, y_test, markerfmt='', use_line_collection = True, basefmt=None ,label="y_test")
plt.xlabel('Date', fontsize=18 , labelpad=11, fontweight='bold')
plt.ylabel('Price', fontsize=18 , labelpad=11, fontweight='bold')
plt.title('SVM Classifier with StandardScaler', pad =10, c = 'r', fontweight='bold')
plt.legend()
plt.show() 
In [27]:
# ------------------------------------------------------------
#          D- Lasso Classifier with StandardScaler
# ------------------------------------------------------------
#         Step 1 : Instantiate the model
# ------------------------------------------------------------
pipe_ls = Pipeline([('scaler', StandardScaler()), ('pipe_ls', LogisticRegression(penalty='l1', solver='saga',max_iter =5000))])
pipe_ls.fit(X_train, y_train)


# ------------------------------------------------------------
#            Step 2 :   Predict Class Labels
# ------------------------------------------------------------
predicted_ls_scale = pipe_ls.predict(X_test)

# ------------------------------------------------------------
#            Step 3 :   Evaluate The Model
# ------------------------------------------------------------
print('Results from Lasso Classifier with StandardScaler:')

print('Confusion Matrix for SVC:')
confusion_matrix_ls_scale = metrics.confusion_matrix(y_test, predicted_ls_scale)
print(confusion_matrix_ls_scale)

print('\n')

print('Classification Report for SVC :')
classification_report_ls_scale = metrics.classification_report(y_test, predicted_ls_scale)
print(classification_report_ls_scale)

print('\n')

print('Model Accuracy for SVC:')
score_ls_scale = pipe_ls.score(X_test,y_test)
print(score_ls_scale)


# ------------------------------------------------------------
#               Cross-Validation
# ------------------------------------------------------------
cross_val_ls_scale = cross_val_score(pipe_ls, X, y, scoring='accuracy', cv=10)
print('Cross-Validation result :', cross_val_ls_scale)

print('\n')

print('Cross-Validation mean :',cross_val_ls_scale.mean())
Results from Lasso Classifier with StandardScaler:
Confusion Matrix for SVC:
[[231   1]
 [276   0]]


Classification Report for SVC :
              precision    recall  f1-score   support

          -1       0.46      1.00      0.63       232
           1       0.00      0.00      0.00       276

    accuracy                           0.45       508
   macro avg       0.23      0.50      0.31       508
weighted avg       0.21      0.45      0.29       508



Model Accuracy for SVC:
0.4547244094488189
Cross-Validation result : [0.5254902  0.52941176 0.51574803 0.51968504 0.51181102 0.5
 0.53149606 0.46640316 0.53359684 0.50988142]


Cross-Validation mean : 0.514352354118953
In [28]:
plt.figure(figsize=(20, 10))
plt.ylim(-1.5, 1.5)
plt.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
plt.plot(dates_test, predicted_ls_scale ,"ro",marker="o",alpha=1, color='g', label="y_predicted")
plt.stem(dates_test, y_test, markerfmt='', use_line_collection = True, basefmt=None ,label="y_test")
plt.xlabel('Date', fontsize=18 , labelpad=11, fontweight='bold')
plt.ylabel('Price', fontsize=18 , labelpad=11, fontweight='bold')
plt.title('Lasso Classifier with StandardScaler', pad =10, c = 'r', fontweight='bold')
plt.legend()
plt.show() 
In [29]:
# ------------------------------------------------------------
#          E- Logistic Regression with StandardScaler
# ------------------------------------------------------------
#         Step 1 : Instantiate the model
# ------------------------------------------------------------
pipe_lR = Pipeline([('scaler', StandardScaler()), ('pipe_lR', LogisticRegression(solver='lbfgs'))])
pipe_lR.fit(X_train, y_train)


# ------------------------------------------------------------
#            Step 2 :   Predict Class Labels
# ------------------------------------------------------------
predicted_lR_scale = pipe_lR.predict(X_test)

# ------------------------------------------------------------
#            Step 3 :   Evaluate The Model
# ------------------------------------------------------------
print('Results from Lasso Classifier with StandardScaler:')

print('Confusion Matrix for SVC:')
confusion_matrix_lR_scale = metrics.confusion_matrix(y_test, predicted_lR_scale)
print(confusion_matrix_lR_scale)

print('\n')

print('Classification Report for SVC :')
classification_report_lR_scale = metrics.classification_report(y_test, predicted_lR_scale)
print(classification_report_lR_scale)

print('\n')

print('Model Accuracy for SVC:')
score_lR_scale = pipe_lR.score(X_test,y_test)
print(score_lR_scale)


# ------------------------------------------------------------
#               Cross-Validation
# ------------------------------------------------------------
cross_val_lR_scale = cross_val_score(pipe_lR, X, y, scoring='accuracy', cv=10)
print('Cross-Validation result :', cross_val_lR_scale)

print('\n')

print('Cross-Validation mean :',cross_val_lR_scale.mean())
Results from Lasso Classifier with StandardScaler:
Confusion Matrix for SVC:
[[231   1]
 [275   1]]


Classification Report for SVC :
              precision    recall  f1-score   support

          -1       0.46      1.00      0.63       232
           1       0.50      0.00      0.01       276

    accuracy                           0.46       508
   macro avg       0.48      0.50      0.32       508
weighted avg       0.48      0.46      0.29       508



Model Accuracy for SVC:
0.4566929133858268
Cross-Validation result : [0.5254902  0.53333333 0.52362205 0.51968504 0.51181102 0.5
 0.53543307 0.46245059 0.54940711 0.53359684]


Cross-Validation mean : 0.5194829255968673
In [30]:
plt.figure(figsize=(20, 10))
plt.ylim(-1.5, 1.5)
plt.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
plt.plot(dates_test, predicted_lR_scale ,"ro",marker="o",alpha=1, color='g', label="y_predicted")
plt.stem(dates_test, y_test, markerfmt='', use_line_collection = True, basefmt=None ,label="y_test")
plt.xlabel('Date', fontsize=18 , labelpad=11, fontweight='bold')
plt.ylabel('Price', fontsize=18 , labelpad=11, fontweight='bold')
plt.title('Logistic Regression with StandardScaler', pad =10, c = 'r', fontweight='bold')
plt.legend()
plt.show() 
In [31]:
# importing the libraries
from bs4 import BeautifulSoup
from urllib import request
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re

from nltk.tokenize import sent_tokenize, word_tokenize
import spacy

# import sys
# !{sys.executable} -m pip install -U vaderSentiment

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

pd.set_option('display.max_rows', None)

# Sending a HTTP request to a URL
url = "file:///Users/leafanirisoa/Documents/data_test/10-K/2016-10-26.html"


# ------------------------------------------------------------
#            Step 0 :  Read the .html
# ------------------------------------------------------------
# fetch the raw HTML content and Parse the html content

def read_html(url):
    response = request.urlopen(url)
    html_content = response.read().decode('utf8')
    return BeautifulSoup(html_content, "lxml")


data_init = read_html(url)
In [32]:
# ------------------------------------------------------------ font style="font-family
#            Step 1 :  Transforme to text
# ------------------------------------------------------------
def extract_text(date, data_parsed):
    if int(date) <= 2015  :
        all_p = data_parsed.find_all('p')
    else:
        all_p = data_parsed.find_all('div')
        
    ls = [] # Create empty list
    for l in all_p: 
      #Find all data structure that is ‘div’
      ls.append(l.text)    
    return ls
In [33]:
# ------------------------------------------------------------
#            Step 2 :   Removing line breaks
# ------------------------------------------------------------
def remove_line_breaks(data_extracted):
    ls = [] # Create empty list
    for l in data_extracted: 
    #Find all data structure that is ‘div’
        ls.append(" ".join(l.split()))    
    return ls
In [34]:
# ------------------------------------------------------------
#            Step 3 :   Removing accented characters
# ------------------------------------------------------------

def remove_accented_chars(data_extracted):
    ls = [] # Create empty list
    for l in data_extracted: 
    #Find all data structure that is ‘div’
        ls.append(l.encode('ascii', 'ignore').decode('utf-8', 'ignore'))    
    return ls

    text = unicodedata.encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text
In [35]:
CONTRACTION_MAP = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}
In [36]:
# ------------------------------------------------------------
#            Step 4 :   Expanding Contractions
# ------------------------------------------------------------

def text_expand(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

#  expand_contractions("Y'all can't expand contractions I'd think")
#  ===> 'You all cannot expand contractions I would think'


def expand_contractions(data_extracted):
    ls = [] # Create empty list
    for l in data_extracted: 
    #Find all data structure that is ‘div’
        ls.append(text_expand(l))    
    return ls
In [37]:
# ------------------------------------------------------------
#            Step 5 :   Expanding Contractions
# ------------------------------------------------------------

# Install spacy (run in terminal/prompt)
# import sys
# !{sys.executable} -m pip install spacy
# Download spacy's  'en' Model
# !{sys.executable} -m spacy download en

nlp = spacy.load('en', parse=True, tag=True, entity=True)

def text_lemmatize(data_extracted):
    ls = [] # Create empty list
    for l in data_extracted: 
        text = nlp(l)
        ls.append(' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text]))    
    return ls
In [38]:
# ------------------------------------------------------------
#            Step 6 :   Removing Stopwords
# ------------------------------------------------------------
from spacy.lang.en.stop_words import STOP_WORDS

def delete_stopwords(data_extracted):
    ls = [] # Create empty list
    for l in data_extracted: 
        text = l.lower()
        ls.append(' '.join(w for w in text.split() if w not in STOP_WORDS))    
    return ls

    
In [39]:
# ------------------------------------------------------------
#            Step 7 :   Removing Special Characters
# ------------------------------------------------------------
def delete_characters(data_extracted):
    data_reduiced = [s for s in data_extracted if len(s.split()) > 10]
    ls = [] # Create empty list
    for l in data_reduiced: 
        ls.append(' '.join(w for w in l.split() if w.isalpha()))    
    return ls

ls_1 = extract_text('2016',data_init)
ls_2 = remove_line_breaks(ls_1)
ls_3 = remove_accented_chars(ls_2)
ls_4 = expand_contractions(ls_3)
ls_5 = text_lemmatize(ls_4)
ls_6 = delete_stopwords(ls_5)
ls_7 = delete_characters(ls_6)
In [40]:
# ------------------------------------------------------------
#            Step 8 :   Scoring 
# ------------------------------------------------------------
def Scoring_sentences(sentence):
    score = analyser.polarity_scores(sentence)
    return score
    
def sentiment_analyzer_scores(data_extracted):
    ls = [] # Create empty list
    for l in data_extracted: 
        l_score = Scoring_sentences(l)
        ls.append(l_score)    
    return ls

    
def scoring_dataset(data_extracted):
    df = pd.DataFrame(sentiment_analyzer_scores(data_extracted)) 
    df.columns = ['Compound' , 'Negative', 'Neutral','Positive'] 
    return df
In [41]:
ls_8 = scoring_dataset(ls_7)
ls_8.head()
Out[41]:
Compound Negative Neutral Positive
0 0.2960 0.0 0.732 0.268
1 0.2960 0.0 0.732 0.268
2 0.0000 0.0 1.000 0.000
3 0.0000 0.0 1.000 0.000
4 0.5574 0.0 0.911 0.089
In [42]:
# percentile list 
perc =[ .60, .80 ,.90] 

# ------------------------------------------------------------
#            Step 9 :   Metrics
# ------------------------------------------------------------
def metrics_values(df, perc):
    include =['object', 'float', 'int'] 
    desc_data  = df.describe(percentiles = perc, include = include) 
    return desc_data
    
ls_9  = metrics_values(ls_8,perc)
ls_9
Out[42]:
Compound Negative Neutral Positive
count 631.000000 631.000000 631.000000 631.000000
mean 0.364839 0.066746 0.765173 0.168090
std 0.495390 0.101599 0.181888 0.152131
min -0.996800 0.000000 0.245000 0.000000
50% 0.421500 0.022000 0.772000 0.156000
60% 0.624900 0.057000 0.820000 0.186000
80% 0.868900 0.126000 1.000000 0.290000
90% 0.937100 0.171000 1.000000 0.383000
max 0.998500 0.581000 1.000000 0.704000
In [43]:
# ------------------------------------------------------------
#            Final Step :   in one step 
# ------------------------------------------------------------

def metrics_extraction(date,data_init,perc):
        
    ls_1 = extract_text(date,data_init)        # Step 1 :   Transforme to text
    ls_2 = remove_line_breaks(ls_1)       # Step 2 :   Removing line breaks
    ls_3 = remove_accented_chars(ls_2)    # Step 3 :   Removing accented characters
    ls_4 = expand_contractions(ls_3)      # Step 4 :   Expanding Contractions
    ls_5 = text_lemmatize(ls_4)           # Step 5 :   Expanding Contractions
    ls_6 = delete_stopwords(ls_5)         # Step 6 :   Removing Stopwords   
    ls_7 = delete_characters(ls_6)        # Step 7 :   Removing Special Characters 
    ls_8 = scoring_dataset(ls_7)          # Step 8 :   Scoring   
    ls_9 = metrics_values(ls_8,perc)      # Step 9 :   Metrics      
    return ls_9

desc_data = metrics_extraction('2016',data_init,perc)
desc_data
Out[43]:
Compound Negative Neutral Positive
count 631.000000 631.000000 631.000000 631.000000
mean 0.364839 0.066746 0.765173 0.168090
std 0.495390 0.101599 0.181888 0.152131
min -0.996800 0.000000 0.245000 0.000000
50% 0.421500 0.022000 0.772000 0.156000
60% 0.624900 0.057000 0.820000 0.186000
80% 0.868900 0.126000 1.000000 0.290000
90% 0.937100 0.171000 1.000000 0.383000
max 0.998500 0.581000 1.000000 0.704000
In [44]:
import seaborn as sns
sns.set_style("white")

x1 = ls_8.Compound
x2 = ls_8.Negative
x3 = ls_8.Neutral
x4 = ls_8.Positive

# plot
fig, axes = plt.subplots(1, 4, figsize=(10, 4), sharey=True, dpi=100)
sns.distplot(x1 , color="dodgerblue", ax=axes[0], axlabel='Compound')
sns.distplot(x2 , color="deeppink", ax=axes[1], axlabel='Negative')
sns.distplot(x3 , color="orange", ax=axes[2], axlabel='Neutral')
sns.distplot(x4 , color="green", ax=axes[3], axlabel='Positive')
Out[44]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fb8282cdf98>
In [45]:
# Draw Plot
plt.figure(figsize=(16,10), dpi= 80)
sns.kdeplot(x1, shade=True, color="dodgerblue", label="Compound", alpha=.7)
sns.kdeplot(x2, shade=True, color="deeppink", label="Negative", alpha=.7)
sns.kdeplot(x3, shade=True, color="orange", label="Neutral", alpha=.7)
sns.kdeplot(x4, shade=True, color="green", label="Positive", alpha=.7)

# Decoration
plt.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
plt.title('Density Plot of each categories ', fontsize=22)
plt.legend(loc ='best', fontsize=15)
plt.show()
In [46]:
# importing the libraries
from bs4 import BeautifulSoup
from urllib import request
import csv
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', None)


# Step 1: Sending a HTTP request to a URL
url = "file:///Users/leafanirisoa/Documents/data_test/10-K/2011-10-26.html"

# Make a GET request to fetch the raw HTML content

response = request.urlopen(url)
html_content = response.read().decode('utf8')

# Step 2: Parse the html content
soup = BeautifulSoup(html_content, "lxml")
In [47]:
def table_generator(data, k):
    #Get the table having in the rank num_table = k
    # Parameters of the table : 
    gdp_table = data.find_all('table')[k]
    gdp_table_data = gdp_table.find_all("tr") 
    num_cols = len(gdp_table_data[0].find_all("td"))
    num_rows =len(gdp_table_data)
    
    # Save the table in pd.dataframe
    new_table = pd.DataFrame(columns=range(0,num_cols), index = range(0,num_rows)) # I know the size 
    for i in range(0,num_rows):
        column_marker = 0
        columns = gdp_table.find_all("tr")[i].find_all('td')
        for column in columns:
            new_table.iat[i,column_marker] = column.get_text(strip = True).replace('\n', ' ').replace('$', '').replace('(', '').replace(')', '').strip()
            column_marker += 1
            
    # Define the header of the dataset
    headings = []
    for td in gdp_table_data[1].find_all("td"):
        # remove any newlines and extra spaces from left and right
        headings.append(td.get_text(strip = True).replace('\n', ' ').strip())
     
    headings_table = [x for x in headings if x] 
    # Consolidation of the dataset    
    new_table_1 = new_table
    new_table_1.replace('', np.nan, inplace=True)
    last_table = new_table_1.dropna(how='all').reset_index(drop=True).iloc[1:].dropna(1,how='all').dropna(0,how='all')
    
    if len(headings_table) == len(last_table.columns) :
        last_table.columns = headings_table 
    elif len(headings_table) == len(last_table.columns) -1 :
        last_table.columns = ["Attributes"] +[x for x in headings if x] 
    else:
        last_table = new_table_1.dropna(how='all').reset_index(drop=True).iloc[2:].dropna(1,how='all').dropna(0,how='all')
        last_table.columns =  ["Attributes"] + ['Col_'+ str(x)  for x in  range(1, len(last_table.columns))]
        
        
    return last_table.fillna('').reset_index(drop=True)
In [48]:
# ------------------------------------------------------------
#                  Load Packages
# ------------------------------------------------------------
import os

import math
import pandas as pd
from scipy import stats
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
pd.plotting.register_matplotlib_converters()
import import_ipynb


# ------------------------------------------------------------
#                   set Parameters
# ------------------------------------------------------------
source_dir = "file:///Users/leafanirisoa/Documents/data_test/10-K/"
dates = ["2011-10-26", "2012-10-31", "2013-10-30","2014-10-27","2015-10-28","2016-10-26","2017-11-03","2018-11-05","2019-10-31"]

# percentile list 
perc =[ .60, .80 ,.90] 
In [49]:
# ------------------------------------------------------------
#                 Read the .html
# ------------------------------------------------------------
def one_date_data(source_dir, dates):
    url =  source_dir + dates + ".html"
    response = request.urlopen(url)
    html_content = response.read().decode('utf8')
    soup = BeautifulSoup(html_content, "lxml")   
    return soup


def get_dico_data(source_dir, dates):
    list_of_url = {t[0:4]: source_dir + t + ".html" for t in dates}
    data_html = {}
    for t in dates:
        k = t[0:4]
        data_html[k] = read_html(list_of_url[k])
        
    return data_html

data_used = get_dico_data(source_dir, dates)
In [50]:
# ------------------------------------------------------------
#              Generate Metric matrix
# ------------------------------------------------------------

def Matrix_metric(dates,data_used,perc):
    dico_metrics = {}
    for t in dates:
        k = t[0:4]
        data = data_used[k]
        dico_metrics[k] = metrics_extraction(k,data,perc)    
    return dico_metrics


dico_metrics = Matrix_metric(dates,data_used,perc)

def Union_metric(dates,dico_metrics,metric):
    ls = []
    for t in dates:
        k = t[0:4]
        dico_metrics[k]['Year'] = k
        ls.append(dico_metrics[k].loc[dico_metrics[k].index == metric,:])
            
    return pd.concat(ls,ignore_index=True).drop_duplicates().reset_index(drop=True)


Union_metric(dates,dico_metrics,'mean')
Out[50]:
Compound Negative Neutral Positive Year
0 0.407927 0.061582 0.762789 0.175604 2011
1 0.422097 0.059376 0.763211 0.177393 2012
2 0.407107 0.060912 0.764631 0.174452 2013
3 0.418464 0.062530 0.768780 0.168701 2014
4 0.418649 0.065114 0.765486 0.169409 2015
5 0.364839 0.066746 0.765173 0.168090 2016
6 0.356237 0.066175 0.768222 0.165621 2017
7 0.353941 0.066981 0.765862 0.167170 2018
8 0.326360 0.073405 0.756443 0.170162 2019
In [51]:
df= Union_metric(dates,dico_metrics,'90%')

fig, ax1 = plt.subplots(figsize=(16,9))
ax1.plot(df.Year, df.Negative, label='Negative', color='tab:blue')
#ax1.plot(df.Year, df.Compound, label='Compound', color='tab:red')
#ax1.plot(df.Year, df.Neutral, label='Neutral', color='tab:green')
ax1.plot(df.Year, df.Positive, label='Positive', color='tab:orange')
ax1.set_xlabel('Jan. 04, 2010 - Apr. 28, 2020', fontsize=18 , labelpad=11, fontweight='bold')
ax1.set_ylabel('Metric Values ', fontsize=18 , labelpad=11, fontweight='bold')
ax1.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
ax1.set_title('Sentiment analysis metric', pad =10, c = 'r', fontweight='bold',fontsize=22)
ax1.legend(loc ='best', fontsize=15)
Out[51]:
<matplotlib.legend.Legend at 0x7fb848a09860>
In [52]:
df2= Union_metric(dates,dico_metrics,'std')

fig, ax1 = plt.subplots(figsize=(16,9))
# ax1.plot(df.Year, df2.Compound, label='Compound', color='tab:red')
ax1.plot(df.Year, df2.Negative, label='Negative', color='tab:blue')
ax1.plot(df.Year, df2.Neutral, label='Neutral', color='tab:green')
ax1.plot(df.Year, df2.Positive, label='Positive', color='tab:orange')
ax1.set_xlabel('Jan. 04, 2010 - Apr. 28, 2020', fontsize=18 , labelpad=11, fontweight='bold')
ax1.set_ylabel('Metric Values ', fontsize=18 , labelpad=11, fontweight='bold')
ax1.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
ax1.set_title('Sentiment analysis metric (Std)', pad =10, c = 'r', fontweight='bold',fontsize=22)
ax1.legend(loc ='best', fontsize=15)
Out[52]:
<matplotlib.legend.Legend at 0x7fb82a856f98>
In [53]:
df= Union_metric(dates,dico_metrics,'mean')

fig, ax1 = plt.subplots(figsize=(16,9))
ax1.plot(df.Year, df.Negative, label='Negative', color='tab:blue')
ax1.plot(df.Year, df.Compound, label='Compound', color='tab:red')
ax1.plot(df.Year, df.Neutral, label='Neutral', color='tab:green')
ax1.plot(df.Year, df.Positive, label='Positive', color='tab:orange')
ax1.set_xlabel('Jan. 04, 2010 - Apr. 28, 2020', fontsize=18 , labelpad=11, fontweight='bold')
ax1.set_ylabel('Metric Values ', fontsize=18 , labelpad=11, fontweight='bold')
ax1.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
ax1.set_title('Sentiment analysis metric', pad =10, c = 'r', fontweight='bold',fontsize=22)
ax1.legend(loc ='best', fontsize=15)
Out[53]:
<matplotlib.legend.Legend at 0x7fb7dac81358>
In [54]:
df_2013 = one_date_data(source_dir, '2013-10-30')
table_2013=table_generator(df_2013, 74)
table_2013
Out[54]:
Attributes 2013 2012 2011
0 Beginning Balance 2,062 1,375 943
1 Increases related to tax positions taken durin... 745 340 49
2 Decreases related to tax positions taken durin... 118 107 39
3 Increases related to tax positions taken durin... 626 467 425
4 Decreases related to settlements with taxing a... 592 3 0
5 Decreases related to expiration of statute of ... 9 10 3
6 Ending Balance 2,714 2,062 1,375
In [55]:
df_2011 = one_date_data(source_dir, '2011-10-26')
table_2011= table_generator(df_2011, 68)
table_2011
Out[55]:
Attributes 2011 2010 2009
0 Beginning Balance 943 971 506
1 Increases related to tax positions taken durin... 49 61 341
2 Decreases related to tax positions taken durin... 39 224 24
3 Increases related to tax positions taken durin... 425 240 151
4 Decreases related to settlements with taxing a... 0 102 0
5 Decreases related to expiration of statute of ... 3 3 3
6 Ending Balance 1,375 943 971
In [56]:
df_cd = pd.merge(table_2011, table_2013, how='inner', on = ['Attributes','2011'])
data_last = df_cd[['Attributes', '2009', '2010', '2011', '2012', '2013']]
data_last
Out[56]:
Attributes 2009 2010 2011 2012 2013
0 Beginning Balance 506 971 943 1,375 2,062
1 Increases related to tax positions taken durin... 341 61 49 340 745
2 Decreases related to tax positions taken durin... 24 224 39 107 118
3 Increases related to tax positions taken durin... 151 240 425 467 626
4 Decreases related to settlements with taxing a... 0 102 0 3 592
5 Decreases related to expiration of statute of ... 3 3 3 10 9
6 Ending Balance 971 943 1,375 2,062 2,714
In [57]:
data_transposed = data_last.set_index('Attributes').T
data_transposed
Out[57]:
Attributes Beginning Balance Increases related to tax positions taken during a prior year Decreases related to tax positions taken during a prior year Increases related to tax positions taken during the current year Decreases related to settlements with taxing authorities Decreases related to expiration of statute of limitations Ending Balance
2009 506 341 24 151 0 3 971
2010 971 61 224 240 102 3 943
2011 943 49 39 425 0 3 1,375
2012 1,375 340 107 467 3 10 2,062
2013 2,062 745 118 626 592 9 2,714
In [58]:
data_transposed.columns = ['C_1', 'C_2', 'C_3', 'C_4', 'C_5', 'C_6', 'C_7']
data_transposed['C_1'] = data_transposed['C_1'].str.replace(',', '').astype(float)
data_transposed['C_7'] = data_transposed['C_7'].str.replace(',', '').astype(float)
data_transposed

for i in data_transposed.columns:
    data_transposed[i] = pd.to_numeric(data_transposed[i], errors='ignore')
In [59]:
fig, ax1 = plt.subplots(figsize=(16,9))
# ax1.plot(df.Year, df2.Compound, label='Compound', color='tab:red')
ax1.plot(data_transposed.index, data_transposed.C_1, label='Beginning Balance', color='tab:blue')
ax1.plot(data_transposed.index, data_transposed.C_7, label='Ending Balance', color='tab:green')
ax1.set_xlabel('Jan. 2009 - Dec. 2013', fontsize=18 , labelpad=11, fontweight='bold')
ax1.set_ylabel('Balance values ', fontsize=18 , labelpad=11, fontweight='bold')
ax1.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
ax1.set_title('The aggregate changes in the balance of gross unrecognized tax benefits', pad =10, c = 'r', fontweight='bold',fontsize=22)
ax1.legend(loc ='best', fontsize=15)
Out[59]:
<matplotlib.legend.Legend at 0x7fb85c06be10>
In [60]:
fig, ax1 = plt.subplots(figsize=(16,9))
# ax1.plot(df.Year, df2.Compound, label='Compound', color='tab:red')
ax1.plot(data_transposed.index, data_transposed.C_7 - data_transposed.C_1, label='C_7-C_1', color='tab:blue')
ax1.plot(data_transposed.index, data_transposed.C_2, label='C_2', color='tab:green')
ax1.plot(data_transposed.index, data_transposed.C_3, label='C_3', color='tab:red')
ax1.plot(data_transposed.index, data_transposed.C_4, label='C_4', color='tab:orange')
ax1.set_xlabel('Jan. 2009 - Dec. 2013', fontsize=18 , labelpad=11, fontweight='bold')
ax1.set_ylabel('Balance values ', fontsize=18 , labelpad=11, fontweight='bold')
ax1.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
ax1.set_title('The aggregate changes in the balance of gross unrecognized tax benefits', pad =10, c = 'r', fontweight='bold',fontsize=22)
ax1.legend(loc ='best', fontsize=15)
Out[60]:
<matplotlib.legend.Legend at 0x7fb7c919b048>
In [ ]: