# ---------------------------------------------------------
#                   IMPORT DES PACKAGES 
# ---------------------------------------------------------

# If we want to use files in HDFS
import os

spark_path = "/usr/local/Cellar/spark-2.4.3-bin-hadoop2.7"
os.environ["SPARK_HOME"] = spark_path

import math
import pandas as pd
from scipy import stats
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
pd.plotting.register_matplotlib_converters()

from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# Machine learning
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# -----------------------------------------------------------------
#                   Set parameters for the plt
# -----------------------------------------------------------------

font = {'family' : 'Verdana','weight' : 15 ,'size'   : 14}
plt.rc('font', **font)

# -------------------------------------------
#                   sparkSession 
# -------------------------------------------

spark = SparkSession \
  .builder \
  .appName("Attrition") \
  .config("spark.memory.fraction", 0.8) \
  .config("spark.executor.memory", "18g") \
  .config("spark.driver.memory", "18g")\
  .config("spark.sql.shuffle.partitions" , "800") \
  .getOrCreate()

sparkcontext =  spark.sparkContext

# -------------------------------------------
#              Read and load Dataframe
# -------------------------------------------
# 1- Data eBoutique : Sales
AAPL_path= "/tmp/incoming/AAPL_pricing/AAPL_prices_parquet"
AAPLRaw = spark.read.parquet(AAPL_path)
AAPLRaw.show(4)

+-------------------+---------+---------+---------+---------+---------+---------+
|               Date|     Open|     High|      Low|    Close|Adj_Close|   Volume|
+-------------------+---------+---------+---------+---------+---------+---------+
|2010-01-04 00:00:00|    30.49|30.642857|    30.34|30.572857|26.538483|123432400|
|2010-01-05 00:00:00|30.657143|30.798571|30.464285|30.625713|26.584366|150476200|
|2010-01-06 00:00:00|30.625713|30.747143|30.107143|30.138571|26.161509|138040000|
|2010-01-07 00:00:00|    30.25|30.285715|29.864286|30.082857|26.113146|119282800|
+-------------------+---------+---------+---------+---------+---------+---------+
only showing top 4 rows

AAPLRaw.agg(min(col("Date")), max(col("Date"))).show()

+-------------------+-------------------+
|          min(Date)|          max(Date)|
+-------------------+-------------------+
|2010-01-04 00:00:00|2020-04-28 00:00:00|
+-------------------+-------------------+

AAPL_stat_desc = AAPLRaw.describe()

for col_name in  AAPL_stat_desc.columns[1:6]:
    AAPL_stat_desc = AAPL_stat_desc.withColumn(col_name, round(col_name, 3))
    
AAPL_stat_desc.show()

+-------+-------+-------+-------+-------+---------+-------------------+
|summary|   Open|   High|    Low|  Close|Adj_Close|             Volume|
+-------+-------+-------+-------+-------+---------+-------------------+
|  count| 2597.0| 2597.0| 2597.0| 2597.0|   2597.0|               2597|
|   mean|115.853|116.965|114.758|115.905|  109.571|7.402348221024258E7|
| stddev| 63.435| 64.192| 62.844| 63.578|   65.231|5.821870515136323E7|
|    min| 27.481|   28.0| 27.179| 27.436|   23.815|           11362000|
|    max| 324.74| 327.85| 323.35|  327.2|    327.2|          470249500|
+-------+-------+-------+-------+-------+---------+-------------------+

# conversion to pandas dataframe
AAPLRaw_pd = AAPLRaw.toPandas()
AAPLRaw_pd.isna().any()
AAPLRaw_pd.info()
AAPLRaw_pd.reset_index(inplace=True)
AAPLRaw_pd.set_index("Date", inplace=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2597 entries, 0 to 2596
Data columns (total 7 columns):
Date         2597 non-null datetime64[ns]
Open         2597 non-null float64
High         2597 non-null float64
Low          2597 non-null float64
Close        2597 non-null float64
Adj_Close    2597 non-null float64
Volume       2597 non-null int32
dtypes: datetime64[ns](1), float64(5), int32(1)
memory usage: 132.0 KB

#  plot the Apple Adj. Close Price History
plt.figure(figsize=(15, 8))
plt.plot(AAPLRaw_pd['Adj_Close'],label = 'APPL_adj')
plt.title('Apple Adj. Close Price History', pad =10, c = 'r', fontweight='bold')
plt.xlabel('Jan. 04, 2010 - Apr. 28, 2020', fontsize=18 , labelpad=11, fontweight='bold')
plt.ylabel('Adj. Close Price USD ($)', fontsize=18 , labelpad=11, fontweight='bold')
plt.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
plt.legend(loc ='upper left', fontsize=15)

<matplotlib.legend.Legend at 0x7fb8692a7358>

# Get the timeseries. This now returns a Pandas Series object indexed by date.
AAPL_Close = AAPLRaw_pd.loc[:, 'Close']

# Calculate the 20 and 100 days moving averages of the closing prices
short_rolling_AAPL = AAPL_Close.rolling(window=15).mean()
middle_rolling_AAPL = AAPL_Close.rolling(window=30).mean()
long_rolling_AAPL = AAPL_Close.rolling(window=100).mean()

# Plot everything by leveraging the very powerful matplotlib package
fig, ax1 = plt.subplots(figsize=(16,9))
ax1.plot(AAPL_Close.index, AAPL_Close, label='AAPL_Close')
ax1.plot(short_rolling_AAPL.index, short_rolling_AAPL, label='10 days rolling')
ax1.plot(middle_rolling_AAPL.index, middle_rolling_AAPL, label='30 days rolling')
ax1.plot(long_rolling_AAPL.index, long_rolling_AAPL, label='100 days rolling')
ax1.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
ax1.set_title('Apple Close Price History', pad =10, c = 'r', fontweight='bold')
ax1.set_xlabel('Date', fontsize=18 , labelpad=11, fontweight='bold')
ax1.set_ylabel('Close price ($)', fontsize=18 , labelpad=11, fontweight='bold')
ax1.legend(loc ='upper left', fontsize=18)

<matplotlib.legend.Legend at 0x7fb869341e80>

# Get the MSFT timeseries. This now returns a Pandas Series object indexed by date.
AAPL_Adj_Close = AAPLRaw_pd.loc[:, 'Adj_Close']

# Calculate the 20 and 100 days moving averages of the closing prices
short_rolling_AAPL_Adj = AAPL_Adj_Close.rolling(window=10).mean()
middle_rolling_AAPL_Adj = AAPL_Adj_Close.rolling(window=30).mean()
long_rolling_AAPL_Adj = AAPL_Adj_Close.rolling(window=100).mean()

# Plot everything by leveraging the very powerful matplotlib package
fig, ax1 = plt.subplots(figsize=(16,9))
ax1.plot(AAPL_Adj_Close.index, AAPL_Adj_Close, label='AAPL_Adj_Close')
ax1.plot(short_rolling_AAPL_Adj.index, short_rolling_AAPL_Adj, label='10 days rolling')
ax1.plot(middle_rolling_AAPL_Adj.index, middle_rolling_AAPL_Adj, label='30 days rolling')
ax1.plot(long_rolling_AAPL_Adj.index, long_rolling_AAPL_Adj, label='100 days rolling')
ax1.set_xlabel('Jan. 04, 2010 - Apr. 28, 2020', fontsize=18 , labelpad=11, fontweight='bold')
ax1.set_ylabel('Adjusted closing price ($)', fontsize=18 , labelpad=11, fontweight='bold')
ax1.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
ax1.set_title('Apple Adj. Close Price History', pad =10, c = 'r', fontweight='bold')
ax1.legend()

<matplotlib.legend.Legend at 0x7fb879550828>

fig, (ax1, ax2) = plt.subplots(2, 1,figsize=(15,10))

AAPL_Adj_Close.plot(x='AAPL_Adj_Close', y='unemployment', ax=ax1, legend=True)
ax1.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
ax1.set_xlabel('AAPL 10-year history of trading days', fontsize=18 , labelpad=11, fontweight='bold')
ax1.set_ylabel('Adjusted closing price', fontsize=18 , labelpad=11, fontweight='bold')

AAPL_Adj_Close.pct_change().hist(bins=50, color = 'b', ax=ax2)
ax2.set_xlabel('AAPL 10-year history of trading days', fontsize=18 , labelpad=11, fontweight='bold')
ax2.set_ylabel('Adjusted closing price', fontsize=18 , labelpad=11, fontweight='bold')

Text(0, 0.5, 'Adjusted closing price')

# -------------------------------------------
#           Conversion to pandas dataframe
# -------------------------------------------
AAPLRaw_pd = AAPLRaw.toPandas()
AAPLRaw_pd.reset_index(inplace=True)
AAPLRaw_pd.set_index("Date", inplace=True)

# ------------------------------------------------------------
#           Relative strength and Relative Strength Index (RSI)
# ------------------------------------------------------------

def RSI(stock, column="Close", period=14):
    # Wilder's RSI
    close = stock[column]
    delta = close.diff() 
    up, down = delta.copy(), delta.copy()

    up[up < 0] = 0
    down[down > 0] = 0
    
    # Calculate the exponential moving averages (EWMA)
    roll_up = up.ewm(com=period - 1, adjust=False).mean()
    roll_down = down.ewm(com=period - 1, adjust=False).mean().abs()
    
    # Calculate RS based on exponential moving average (EWMA)
    rs = roll_up / roll_down   # relative strength =  average gain/average loss

    rsi = 100-(100/(1+rs))
    stock['RSI'] = rsi
    
    return stock

AAPLRaw_pd = RSI(AAPLRaw_pd)

fig,ax=plt.subplots(figsize=(15,5))
AAPLRaw_pd.RSI.plot(ax=ax, label='RSI', legend=True, color='b');
ax.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
ax.set_xlabel('Trading days', fontsize=18 , labelpad=11, fontweight='bold')
ax.set_ylabel('14-day RSI', fontsize=18 , labelpad=11, fontweight='bold')
ax.set_title('14-day RSI of the Close price AAPL', pad =10, c = 'r', fontweight='bold')

Text(0.5, 1.0, '14-day RSI of the Close price AAPL')

# ------------------------------------------------------------
#             Define Predictor/Independent Variables
# ------------------------------------------------------------
AAPLRaw_pd['middle_MA'] = AAPLRaw_pd['Close'].rolling(window=30).mean()
AAPLRaw_pd['Corr'] = AAPLRaw_pd['Close'].rolling(window=30).corr(AAPLRaw_pd['middle_MA'])
AAPLRaw_pd['Open-Close'] = AAPLRaw_pd['Open'] - AAPLRaw_pd['Close'].shift(1)
AAPLRaw_pd['Open-Open'] = AAPLRaw_pd['Open'] - AAPLRaw_pd['Open'].shift(1)
AAPLRaw_pd_reduice = AAPLRaw_pd.dropna().drop(['index','Adj_Close'], axis=1)
AAPLRaw_pd_reduice.head()

# ------------------------------------------------------------
#           Define Target/Dependent Variable
# ------------------------------------------------------------
# If tomorrow’s closing price is higher than today’s closing price, 
# then the price is going up (1), else the price is going down (-1).
AAPL_y = np.where(AAPLRaw_pd_reduice['Close'].shift(-1) > AAPLRaw_pd_reduice['Close'],1,-1)

# ------------------------------------------------------------
#                   Split The Dataset
# ------------------------------------------------------------

split = int(0.8*len(AAPLRaw_pd_reduice))
X = AAPLRaw_pd_reduice
y = AAPL_y
X_train, X_test, y_train, y_test = X[:split], X[split:], y[:split], y[split:]
dates_test = X_test.index.values

y_test

array([-1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1, -1, -1,  1,
       -1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1,  1,  1,  1, -1, -1, -1,
        1, -1,  1, -1, -1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1,  1,
        1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1, -1, -1,
        1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1,  1,  1, -1, -1,  1,
        1,  1,  1,  1,  1,  1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1, -1,
        1,  1,  1, -1,  1,  1, -1,  1,  1,  1,  1,  1, -1, -1, -1,  1, -1,
       -1,  1, -1,  1, -1, -1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1, -1,
       -1,  1,  1, -1, -1, -1, -1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,
       -1, -1,  1, -1, -1, -1,  1, -1,  1,  1, -1, -1,  1, -1, -1, -1, -1,
        1, -1,  1,  1,  1, -1,  1, -1,  1,  1,  1, -1, -1,  1,  1,  1,  1,
       -1,  1, -1,  1, -1, -1,  1,  1,  1,  1,  1,  1, -1, -1, -1,  1, -1,
        1, -1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1, -1, -1, -1,  1,  1,
        1,  1,  1,  1,  1, -1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1,  1,
        1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1,  1,  1, -1, -1, -1,  1,
       -1,  1, -1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1, -1,  1, -1,
       -1, -1, -1, -1,  1, -1, -1,  1,  1,  1,  1,  1,  1, -1, -1, -1,  1,
        1, -1,  1, -1, -1, -1,  1, -1, -1,  1,  1,  1, -1, -1,  1,  1, -1,
        1,  1, -1, -1,  1, -1,  1,  1, -1, -1,  1,  1, -1,  1, -1, -1, -1,
        1,  1,  1, -1, -1,  1, -1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,
        1, -1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1,  1, -1, -1,  1,
       -1,  1, -1, -1,  1,  1, -1,  1,  1,  1, -1,  1,  1,  1, -1, -1, -1,
        1,  1,  1, -1,  1,  1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1,  1,
        1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1,  1,
        1,  1, -1,  1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1, -1,  1,
        1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1, -1,
       -1,  1,  1, -1, -1, -1,  1,  1,  1, -1,  1, -1,  1, -1,  1, -1,  1,
       -1, -1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1, -1,  1, -1, -1,  1,
       -1,  1, -1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1,  1, -1,  1, -1,
        1,  1,  1,  1, -1,  1, -1, -1, -1,  1, -1,  1,  1, -1, -1])

print("X dimension:",X.shape)
print("Y dimension:",y.shape)
print("X_train dimension:",X_train.shape)
print("X_train dimension:",X_test.shape)
print("y_train dimension:",y_train.shape)
print("y_test dimension:",y_test.shape)

X dimension: (2539, 10)
Y dimension: (2539,)
X_train dimension: (2031, 10)
X_train dimension: (508, 10)
y_train dimension: (2031,)
y_test dimension: (508,)

# ------------------------------------------------------------
#                A- Logistic Regression
# ------------------------------------------------------------
#         Step 1 : Instantiate the model
# ------------------------------------------------------------
LR = LogisticRegression(solver='lbfgs')
model_LR = LR.fit (X_train,y_train)

# ------------------------------------------------------------
#          Examine The Coefficients 
# ------------------------------------------------------------
print('Result of the Logistic Regression  :')

print('Coefficients  :')
Coeff_model_LR = pd.DataFrame(zip(X.columns, np.transpose(model_LR.coef_)))
print(Coeff_model_LR)

print('\n')
# ------------------------------------------------------------
#           Calculate Class Probabilities
# ------------------------------------------------------------
print('Class Probabilities  :')
probability_LR = model_LR.predict_proba(X_test)
print(probability_LR)
print('\n')

# ------------------------------------------------------------
#            Step 2 :   Predict Class Labels
# ------------------------------------------------------------
predicted_LR = model_LR.predict(X_test)

# ------------------------------------------------------------
#            Step 3 :   Evaluate The Model
# ------------------------------------------------------------
print('Confusion Matrix :')
confusion_matrix_LR = metrics.confusion_matrix(y_test, predicted_LR)
print(confusion_matrix_LR)

print('\n')

print('Classification Report :')
classification_report_LR = metrics.classification_report(y_test, predicted_LR)
print(classification_report_LR)

print('\n')

print('Model Accuracy :')
score_LR = model_LR.score(X_test,y_test)
print(score_LR)
print('\n')

# ------------------------------------------------------------
#               Cross-Validation
# ------------------------------------------------------------
cross_val_LR = cross_val_score(LR, X, y, scoring='accuracy', cv=10)
print('Cross-Validation result :', cross_val_LR)

print('\n')

print('Cross-Validation mean :',cross_val_LR.mean())

Result of the Logistic Regression  :
Coefficients  :
            0                         1
0        Open   [7.954088776729342e-16]
1        High   [7.918867184166374e-16]
2         Low   [7.864993930545724e-16]
3       Close   [7.800735206278794e-16]
4      Volume    [5.98726795794646e-10]
5         RSI     [9.7205283093297e-16]
6   middle_MA    [7.35516390436568e-16]
7        Corr  [2.4905056529809057e-18]
8  Open-Close   [5.518848519987871e-18]
9   Open-Open  [1.0151231664843978e-17]


Class Probabilities  :
[[0.49453452 0.50546548]
 [0.4949571  0.5050429 ]
 [0.49575182 0.50424818]
 ...
 [0.49526613 0.50473387]
 [0.49561864 0.50438136]
 [0.49581733 0.50418267]]


Confusion Matrix :
[[  0 232]
 [  0 276]]


Classification Report :
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       232
           1       0.54      1.00      0.70       276

    accuracy                           0.54       508
   macro avg       0.27      0.50      0.35       508
weighted avg       0.30      0.54      0.38       508


Model Accuracy :
0.5433070866141733


Cross-Validation result : [0.5254902  0.5254902  0.52362205 0.52362205 0.52362205 0.52362205
 0.52362205 0.5256917  0.5256917  0.5256917 ]


Cross-Validation mean : 0.5246165727191564

/Users/leafanirisoa/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

plt.figure(figsize=(20, 10))
plt.ylim(-1.5, 1.5)
plt.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
plt.plot(dates_test, predicted_LR ,"ro",marker=".",alpha=1, color='g', label="y_predicted")
plt.stem(dates_test, y_test, markerfmt='', use_line_collection = True, basefmt=None ,label="y_test")
plt.xlabel('Date', fontsize=18 , labelpad=11, fontweight='bold')
plt.ylabel('Price', fontsize=18 , labelpad=11, fontweight='bold')
plt.title('Logistic Regression model', pad =10, c = 'r', fontweight='bold')
plt.legend()
plt.show()

# ------------------------------------------------------------
#                B- SVM Classifier
# ------------------------------------------------------------
#         Step 1 : Instantiate the model
# ------------------------------------------------------------
# Create a svm Classifier
clf = svm.SVC(kernel='rbf',gamma=0.3, max_iter=-1, probability=True,random_state=None, shrinking=True, tol=0.01) 
model_svc = clf.fit(X_train, y_train)

# ------------------------------------------------------------
#            Step 2 :   Predict Class Labels
# ------------------------------------------------------------
predicted_svc = model_svc.predict(X_test)

# ------------------------------------------------------------
#            Step 3 :   Evaluate The Model
# ------------------------------------------------------------
print('Result of the SVM Classifier :')

print('Confusion Matrix for SVC:')
confusion_matrix_svc = metrics.confusion_matrix(y_test, predicted_svc)
print(confusion_matrix_svc)

print('\n')

print('Classification Report for SVC :')
classification_report_svc = metrics.classification_report(y_test, predicted_svc)
print(classification_report_svc)

print('\n')

print('Model Accuracy for SVC:')
score_svc = model_svc.score(X_test,y_test)
print(score_svc)


# ------------------------------------------------------------
#               Cross-Validation
# ------------------------------------------------------------

cross_val_svc = cross_val_score(clf, X, y, scoring='accuracy', cv=10)
print('Cross-Validation result :', cross_val_svc)

print('\n')

print('Cross-Validation mean :',cross_val_svc.mean())

Result of the SVM Classifier :
Confusion Matrix for SVC:
[[  0 232]
 [  0 276]]


Classification Report for SVC :
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       232
           1       0.54      1.00      0.70       276

    accuracy                           0.54       508
   macro avg       0.27      0.50      0.35       508
weighted avg       0.30      0.54      0.38       508


Model Accuracy for SVC:
0.5433070866141733

/Users/leafanirisoa/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

Cross-Validation result : [0.5254902  0.5254902  0.52362205 0.52362205 0.52362205 0.52362205
 0.52362205 0.5256917  0.5256917  0.5256917 ]


Cross-Validation mean : 0.5246165727191564

plt.figure(figsize=(20, 10))
plt.ylim(-1.5, 1.5)
plt.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
plt.plot(dates_test, predicted_svc ,"ro",marker=".",alpha=1, color='g', label="y_predicted")
plt.stem(dates_test, y_test, markerfmt='', use_line_collection = True, basefmt=None ,label="y_test")
plt.xlabel('Date', fontsize=18 , labelpad=11, fontweight='bold')
plt.ylabel('Price', fontsize=18 , labelpad=11, fontweight='bold')
plt.title('SVM Classifier model', pad =10, c = 'r', fontweight='bold')
plt.legend()
plt.show()

# ------------------------------------------------------------
#                C - Lasso Classifier
# ------------------------------------------------------------
#         Step 1 : Instantiate the model
# ------------------------------------------------------------
# Create a svm Classifier
ls =  LogisticRegression(penalty='l1', solver='saga')
model_ls =ls.fit(X_train, y_train)

# ------------------------------------------------------------
#            Step 2 :   Predict Class Labels
# ------------------------------------------------------------
predicted_ls = model_ls.predict(X_test)

# ------------------------------------------------------------
#            Step 3 :   Evaluate The Model
# ------------------------------------------------------------
print('Result of the Lasso Classifier :')

print('Confusion Matrix for SVC:')
confusion_matrix_ls = metrics.confusion_matrix(y_test, predicted_ls)
print(confusion_matrix_ls)

print('\n')

print('Classification Report for SVC :')
classification_report_ls = metrics.classification_report(y_test, predicted_ls)
print(classification_report_ls)

print('\n')

print('Model Accuracy for SVC:')
score_ls = model_ls.score(X_test,y_test)
print(score_ls)


# ------------------------------------------------------------
#               Cross-Validation
# ------------------------------------------------------------

cross_val_ls = cross_val_score(ls, X, y, scoring='accuracy', cv=10)
print('Cross-Validation result :', cross_val_ls)

print('\n')

print('Cross-Validation mean :',cross_val_ls.mean())

Result of the Lasso Classifier :
Confusion Matrix for SVC:
[[  0 232]
 [  0 276]]


Classification Report for SVC :
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       232
           1       0.54      1.00      0.70       276

    accuracy                           0.54       508
   macro avg       0.27      0.50      0.35       508
weighted avg       0.30      0.54      0.38       508


Model Accuracy for SVC:
0.5433070866141733
Cross-Validation result : [0.5254902  0.5254902  0.52362205 0.52362205 0.52362205 0.52362205
 0.52362205 0.5256917  0.5256917  0.5256917 ]


Cross-Validation mean : 0.5246165727191564

/Users/leafanirisoa/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

plt.figure(figsize=(20, 10))
plt.ylim(-1.5, 1.5)
plt.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
plt.plot(dates_test, predicted_svc ,"ro",marker=".",alpha=1, color='g', label="y_predicted")
plt.stem(dates_test, y_test, markerfmt='', use_line_collection = True, basefmt=None ,label="y_test")
plt.xlabel('Date', fontsize=18 , labelpad=11, fontweight='bold')
plt.ylabel('Price', fontsize=18 , labelpad=11, fontweight='bold')
plt.title('Lasso Classifier model', pad =10, c = 'r', fontweight='bold')
plt.legend()
plt.show()

# ------------------------------------------------------------
#          D- SVM Classifier with StandardScaler
# ------------------------------------------------------------
#         Step 1 : Instantiate the model
# ------------------------------------------------------------
# Create a svm Classifier
pipe_lrSVC = Pipeline([('scaler', StandardScaler()), ('clf', svm.SVC(kernel='rbf',gamma=0.1, max_iter=-1, probability=False,random_state=None, shrinking=True, tol=0.001))])
pipe_lrSVC.fit(X_train, y_train)


# ------------------------------------------------------------
#            Step 2 :   Predict Class Labels
# ------------------------------------------------------------
predicted_svc_scale = pipe_lrSVC.predict(X_test)

# ------------------------------------------------------------
#            Step 3 :   Evaluate The Model
# ------------------------------------------------------------
print('Result of the SVM Classifier :')

print('Confusion Matrix for SVC:')
confusion_matrix_svc_scale = metrics.confusion_matrix(y_test, predicted_svc_scale)
print(confusion_matrix_svc_scale)

print('\n')

print('Classification Report for SVC :')
classification_report_svc_scale = metrics.classification_report(y_test, predicted_svc_scale)
print(classification_report_svc_scale)

print('\n')

print('Model Accuracy for SVC:')
score_svc_scale = pipe_lrSVC.score(X_test,y_test)
print(score_svc_scale)


# ------------------------------------------------------------
#               Cross-Validation
# ------------------------------------------------------------
cross_val_svc_scale = cross_val_score(pipe_lrSVC, X, y, scoring='accuracy', cv=10)
print('Cross-Validation result :', cross_val_svc_scale)

print('\n')

print('Cross-Validation mean :',cross_val_svc_scale.mean())

Result of the SVM Classifier :
Confusion Matrix for SVC:
[[135  97]
 [183  93]]


Classification Report for SVC :
              precision    recall  f1-score   support

          -1       0.42      0.58      0.49       232
           1       0.49      0.34      0.40       276

    accuracy                           0.45       508
   macro avg       0.46      0.46      0.45       508
weighted avg       0.46      0.45      0.44       508


Model Accuracy for SVC:
0.44881889763779526
Cross-Validation result : [0.53333333 0.49411765 0.48425197 0.50393701 0.50787402 0.48425197
 0.53149606 0.41501976 0.50197628 0.52964427]


Cross-Validation mean : 0.49859023202197383

plt.figure(figsize=(20, 10))
plt.ylim(-1.5, 1.5)
plt.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
plt.plot(dates_test, predicted_svc_scale ,"ro",marker="o",alpha=1, color='g', label="y_predicted")
plt.stem(dates_test, y_test, markerfmt='', use_line_collection = True, basefmt=None ,label="y_test")
plt.xlabel('Date', fontsize=18 , labelpad=11, fontweight='bold')
plt.ylabel('Price', fontsize=18 , labelpad=11, fontweight='bold')
plt.title('SVM Classifier with StandardScaler', pad =10, c = 'r', fontweight='bold')
plt.legend()
plt.show()

# ------------------------------------------------------------
#          D- Lasso Classifier with StandardScaler
# ------------------------------------------------------------
#         Step 1 : Instantiate the model
# ------------------------------------------------------------
pipe_ls = Pipeline([('scaler', StandardScaler()), ('pipe_ls', LogisticRegression(penalty='l1', solver='saga',max_iter =5000))])
pipe_ls.fit(X_train, y_train)


# ------------------------------------------------------------
#            Step 2 :   Predict Class Labels
# ------------------------------------------------------------
predicted_ls_scale = pipe_ls.predict(X_test)

# ------------------------------------------------------------
#            Step 3 :   Evaluate The Model
# ------------------------------------------------------------
print('Results from Lasso Classifier with StandardScaler:')

print('Confusion Matrix for SVC:')
confusion_matrix_ls_scale = metrics.confusion_matrix(y_test, predicted_ls_scale)
print(confusion_matrix_ls_scale)

print('\n')

print('Classification Report for SVC :')
classification_report_ls_scale = metrics.classification_report(y_test, predicted_ls_scale)
print(classification_report_ls_scale)

print('\n')

print('Model Accuracy for SVC:')
score_ls_scale = pipe_ls.score(X_test,y_test)
print(score_ls_scale)


# ------------------------------------------------------------
#               Cross-Validation
# ------------------------------------------------------------
cross_val_ls_scale = cross_val_score(pipe_ls, X, y, scoring='accuracy', cv=10)
print('Cross-Validation result :', cross_val_ls_scale)

print('\n')

print('Cross-Validation mean :',cross_val_ls_scale.mean())

Results from Lasso Classifier with StandardScaler:
Confusion Matrix for SVC:
[[231   1]
 [276   0]]


Classification Report for SVC :
              precision    recall  f1-score   support

          -1       0.46      1.00      0.63       232
           1       0.00      0.00      0.00       276

    accuracy                           0.45       508
   macro avg       0.23      0.50      0.31       508
weighted avg       0.21      0.45      0.29       508


Model Accuracy for SVC:
0.4547244094488189
Cross-Validation result : [0.5254902  0.52941176 0.51574803 0.51968504 0.51181102 0.5
 0.53149606 0.46640316 0.53359684 0.50988142]


Cross-Validation mean : 0.514352354118953

plt.figure(figsize=(20, 10))
plt.ylim(-1.5, 1.5)
plt.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
plt.plot(dates_test, predicted_ls_scale ,"ro",marker="o",alpha=1, color='g', label="y_predicted")
plt.stem(dates_test, y_test, markerfmt='', use_line_collection = True, basefmt=None ,label="y_test")
plt.xlabel('Date', fontsize=18 , labelpad=11, fontweight='bold')
plt.ylabel('Price', fontsize=18 , labelpad=11, fontweight='bold')
plt.title('Lasso Classifier with StandardScaler', pad =10, c = 'r', fontweight='bold')
plt.legend()
plt.show()

# ------------------------------------------------------------
#          E- Logistic Regression with StandardScaler
# ------------------------------------------------------------
#         Step 1 : Instantiate the model
# ------------------------------------------------------------
pipe_lR = Pipeline([('scaler', StandardScaler()), ('pipe_lR', LogisticRegression(solver='lbfgs'))])
pipe_lR.fit(X_train, y_train)


# ------------------------------------------------------------
#            Step 2 :   Predict Class Labels
# ------------------------------------------------------------
predicted_lR_scale = pipe_lR.predict(X_test)

# ------------------------------------------------------------
#            Step 3 :   Evaluate The Model
# ------------------------------------------------------------
print('Results from Lasso Classifier with StandardScaler:')

print('Confusion Matrix for SVC:')
confusion_matrix_lR_scale = metrics.confusion_matrix(y_test, predicted_lR_scale)
print(confusion_matrix_lR_scale)

print('\n')

print('Classification Report for SVC :')
classification_report_lR_scale = metrics.classification_report(y_test, predicted_lR_scale)
print(classification_report_lR_scale)

print('\n')

print('Model Accuracy for SVC:')
score_lR_scale = pipe_lR.score(X_test,y_test)
print(score_lR_scale)


# ------------------------------------------------------------
#               Cross-Validation
# ------------------------------------------------------------
cross_val_lR_scale = cross_val_score(pipe_lR, X, y, scoring='accuracy', cv=10)
print('Cross-Validation result :', cross_val_lR_scale)

print('\n')

print('Cross-Validation mean :',cross_val_lR_scale.mean())

Results from Lasso Classifier with StandardScaler:
Confusion Matrix for SVC:
[[231   1]
 [275   1]]


Classification Report for SVC :
              precision    recall  f1-score   support

          -1       0.46      1.00      0.63       232
           1       0.50      0.00      0.01       276

    accuracy                           0.46       508
   macro avg       0.48      0.50      0.32       508
weighted avg       0.48      0.46      0.29       508


Model Accuracy for SVC:
0.4566929133858268
Cross-Validation result : [0.5254902  0.53333333 0.52362205 0.51968504 0.51181102 0.5
 0.53543307 0.46245059 0.54940711 0.53359684]


Cross-Validation mean : 0.5194829255968673

plt.figure(figsize=(20, 10))
plt.ylim(-1.5, 1.5)
plt.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
plt.plot(dates_test, predicted_lR_scale ,"ro",marker="o",alpha=1, color='g', label="y_predicted")
plt.stem(dates_test, y_test, markerfmt='', use_line_collection = True, basefmt=None ,label="y_test")
plt.xlabel('Date', fontsize=18 , labelpad=11, fontweight='bold')
plt.ylabel('Price', fontsize=18 , labelpad=11, fontweight='bold')
plt.title('Logistic Regression with StandardScaler', pad =10, c = 'r', fontweight='bold')
plt.legend()
plt.show()

# importing the libraries
from bs4 import BeautifulSoup
from urllib import request
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re

from nltk.tokenize import sent_tokenize, word_tokenize
import spacy

# import sys
# !{sys.executable} -m pip install -U vaderSentiment

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

pd.set_option('display.max_rows', None)

# Sending a HTTP request to a URL
url = "file:///Users/leafanirisoa/Documents/data_test/10-K/2016-10-26.html"


# ------------------------------------------------------------
#            Step 0 :  Read the .html
# ------------------------------------------------------------
# fetch the raw HTML content and Parse the html content

def read_html(url):
    response = request.urlopen(url)
    html_content = response.read().decode('utf8')
    return BeautifulSoup(html_content, "lxml")


data_init = read_html(url)

# ------------------------------------------------------------ font style="font-family
#            Step 1 :  Transforme to text
# ------------------------------------------------------------
def extract_text(date, data_parsed):
    if int(date) <= 2015  :
        all_p = data_parsed.find_all('p')
    else:
        all_p = data_parsed.find_all('div')
        
    ls = [] # Create empty list
    for l in all_p: 
      #Find all data structure that is ‘div’
      ls.append(l.text)    
    return ls

# ------------------------------------------------------------
#            Step 2 :   Removing line breaks
# ------------------------------------------------------------
def remove_line_breaks(data_extracted):
    ls = [] # Create empty list
    for l in data_extracted: 
    #Find all data structure that is ‘div’
        ls.append(" ".join(l.split()))    
    return ls

# ------------------------------------------------------------
#            Step 3 :   Removing accented characters
# ------------------------------------------------------------

def remove_accented_chars(data_extracted):
    ls = [] # Create empty list
    for l in data_extracted: 
    #Find all data structure that is ‘div’
        ls.append(l.encode('ascii', 'ignore').decode('utf-8', 'ignore'))    
    return ls

    text = unicodedata.encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

CONTRACTION_MAP = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

# ------------------------------------------------------------
#            Step 4 :   Expanding Contractions
# ------------------------------------------------------------

def text_expand(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

#  expand_contractions("Y'all can't expand contractions I'd think")
#  ===> 'You all cannot expand contractions I would think'


def expand_contractions(data_extracted):
    ls = [] # Create empty list
    for l in data_extracted: 
    #Find all data structure that is ‘div’
        ls.append(text_expand(l))    
    return ls

# ------------------------------------------------------------
#            Step 5 :   Expanding Contractions
# ------------------------------------------------------------

# Install spacy (run in terminal/prompt)
# import sys
# !{sys.executable} -m pip install spacy
# Download spacy's  'en' Model
# !{sys.executable} -m spacy download en

nlp = spacy.load('en', parse=True, tag=True, entity=True)

def text_lemmatize(data_extracted):
    ls = [] # Create empty list
    for l in data_extracted: 
        text = nlp(l)
        ls.append(' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text]))    
    return ls

# ------------------------------------------------------------
#            Step 6 :   Removing Stopwords
# ------------------------------------------------------------
from spacy.lang.en.stop_words import STOP_WORDS

def delete_stopwords(data_extracted):
    ls = [] # Create empty list
    for l in data_extracted: 
        text = l.lower()
        ls.append(' '.join(w for w in text.split() if w not in STOP_WORDS))    
    return ls

# ------------------------------------------------------------
#            Step 7 :   Removing Special Characters
# ------------------------------------------------------------
def delete_characters(data_extracted):
    data_reduiced = [s for s in data_extracted if len(s.split()) > 10]
    ls = [] # Create empty list
    for l in data_reduiced: 
        ls.append(' '.join(w for w in l.split() if w.isalpha()))    
    return ls

ls_1 = extract_text('2016',data_init)
ls_2 = remove_line_breaks(ls_1)
ls_3 = remove_accented_chars(ls_2)
ls_4 = expand_contractions(ls_3)
ls_5 = text_lemmatize(ls_4)
ls_6 = delete_stopwords(ls_5)
ls_7 = delete_characters(ls_6)

# ------------------------------------------------------------
#            Step 8 :   Scoring 
# ------------------------------------------------------------
def Scoring_sentences(sentence):
    score = analyser.polarity_scores(sentence)
    return score
    
def sentiment_analyzer_scores(data_extracted):
    ls = [] # Create empty list
    for l in data_extracted: 
        l_score = Scoring_sentences(l)
        ls.append(l_score)    
    return ls

    
def scoring_dataset(data_extracted):
    df = pd.DataFrame(sentiment_analyzer_scores(data_extracted)) 
    df.columns = ['Compound' , 'Negative', 'Neutral','Positive'] 
    return df

ls_8 = scoring_dataset(ls_7)
ls_8.head()

# percentile list 
perc =[ .60, .80 ,.90] 

# ------------------------------------------------------------
#            Step 9 :   Metrics
# ------------------------------------------------------------
def metrics_values(df, perc):
    include =['object', 'float', 'int'] 
    desc_data  = df.describe(percentiles = perc, include = include) 
    return desc_data
    
ls_9  = metrics_values(ls_8,perc)
ls_9

# ------------------------------------------------------------
#            Final Step :   in one step 
# ------------------------------------------------------------

def metrics_extraction(date,data_init,perc):
        
    ls_1 = extract_text(date,data_init)        # Step 1 :   Transforme to text
    ls_2 = remove_line_breaks(ls_1)       # Step 2 :   Removing line breaks
    ls_3 = remove_accented_chars(ls_2)    # Step 3 :   Removing accented characters
    ls_4 = expand_contractions(ls_3)      # Step 4 :   Expanding Contractions
    ls_5 = text_lemmatize(ls_4)           # Step 5 :   Expanding Contractions
    ls_6 = delete_stopwords(ls_5)         # Step 6 :   Removing Stopwords   
    ls_7 = delete_characters(ls_6)        # Step 7 :   Removing Special Characters 
    ls_8 = scoring_dataset(ls_7)          # Step 8 :   Scoring   
    ls_9 = metrics_values(ls_8,perc)      # Step 9 :   Metrics      
    return ls_9

desc_data = metrics_extraction('2016',data_init,perc)
desc_data

import seaborn as sns
sns.set_style("white")

x1 = ls_8.Compound
x2 = ls_8.Negative
x3 = ls_8.Neutral
x4 = ls_8.Positive

# plot
fig, axes = plt.subplots(1, 4, figsize=(10, 4), sharey=True, dpi=100)
sns.distplot(x1 , color="dodgerblue", ax=axes[0], axlabel='Compound')
sns.distplot(x2 , color="deeppink", ax=axes[1], axlabel='Negative')
sns.distplot(x3 , color="orange", ax=axes[2], axlabel='Neutral')
sns.distplot(x4 , color="green", ax=axes[3], axlabel='Positive')

<matplotlib.axes._subplots.AxesSubplot at 0x7fb8282cdf98>

# Draw Plot
plt.figure(figsize=(16,10), dpi= 80)
sns.kdeplot(x1, shade=True, color="dodgerblue", label="Compound", alpha=.7)
sns.kdeplot(x2, shade=True, color="deeppink", label="Negative", alpha=.7)
sns.kdeplot(x3, shade=True, color="orange", label="Neutral", alpha=.7)
sns.kdeplot(x4, shade=True, color="green", label="Positive", alpha=.7)

# Decoration
plt.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
plt.title('Density Plot of each categories ', fontsize=22)
plt.legend(loc ='best', fontsize=15)
plt.show()

# importing the libraries
from bs4 import BeautifulSoup
from urllib import request
import csv
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', None)


# Step 1: Sending a HTTP request to a URL
url = "file:///Users/leafanirisoa/Documents/data_test/10-K/2011-10-26.html"

# Make a GET request to fetch the raw HTML content

response = request.urlopen(url)
html_content = response.read().decode('utf8')

# Step 2: Parse the html content
soup = BeautifulSoup(html_content, "lxml")

def table_generator(data, k):
    #Get the table having in the rank num_table = k
    # Parameters of the table : 
    gdp_table = data.find_all('table')[k]
    gdp_table_data = gdp_table.find_all("tr") 
    num_cols = len(gdp_table_data[0].find_all("td"))
    num_rows =len(gdp_table_data)
    
    # Save the table in pd.dataframe
    new_table = pd.DataFrame(columns=range(0,num_cols), index = range(0,num_rows)) # I know the size 
    for i in range(0,num_rows):
        column_marker = 0
        columns = gdp_table.find_all("tr")[i].find_all('td')
        for column in columns:
            new_table.iat[i,column_marker] = column.get_text(strip = True).replace('\n', ' ').replace('$', '').replace('(', '').replace(')', '').strip()
            column_marker += 1
            
    # Define the header of the dataset
    headings = []
    for td in gdp_table_data[1].find_all("td"):
        # remove any newlines and extra spaces from left and right
        headings.append(td.get_text(strip = True).replace('\n', ' ').strip())
     
    headings_table = [x for x in headings if x] 
    # Consolidation of the dataset    
    new_table_1 = new_table
    new_table_1.replace('', np.nan, inplace=True)
    last_table = new_table_1.dropna(how='all').reset_index(drop=True).iloc[1:].dropna(1,how='all').dropna(0,how='all')
    
    if len(headings_table) == len(last_table.columns) :
        last_table.columns = headings_table 
    elif len(headings_table) == len(last_table.columns) -1 :
        last_table.columns = ["Attributes"] +[x for x in headings if x] 
    else:
        last_table = new_table_1.dropna(how='all').reset_index(drop=True).iloc[2:].dropna(1,how='all').dropna(0,how='all')
        last_table.columns =  ["Attributes"] + ['Col_'+ str(x)  for x in  range(1, len(last_table.columns))]
        
        
    return last_table.fillna('').reset_index(drop=True)

# ------------------------------------------------------------
#                  Load Packages
# ------------------------------------------------------------
import os

import math
import pandas as pd
from scipy import stats
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
pd.plotting.register_matplotlib_converters()
import import_ipynb


# ------------------------------------------------------------
#                   set Parameters
# ------------------------------------------------------------
source_dir = "file:///Users/leafanirisoa/Documents/data_test/10-K/"
dates = ["2011-10-26", "2012-10-31", "2013-10-30","2014-10-27","2015-10-28","2016-10-26","2017-11-03","2018-11-05","2019-10-31"]

# percentile list 
perc =[ .60, .80 ,.90]

# ------------------------------------------------------------
#                 Read the .html
# ------------------------------------------------------------
def one_date_data(source_dir, dates):
    url =  source_dir + dates + ".html"
    response = request.urlopen(url)
    html_content = response.read().decode('utf8')
    soup = BeautifulSoup(html_content, "lxml")   
    return soup


def get_dico_data(source_dir, dates):
    list_of_url = {t[0:4]: source_dir + t + ".html" for t in dates}
    data_html = {}
    for t in dates:
        k = t[0:4]
        data_html[k] = read_html(list_of_url[k])
        
    return data_html

data_used = get_dico_data(source_dir, dates)

# ------------------------------------------------------------
#              Generate Metric matrix
# ------------------------------------------------------------

def Matrix_metric(dates,data_used,perc):
    dico_metrics = {}
    for t in dates:
        k = t[0:4]
        data = data_used[k]
        dico_metrics[k] = metrics_extraction(k,data,perc)    
    return dico_metrics


dico_metrics = Matrix_metric(dates,data_used,perc)

def Union_metric(dates,dico_metrics,metric):
    ls = []
    for t in dates:
        k = t[0:4]
        dico_metrics[k]['Year'] = k
        ls.append(dico_metrics[k].loc[dico_metrics[k].index == metric,:])
            
    return pd.concat(ls,ignore_index=True).drop_duplicates().reset_index(drop=True)


Union_metric(dates,dico_metrics,'mean')

df= Union_metric(dates,dico_metrics,'90%')

fig, ax1 = plt.subplots(figsize=(16,9))
ax1.plot(df.Year, df.Negative, label='Negative', color='tab:blue')
#ax1.plot(df.Year, df.Compound, label='Compound', color='tab:red')
#ax1.plot(df.Year, df.Neutral, label='Neutral', color='tab:green')
ax1.plot(df.Year, df.Positive, label='Positive', color='tab:orange')
ax1.set_xlabel('Jan. 04, 2010 - Apr. 28, 2020', fontsize=18 , labelpad=11, fontweight='bold')
ax1.set_ylabel('Metric Values ', fontsize=18 , labelpad=11, fontweight='bold')
ax1.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
ax1.set_title('Sentiment analysis metric', pad =10, c = 'r', fontweight='bold',fontsize=22)
ax1.legend(loc ='best', fontsize=15)

<matplotlib.legend.Legend at 0x7fb848a09860>

df2= Union_metric(dates,dico_metrics,'std')

fig, ax1 = plt.subplots(figsize=(16,9))
# ax1.plot(df.Year, df2.Compound, label='Compound', color='tab:red')
ax1.plot(df.Year, df2.Negative, label='Negative', color='tab:blue')
ax1.plot(df.Year, df2.Neutral, label='Neutral', color='tab:green')
ax1.plot(df.Year, df2.Positive, label='Positive', color='tab:orange')
ax1.set_xlabel('Jan. 04, 2010 - Apr. 28, 2020', fontsize=18 , labelpad=11, fontweight='bold')
ax1.set_ylabel('Metric Values ', fontsize=18 , labelpad=11, fontweight='bold')
ax1.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
ax1.set_title('Sentiment analysis metric (Std)', pad =10, c = 'r', fontweight='bold',fontsize=22)
ax1.legend(loc ='best', fontsize=15)

<matplotlib.legend.Legend at 0x7fb82a856f98>

df= Union_metric(dates,dico_metrics,'mean')

fig, ax1 = plt.subplots(figsize=(16,9))
ax1.plot(df.Year, df.Negative, label='Negative', color='tab:blue')
ax1.plot(df.Year, df.Compound, label='Compound', color='tab:red')
ax1.plot(df.Year, df.Neutral, label='Neutral', color='tab:green')
ax1.plot(df.Year, df.Positive, label='Positive', color='tab:orange')
ax1.set_xlabel('Jan. 04, 2010 - Apr. 28, 2020', fontsize=18 , labelpad=11, fontweight='bold')
ax1.set_ylabel('Metric Values ', fontsize=18 , labelpad=11, fontweight='bold')
ax1.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
ax1.set_title('Sentiment analysis metric', pad =10, c = 'r', fontweight='bold',fontsize=22)
ax1.legend(loc ='best', fontsize=15)

<matplotlib.legend.Legend at 0x7fb7dac81358>

df_2013 = one_date_data(source_dir, '2013-10-30')
table_2013=table_generator(df_2013, 74)
table_2013

df_2011 = one_date_data(source_dir, '2011-10-26')
table_2011= table_generator(df_2011, 68)
table_2011

df_cd = pd.merge(table_2011, table_2013, how='inner', on = ['Attributes','2011'])
data_last = df_cd[['Attributes', '2009', '2010', '2011', '2012', '2013']]
data_last

data_transposed = data_last.set_index('Attributes').T
data_transposed

data_transposed.columns = ['C_1', 'C_2', 'C_3', 'C_4', 'C_5', 'C_6', 'C_7']
data_transposed['C_1'] = data_transposed['C_1'].str.replace(',', '').astype(float)
data_transposed['C_7'] = data_transposed['C_7'].str.replace(',', '').astype(float)
data_transposed

for i in data_transposed.columns:
    data_transposed[i] = pd.to_numeric(data_transposed[i], errors='ignore')

fig, ax1 = plt.subplots(figsize=(16,9))
# ax1.plot(df.Year, df2.Compound, label='Compound', color='tab:red')
ax1.plot(data_transposed.index, data_transposed.C_1, label='Beginning Balance', color='tab:blue')
ax1.plot(data_transposed.index, data_transposed.C_7, label='Ending Balance', color='tab:green')
ax1.set_xlabel('Jan. 2009 - Dec. 2013', fontsize=18 , labelpad=11, fontweight='bold')
ax1.set_ylabel('Balance values ', fontsize=18 , labelpad=11, fontweight='bold')
ax1.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
ax1.set_title('The aggregate changes in the balance of gross unrecognized tax benefits', pad =10, c = 'r', fontweight='bold',fontsize=22)
ax1.legend(loc ='best', fontsize=15)

<matplotlib.legend.Legend at 0x7fb85c06be10>

fig, ax1 = plt.subplots(figsize=(16,9))
# ax1.plot(df.Year, df2.Compound, label='Compound', color='tab:red')
ax1.plot(data_transposed.index, data_transposed.C_7 - data_transposed.C_1, label='C_7-C_1', color='tab:blue')
ax1.plot(data_transposed.index, data_transposed.C_2, label='C_2', color='tab:green')
ax1.plot(data_transposed.index, data_transposed.C_3, label='C_3', color='tab:red')
ax1.plot(data_transposed.index, data_transposed.C_4, label='C_4', color='tab:orange')
ax1.set_xlabel('Jan. 2009 - Dec. 2013', fontsize=18 , labelpad=11, fontweight='bold')
ax1.set_ylabel('Balance values ', fontsize=18 , labelpad=11, fontweight='bold')
ax1.grid(which="major", color='k', linestyle='-.', linewidth=0.5)
ax1.set_title('The aggregate changes in the balance of gross unrecognized tax benefits', pad =10, c = 'r', fontweight='bold',fontsize=22)
ax1.legend(loc ='best', fontsize=15)

<matplotlib.legend.Legend at 0x7fb7c919b048>

	Open	High	Low	Close	Volume	RSI	middle_MA	Corr	Open-Close	Open-Open
Date
2010-03-29	33.285713	33.410000	33.088570	33.198570	135186100	71.676475	30.871000	0.745134	0.299999	0.578571
2010-03-30	33.799999	33.925713	33.464287	33.692856	131827500	74.703099	31.025524	0.775770	0.601429	0.514286
2010-03-31	33.641430	33.801430	33.494286	33.571430	107664900	72.649288	31.180047	0.795911	-0.051426	-0.158569
2010-04-01	33.915714	34.104286	33.250000	33.709999	150786300	73.543207	31.337381	0.811838	0.344284	0.274284
2010-04-05	33.568573	34.072857	33.538570	34.070000	171126900	75.759809	31.512714	0.830812	-0.141426	-0.347141

	Compound	Neutral	Positive
0	0.2960	0.732	0.268
1	0.2960	0.732	0.268
2	0.0000	1.000	0.000
3	0.0000	1.000	0.000
4	0.5574	0.911	0.089

	Compound	Negative	Neutral	Positive
count	631.000000	631.000000	631.000000	631.000000
mean	0.364839	0.066746	0.765173	0.168090
std	0.495390	0.101599	0.181888	0.152131
min	-0.996800	0.000000	0.245000	0.000000
50%	0.421500	0.022000	0.772000	0.156000
60%	0.624900	0.057000	0.820000	0.186000
80%	0.868900	0.126000	1.000000	0.290000
90%	0.937100	0.171000	1.000000	0.383000
max	0.998500	0.581000	1.000000	0.704000

	Compound	Negative	Neutral	Positive
count	631.000000	631.000000	631.000000	631.000000
mean	0.364839	0.066746	0.765173	0.168090
std	0.495390	0.101599	0.181888	0.152131
min	-0.996800	0.000000	0.245000	0.000000
50%	0.421500	0.022000	0.772000	0.156000
60%	0.624900	0.057000	0.820000	0.186000
80%	0.868900	0.126000	1.000000	0.290000
90%	0.937100	0.171000	1.000000	0.383000
max	0.998500	0.581000	1.000000	0.704000

	Compound	Negative	Neutral	Positive	Year
0	0.407927	0.061582	0.762789	0.175604	2011
1	0.422097	0.059376	0.763211	0.177393	2012
2	0.407107	0.060912	0.764631	0.174452	2013
3	0.418464	0.062530	0.768780	0.168701	2014
4	0.418649	0.065114	0.765486	0.169409	2015
5	0.364839	0.066746	0.765173	0.168090	2016
6	0.356237	0.066175	0.768222	0.165621	2017
7	0.353941	0.066981	0.765862	0.167170	2018
8	0.326360	0.073405	0.756443	0.170162	2019

	Attributes	2013	2012	2011
0	Beginning Balance	2,062	1,375	943
1	Increases related to tax positions taken durin...	745	340	49
2	Decreases related to tax positions taken durin...	118	107	39
3	Increases related to tax positions taken durin...	626	467	425
4	Decreases related to settlements with taxing a...	592	3	0
5	Decreases related to expiration of statute of ...	9	10	3
6	Ending Balance	2,714	2,062	1,375

Attributes	Beginning Balance	Increases related to tax positions taken during a prior year	Decreases related to tax positions taken during a prior year	Increases related to tax positions taken during the current year	Decreases related to settlements with taxing authorities	Decreases related to expiration of statute of limitations	Ending Balance
2009	506	341	24	151	0	3	971
2010	971	61	224	240	102	3	943
2011	943	49	39	425	0	3	1,375
2012	1,375	340	107	467	3	10	2,062
2013	2,062	745	118	626	592	9	2,714