import time

time_start_notebook = time.time()


mkdir -p ~/.kaggle


# !echo '' > ~/.kaggle/kaggle.json


!chmod 600 ~/.kaggle/kaggle.json

# after we have ~/.kaggle/kaggle.json file in colab, we can install kaggle module.


!head -c 20 ~/.kaggle/kaggle.json

{"username":"bhishan


%%capture
# capture will not print in notebook

import os
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    ## install modules
    !pip install watermark
    !pip install tsfresh


    ## create project like folders
    !mkdir -p ../data ../outputs ../images ../reports ../html ../models

    !pip install kaggle


# !kaggle competitions files -c web-traffic-time-series-forecasting


# !kaggle competitions download -c web-traffic-time-series-forecasting -f train_1.csv.zip -p ../data/


!ls ../data

most_visited_2016.csv train_1.csv.zip       train_1_01?raw=true   train_1_02?raw=true   train_1_03?raw=true


import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set(context='notebook', style='whitegrid', rc={'figure.figsize': (12,8)})
plt.style.use('fivethirtyeight') # better than sns styles.
matplotlib.rcParams['figure.figsize'] = 12,8

import os
import time

# random state
SEED=100
np.random.seed(SEED)

# Jupyter notebook settings for pandas
#pd.set_option('display.float_format', '{:,.2g}'.format) # numbers sep by comma
from pandas.api.types import CategoricalDtype
np.set_printoptions(precision=3)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 200)


import gc

import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, r2_score

import xgboost
from xgboost import XGBRegressor

import watermark

# versions
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Bhishan Poudel 2020-10-14 

CPython 3.7.7
IPython 7.18.1

compiler   : Clang 4.0.1 (tags/RELEASE_401/final)
system     : Darwin
release    : 19.6.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit

watermark  2.0.2
matplotlib 3.2.1
numpy      1.18.4
autopep8   1.5.2
xgboost    1.2.0
seaborn    0.11.0
pandas     1.1.0
sklearn    0.23.1
json       2.0.9


%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;


def show_method_attributes(method, ncols=7):
    """ Show all the attributes of a given method.
    Example:
    ========
    show_method_attributes(list)
     """
    x = [i for i in dir(method) if i[0].islower()]
    return pd.DataFrame(np.array_split(x,ncols)).T.fillna('')


def safe_median(s):
    return np.median([x for x in s if ~np.isnan(x)])


def timeseries_train_test_split(df, target, train_size=0.8):
    """Train test split for time series dataframe.
    
    Parameters
    -----------
    df -- dataframe
    target -- name of target
    train_size -- (float) for train size
    
    Returns
    -------
    Xtrain, Xtest, ytrain, ytest: numpy array of dtype int32
    
    """

    frac = int(len(df)* train_size)

    Xtrain = df.drop(target,1).iloc[:frac, :].astype(np.int32).to_numpy()
    Xtest = df.drop(target,1).iloc[frac:, :].astype(np.int32).to_numpy()

    ytrain = df[target].iloc[:frac].to_numpy()
    ytest = df[target].iloc[frac:].to_numpy()

    return Xtrain, Xtest, ytrain, ytest


def get_mape(y_true, y_pred):
    "Mean Absolute Percentage Error"
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def get_smape(y_true, y_pred):
    "Symmetric Mean Absolute Percentage Error"
    denominator = (np.abs(y_true) + np.abs(y_pred))
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return 200 * np.mean(diff)

def timeseries_evaluation(model_name, desc, ytest, ypreds, df_eval=None,
                          show=False,sort_col='SMAPE'):
    from statsmodels.tsa.stattools import acf

    if df_eval is None:
        df_eval = pd.DataFrame({'Model': [],
                                'Description':[],
                                'MAPE': [],
                                'SMAPE': [],
                                'RMSE': [],
                                'ME': [],
                                'MAE': [],
                                'MPE': [],
                                'CORR': [],
                                'MINMAX': [],
                                'ACF1': [],
                                })

    mape = get_mape(ytest,ypreds)   # MAPE
    smape = get_smape(ytest,ypreds) # SMAPE
    rmse = np.mean((ytest - ypreds)**2)**.5  # RMSE

    me = np.mean(ytest - ypreds)             # ME
    mae = np.mean(np.abs(ytest - ypreds))    # MAE
    mpe = np.mean((ytest - ypreds)/ytest)    # MPE

    corr = np.corrcoef(ytest, ypreds)[0, 1]  # corr

    mins = np.amin(np.hstack([ytest[:, None],
                              ypreds[:, None]]), axis=1)
    maxs = np.amax(np.hstack([ytest[:, None],
                              ypreds[:, None]]), axis=1)
    minmax = 1 - np.mean(mins/maxs)             # minmax

    acf1 = acf(ytest-ypreds, fft=False)[1]    # ACF1 (autocorrelation function)

    row = [model_name, desc, mape, smape,rmse, me,mae,mpe,corr,minmax,acf1]

    df_eval.loc[len(df_eval)] = row
    df_eval = df_eval.drop_duplicates(['Model','Description'])
    
    asc, cmap = True, 'Greens_r'
    
    if sort_col == 'RMSE':
        asc = False
        cmap = 'Greens'

    if sort_col == 'SMAPE':
        asc = True
        cmap = 'Greens_r'

    df_eval = df_eval.sort_values(sort_col,ascending=asc)
    df_eval = df_eval.reset_index(drop=True)

    if show:
        df_eval_style = (df_eval.style
                         .format({'MAPE': "{:,.0f}",
                                  'SMAPE': "{:,.4f}",
                                  'RMSE': "{:,.0f}",
                                  'ME': "{:,.0f}",
                                  'MAE': "{:,.0f}",
                                  'MPE': "{:,.0f}",
                                  'CORR': "{:,.4f}",
                                  'MINMAX': "{:,.4f}",
                                  'ACF1': "{:,.4f}"
                                 })
                         .background_gradient(subset=[sort_col],cmap=cmap)
                        )
        display(df_eval_style)
    return df_eval

df_eval = None


def plot_model_results(model,  Xtrain, Xtest, ytest,
                       plot_intervals=False, plot_anomalies=False):
    """Plot model performance for timeseries data.

    """
    from sklearn.model_selection import cross_val_score
    from sklearn.model_selection import TimeSeriesSplit

    def get_mape(y_true, y_pred): 
        return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

    def get_smape(y_true, y_pred):
        denominator = (np.abs(y_true) + np.abs(y_pred))
        diff = np.abs(y_true - y_pred) / denominator
        diff[denominator == 0] = 0.0
        return 200 * np.mean(diff)

    tscv = TimeSeriesSplit(n_splits=3)
    ypreds = model.predict(Xtest)

    plt.figure(figsize=(15, 7))
    plt.plot(ypreds, "g", label="ypreds", linewidth=2.0)
    plt.plot(ytest, label="actual", linewidth=2.0)

    if plot_intervals:
        cv = cross_val_score(model,  Xtrain, ytrain,
                                    cv=tscv,
                                    scoring="neg_mean_absolute_error")
        mae = cv.mean() * (-1)
        deviation = cv.std()

        scale = 1.96
        lower = ypreds - (mae + scale * deviation)
        upper = ypreds + (mae + scale * deviation)

        plt.plot(lower, "r--", label="upper bond / lower bond", alpha=0.5)
        plt.plot(upper, "r--", alpha=0.5)

        if plot_anomalies:
            anomalies = np.array([np.NaN]*len(ytest))
            anomalies[ytest<lower] = ytest[ytest<lower]
            anomalies[ytest>upper] = ytest[ytest>upper]
            plt.plot(anomalies, "o", markersize=10, label = "Anomalies")

    mape_error = get_mape(ytest, ypreds)
    smape_error = get_smape(ytest, ypreds)

    mape_error = round(mape_error,2)
    smape_error = round(smape_error,2)
    
    plt.title("MAPE: "+str(mape_error)+"\n"+"SMAPE: "+str(smape_error))
    plt.legend(loc="best")
    plt.tight_layout()
    plt.grid(True);


def plot_coefficients(model, Xtrain_columns):
    """Plot of coefficients
    """

    coefs = pd.DataFrame(model.coef_,  Xtrain_columns)
    coefs.columns = ["coef"]
    coefs["abs"] = coefs.coef.apply(np.abs)
    coefs = coefs.sort_values(by="abs", ascending=False).drop(["abs"], axis=1)

    plt.figure(figsize=(15, 7))
    coefs.coef.plot(kind='bar')
    plt.grid(True, axis='y')
    plt.hlines(y=0, xmin=0, xmax=len(coefs), linestyles='dashed');


df = pd.read_csv('../data/train_1.csv.zip',compression='zip')

print(df.shape)
df.head()

(145063, 551)


cond = df['Page'] == "Prince_(musician)_en.wikipedia.org_all-access_all-agents"
df = df.loc[cond]

df.head()


df = df.filter(regex="Page|2016")
df.iloc[:5, np.r_[0, 1,2,-2,-1]]


df = df.melt(id_vars=['Page'],var_name='date',value_name='visits').drop('Page',axis=1)

print(df.shape)
df.head()

(366, 2)


df['visits'].isna().sum()

0


df['visits'].plot.line()

<matplotlib.axes._subplots.AxesSubplot at 0x7f9d873aef90>


df['visits'].plot.line()
plt.ylim(0,1e5)

(0.0, 100000.0)


for lag in range(1,8):
    df['lag'+str(lag)] = df['visits'].shift(lag)


df.isna().sum().sum()

28


df = df.dropna(how='any')
df.head()


df['bias'] = 1


df['date'] = pd.to_datetime(df['date'])
df['month'] = df['date'].dt.month # 1 to 12
df['day'] = df['date'].dt.day # 1 to 31

df['quarter'] = df['date'].dt.quarter # 1 to 4
df['dayofweek'] = df['date'].dt.dayofweek # 0 to 6

df['weekend'] = ((df['date'].dt.dayofweek) // 5 == 1).astype(np.int8)

df = df.drop('date',axis=1)

print('nans = ',df.isna().sum().sum())
df.head()

nans =  0


Xtrain, Xtest, ytrain, ytest = timeseries_train_test_split(df, 'visits', 0.7)

print(Xtrain.shape, Xtest.shape)

(251, 13) (108, 13)


target = 'visits'
df_Xtrain = pd.DataFrame(Xtrain, columns=df.columns.drop(target))
df_Xtest = pd.DataFrame(Xtest, columns=df.columns.drop(target))
df_Xtrain.head(2)


ts = df['visits']


ts.plot()

<matplotlib.axes._subplots.AxesSubplot at 0x7f9d87259650>


from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()


Xtrain_scaled = scaler.fit_transform(Xtrain)
Xtest_scaled = scaler.transform(Xtest)


from sklearn.linear_model import LinearRegression


from sklearn.model_selection import cross_val_score
from sklearn.model_selection import TimeSeriesSplit


print(Xtrain.shape, ytrain.shape, Xtest.shape, ytest.shape)

(251, 13) (251,) (108, 13) (108,)


# Linear Regression
model = LinearRegression()
model.fit(Xtrain, ytrain)

plot_model_results(model, Xtrain,Xtest,ytest,plot_intervals=True)
plot_coefficients(model, df.columns.drop('visits'))

ypreds = model.predict(Xtest)
model_name = 'LinearRegression'
desc = 'default'
df_eval = timeseries_evaluation(model_name, desc, ytest, ypreds,
                    df_eval=df_eval,show=True)

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/statsmodels/tsa/stattools.py:657: FutureWarning: The default number of lags is changing from 40 tomin(int(10 * np.log10(nobs)), nobs - 1) after 0.12is released. Set the number of lags to an integer to  silence this warning.
  FutureWarning,


# Linear Regression with scaled data
model = LinearRegression()
model.fit(Xtrain_scaled, ytrain)

plot_model_results(model, Xtrain_scaled,Xtest_scaled,ytest, plot_intervals=False)
plot_coefficients(model, df.columns.drop('visits'))

ypreds = model.predict(Xtest)
model_name = 'LinearRegression'
desc = 'scaled'
df_eval = timeseries_evaluation(model_name, desc, ytest, ypreds,
                    df_eval=df_eval,show=True)

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/statsmodels/tsa/stattools.py:657: FutureWarning: The default number of lags is changing from 40 tomin(int(10 * np.log10(nobs)), nobs - 1) after 0.12is released. Set the number of lags to an integer to  silence this warning.
  FutureWarning,


plt.figure(figsize=(12,8))
sns.heatmap(df_Xtrain.corr())

<matplotlib.axes._subplots.AxesSubplot at 0x7f9d81c1aed0>


from sklearn.linear_model import RidgeCV
from sklearn.model_selection import TimeSeriesSplit

tscv = TimeSeriesSplit(n_splits=3)
model = RidgeCV(cv=tscv)
model.fit(Xtrain, ytrain)

plot_model_results(model, 
                 Xtrain=Xtrain, 
                 Xtest=Xtest,
                 ytest=ytest,
                 plot_intervals=True, plot_anomalies=True)
plot_coefficients(model,df_Xtrain.columns)

ypreds = model.predict(Xtest)
model_name = 'RidgeCV'
desc = 'ts_split=3'
df_eval = timeseries_evaluation(model_name, desc, ytest, ypreds,
                    df_eval=df_eval,show=True)

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/statsmodels/tsa/stattools.py:657: FutureWarning: The default number of lags is changing from 40 tomin(int(10 * np.log10(nobs)), nobs - 1) after 0.12is released. Set the number of lags to an integer to  silence this warning.
  FutureWarning,


from sklearn.linear_model import LassoCV
from sklearn.model_selection import TimeSeriesSplit

tscv = TimeSeriesSplit(n_splits=3)
model = LassoCV(cv=tscv)
model.fit(Xtrain, ytrain)

plot_model_results(model, 
                 Xtrain=Xtrain, 
                 Xtest=Xtest,
                 ytest=ytest,
                 plot_intervals=True, plot_anomalies=True)
plot_coefficients(model,df_Xtrain.columns)


ypreds = model.predict(Xtest)
model_name = 'LassoCV'
desc = 'ts_split=3'
df_eval = timeseries_evaluation(model_name, desc, ytest, ypreds,
                    df_eval=df_eval,show=True)

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/statsmodels/tsa/stattools.py:657: FutureWarning: The default number of lags is changing from 40 tomin(int(10 * np.log10(nobs)), nobs - 1) after 0.12is released. Set the number of lags to an integer to  silence this warning.
  FutureWarning,


from xgboost import XGBRegressor


# XGBRegressor?


model = XGBRegressor(random_state=SEED,n_jobs=-1,objective='reg:squarederror')
model.fit(Xtrain, ytrain)

plot_model_results(model, 
                 Xtrain=Xtrain, 
                 Xtest=Xtest,
                 ytest=ytest,
                 plot_intervals=True, plot_anomalies=True)

ypreds = model.predict(Xtest)
model_name = 'XGBRegressor'
desc = 'default'
df_eval = timeseries_evaluation(model_name, desc, ytest, ypreds,
                    df_eval=df_eval,show=True)

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/statsmodels/tsa/stattools.py:657: FutureWarning: The default number of lags is changing from 40 tomin(int(10 * np.log10(nobs)), nobs - 1) after 0.12is released. Set the number of lags to an integer to  silence this warning.
  FutureWarning,


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 0 hr 0 min 17 secs

	Page	2015-07-01	2015-07-02	2015-07-03	2015-07-04	2015-07-05	2015-07-06	2015-07-07	2015-07-08	2015-07-09	2015-07-10	2015-07-11	2015-07-12	2015-07-13	2015-07-14	2015-07-15	2015-07-16	2015-07-17	2015-07-18	2015-07-19	2015-07-20	2015-07-21	2015-07-22	2015-07-23	2015-07-24	2015-07-25	2015-07-26	2015-07-27	2015-07-28	2015-07-29	2015-07-30	2015-07-31	2015-08-01	2015-08-02	2015-08-03	2015-08-04	2015-08-05	2015-08-06	2015-08-07	2015-08-08	2015-08-09	2015-08-10	2015-08-11	2015-08-12	2015-08-13	2015-08-14	2015-08-15	2015-08-16	2015-08-17	2015-08-18	...	2016-11-12	2016-11-13	2016-11-14	2016-11-15	2016-11-16	2016-11-17	2016-11-18	2016-11-19	2016-11-20	2016-11-21	2016-11-22	2016-11-23	2016-11-24	2016-11-25	2016-11-26	2016-11-27	2016-11-28	2016-11-29	2016-11-30	2016-12-01	2016-12-02	2016-12-03	2016-12-04	2016-12-05	2016-12-06	2016-12-07	2016-12-08	2016-12-09	2016-12-10	2016-12-11	2016-12-12	2016-12-13	2016-12-14	2016-12-15	2016-12-16	2016-12-17	2016-12-18	2016-12-19	2016-12-20	2016-12-21	2016-12-22	2016-12-23	2016-12-24	2016-12-25	2016-12-26	2016-12-27	2016-12-28	2016-12-29	2016-12-30	2016-12-31
0	2NE1_zh.wikipedia.org_all-access_spider	18.0	11.0	5.0	13.0	14.0	9.0	9.0	22.0	26.0	24.0	19.0	10.0	14.0	15.0	8.0	16.0	8.0	8.0	16.0	7.0	11.0	10.0	20.0	18.0	15.0	14.0	49.0	10.0	16.0	18.0	8.0	5.0	9.0	7.0	13.0	9.0	7.0	4.0	11.0	10.0	5.0	9.0	9.0	9.0	9.0	13.0	4.0	15.0	25.0	...	13.0	8.0	15.0	14.0	12.0	6.0	11.0	10.0	42.0	21.0	24.0	14.0	11.0	204.0	14.0	45.0	33.0	28.0	18.0	14.0	47.0	15.0	14.0	18.0	20.0	14.0	16.0	14.0	20.0	60.0	22.0	15.0	17.0	19.0	18.0	21.0	21.0	47.0	65.0	17.0	32.0	63.0	15.0	26.0	14.0	20.0	22.0	19.0	18.0	20.0
1	2PM_zh.wikipedia.org_all-access_spider	11.0	14.0	15.0	18.0	11.0	13.0	22.0	11.0	10.0	4.0	41.0	65.0	57.0	38.0	20.0	62.0	44.0	15.0	10.0	47.0	24.0	17.0	22.0	9.0	39.0	13.0	11.0	12.0	21.0	19.0	9.0	15.0	33.0	8.0	8.0	7.0	13.0	2.0	23.0	12.0	27.0	27.0	36.0	23.0	58.0	80.0	60.0	69.0	42.0	...	12.0	11.0	14.0	28.0	23.0	20.0	9.0	12.0	11.0	14.0	14.0	15.0	15.0	11.0	20.0	13.0	19.0	621.0	57.0	17.0	23.0	19.0	21.0	47.0	28.0	22.0	22.0	65.0	27.0	17.0	17.0	13.0	9.0	18.0	22.0	17.0	15.0	22.0	23.0	19.0	17.0	42.0	28.0	15.0	9.0	30.0	52.0	45.0	26.0	20.0
2	3C_zh.wikipedia.org_all-access_spider	1.0	0.0	1.0	1.0	0.0	4.0	0.0	3.0	4.0	4.0	1.0	1.0	1.0	6.0	8.0	6.0	4.0	5.0	1.0	2.0	3.0	8.0	8.0	6.0	6.0	2.0	2.0	3.0	2.0	4.0	3.0	3.0	5.0	3.0	5.0	4.0	2.0	5.0	1.0	4.0	5.0	0.0	0.0	7.0	3.0	5.0	1.0	6.0	2.0	...	6.0	4.0	2.0	4.0	6.0	5.0	4.0	4.0	3.0	3.0	9.0	3.0	5.0	4.0	0.0	1.0	4.0	5.0	8.0	8.0	1.0	1.0	2.0	5.0	3.0	3.0	3.0	7.0	3.0	9.0	8.0	3.0	210.0	5.0	4.0	6.0	2.0	2.0	4.0	3.0	3.0	1.0	1.0	7.0	4.0	4.0	6.0	3.0	4.0	17.0
3	4minute_zh.wikipedia.org_all-access_spider	35.0	13.0	10.0	94.0	4.0	26.0	14.0	9.0	11.0	16.0	16.0	11.0	23.0	145.0	14.0	17.0	85.0	4.0	30.0	22.0	9.0	10.0	11.0	7.0	7.0	11.0	9.0	11.0	44.0	8.0	14.0	19.0	10.0	17.0	17.0	10.0	7.0	10.0	1.0	8.0	27.0	19.0	16.0	2.0	84.0	22.0	14.0	47.0	25.0	...	38.0	13.0	14.0	17.0	26.0	14.0	10.0	9.0	23.0	15.0	7.0	10.0	7.0	10.0	14.0	17.0	11.0	9.0	11.0	5.0	10.0	8.0	17.0	13.0	23.0	40.0	16.0	17.0	41.0	17.0	8.0	9.0	18.0	12.0	12.0	18.0	13.0	18.0	23.0	10.0	32.0	10.0	26.0	27.0	16.0	11.0	17.0	19.0	10.0	11.0
4	52_Hz_I_Love_You_zh.wikipedia.org_all-access_spider	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	13.0	11.0	8.0	6.0	10.0	14.0	6.0	9.0	6.0	16.0	14.0	13.0	15.0	14.0	16.0	9.0	178.0	64.0	12.0	10.0	11.0	6.0	8.0	7.0	9.0	8.0	5.0	11.0	8.0	4.0	15.0	5.0	8.0	8.0	6.0	7.0	15.0	4.0	11.0	7.0	48.0	9.0	25.0	13.0	3.0	11.0	27.0	13.0	36.0	10.0

	date	visits
0	2016-01-01	20947.0
1	2016-01-02	19466.0
2	2016-01-03	8587.0
3	2016-01-04	7386.0
4	2016-01-05	7719.0

	date	visits	lag1	lag2	lag3	lag4	lag5	lag6	lag7
7	2016-01-08	8658.0	7713.0	7156.0	7719.0	7386.0	8587.0	19466.0	20947.0
8	2016-01-09	9137.0	8658.0	7713.0	7156.0	7719.0	7386.0	8587.0	19466.0
9	2016-01-10	8855.0	9137.0	8658.0	7713.0	7156.0	7719.0	7386.0	8587.0
10	2016-01-11	9034.0	8855.0	9137.0	8658.0	7713.0	7156.0	7719.0	7386.0
11	2016-01-12	8435.0	9034.0	8855.0	9137.0	8658.0	7713.0	7156.0	7719.0

	visits	lag1	lag2	lag3	lag4	lag5	lag6	lag7	bias	month	day	quarter	dayofweek	weekend
7	8658.0	7713.0	7156.0	7719.0	7386.0	8587.0	19466.0	20947.0	1	1	8	1	4	0
8	9137.0	8658.0	7713.0	7156.0	7719.0	7386.0	8587.0	19466.0	1	1	9	1	5	1
9	8855.0	9137.0	8658.0	7713.0	7156.0	7719.0	7386.0	8587.0	1	1	10	1	6	1
10	9034.0	8855.0	9137.0	8658.0	7713.0	7156.0	7719.0	7386.0	1	1	11	1	0	0
11	8435.0	9034.0	8855.0	9137.0	8658.0	7713.0	7156.0	7719.0	1	1	12	1	1	0

	lag1	lag2	lag3	lag4	lag5	lag6	lag7	bias	month	day	quarter	dayofweek	weekend
0	7713	7156	7719	7386	8587	19466	20947	1	1	8	1	4	0
1	8658	7713	7156	7719	7386	8587	19466	1	1	9	1	5	1

Table of Contents

Data Description¶

Colab¶

Load the Libraries¶

Useful Scripts¶

Load the data¶

Choose Prince Musician data as timeseries¶

Data Preprocessing¶

Add lag columns¶

Add bias term¶

Add timeseries features¶

Modelling¶

Train Test split¶

Scaling¶

Linear Regression¶

Regularized models: LassoCV and RidgeCV¶

Modelling: Ensemble Regressors¶

Time Taken¶

	Model	Description	MAPE	SMAPE	RMSE	ME	MAE	MPE	CORR	MINMAX	ACF1
0	LinearRegression	default	365	135.3122	43,579	-17,255	35,357	-2	-0.1236	1.2735	0.6457
1	LinearRegression	scaled	33,841,890	199.9984	4,378,715,364	-3,640,663,624	3,640,663,624	-338,419	0.5725	1.0000	0.0784

	Model	Description	MAPE	SMAPE	RMSE	ME	MAE	MPE	CORR	MINMAX	ACF1
0	RidgeCV	ts_split=3	261	118.8720	31,289	-15,694	25,228	-2	-0.0255	0.8816	0.6251
1	LinearRegression	default	365	135.3122	43,579	-17,255	35,357	-2	-0.1236	1.2735	0.6457
2	LinearRegression	scaled	33,841,890	199.9984	4,378,715,364	-3,640,663,624	3,640,663,624	-338,419	0.5725	1.0000	0.0784

	Model	Description	MAPE	SMAPE	RMSE	ME	MAE	MPE	CORR	MINMAX	ACF1
0	LassoCV	ts_split=3	266	110.8415	25,829	-25,336	25,537	-3	0.5769	0.7062	-0.4231
1	RidgeCV	ts_split=3	261	118.8720	31,289	-15,694	25,228	-2	-0.0255	0.8816	0.6251
2	LinearRegression	default	365	135.3122	43,579	-17,255	35,357	-2	-0.1236	1.2735	0.6457
3	LinearRegression	scaled	33,841,890	199.9984	4,378,715,364	-3,640,663,624	3,640,663,624	-338,419	0.5725	1.0000	0.0784

	Model	Description	MAPE	SMAPE	RMSE	ME	MAE	MPE	CORR	MINMAX	ACF1
0	XGBRegressor	default	18	18.2580	4,513	687	2,331	0	0.6643	0.1565	0.1207
1	LassoCV	ts_split=3	266	110.8415	25,829	-25,336	25,537	-3	0.5769	0.7062	-0.4231
2	RidgeCV	ts_split=3	261	118.8720	31,289	-15,694	25,228	-2	-0.0255	0.8816	0.6251
3	LinearRegression	default	365	135.3122	43,579	-17,255	35,357	-2	-0.1236	1.2735	0.6457
4	LinearRegression	scaled	33,841,890	199.9984	4,378,715,364	-3,640,663,624	3,640,663,624	-338,419	0.5725	1.0000	0.0784