import time

time_start_notebook = time.time()


mkdir -p ~/.kaggle


# !echo '' > ~/.kaggle/kaggle.json


!chmod 600 ~/.kaggle/kaggle.json

# after we have ~/.kaggle/kaggle.json file in colab, we can install kaggle module.


!head -c 20 ~/.kaggle/kaggle.json

{"username":"bhishan


%%capture
# capture will not print in notebook

import os
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    ## install modules
    !pip install watermark
    !pip install tsfresh

    ## create project like folders
    !mkdir -p ../data ../outputs ../images ../reports ../html ../models

    !pip install kaggle


# !kaggle competitions files -c web-traffic-time-series-forecasting


!kaggle competitions download -c web-traffic-time-series-forecasting -f train_1.csv.zip -p ../data/

train_1.csv.zip: Skipping, found more recently modified local copy (use --force to force download)


!ls ../data

most_visited_2016.csv train_1.csv.zip       train_1_01?raw=true   train_1_02?raw=true   train_1_03?raw=true


import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set(context='notebook', style='whitegrid', rc={'figure.figsize': (12,8)})
plt.style.use('fivethirtyeight') # better than sns styles.
matplotlib.rcParams['figure.figsize'] = 12,8

import os
import time
import joblib

# random state
SEED=100
np.random.seed(SEED)

# Jupyter notebook settings for pandas
#pd.set_option('display.float_format', '{:,.2g}'.format) # numbers sep by comma
from pandas.api.types import CategoricalDtype
np.set_printoptions(precision=3)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 200)

import gc

import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, r2_score

import xgboost
from xgboost import XGBRegressor
import scipy
from scipy import stats

import watermark

# versions
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv

Bhishan Poudel 2020-10-14 

CPython 3.7.7
IPython 7.18.1

compiler   : Clang 4.0.1 (tags/RELEASE_401/final)
system     : Darwin
release    : 19.6.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit

watermark  2.0.2
joblib     0.17.0
matplotlib 3.2.1
pandas     1.1.0
numpy      1.18.4
json       2.0.9
scipy      1.4.1
sklearn    0.23.1
seaborn    0.11.0
autopep8   1.5.2
xgboost    1.2.0


%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;


def show_method_attributes(method, ncols=7):
    """ Show all the attributes of a given method.
    Example:
    ========
    show_method_attributes(list)
     """
    x = [i for i in dir(method) if i[0].islower()]
    return pd.DataFrame(np.array_split(x,ncols)).T.fillna('')


def safe_median(s):
    return np.median([x for x in s if ~np.isnan(x)])


def get_mape(y_true, y_pred):
    "Mean Absolute Percentage Error"
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def get_smape(y_true, y_pred):
    "Symmetric Mean Absolute Percentage Error"
    denominator = (np.abs(y_true) + np.abs(y_pred))
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return 200 * np.mean(diff)


def get_mape(y_true, y_pred):
    "Mean Absolute Percentage Error"
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def get_smape(y_true, y_pred):
    "Symmetric Mean Absolute Percentage Error"
    denominator = (np.abs(y_true) + np.abs(y_pred))
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return 200 * np.mean(diff)

def timeseries_evaluation(model_name, desc, ytest, ypreds, df_eval=None,
                          show=False,sort_col='SMAPE'):
    from statsmodels.tsa.stattools import acf

    if df_eval is None:
        df_eval = pd.DataFrame({'Model': [],
                                'Description':[],
                                'MAPE': [],
                                'SMAPE': [],
                                'RMSE': [],
                                'ME': [],
                                'MAE': [],
                                'MPE': [],
                                'CORR': [],
                                'MINMAX': [],
                                'ACF1': [],
                                })

    mape = get_mape(ytest,ypreds)   # MAPE
    smape = get_smape(ytest,ypreds) # SMAPE
    rmse = np.mean((ytest - ypreds)**2)**.5  # RMSE

    me = np.mean(ytest - ypreds)             # ME
    mae = np.mean(np.abs(ytest - ypreds))    # MAE
    mpe = np.mean((ytest - ypreds)/ytest)    # MPE

    corr = np.corrcoef(ytest, ypreds)[0, 1]  # corr

    mins = np.amin(np.hstack([ytest[:, None],
                              ypreds[:, None]]), axis=1)
    maxs = np.amax(np.hstack([ytest[:, None],
                              ypreds[:, None]]), axis=1)
    minmax = 1 - np.mean(mins/maxs)             # minmax

    acf1 = acf(ytest-ypreds, fft=False)[1]    # ACF1 (autocorrelation function)

    row = [model_name, desc, mape, smape,rmse, me,mae,mpe,corr,minmax,acf1]

    df_eval.loc[len(df_eval)] = row
    df_eval = df_eval.drop_duplicates(['Model','Description'])
    
    asc, cmap = True, 'Greens_r'
    
    if sort_col == 'RMSE':
        asc = False
        cmap = 'Greens'

    if sort_col == 'SMAPE':
        asc = True
        cmap = 'Greens_r'

    df_eval = df_eval.sort_values(sort_col,ascending=asc)
    df_eval = df_eval.reset_index(drop=True)

    if show:
        df_eval_style = (df_eval.style
                         .format({'MAPE': "{:,.0f}",
                                  'SMAPE': "{:,.4f}",
                                  'RMSE': "{:,.0f}",
                                  'ME': "{:,.0f}",
                                  'MAE': "{:,.0f}",
                                  'MPE': "{:,.0f}",
                                  'CORR': "{:,.4f}",
                                  'MINMAX': "{:,.4f}",
                                  'ACF1': "{:,.4f}"
                                 })
                         .background_gradient(subset=[sort_col],cmap=cmap)
                        )
        display(df_eval_style)
    return df_eval

df_eval = None


def plot_model_results(model,Xtrain, ytrain,Xtest,ytest,ypreds,
                       n_splits=5,
                       plot_intervals=False,
                       plot_anomalies=False):
    """Plot model performance for timeseries data.

    """
    from sklearn.model_selection import cross_val_score
    from sklearn.model_selection import TimeSeriesSplit

    # plot
    plt.figure(figsize=(14, 8))
    plt.plot(ytest, "green", label="actual", linewidth=2.0)
    plt.plot(ypreds, "blue", label="ypreds", linewidth=2.0)

    tscv = TimeSeriesSplit(n_splits=n_splits)
    if plot_intervals:
        score_cv = cross_val_score(model,Xtrain,ytrain,
                                    cv=tscv,
                                    scoring="neg_mean_absolute_error")
        mae = score_cv.mean() * (-1)
        deviation = score_cv.std()

        scale = 1.96
        lower = ypreds - (mae + scale * deviation)
        upper = ypreds + (mae + scale * deviation)

        plt.plot(lower, "r--", label="upper bond / lower bond", alpha=0.5)
        plt.plot(upper, "r--", alpha=0.5)

        if plot_anomalies:
            anomalies = np.array([np.NaN]*len(ytest))
            anomalies[ytest<lower] = ytest[ytest<lower]
            anomalies[ytest>upper] = ytest[ytest>upper]
            plt.plot(anomalies, "o", markersize=10, label = "Anomalies")

    mape_error = get_mape(ytest, ypreds)
    smape_error = get_smape(ytest, ypreds)

    title = "MAPE : {:.2f} \nSMAPE: {:.2f}".format(mape_error, smape_error)
    plt.title(title)
    plt.legend(loc="best")
    plt.tight_layout()
    plt.grid(True);


df = pd.read_csv('../data/train_1.csv.zip',compression='zip')

print(df.shape)
df.head()

(145063, 551)


cond = df['Page'] == "Prince_(musician)_en.wikipedia.org_all-access_all-agents"

df = df.loc[cond]
df.head()


df = df.filter(regex="Page|2016")
df.iloc[:5, np.r_[0, 1,2,-2,-1]]


df = df.melt(id_vars=['Page'],var_name='date',value_name='visits').drop('Page',axis=1)

print(df.shape)
df.head()

(366, 2)


df['visits'].isna().sum()

0


df['visits'].plot.line()

<matplotlib.axes._subplots.AxesSubplot at 0x7fce8545bbd0>


df['visits'].plot.line()
plt.ylim(0,1e5)

(0.0, 100000.0)


for lag in range(1,8):
    df['lag'+str(lag)] = df['visits'].shift(lag)


# adding lags will create nans
df.isna().sum().sum()

28


df = df.dropna(how='any')
df.head()


df['bias'] = 1


df['date'] = pd.to_datetime(df['date'])
df['month'] = df['date'].dt.month # 1 to 12
df['day'] = df['date'].dt.day # 1 to 31

df['quarter'] = df['date'].dt.quarter # 1 to 4
df['dayofweek'] = df['date'].dt.dayofweek # 0 to 6

df['weekend'] = ((df['date'].dt.dayofweek) // 5 == 1).astype(np.int8)

# df = df.drop('date',axis=1)

print('nans = ',df.isna().sum().sum())
df.head()

nans =  0


Xtrain, Xtest, ytrain, ytest = train_test_split(df.drop('date',1),
                                                df['visits'],
                                                test_size=0.2,shuffle=False)

print(Xtrain.shape, Xtest.shape)

(287, 14) (72, 14)


target = 'visits'
features = df.columns.drop(['date', target])
df_Xtrain = pd.DataFrame(Xtrain, columns=features)
df_Xtest = pd.DataFrame(Xtest, columns=features)
df_Xtrain.head(2)


from xgboost import XGBRegressor


# XGBRegressor?


model = XGBRegressor(random_state=SEED,n_jobs=-1,objective='reg:squarederror')
model.fit(Xtrain, ytrain)
ypreds = model.predict(Xtest)

plot_model_results(model, 
                 Xtrain=Xtrain,ytrain=ytrain,
                 Xtest=Xtest,ytest=ytest,ypreds=ypreds,
                 plot_intervals=True, plot_anomalies=True)

model_name = 'xgb'
desc = 'default'
df_eval = timeseries_evaluation(model_name, desc, ytest, ypreds,
                    df_eval=df_eval,show=True)

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/ipykernel_launcher.py:40: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version.  Convert to a numpy array before indexing instead.
/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/ipykernel_launcher.py:42: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version.  Convert to a numpy array before indexing instead.
/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/statsmodels/tsa/stattools.py:657: FutureWarning: The default number of lags is changing from 40 tomin(int(10 * np.log10(nobs)), nobs - 1) after 0.12is released. Set the number of lags to an integer to  silence this warning.
  FutureWarning,


from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters


%%time
df = pd.read_csv('../data/train_1.csv.zip',compression='zip')

cond = df['Page'] == "Prince_(musician)_en.wikipedia.org_all-access_all-agents"
df = df.loc[cond]

df = df.filter(regex="Page|2016")

df = df.melt(id_vars=['Page'],var_name='date',value_name='visits').drop('Page',axis=1)
df = df.dropna(how='any')

print(df.shape)

df['id'] = df.index

df.iloc[:2,:5]

(366, 2)
CPU times: user 8.51 s, sys: 945 ms, total: 9.45 s
Wall time: 11.7 s


%%time

from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters

# we impute = remove all NaN features automatically
extraction_settings = ComprehensiveFCParameters()
df_extr = extract_features(df, column_id='id', column_sort='date',
                     default_fc_parameters=extraction_settings,
                     impute_function=impute)

Feature Extraction: 100%|██████████| 10/10 [00:04<00:00,  2.13it/s]

CPU times: user 750 ms, sys: 110 ms, total: 860 ms
Wall time: 5.41 s


df_extr.shape

(366, 779)


%%time
df_extr_sel = select_features(df_extr, df['visits'])
df_extr_sel.shape

CPU times: user 65.8 ms, sys: 23.1 ms, total: 89 ms
Wall time: 235 ms

(366, 21)


df_Xtrain, df_Xtest, ser_ytrain, ser_ytest = train_test_split(
    df_extr_sel, df['visits'],test_size=0.2,shuffle=False)

Xtrain = df_Xtrain.to_numpy()
Xtest = df_Xtest.to_numpy()
ytrain = ser_ytrain.to_numpy().ravel()
ytest = ser_ytest.to_numpy().ravel()

print(df_Xtrain.shape, df_Xtest.shape)
# df_Xtrain.iloc[:10,:5]

(292, 21) (74, 21)


# make sure the data is not shuffled
# df_extr_sel.iloc[:10,:5]


# df_extr_sel.iloc[-5:,:5]


# df_Xtest.iloc[-5:,:5]


model = XGBRegressor(random_state=SEED,n_jobs=-1,objective='reg:squarederror')
model.fit(Xtrain, ytrain)
ypreds = model.predict(Xtest)

plot_model_results(model, 
                 Xtrain=Xtrain,ytrain=ytrain,
                 Xtest=Xtest,ytest=ytest,ypreds=ypreds,
                 plot_intervals=True, plot_anomalies=True)

model_name = 'xgb'
desc = 'tsfresh'
df_eval = timeseries_evaluation(model_name, desc, ytest, ypreds,
                    df_eval=df_eval,show=True)

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/statsmodels/tsa/stattools.py:657: FutureWarning: The default number of lags is changing from 40 tomin(int(10 * np.log10(nobs)), nobs - 1) after 0.12is released. Set the number of lags to an integer to  silence this warning.
  FutureWarning,


from sklearn.pipeline import Pipeline
from tsfresh.transformers import RelevantFeatureAugmenter


df_Xtrain.iloc[:2,:5]


df_extr_sel.iloc[:2,:5]


# we need dataframe with columns id and date to use pipeline


df.iloc[:2,:5]


df_train, df_test, ser_ytrain, ser_ytest = train_test_split(df,df['visits'],
                                                shuffle=False,test_size=0.2)
print(df.shape, df_train.shape)
df_train.iloc[:2,:5]

(366, 3) (292, 3)


# create empty dataframe

df_empty = pd.DataFrame(index=df.index)

df_Xtrain_empty, df_Xtest_empty = train_test_split(df_empty,shuffle=False,test_size=0.2)

df_empty.head()


%%capture

pipe = Pipeline([
        ('augmenter', RelevantFeatureAugmenter(column_id='id', column_sort='date')),
        ('regressor', XGBRegressor(random_state=SEED,n_jobs=-1,objective='reg:squarederror'))
      ])

pipe.set_params(augmenter__timeseries_container=df_train);
pipe.fit(df_Xtrain_empty, ser_ytrain);

path_pipe = '../models/pipeline.joblib'
joblib.dump(pipe, path_pipe)


# load the fitted pipeline and again assingn new augmenter
pipe = joblib.load(path_pipe)

pipe.set_params(augmenter__timeseries_container=df_test);
ypreds = pipe.predict(df_Xtest_empty)

Feature Extraction: 100%|██████████| 10/10 [00:00<00:00, 84.29it/s]


ypreds[:5]

array([ 7939.966,  7939.966, 10576.055,  9504.858, 10793.434],
      dtype=float32)


get_smape(ytest,ypreds)

0.6356307432712617


from sklearn.model_selection import cross_val_score
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import make_scorer


model = XGBRegressor(random_state=SEED,n_jobs=-1,objective='reg:squarederror')
tscv = TimeSeriesSplit(n_splits=5)

scores = cross_val_score(model,Xtrain,ytrain,cv=tscv,n_jobs=-1,
                         scoring=make_scorer(get_smape))

scores

array([ 1.177, 56.18 ,  1.225,  0.377,  0.279])


df_train.head(2)


print(df_Xtrain.shape)
df_Xtrain.head(2)

(292, 21)


tscv = TimeSeriesSplit(n_splits=5)
smapes = []
for i, (idx_tr, idx_vd) in enumerate(tscv.split(df_Xtrain)):

    Xtr = df_Xtrain.iloc[idx_tr]
    Xvd = df_Xtrain.iloc[idx_vd]
    ytr = ser_ytrain.iloc[idx_tr].to_numpy().ravel()
    yvd = ser_ytrain.iloc[idx_vd].to_numpy().ravel()

    model = XGBRegressor(random_state=SEED,n_jobs=-1,objective='reg:squarederror')
    model.fit(Xtr,ytr)
    vd_preds = model.predict(Xvd)

    smape = get_smape(yvd, vd_preds)
    smapes.append(smape)

    # info
    print()
    print('Fold                  : Fold {}'.format(i+1))
    print('training   data shape : {}'.format(idx_tr.shape))
    print('validation data shape : {}'.format(idx_vd.shape))
    print('SMAPE                 : {:.3f}'.format(smape))

smapes

Fold                  : Fold 1
training   data shape : (52,)
validation data shape : (48,)
SMAPE                 : 1.177

Fold                  : Fold 2
training   data shape : (100,)
validation data shape : (48,)
SMAPE                 : 56.180

Fold                  : Fold 3
training   data shape : (148,)
validation data shape : (48,)
SMAPE                 : 1.225

Fold                  : Fold 4
training   data shape : (196,)
validation data shape : (48,)
SMAPE                 : 0.377

Fold                  : Fold 5
training   data shape : (244,)
validation data shape : (48,)
SMAPE                 : 0.279

[1.1773017114187323,
 56.18010584713199,
 1.225068082605304,
 0.37743419598934896,
 0.2785122686770567]


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 0 hr 4 min 18 secs

	Page	2015-07-01	2015-07-02	2015-07-03	2015-07-04	2015-07-05	2015-07-06	2015-07-07	2015-07-08	2015-07-09	2015-07-10	2015-07-11	2015-07-12	2015-07-13	2015-07-14	2015-07-15	2015-07-16	2015-07-17	2015-07-18	2015-07-19	2015-07-20	2015-07-21	2015-07-22	2015-07-23	2015-07-24	2015-07-25	2015-07-26	2015-07-27	2015-07-28	2015-07-29	2015-07-30	2015-07-31	2015-08-01	2015-08-02	2015-08-03	2015-08-04	2015-08-05	2015-08-06	2015-08-07	2015-08-08	2015-08-09	2015-08-10	2015-08-11	2015-08-12	2015-08-13	2015-08-14	2015-08-15	2015-08-16	2015-08-17	2015-08-18	...	2016-11-12	2016-11-13	2016-11-14	2016-11-15	2016-11-16	2016-11-17	2016-11-18	2016-11-19	2016-11-20	2016-11-21	2016-11-22	2016-11-23	2016-11-24	2016-11-25	2016-11-26	2016-11-27	2016-11-28	2016-11-29	2016-11-30	2016-12-01	2016-12-02	2016-12-03	2016-12-04	2016-12-05	2016-12-06	2016-12-07	2016-12-08	2016-12-09	2016-12-10	2016-12-11	2016-12-12	2016-12-13	2016-12-14	2016-12-15	2016-12-16	2016-12-17	2016-12-18	2016-12-19	2016-12-20	2016-12-21	2016-12-22	2016-12-23	2016-12-24	2016-12-25	2016-12-26	2016-12-27	2016-12-28	2016-12-29	2016-12-30	2016-12-31
0	2NE1_zh.wikipedia.org_all-access_spider	18.0	11.0	5.0	13.0	14.0	9.0	9.0	22.0	26.0	24.0	19.0	10.0	14.0	15.0	8.0	16.0	8.0	8.0	16.0	7.0	11.0	10.0	20.0	18.0	15.0	14.0	49.0	10.0	16.0	18.0	8.0	5.0	9.0	7.0	13.0	9.0	7.0	4.0	11.0	10.0	5.0	9.0	9.0	9.0	9.0	13.0	4.0	15.0	25.0	...	13.0	8.0	15.0	14.0	12.0	6.0	11.0	10.0	42.0	21.0	24.0	14.0	11.0	204.0	14.0	45.0	33.0	28.0	18.0	14.0	47.0	15.0	14.0	18.0	20.0	14.0	16.0	14.0	20.0	60.0	22.0	15.0	17.0	19.0	18.0	21.0	21.0	47.0	65.0	17.0	32.0	63.0	15.0	26.0	14.0	20.0	22.0	19.0	18.0	20.0
1	2PM_zh.wikipedia.org_all-access_spider	11.0	14.0	15.0	18.0	11.0	13.0	22.0	11.0	10.0	4.0	41.0	65.0	57.0	38.0	20.0	62.0	44.0	15.0	10.0	47.0	24.0	17.0	22.0	9.0	39.0	13.0	11.0	12.0	21.0	19.0	9.0	15.0	33.0	8.0	8.0	7.0	13.0	2.0	23.0	12.0	27.0	27.0	36.0	23.0	58.0	80.0	60.0	69.0	42.0	...	12.0	11.0	14.0	28.0	23.0	20.0	9.0	12.0	11.0	14.0	14.0	15.0	15.0	11.0	20.0	13.0	19.0	621.0	57.0	17.0	23.0	19.0	21.0	47.0	28.0	22.0	22.0	65.0	27.0	17.0	17.0	13.0	9.0	18.0	22.0	17.0	15.0	22.0	23.0	19.0	17.0	42.0	28.0	15.0	9.0	30.0	52.0	45.0	26.0	20.0
2	3C_zh.wikipedia.org_all-access_spider	1.0	0.0	1.0	1.0	0.0	4.0	0.0	3.0	4.0	4.0	1.0	1.0	1.0	6.0	8.0	6.0	4.0	5.0	1.0	2.0	3.0	8.0	8.0	6.0	6.0	2.0	2.0	3.0	2.0	4.0	3.0	3.0	5.0	3.0	5.0	4.0	2.0	5.0	1.0	4.0	5.0	0.0	0.0	7.0	3.0	5.0	1.0	6.0	2.0	...	6.0	4.0	2.0	4.0	6.0	5.0	4.0	4.0	3.0	3.0	9.0	3.0	5.0	4.0	0.0	1.0	4.0	5.0	8.0	8.0	1.0	1.0	2.0	5.0	3.0	3.0	3.0	7.0	3.0	9.0	8.0	3.0	210.0	5.0	4.0	6.0	2.0	2.0	4.0	3.0	3.0	1.0	1.0	7.0	4.0	4.0	6.0	3.0	4.0	17.0
3	4minute_zh.wikipedia.org_all-access_spider	35.0	13.0	10.0	94.0	4.0	26.0	14.0	9.0	11.0	16.0	16.0	11.0	23.0	145.0	14.0	17.0	85.0	4.0	30.0	22.0	9.0	10.0	11.0	7.0	7.0	11.0	9.0	11.0	44.0	8.0	14.0	19.0	10.0	17.0	17.0	10.0	7.0	10.0	1.0	8.0	27.0	19.0	16.0	2.0	84.0	22.0	14.0	47.0	25.0	...	38.0	13.0	14.0	17.0	26.0	14.0	10.0	9.0	23.0	15.0	7.0	10.0	7.0	10.0	14.0	17.0	11.0	9.0	11.0	5.0	10.0	8.0	17.0	13.0	23.0	40.0	16.0	17.0	41.0	17.0	8.0	9.0	18.0	12.0	12.0	18.0	13.0	18.0	23.0	10.0	32.0	10.0	26.0	27.0	16.0	11.0	17.0	19.0	10.0	11.0
4	52_Hz_I_Love_You_zh.wikipedia.org_all-access_spider	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	13.0	11.0	8.0	6.0	10.0	14.0	6.0	9.0	6.0	16.0	14.0	13.0	15.0	14.0	16.0	9.0	178.0	64.0	12.0	10.0	11.0	6.0	8.0	7.0	9.0	8.0	5.0	11.0	8.0	4.0	15.0	5.0	8.0	8.0	6.0	7.0	15.0	4.0	11.0	7.0	48.0	9.0	25.0	13.0	3.0	11.0	27.0	13.0	36.0	10.0

	date	visits
0	2016-01-01	20947.0
1	2016-01-02	19466.0
2	2016-01-03	8587.0
3	2016-01-04	7386.0
4	2016-01-05	7719.0

	date	visits	lag1	lag2	lag3	lag4	lag5	lag6	lag7
7	2016-01-08	8658.0	7713.0	7156.0	7719.0	7386.0	8587.0	19466.0	20947.0
8	2016-01-09	9137.0	8658.0	7713.0	7156.0	7719.0	7386.0	8587.0	19466.0
9	2016-01-10	8855.0	9137.0	8658.0	7713.0	7156.0	7719.0	7386.0	8587.0
10	2016-01-11	9034.0	8855.0	9137.0	8658.0	7713.0	7156.0	7719.0	7386.0
11	2016-01-12	8435.0	9034.0	8855.0	9137.0	8658.0	7713.0	7156.0	7719.0

	date	visits	lag1	lag2	lag3	lag4	lag5	lag6	lag7	bias	month	day	quarter	dayofweek	weekend
7	2016-01-08	8658.0	7713.0	7156.0	7719.0	7386.0	8587.0	19466.0	20947.0	1	1	8	1	4	0
8	2016-01-09	9137.0	8658.0	7713.0	7156.0	7719.0	7386.0	8587.0	19466.0	1	1	9	1	5	1
9	2016-01-10	8855.0	9137.0	8658.0	7713.0	7156.0	7719.0	7386.0	8587.0	1	1	10	1	6	1
10	2016-01-11	9034.0	8855.0	9137.0	8658.0	7713.0	7156.0	7719.0	7386.0	1	1	11	1	0	0
11	2016-01-12	8435.0	9034.0	8855.0	9137.0	8658.0	7713.0	7156.0	7719.0	1	1	12	1	1	0

	lag1	lag2	lag3	lag4	lag5	lag6	lag7	bias	month	day	quarter	dayofweek	weekend
7	7713.0	7156.0	7719.0	7386.0	8587.0	19466.0	20947.0	1	1	8	1	4	0
8	8658.0	7713.0	7156.0	7719.0	7386.0	8587.0	19466.0	1	1	9	1	5	1

Table of Contents

Data Description¶

Colab¶

Load the Libraries¶

Useful Scripts¶

Load the data¶

Choose Prince Musician data as timeseries¶

Data Preprocessing¶

Add lag columns¶

Add bias term¶

Add timeseries features¶

Modelling¶

Train Test split¶

Modelling: Xgboost¶

Adding timeseries features using tsfresh¶

Using Pipeline for tsfresh relevant feature augmenter¶

Cross validation for timeseries¶

Time Taken¶

	Model	Description	MAPE	SMAPE	RMSE	ME	MAE	MPE	CORR	MINMAX	ACF1
0	xgb	tsfresh	1	0.6356	337	43	115	0	0.9991	0.0063	-0.2886
1	xgb	default	1	1.4308	453	9	224	0	0.9978	0.0141	-0.3971

	visits__sum_values	visits__cwt_coefficients__coeff_0__w_20__widths_(2, 5, 10, 20)	visits__cwt_coefficients__coeff_0__w_10__widths_(2, 5, 10, 20)	visits__cwt_coefficients__coeff_0__w_5__widths_(2, 5, 10, 20)	visits__cwt_coefficients__coeff_0__w_2__widths_(2, 5, 10, 20)
0	20947.0	4062.456606	5745.181229	8124.913212	12846.615771
1	19466.0	3775.231789	5338.983998	7550.463579	11938.331150