Reference: https://www.kaggle.com/c/web-traffic-time-series-forecasting/data
Original data: train_1.csv
-----------------------------
rows = 145,063
columns = 551
first column = Page
date columns = 2015-07-01, 2015-07-02, ..., 2016-12-31 (550 columns)
file size: 284.6 MB
Data for modelling: Prince Musician
-------------------------------------------------------
timeseries : 2016 page visits for Prince
lag columns : lag1 to lag7
bias : bias column
For ARIMA : we have only one timeseries (one column)
For sklearn : For linear regressor, ensemble learners we can have many columns
import time
time_start_notebook = time.time()
mkdir -p ~/.kaggle
# !echo '' > ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json
# after we have ~/.kaggle/kaggle.json file in colab, we can install kaggle module.
!head -c 20 ~/.kaggle/kaggle.json
{"username":"bhishan
%%capture
# capture will not print in notebook
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
## install modules
!pip install watermark
!pip install tsfresh
## create project like folders
!mkdir -p ../data ../outputs ../images ../reports ../html ../models
!pip install kaggle
# !kaggle competitions files -c web-traffic-time-series-forecasting
# !kaggle competitions download -c web-traffic-time-series-forecasting -f train_1.csv.zip -p ../data/
!ls ../data
most_visited_2016.csv train_1.csv.zip train_1_01?raw=true train_1_02?raw=true train_1_03?raw=true
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set(context='notebook', style='whitegrid', rc={'figure.figsize': (12,8)})
plt.style.use('fivethirtyeight') # better than sns styles.
matplotlib.rcParams['figure.figsize'] = 12,8
import os
import time
# random state
SEED=100
np.random.seed(SEED)
# Jupyter notebook settings for pandas
#pd.set_option('display.float_format', '{:,.2g}'.format) # numbers sep by comma
from pandas.api.types import CategoricalDtype
np.set_printoptions(precision=3)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 200)
import gc
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, r2_score
import xgboost
from xgboost import XGBRegressor
import watermark
# versions
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv
The watermark extension is already loaded. To reload it, use: %reload_ext watermark Bhishan Poudel 2020-10-14 CPython 3.7.7 IPython 7.18.1 compiler : Clang 4.0.1 (tags/RELEASE_401/final) system : Darwin release : 19.6.0 machine : x86_64 processor : i386 CPU cores : 4 interpreter: 64bit watermark 2.0.2 matplotlib 3.2.1 numpy 1.18.4 autopep8 1.5.2 xgboost 1.2.0 seaborn 0.11.0 pandas 1.1.0 sklearn 0.23.1 json 2.0.9
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
def show_method_attributes(method, ncols=7):
""" Show all the attributes of a given method.
Example:
========
show_method_attributes(list)
"""
x = [i for i in dir(method) if i[0].islower()]
return pd.DataFrame(np.array_split(x,ncols)).T.fillna('')
def safe_median(s):
return np.median([x for x in s if ~np.isnan(x)])
MAPE - Mean Absolute Percentage Error: $$ M A P E=\frac{100}{n} \sum_{i=1}^{n} \frac{\left|y_{i}-\hat{y}_{i}\right|}{y_{i}} $$
SMAPE - Symmetric Mean Absolute Percentage Error:
$$ S M A P E = \frac{100 \%}{n} \sum_{i=1}^{n} \frac{\left|y_{i} - \hat{y}\right|}{\left(\left|y_i\right| + \left|\hat{y}\right|\right) / 2}\\ \quad \quad = \frac{200 \%}{n} \sum_{i=1}^{n} \frac{\left|y_{i} - \hat{y}\right|}{ \left|y_i\right| + \left|\hat{y}\right|} $$def timeseries_train_test_split(df, target, train_size=0.8):
"""Train test split for time series dataframe.
Parameters
-----------
df -- dataframe
target -- name of target
train_size -- (float) for train size
Returns
-------
Xtrain, Xtest, ytrain, ytest: numpy array of dtype int32
"""
frac = int(len(df)* train_size)
Xtrain = df.drop(target,1).iloc[:frac, :].astype(np.int32).to_numpy()
Xtest = df.drop(target,1).iloc[frac:, :].astype(np.int32).to_numpy()
ytrain = df[target].iloc[:frac].to_numpy()
ytest = df[target].iloc[frac:].to_numpy()
return Xtrain, Xtest, ytrain, ytest
def get_mape(y_true, y_pred):
"Mean Absolute Percentage Error"
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
def get_smape(y_true, y_pred):
"Symmetric Mean Absolute Percentage Error"
denominator = (np.abs(y_true) + np.abs(y_pred))
diff = np.abs(y_true - y_pred) / denominator
diff[denominator == 0] = 0.0
return 200 * np.mean(diff)
def timeseries_evaluation(model_name, desc, ytest, ypreds, df_eval=None,
show=False,sort_col='SMAPE'):
from statsmodels.tsa.stattools import acf
if df_eval is None:
df_eval = pd.DataFrame({'Model': [],
'Description':[],
'MAPE': [],
'SMAPE': [],
'RMSE': [],
'ME': [],
'MAE': [],
'MPE': [],
'CORR': [],
'MINMAX': [],
'ACF1': [],
})
mape = get_mape(ytest,ypreds) # MAPE
smape = get_smape(ytest,ypreds) # SMAPE
rmse = np.mean((ytest - ypreds)**2)**.5 # RMSE
me = np.mean(ytest - ypreds) # ME
mae = np.mean(np.abs(ytest - ypreds)) # MAE
mpe = np.mean((ytest - ypreds)/ytest) # MPE
corr = np.corrcoef(ytest, ypreds)[0, 1] # corr
mins = np.amin(np.hstack([ytest[:, None],
ypreds[:, None]]), axis=1)
maxs = np.amax(np.hstack([ytest[:, None],
ypreds[:, None]]), axis=1)
minmax = 1 - np.mean(mins/maxs) # minmax
acf1 = acf(ytest-ypreds, fft=False)[1] # ACF1 (autocorrelation function)
row = [model_name, desc, mape, smape,rmse, me,mae,mpe,corr,minmax,acf1]
df_eval.loc[len(df_eval)] = row
df_eval = df_eval.drop_duplicates(['Model','Description'])
asc, cmap = True, 'Greens_r'
if sort_col == 'RMSE':
asc = False
cmap = 'Greens'
if sort_col == 'SMAPE':
asc = True
cmap = 'Greens_r'
df_eval = df_eval.sort_values(sort_col,ascending=asc)
df_eval = df_eval.reset_index(drop=True)
if show:
df_eval_style = (df_eval.style
.format({'MAPE': "{:,.0f}",
'SMAPE': "{:,.4f}",
'RMSE': "{:,.0f}",
'ME': "{:,.0f}",
'MAE': "{:,.0f}",
'MPE': "{:,.0f}",
'CORR': "{:,.4f}",
'MINMAX': "{:,.4f}",
'ACF1': "{:,.4f}"
})
.background_gradient(subset=[sort_col],cmap=cmap)
)
display(df_eval_style)
return df_eval
df_eval = None
def plot_model_results(model, Xtrain, Xtest, ytest,
plot_intervals=False, plot_anomalies=False):
"""Plot model performance for timeseries data.
"""
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import TimeSeriesSplit
def get_mape(y_true, y_pred):
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
def get_smape(y_true, y_pred):
denominator = (np.abs(y_true) + np.abs(y_pred))
diff = np.abs(y_true - y_pred) / denominator
diff[denominator == 0] = 0.0
return 200 * np.mean(diff)
tscv = TimeSeriesSplit(n_splits=3)
ypreds = model.predict(Xtest)
plt.figure(figsize=(15, 7))
plt.plot(ypreds, "g", label="ypreds", linewidth=2.0)
plt.plot(ytest, label="actual", linewidth=2.0)
if plot_intervals:
cv = cross_val_score(model, Xtrain, ytrain,
cv=tscv,
scoring="neg_mean_absolute_error")
mae = cv.mean() * (-1)
deviation = cv.std()
scale = 1.96
lower = ypreds - (mae + scale * deviation)
upper = ypreds + (mae + scale * deviation)
plt.plot(lower, "r--", label="upper bond / lower bond", alpha=0.5)
plt.plot(upper, "r--", alpha=0.5)
if plot_anomalies:
anomalies = np.array([np.NaN]*len(ytest))
anomalies[ytest<lower] = ytest[ytest<lower]
anomalies[ytest>upper] = ytest[ytest>upper]
plt.plot(anomalies, "o", markersize=10, label = "Anomalies")
mape_error = get_mape(ytest, ypreds)
smape_error = get_smape(ytest, ypreds)
mape_error = round(mape_error,2)
smape_error = round(smape_error,2)
plt.title("MAPE: "+str(mape_error)+"\n"+"SMAPE: "+str(smape_error))
plt.legend(loc="best")
plt.tight_layout()
plt.grid(True);
def plot_coefficients(model, Xtrain_columns):
"""Plot of coefficients
"""
coefs = pd.DataFrame(model.coef_, Xtrain_columns)
coefs.columns = ["coef"]
coefs["abs"] = coefs.coef.apply(np.abs)
coefs = coefs.sort_values(by="abs", ascending=False).drop(["abs"], axis=1)
plt.figure(figsize=(15, 7))
coefs.coef.plot(kind='bar')
plt.grid(True, axis='y')
plt.hlines(y=0, xmin=0, xmax=len(coefs), linestyles='dashed');
df = pd.read_csv('../data/train_1.csv.zip',compression='zip')
print(df.shape)
df.head()
(145063, 551)
Page | 2015-07-01 | 2015-07-02 | 2015-07-03 | 2015-07-04 | 2015-07-05 | 2015-07-06 | 2015-07-07 | 2015-07-08 | 2015-07-09 | 2015-07-10 | 2015-07-11 | 2015-07-12 | 2015-07-13 | 2015-07-14 | 2015-07-15 | 2015-07-16 | 2015-07-17 | 2015-07-18 | 2015-07-19 | 2015-07-20 | 2015-07-21 | 2015-07-22 | 2015-07-23 | 2015-07-24 | 2015-07-25 | 2015-07-26 | 2015-07-27 | 2015-07-28 | 2015-07-29 | 2015-07-30 | 2015-07-31 | 2015-08-01 | 2015-08-02 | 2015-08-03 | 2015-08-04 | 2015-08-05 | 2015-08-06 | 2015-08-07 | 2015-08-08 | 2015-08-09 | 2015-08-10 | 2015-08-11 | 2015-08-12 | 2015-08-13 | 2015-08-14 | 2015-08-15 | 2015-08-16 | 2015-08-17 | 2015-08-18 | ... | 2016-11-12 | 2016-11-13 | 2016-11-14 | 2016-11-15 | 2016-11-16 | 2016-11-17 | 2016-11-18 | 2016-11-19 | 2016-11-20 | 2016-11-21 | 2016-11-22 | 2016-11-23 | 2016-11-24 | 2016-11-25 | 2016-11-26 | 2016-11-27 | 2016-11-28 | 2016-11-29 | 2016-11-30 | 2016-12-01 | 2016-12-02 | 2016-12-03 | 2016-12-04 | 2016-12-05 | 2016-12-06 | 2016-12-07 | 2016-12-08 | 2016-12-09 | 2016-12-10 | 2016-12-11 | 2016-12-12 | 2016-12-13 | 2016-12-14 | 2016-12-15 | 2016-12-16 | 2016-12-17 | 2016-12-18 | 2016-12-19 | 2016-12-20 | 2016-12-21 | 2016-12-22 | 2016-12-23 | 2016-12-24 | 2016-12-25 | 2016-12-26 | 2016-12-27 | 2016-12-28 | 2016-12-29 | 2016-12-30 | 2016-12-31 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2NE1_zh.wikipedia.org_all-access_spider | 18.0 | 11.0 | 5.0 | 13.0 | 14.0 | 9.0 | 9.0 | 22.0 | 26.0 | 24.0 | 19.0 | 10.0 | 14.0 | 15.0 | 8.0 | 16.0 | 8.0 | 8.0 | 16.0 | 7.0 | 11.0 | 10.0 | 20.0 | 18.0 | 15.0 | 14.0 | 49.0 | 10.0 | 16.0 | 18.0 | 8.0 | 5.0 | 9.0 | 7.0 | 13.0 | 9.0 | 7.0 | 4.0 | 11.0 | 10.0 | 5.0 | 9.0 | 9.0 | 9.0 | 9.0 | 13.0 | 4.0 | 15.0 | 25.0 | ... | 13.0 | 8.0 | 15.0 | 14.0 | 12.0 | 6.0 | 11.0 | 10.0 | 42.0 | 21.0 | 24.0 | 14.0 | 11.0 | 204.0 | 14.0 | 45.0 | 33.0 | 28.0 | 18.0 | 14.0 | 47.0 | 15.0 | 14.0 | 18.0 | 20.0 | 14.0 | 16.0 | 14.0 | 20.0 | 60.0 | 22.0 | 15.0 | 17.0 | 19.0 | 18.0 | 21.0 | 21.0 | 47.0 | 65.0 | 17.0 | 32.0 | 63.0 | 15.0 | 26.0 | 14.0 | 20.0 | 22.0 | 19.0 | 18.0 | 20.0 |
1 | 2PM_zh.wikipedia.org_all-access_spider | 11.0 | 14.0 | 15.0 | 18.0 | 11.0 | 13.0 | 22.0 | 11.0 | 10.0 | 4.0 | 41.0 | 65.0 | 57.0 | 38.0 | 20.0 | 62.0 | 44.0 | 15.0 | 10.0 | 47.0 | 24.0 | 17.0 | 22.0 | 9.0 | 39.0 | 13.0 | 11.0 | 12.0 | 21.0 | 19.0 | 9.0 | 15.0 | 33.0 | 8.0 | 8.0 | 7.0 | 13.0 | 2.0 | 23.0 | 12.0 | 27.0 | 27.0 | 36.0 | 23.0 | 58.0 | 80.0 | 60.0 | 69.0 | 42.0 | ... | 12.0 | 11.0 | 14.0 | 28.0 | 23.0 | 20.0 | 9.0 | 12.0 | 11.0 | 14.0 | 14.0 | 15.0 | 15.0 | 11.0 | 20.0 | 13.0 | 19.0 | 621.0 | 57.0 | 17.0 | 23.0 | 19.0 | 21.0 | 47.0 | 28.0 | 22.0 | 22.0 | 65.0 | 27.0 | 17.0 | 17.0 | 13.0 | 9.0 | 18.0 | 22.0 | 17.0 | 15.0 | 22.0 | 23.0 | 19.0 | 17.0 | 42.0 | 28.0 | 15.0 | 9.0 | 30.0 | 52.0 | 45.0 | 26.0 | 20.0 |
2 | 3C_zh.wikipedia.org_all-access_spider | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 4.0 | 0.0 | 3.0 | 4.0 | 4.0 | 1.0 | 1.0 | 1.0 | 6.0 | 8.0 | 6.0 | 4.0 | 5.0 | 1.0 | 2.0 | 3.0 | 8.0 | 8.0 | 6.0 | 6.0 | 2.0 | 2.0 | 3.0 | 2.0 | 4.0 | 3.0 | 3.0 | 5.0 | 3.0 | 5.0 | 4.0 | 2.0 | 5.0 | 1.0 | 4.0 | 5.0 | 0.0 | 0.0 | 7.0 | 3.0 | 5.0 | 1.0 | 6.0 | 2.0 | ... | 6.0 | 4.0 | 2.0 | 4.0 | 6.0 | 5.0 | 4.0 | 4.0 | 3.0 | 3.0 | 9.0 | 3.0 | 5.0 | 4.0 | 0.0 | 1.0 | 4.0 | 5.0 | 8.0 | 8.0 | 1.0 | 1.0 | 2.0 | 5.0 | 3.0 | 3.0 | 3.0 | 7.0 | 3.0 | 9.0 | 8.0 | 3.0 | 210.0 | 5.0 | 4.0 | 6.0 | 2.0 | 2.0 | 4.0 | 3.0 | 3.0 | 1.0 | 1.0 | 7.0 | 4.0 | 4.0 | 6.0 | 3.0 | 4.0 | 17.0 |
3 | 4minute_zh.wikipedia.org_all-access_spider | 35.0 | 13.0 | 10.0 | 94.0 | 4.0 | 26.0 | 14.0 | 9.0 | 11.0 | 16.0 | 16.0 | 11.0 | 23.0 | 145.0 | 14.0 | 17.0 | 85.0 | 4.0 | 30.0 | 22.0 | 9.0 | 10.0 | 11.0 | 7.0 | 7.0 | 11.0 | 9.0 | 11.0 | 44.0 | 8.0 | 14.0 | 19.0 | 10.0 | 17.0 | 17.0 | 10.0 | 7.0 | 10.0 | 1.0 | 8.0 | 27.0 | 19.0 | 16.0 | 2.0 | 84.0 | 22.0 | 14.0 | 47.0 | 25.0 | ... | 38.0 | 13.0 | 14.0 | 17.0 | 26.0 | 14.0 | 10.0 | 9.0 | 23.0 | 15.0 | 7.0 | 10.0 | 7.0 | 10.0 | 14.0 | 17.0 | 11.0 | 9.0 | 11.0 | 5.0 | 10.0 | 8.0 | 17.0 | 13.0 | 23.0 | 40.0 | 16.0 | 17.0 | 41.0 | 17.0 | 8.0 | 9.0 | 18.0 | 12.0 | 12.0 | 18.0 | 13.0 | 18.0 | 23.0 | 10.0 | 32.0 | 10.0 | 26.0 | 27.0 | 16.0 | 11.0 | 17.0 | 19.0 | 10.0 | 11.0 |
4 | 52_Hz_I_Love_You_zh.wikipedia.org_all-access_spider | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 13.0 | 11.0 | 8.0 | 6.0 | 10.0 | 14.0 | 6.0 | 9.0 | 6.0 | 16.0 | 14.0 | 13.0 | 15.0 | 14.0 | 16.0 | 9.0 | 178.0 | 64.0 | 12.0 | 10.0 | 11.0 | 6.0 | 8.0 | 7.0 | 9.0 | 8.0 | 5.0 | 11.0 | 8.0 | 4.0 | 15.0 | 5.0 | 8.0 | 8.0 | 6.0 | 7.0 | 15.0 | 4.0 | 11.0 | 7.0 | 48.0 | 9.0 | 25.0 | 13.0 | 3.0 | 11.0 | 27.0 | 13.0 | 36.0 | 10.0 |
5 rows × 551 columns
cond = df['Page'] == "Prince_(musician)_en.wikipedia.org_all-access_all-agents"
df = df.loc[cond]
df.head()
Page | 2015-07-01 | 2015-07-02 | 2015-07-03 | 2015-07-04 | 2015-07-05 | 2015-07-06 | 2015-07-07 | 2015-07-08 | 2015-07-09 | 2015-07-10 | 2015-07-11 | 2015-07-12 | 2015-07-13 | 2015-07-14 | 2015-07-15 | 2015-07-16 | 2015-07-17 | 2015-07-18 | 2015-07-19 | 2015-07-20 | 2015-07-21 | 2015-07-22 | 2015-07-23 | 2015-07-24 | 2015-07-25 | 2015-07-26 | 2015-07-27 | 2015-07-28 | 2015-07-29 | 2015-07-30 | 2015-07-31 | 2015-08-01 | 2015-08-02 | 2015-08-03 | 2015-08-04 | 2015-08-05 | 2015-08-06 | 2015-08-07 | 2015-08-08 | 2015-08-09 | 2015-08-10 | 2015-08-11 | 2015-08-12 | 2015-08-13 | 2015-08-14 | 2015-08-15 | 2015-08-16 | 2015-08-17 | 2015-08-18 | ... | 2016-11-12 | 2016-11-13 | 2016-11-14 | 2016-11-15 | 2016-11-16 | 2016-11-17 | 2016-11-18 | 2016-11-19 | 2016-11-20 | 2016-11-21 | 2016-11-22 | 2016-11-23 | 2016-11-24 | 2016-11-25 | 2016-11-26 | 2016-11-27 | 2016-11-28 | 2016-11-29 | 2016-11-30 | 2016-12-01 | 2016-12-02 | 2016-12-03 | 2016-12-04 | 2016-12-05 | 2016-12-06 | 2016-12-07 | 2016-12-08 | 2016-12-09 | 2016-12-10 | 2016-12-11 | 2016-12-12 | 2016-12-13 | 2016-12-14 | 2016-12-15 | 2016-12-16 | 2016-12-17 | 2016-12-18 | 2016-12-19 | 2016-12-20 | 2016-12-21 | 2016-12-22 | 2016-12-23 | 2016-12-24 | 2016-12-25 | 2016-12-26 | 2016-12-27 | 2016-12-28 | 2016-12-29 | 2016-12-30 | 2016-12-31 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
40563 | Prince_(musician)_en.wikipedia.org_all-access_all-agents | 9529.0 | 13627.0 | 9163.0 | 8222.0 | 7769.0 | 7640.0 | 8411.0 | 8746.0 | 6970.0 | 7072.0 | 7134.0 | 7313.0 | 6969.0 | 12577.0 | 16418.0 | 7487.0 | 7072.0 | 7314.0 | 7196.0 | 6785.0 | 6736.0 | 7661.0 | 9605.0 | 10634.0 | 8393.0 | 8548.0 | 8893.0 | 8960.0 | 7119.0 | 7939.0 | 7877.0 | 7309.0 | 7366.0 | 6505.0 | 7339.0 | 7798.0 | 9699.0 | 9754.0 | 14827.0 | 15815.0 | 14232.0 | 9910.0 | 8185.0 | 8089.0 | 8374.0 | 8933.0 | 10286.0 | 8747.0 | 7268.0 | ... | 8474.0 | 10774.0 | 9190.0 | 8220.0 | 8744.0 | 10619.0 | 12532.0 | 10791.0 | 9323.0 | 22885.0 | 10711.0 | 9349.0 | 9880.0 | 10420.0 | 11213.0 | 16069.0 | 11077.0 | 11055.0 | 9930.0 | 10507.0 | 10964.0 | 10104.0 | 10321.0 | 10005.0 | 9909.0 | 10521.0 | 11002.0 | 11153.0 | 13712.0 | 15153.0 | 10304.0 | 10504.0 | 12701.0 | 14971.0 | 12159.0 | 10778.0 | 11292.0 | 10883.0 | 9788.0 | 9856.0 | 13222.0 | 11297.0 | 15963.0 | 17002.0 | 49774.0 | 34560.0 | 31090.0 | 22827.0 | 19956.0 | 31446.0 |
1 rows × 551 columns
df = df.filter(regex="Page|2016")
df.iloc[:5, np.r_[0, 1,2,-2,-1]]
Page | 2016-01-01 | 2016-01-02 | 2016-12-30 | 2016-12-31 | |
---|---|---|---|---|---|
40563 | Prince_(musician)_en.wikipedia.org_all-access_all-agents | 20947.0 | 19466.0 | 19956.0 | 31446.0 |
df = df.melt(id_vars=['Page'],var_name='date',value_name='visits').drop('Page',axis=1)
print(df.shape)
df.head()
(366, 2)
date | visits | |
---|---|---|
0 | 2016-01-01 | 20947.0 |
1 | 2016-01-02 | 19466.0 |
2 | 2016-01-03 | 8587.0 |
3 | 2016-01-04 | 7386.0 |
4 | 2016-01-05 | 7719.0 |
df['visits'].isna().sum()
0
df['visits'].plot.line()
<matplotlib.axes._subplots.AxesSubplot at 0x7f9d873aef90>
df['visits'].plot.line()
plt.ylim(0,1e5)
(0.0, 100000.0)
for lag in range(1,8):
df['lag'+str(lag)] = df['visits'].shift(lag)
df.isna().sum().sum()
28
df = df.dropna(how='any')
df.head()
date | visits | lag1 | lag2 | lag3 | lag4 | lag5 | lag6 | lag7 | |
---|---|---|---|---|---|---|---|---|---|
7 | 2016-01-08 | 8658.0 | 7713.0 | 7156.0 | 7719.0 | 7386.0 | 8587.0 | 19466.0 | 20947.0 |
8 | 2016-01-09 | 9137.0 | 8658.0 | 7713.0 | 7156.0 | 7719.0 | 7386.0 | 8587.0 | 19466.0 |
9 | 2016-01-10 | 8855.0 | 9137.0 | 8658.0 | 7713.0 | 7156.0 | 7719.0 | 7386.0 | 8587.0 |
10 | 2016-01-11 | 9034.0 | 8855.0 | 9137.0 | 8658.0 | 7713.0 | 7156.0 | 7719.0 | 7386.0 |
11 | 2016-01-12 | 8435.0 | 9034.0 | 8855.0 | 9137.0 | 8658.0 | 7713.0 | 7156.0 | 7719.0 |
df['bias'] = 1
df['date'] = pd.to_datetime(df['date'])
df['month'] = df['date'].dt.month # 1 to 12
df['day'] = df['date'].dt.day # 1 to 31
df['quarter'] = df['date'].dt.quarter # 1 to 4
df['dayofweek'] = df['date'].dt.dayofweek # 0 to 6
df['weekend'] = ((df['date'].dt.dayofweek) // 5 == 1).astype(np.int8)
df = df.drop('date',axis=1)
print('nans = ',df.isna().sum().sum())
df.head()
nans = 0
visits | lag1 | lag2 | lag3 | lag4 | lag5 | lag6 | lag7 | bias | month | day | quarter | dayofweek | weekend | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
7 | 8658.0 | 7713.0 | 7156.0 | 7719.0 | 7386.0 | 8587.0 | 19466.0 | 20947.0 | 1 | 1 | 8 | 1 | 4 | 0 |
8 | 9137.0 | 8658.0 | 7713.0 | 7156.0 | 7719.0 | 7386.0 | 8587.0 | 19466.0 | 1 | 1 | 9 | 1 | 5 | 1 |
9 | 8855.0 | 9137.0 | 8658.0 | 7713.0 | 7156.0 | 7719.0 | 7386.0 | 8587.0 | 1 | 1 | 10 | 1 | 6 | 1 |
10 | 9034.0 | 8855.0 | 9137.0 | 8658.0 | 7713.0 | 7156.0 | 7719.0 | 7386.0 | 1 | 1 | 11 | 1 | 0 | 0 |
11 | 8435.0 | 9034.0 | 8855.0 | 9137.0 | 8658.0 | 7713.0 | 7156.0 | 7719.0 | 1 | 1 | 12 | 1 | 1 | 0 |
Xtrain, Xtest, ytrain, ytest = timeseries_train_test_split(df, 'visits', 0.7)
print(Xtrain.shape, Xtest.shape)
(251, 13) (108, 13)
target = 'visits'
df_Xtrain = pd.DataFrame(Xtrain, columns=df.columns.drop(target))
df_Xtest = pd.DataFrame(Xtest, columns=df.columns.drop(target))
df_Xtrain.head(2)
lag1 | lag2 | lag3 | lag4 | lag5 | lag6 | lag7 | bias | month | day | quarter | dayofweek | weekend | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7713 | 7156 | 7719 | 7386 | 8587 | 19466 | 20947 | 1 | 1 | 8 | 1 | 4 | 0 |
1 | 8658 | 7713 | 7156 | 7719 | 7386 | 8587 | 19466 | 1 | 1 | 9 | 1 | 5 | 1 |
ts = df['visits']
ts.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7f9d87259650>
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
Xtrain_scaled = scaler.fit_transform(Xtrain)
Xtest_scaled = scaler.transform(Xtest)
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import TimeSeriesSplit
print(Xtrain.shape, ytrain.shape, Xtest.shape, ytest.shape)
(251, 13) (251,) (108, 13) (108,)
# Linear Regression
model = LinearRegression()
model.fit(Xtrain, ytrain)
plot_model_results(model, Xtrain,Xtest,ytest,plot_intervals=True)
plot_coefficients(model, df.columns.drop('visits'))
ypreds = model.predict(Xtest)
model_name = 'LinearRegression'
desc = 'default'
df_eval = timeseries_evaluation(model_name, desc, ytest, ypreds,
df_eval=df_eval,show=True)
/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/statsmodels/tsa/stattools.py:657: FutureWarning: The default number of lags is changing from 40 tomin(int(10 * np.log10(nobs)), nobs - 1) after 0.12is released. Set the number of lags to an integer to silence this warning. FutureWarning,
Model | Description | MAPE | SMAPE | RMSE | ME | MAE | MPE | CORR | MINMAX | ACF1 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | LinearRegression | default | 365 | 135.3122 | 43,579 | -17,255 | 35,357 | -2 | -0.1236 | 1.2735 | 0.6457 |
# Linear Regression with scaled data
model = LinearRegression()
model.fit(Xtrain_scaled, ytrain)
plot_model_results(model, Xtrain_scaled,Xtest_scaled,ytest, plot_intervals=False)
plot_coefficients(model, df.columns.drop('visits'))
ypreds = model.predict(Xtest)
model_name = 'LinearRegression'
desc = 'scaled'
df_eval = timeseries_evaluation(model_name, desc, ytest, ypreds,
df_eval=df_eval,show=True)
/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/statsmodels/tsa/stattools.py:657: FutureWarning: The default number of lags is changing from 40 tomin(int(10 * np.log10(nobs)), nobs - 1) after 0.12is released. Set the number of lags to an integer to silence this warning. FutureWarning,
Model | Description | MAPE | SMAPE | RMSE | ME | MAE | MPE | CORR | MINMAX | ACF1 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | LinearRegression | default | 365 | 135.3122 | 43,579 | -17,255 | 35,357 | -2 | -0.1236 | 1.2735 | 0.6457 |
1 | LinearRegression | scaled | 33,841,890 | 199.9984 | 4,378,715,364 | -3,640,663,624 | 3,640,663,624 | -338,419 | 0.5725 | 1.0000 | 0.0784 |
plt.figure(figsize=(12,8))
sns.heatmap(df_Xtrain.corr())
<matplotlib.axes._subplots.AxesSubplot at 0x7f9d81c1aed0>
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=3)
model = RidgeCV(cv=tscv)
model.fit(Xtrain, ytrain)
plot_model_results(model,
Xtrain=Xtrain,
Xtest=Xtest,
ytest=ytest,
plot_intervals=True, plot_anomalies=True)
plot_coefficients(model,df_Xtrain.columns)
ypreds = model.predict(Xtest)
model_name = 'RidgeCV'
desc = 'ts_split=3'
df_eval = timeseries_evaluation(model_name, desc, ytest, ypreds,
df_eval=df_eval,show=True)
/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/statsmodels/tsa/stattools.py:657: FutureWarning: The default number of lags is changing from 40 tomin(int(10 * np.log10(nobs)), nobs - 1) after 0.12is released. Set the number of lags to an integer to silence this warning. FutureWarning,
Model | Description | MAPE | SMAPE | RMSE | ME | MAE | MPE | CORR | MINMAX | ACF1 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | RidgeCV | ts_split=3 | 261 | 118.8720 | 31,289 | -15,694 | 25,228 | -2 | -0.0255 | 0.8816 | 0.6251 |
1 | LinearRegression | default | 365 | 135.3122 | 43,579 | -17,255 | 35,357 | -2 | -0.1236 | 1.2735 | 0.6457 |
2 | LinearRegression | scaled | 33,841,890 | 199.9984 | 4,378,715,364 | -3,640,663,624 | 3,640,663,624 | -338,419 | 0.5725 | 1.0000 | 0.0784 |
from sklearn.linear_model import LassoCV
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=3)
model = LassoCV(cv=tscv)
model.fit(Xtrain, ytrain)
plot_model_results(model,
Xtrain=Xtrain,
Xtest=Xtest,
ytest=ytest,
plot_intervals=True, plot_anomalies=True)
plot_coefficients(model,df_Xtrain.columns)
ypreds = model.predict(Xtest)
model_name = 'LassoCV'
desc = 'ts_split=3'
df_eval = timeseries_evaluation(model_name, desc, ytest, ypreds,
df_eval=df_eval,show=True)
/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/statsmodels/tsa/stattools.py:657: FutureWarning: The default number of lags is changing from 40 tomin(int(10 * np.log10(nobs)), nobs - 1) after 0.12is released. Set the number of lags to an integer to silence this warning. FutureWarning,
Model | Description | MAPE | SMAPE | RMSE | ME | MAE | MPE | CORR | MINMAX | ACF1 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | LassoCV | ts_split=3 | 266 | 110.8415 | 25,829 | -25,336 | 25,537 | -3 | 0.5769 | 0.7062 | -0.4231 |
1 | RidgeCV | ts_split=3 | 261 | 118.8720 | 31,289 | -15,694 | 25,228 | -2 | -0.0255 | 0.8816 | 0.6251 |
2 | LinearRegression | default | 365 | 135.3122 | 43,579 | -17,255 | 35,357 | -2 | -0.1236 | 1.2735 | 0.6457 |
3 | LinearRegression | scaled | 33,841,890 | 199.9984 | 4,378,715,364 | -3,640,663,624 | 3,640,663,624 | -338,419 | 0.5725 | 1.0000 | 0.0784 |
from xgboost import XGBRegressor
# XGBRegressor?
model = XGBRegressor(random_state=SEED,n_jobs=-1,objective='reg:squarederror')
model.fit(Xtrain, ytrain)
plot_model_results(model,
Xtrain=Xtrain,
Xtest=Xtest,
ytest=ytest,
plot_intervals=True, plot_anomalies=True)
ypreds = model.predict(Xtest)
model_name = 'XGBRegressor'
desc = 'default'
df_eval = timeseries_evaluation(model_name, desc, ytest, ypreds,
df_eval=df_eval,show=True)
/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/statsmodels/tsa/stattools.py:657: FutureWarning: The default number of lags is changing from 40 tomin(int(10 * np.log10(nobs)), nobs - 1) after 0.12is released. Set the number of lags to an integer to silence this warning. FutureWarning,
Model | Description | MAPE | SMAPE | RMSE | ME | MAE | MPE | CORR | MINMAX | ACF1 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | XGBRegressor | default | 18 | 18.2580 | 4,513 | 687 | 2,331 | 0 | 0.6643 | 0.1565 | 0.1207 |
1 | LassoCV | ts_split=3 | 266 | 110.8415 | 25,829 | -25,336 | 25,537 | -3 | 0.5769 | 0.7062 | -0.4231 |
2 | RidgeCV | ts_split=3 | 261 | 118.8720 | 31,289 | -15,694 | 25,228 | -2 | -0.0255 | 0.8816 | 0.6251 |
3 | LinearRegression | default | 365 | 135.3122 | 43,579 | -17,255 | 35,357 | -2 | -0.1236 | 1.2735 | 0.6457 |
4 | LinearRegression | scaled | 33,841,890 | 199.9984 | 4,378,715,364 | -3,640,663,624 | 3,640,663,624 | -338,419 | 0.5725 | 1.0000 | 0.0784 |
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 0 hr 0 min 17 secs