import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set(context='notebook', style='whitegrid', rc={'figure.figsize': (12,8)})
plt.style.use('fivethirtyeight') # better than sns styles.
matplotlib.rcParams['figure.figsize'] = 12,8

import os
import time

# random state
random_state=100
np.random.seed(random_state)

# Jupyter notebook settings for pandas
#pd.set_option('display.float_format', '{:,.2g}'.format) # numbers sep by comma
from pandas.api.types import CategoricalDtype
np.set_printoptions(precision=3)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 200)

import IPython
from IPython.display import display, HTML, Image, Markdown

import gc

import statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.graphics.api as smg
import statsmodels.robust as smrb # smrb.mad() etc
import patsy # y,X1 = patsy.dmatrices(formula, df, return_type='dataframe') 

# versions
import watermark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv

Bhishan Poudel 2020-10-14 

CPython 3.7.9
IPython 7.18.1

compiler   : Clang 10.0.0 
system     : Darwin
release    : 19.6.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit

autopep8        1.5.4
statsmodels.api 0.11.1
statsmodels     0.11.1
matplotlib      3.1.3
pandas          1.1.1
patsy           0.5.1
watermark       2.0.2
seaborn         0.11.0
json            2.0.9
numpy           1.19.1
IPython         7.18.1


%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;


def show_methods(obj, ncols=4):
    lst = [i for i in dir(obj) if i[0]!='_' ]
    df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
    return df


def get_mape(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def get_smape(y_true, y_pred):
    denominator = (np.abs(y_true) + np.abs(y_pred))
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return 200 * np.mean(diff)


df = pd.read_csv('../data/train_1.csv.zip',compression='zip')

cond = df['Page'] == """Special:Search_en.wikipedia.org_desktop_all-agents"""
df = df.loc[cond]
df = df.filter(regex="Page|2016")
df = df.melt(id_vars=['Page'],var_name='date',value_name='visits').drop('Page',axis=1)
df = df.dropna(how='any')

print(df.shape)
df.iloc[:2,:5]

(366, 2)


df['date'] = pd.to_datetime(df['date'])
df['visits'] = df['visits'].astype(np.int32)
df = df[['visits','date']].set_index('date')

ts = df['visits']

ts.head()

date
2016-01-01    1401667
2016-01-02    1395136
2016-01-03    1455522
2016-01-04    1750373
2016-01-05    1787494
Name: visits, dtype: int32


# make sure this is series, not dataframe
type(ts)

pandas.core.series.Series


ts.plot()

<matplotlib.axes._subplots.AxesSubplot at 0x7f8ded5d2590>


# almost all values are around 0.2e7 except 3 peaks.
# these peaks will later harm timeseries modelling.


ts.describe().T

count    3.660000e+02
mean     1.845918e+06
std      1.095724e+06
min      1.030746e+06
25%      1.515952e+06
50%      1.700576e+06
75%      1.821304e+06
max      1.659208e+07
Name: visits, dtype: float64


t = ts[ts > 0.3e7]
t

date
2016-07-05     6038766
2016-07-06    16592075
2016-07-07     5025785
2016-07-11     5181746
2016-08-30     5700934
2016-08-31     6281005
2016-09-01     6878515
2016-09-02     7599524
2016-09-03     6894531
2016-09-04     6457072
2016-09-05     4705306
2016-09-06     3688959
Name: visits, dtype: int32


# the wikipedia page visits seems to have weekly periodicity
# we can try replacing these possible outlier values with value from one week
# prior to them.
#
# attempt2:
# we can also fill these values with mean from given week day.


idx = ts[ts > 0.3e7].index
idx

DatetimeIndex(['2016-07-05', '2016-07-06', '2016-07-07', '2016-07-11',
               '2016-08-30', '2016-08-31', '2016-09-01', '2016-09-02',
               '2016-09-03', '2016-09-04', '2016-09-05', '2016-09-06'],
              dtype='datetime64[ns]', name='date', freq=None)


idx - pd.Timedelta(days=7)

DatetimeIndex(['2016-06-28', '2016-06-29', '2016-06-30', '2016-07-04',
               '2016-08-23', '2016-08-24', '2016-08-25', '2016-08-26',
               '2016-08-27', '2016-08-28', '2016-08-29', '2016-08-30'],
              dtype='datetime64[ns]', name='date', freq=None)


ts[idx - pd.Timedelta(days=7)]

date
2016-06-28    1715507
2016-06-29    2010241
2016-06-30    2012923
2016-07-04    2457009
2016-08-23    1820624
2016-08-24    1825362
2016-08-25    1710625
2016-08-26    1676577
2016-08-27    1730368
2016-08-28    1788768
2016-08-29    2420356
2016-08-30    5700934
Name: visits, dtype: int32


ts_no_outliers = ts.copy()
ts_no_outliers[idx] = ts_no_outliers[idx - pd.Timedelta(days=7)]

ts_no_outliers.plot()

<matplotlib.axes._subplots.AxesSubplot at 0x7f8e02c8b810>


# again i see only one peak, remove this peak
idx = ts[ts > 0.3e7].index
idx

DatetimeIndex(['2016-07-05', '2016-07-06', '2016-07-07', '2016-07-11',
               '2016-08-30', '2016-08-31', '2016-09-01', '2016-09-02',
               '2016-09-03', '2016-09-04', '2016-09-05', '2016-09-06'],
              dtype='datetime64[ns]', name='date', freq=None)


ts_no_outliers[idx] = ts_no_outliers[idx - pd.Timedelta(days=7)]

ts_no_outliers.plot()

<matplotlib.axes._subplots.AxesSubplot at 0x7f8ded55fe90>


ts_no_outliers.head().append(ts_no_outliers.tail())

date
2016-01-01    1401667.0
2016-01-02    1395136.0
2016-01-03    1455522.0
2016-01-04    1750373.0
2016-01-05    1787494.0
2016-12-27    1481319.0
2016-12-28    1399599.0
2016-12-29    1455447.0
2016-12-30    1397331.0
2016-12-31    1175657.0
Name: visits, dtype: float64


ts_no_outliers.shape

(366,)


# now try imputing outliers with given weekday mean.


ts[ts>0.3e7]

date
2016-07-05     6038766
2016-07-06    16592075
2016-07-07     5025785
2016-07-11     5181746
2016-08-30     5700934
2016-08-31     6281005
2016-09-01     6878515
2016-09-02     7599524
2016-09-03     6894531
2016-09-04     6457072
2016-09-05     4705306
2016-09-06     3688959
Name: visits, dtype: int32


ts_good_weekday_means = ts[ts<0.3e7].groupby(ts[ts<0.3e7].index.weekday).agg('mean')
ts_good_weekday_means

date
0    1.804507e+06
1    1.833039e+06
2    1.811539e+06
3    1.781031e+06
4    1.650711e+06
5    1.426800e+06
6    1.467813e+06
Name: visits, dtype: float64


ts_bad_idx_weekday = ts[ts>0.3e7].index.weekday
ts_bad_idx_weekday

Int64Index([1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 0, 1], dtype='int64', name='date')


imputes = ts[ts>0.3e7].index.weekday.map(ts_good_weekday_means).to_numpy()
imputes

array([1833038.959, 1811539.16 , 1781030.9  , 1804507.3  , 1833038.959,
       1811539.16 , 1781030.9  , 1650711.058, 1426799.75 , 1467813.02 ,
       1804507.3  , 1833038.959])


ts_no_outliers2 = ts.copy()
ts_no_outliers2[ts_no_outliers2>0.3e7] = imputes

ts_no_outliers2.plot()

<matplotlib.axes._subplots.AxesSubplot at 0x7f8e02c45350>


def lag_plot_partial_autocorrelation():
    from statsmodels.graphics.tsaplots import plot_pacf

    for p in range(5):
        tsx = eval('ts' + '.diff()'*p + '.dropna()')
        fig = plot_pacf(tsx)
        plt.title(f'Partial Autocorrelation for lag p= {p}')
        plt.xticks(range(25))

# lag_plot_partial_autocorrelation()

# for 1st differencing p =1, most of the lines are almost inside 
# the significance limit.
# we can choose p = 1 to start with.


def lag_plot_autocorrelation():
    from statsmodels.graphics.tsaplots import plot_acf

    for d in range(10):
        tsx = eval('ts' + '.diff()'*d + '.dropna()')
        fig = plot_acf(tsx)
        plt.title(f'Autocorrelation for lag d= {d}')
        plt.xticks(range(25))

# lag_plot_autocorrelation()

# I see that lag of d=7 gives similar lines above and below te base line.
# d=7, ie. weekly differencing is a good start.


# https://www.alkaline-ml.com/pmdarima/modules/generated/pmdarima.arima.ndiffs.html
import six
from pmdarima.arima.utils import ndiffs


## augmented DF Test
adf = ndiffs(ts, test='adf')

# KPSS test
kpss = ndiffs(ts, test='kpss')

# PP test:
pp = ndiffs(ts, test='pp')

adf, kpss, pp

(0, 0, 0)


# it shows that our time series is already stationary at no lagging.
# so, d =0 is a good start.


from statsmodels.graphics.tsaplots import plot_acf

ts.diff().plot()

<matplotlib.axes._subplots.AxesSubplot at 0x7f8e02ed25d0>


plot_acf(ts.diff().dropna());


# only few lags are above the significance region. so, choose q = 2.
# when in doubt choose simpler model.


from statsmodels.tsa.arima_model import ARIMA

# 1,0,2 ARIMA Model
model = ARIMA(ts, order=(1,0,2))
model_fit = model.fit(disp=0)
model_fit.summary()

/Users/poudel/opt/miniconda3/envs/gam/lib/python3.7/site-packages/statsmodels/tsa/base/tsa_model.py:162: ValueWarning: No frequency information was provided, so inferred frequency D will be used.
  % freq, ValueWarning)


# here all coeffs are well above zero.
# if some coeffs were very close to zero, we could try reducing that lag.
#
# note that the p-value for ma L1 and L2 are greater than 0.05, they are
# not significant.
#
# try removing ma L2.


# 1,0,2 ARIMA Model
model = ARIMA(ts, order=(1,0,1))
model_fit = model.fit(disp=0)
model_fit.summary()

/Users/poudel/opt/miniconda3/envs/gam/lib/python3.7/site-packages/statsmodels/tsa/base/tsa_model.py:162: ValueWarning: No frequency information was provided, so inferred frequency D will be used.
  % freq, ValueWarning)


# again ma L1 lag p-value > 0.05


# 1,0,0 AR Model
model = ARIMA(ts, order=(1,0,0))
results_arima100 = model.fit(disp=0)
results_arima100.summary()

/Users/poudel/opt/miniconda3/envs/gam/lib/python3.7/site-packages/statsmodels/tsa/base/tsa_model.py:162: ValueWarning: No frequency information was provided, so inferred frequency D will be used.
  % freq, ValueWarning)


# this looks like just an AR model without I and MA part.
# but still statsmodels calls it ARMA

show_methods(results_arima100)


# Plot residual errors
residuals = pd.DataFrame(results_arima100.resid)
fig, ax = plt.subplots(1,2,figsize=(12,8))
residuals.plot(title="Residuals", ax=ax[0])
residuals.plot(kind='kde', title='Density', ax=ax[1])
plt.show()


# Actual vs Fitted
results_arima100.plot_predict(dynamic=False)
plt.show()


# Accuracy metrics
def timeseries_evaluation(ytest, ypreds, df_eval=None,show=False,
                          round_=2,index_name=None):
    from statsmodels.tsa.stattools import acf

    if not isinstance(df_eval, pd.DataFrame):
        df_eval = pd.DataFrame({'MAPE': [],
                                'ME': [],
                                'MAE': [],
                                'MPE': [],
                                'RMSE': [],
                                'CORR': [],
                                'MINMAX': [],
                                'ACF1': [],
                                })

    mape = np.mean(np.abs(ypreds - ytest)/np.abs(ytest))  # MAPE
    me = np.mean(ypreds - ytest)             # ME
    mae = np.mean(np.abs(ypreds - ytest))    # MAE
    mpe = np.mean((ypreds - ytest)/ytest)   # MPE
    rmse = np.mean((ypreds - ytest)**2)**.5  # RMSE
    corr = np.corrcoef(ypreds, ytest)[0, 1]   # corr
    mins = np.amin(np.hstack([ypreds[:, None],
                              ytest[:, None]]), axis=1)
    maxs = np.amax(np.hstack([ypreds[:, None],
                              ytest[:, None]]), axis=1)
    minmax = 1 - np.mean(mins/maxs)             # minmax

    acf1 = acf(ypreds-ytest, fft=False)[1]    # ACF1

    row = [mape, me, mae, mpe, rmse, corr, minmax, acf1]

    if index_name:
        df_eval.loc[index_name] =  row
        
    else:
        df_eval.loc[len(df_eval)] = row
    
    df_eval = df_eval.round(round_)
    
    if show:
        display(df_eval)
    return df_eval


# 1,0,0 AR Model
train = ts.iloc[:300]
model = ARIMA(train, order=(1,0,0))
results_arima100 = model.fit(disp=0)

/Users/poudel/opt/miniconda3/envs/gam/lib/python3.7/site-packages/statsmodels/tsa/base/tsa_model.py:162: ValueWarning: No frequency information was provided, so inferred frequency D will be used.
  % freq, ValueWarning)


# Forecast
test =  ts.iloc[300:]
fc, se, conf = results_arima100.forecast(len(test), alpha=0.05)  # 95% conf
timeseries_evaluation(test, fc)

/Users/poudel/opt/miniconda3/envs/gam/lib/python3.7/site-packages/ipykernel_launcher.py:24: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version.  Convert to a numpy array before indexing instead.
/Users/poudel/opt/miniconda3/envs/gam/lib/python3.7/site-packages/ipykernel_launcher.py:26: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version.  Convert to a numpy array before indexing instead.


# https://github.com/statsmodels/statsmodels/blob/master/examples/notebooks/tsa_arma_0.ipynb

arma10 = sm.tsa.ARMA(train, (1,0)).fit(disp=False)
resid = arma10.resid

print(train.shape, resid.shape)
r,q,p = sm.tsa.acf(resid, fft=True, qstat=True)
print(r.shape, q.shape, p.shape)

data = np.c_[range(1,len(r)), r[1:], q, p]

table = pd.DataFrame(data, columns=['lag', "AC", "Q", "Prob(>Q)"])
table.set_index('lag').head()

(300,) (300,)
(41,) (40,) (40,)

/Users/poudel/opt/miniconda3/envs/gam/lib/python3.7/site-packages/statsmodels/tsa/base/tsa_model.py:162: ValueWarning: No frequency information was provided, so inferred frequency D will be used.
  % freq, ValueWarning)


from statsmodels.tsa.arima_model import ARIMA
import pmdarima as pm


model = pm.auto_arima(ts, start_p=1, start_q=1,
                      test='adf',       # use adftest to find optimal 'd'
                      max_p=3, max_q=3, # maximum p and q
                      m=1,              # frequency of series
                      d=None,           # let model determine 'd'
                      seasonal=False,   # No Seasonality
                      start_P=0, 
                      D=0, 
                      trace=True,
                      error_action='ignore',  
                      suppress_warnings=True, 
                      stepwise=True)

model.summary()

Performing stepwise search to minimize aic
 ARIMA(1,0,1)(0,0,0)[0]             : AIC=11092.335, Time=0.07 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=11712.531, Time=0.01 sec
 ARIMA(1,0,0)(0,0,0)[0]             : AIC=11095.687, Time=0.02 sec
 ARIMA(0,0,1)(0,0,0)[0]             : AIC=11475.698, Time=0.04 sec
 ARIMA(2,0,1)(0,0,0)[0]             : AIC=11090.849, Time=0.36 sec
 ARIMA(2,0,0)(0,0,0)[0]             : AIC=11094.755, Time=0.02 sec
 ARIMA(3,0,1)(0,0,0)[0]             : AIC=inf, Time=0.28 sec
 ARIMA(2,0,2)(0,0,0)[0]             : AIC=inf, Time=0.42 sec
 ARIMA(1,0,2)(0,0,0)[0]             : AIC=11064.631, Time=0.07 sec
 ARIMA(0,0,2)(0,0,0)[0]             : AIC=11432.449, Time=0.03 sec
 ARIMA(1,0,3)(0,0,0)[0]             : AIC=11061.297, Time=0.16 sec
 ARIMA(0,0,3)(0,0,0)[0]             : AIC=11397.376, Time=0.05 sec
 ARIMA(2,0,3)(0,0,0)[0]             : AIC=11062.418, Time=0.17 sec
 ARIMA(1,0,3)(0,0,0)[0] intercept   : AIC=11042.894, Time=0.08 sec
 ARIMA(0,0,3)(0,0,0)[0] intercept   : AIC=11050.512, Time=0.05 sec
 ARIMA(1,0,2)(0,0,0)[0] intercept   : AIC=11040.693, Time=0.07 sec
 ARIMA(0,0,2)(0,0,0)[0] intercept   : AIC=11066.585, Time=0.08 sec
 ARIMA(1,0,1)(0,0,0)[0] intercept   : AIC=11040.158, Time=0.03 sec
 ARIMA(0,0,1)(0,0,0)[0] intercept   : AIC=11084.832, Time=0.03 sec
 ARIMA(1,0,0)(0,0,0)[0] intercept   : AIC=11039.466, Time=0.03 sec
 ARIMA(0,0,0)(0,0,0)[0] intercept   : AIC=11221.532, Time=0.01 sec
 ARIMA(2,0,0)(0,0,0)[0] intercept   : AIC=11040.000, Time=0.05 sec
 ARIMA(2,0,1)(0,0,0)[0] intercept   : AIC=11042.104, Time=0.13 sec

Best model:  ARIMA(1,0,0)(0,0,0)[0] intercept
Total fit time: 2.281 seconds


model.plot_diagnostics(figsize=(12,8))
plt.show()


ts.head(2).append(ts.tail())

date
2016-01-01    1401667
2016-01-02    1395136
2016-12-27    1481319
2016-12-28    1399599
2016-12-29    1455447
2016-12-30    1397331
2016-12-31    1175657
Name: visits, dtype: int32


show_methods(results_arima100)


results_arima100.plot_predict(dynamic=False)
plt.show()

# train from data upto dec 1 and forecast the rest data.

# dynamic=False argument ensures that we produce one-step ahead forecasts.
# meaning that forecasts at each point are generated using
# the full history up to that point.

Dep. Variable:	visits	No. Observations:	366
Model:	ARMA(1, 2)	Log Likelihood	-5515.184
Method:	css-mle	S.D. of innovations	846735.155
Date:	Wed, 14 Oct 2020	AIC	11040.367
Time:	12:07:55	BIC	11059.880
Sample:	01-01-2016	HQIC	11048.121
	- 12-31-2016

	coef	std err	z	P>\|z\|	[0.025	0.975]
const	1.846e+06	1.24e+05	14.854	0.000	1.6e+06	2.09e+06
ar.L1.visits	0.7185	0.115	6.267	0.000	0.494	0.943
ma.L1.visits	-0.0505	0.131	-0.385	0.700	-0.308	0.207
ma.L2.visits	-0.1547	0.111	-1.393	0.164	-0.372	0.063

	Real	Imaginary	Modulus	Frequency
AR.1	1.3917	+0.0000j	1.3917	0.0000
MA.1	2.3843	+0.0000j	2.3843	0.0000
MA.2	-2.7104	+0.0000j	2.7104	0.5000

Dep. Variable:	visits	No. Observations:	366
Model:	ARMA(1, 1)	Log Likelihood	-5515.882
Method:	css-mle	S.D. of innovations	848375.073
Date:	Wed, 14 Oct 2020	AIC	11039.764
Time:	12:07:55	BIC	11055.375
Sample:	01-01-2016	HQIC	11045.967
	- 12-31-2016

	coef	std err	z	P>\|z\|	[0.025	0.975]
const	1.846e+06	1.12e+05	16.541	0.000	1.63e+06	2.06e+06
ar.L1.visits	0.5585	0.071	7.858	0.000	0.419	0.698
ma.L1.visits	0.1153	0.087	1.318	0.187	-0.056	0.287

Table of Contents

Data Description¶

Useful Functions¶

Load the data¶

Modelling Timeseries¶

Modelling: ARIMA for non-seasonal timeseries¶

Find Auto Regressive AR term p (partial correlation)¶

Find Integrated I term d (differentiation autocorrelation)¶

Find Moving Average MA term q¶

ARIMA Modelling¶

Model Evaluation¶

Auto ARIMA¶

Interpret the residual plots in ARIMA model¶

Future Predictions¶

	0	1	2	3
0	aic	fittedvalues	maroots	resid
1	arfreq	forecast	mle_retvals	save
2	arparams	hqic	mle_settings	scale
3	arroots	initialize	model	sigma2
4	bic	k_ar	n_totobs	summary
5	bse	k_constant	nobs	summary2
6	conf_int	k_exog	normalized_cov_params	t_test
7	cov_params	k_ma	params	t_test_pairwise
8	cov_params_default	k_trend	plot_predict	tvalues
9	data	llf	predict	use_t
10	df_model	load	pvalues	wald_test
11	df_resid	mafreq	remove_data	wald_test_terms
12	f_test	maparams

	AC	Q	Prob(>Q)
lag
1.0	0.042116	0.537471	0.463483
2.0	-0.065968	1.860518	0.394451
3.0	-0.028170	2.102594	0.551388
4.0	-0.059243	3.176863	0.528676
5.0	0.206869	16.319908	0.005988

Ljung-Box (Q):	27.43	Jarque-Bera (JB):	222704.07
Prob(Q):	0.93	Prob(JB):	0.00
Heteroskedasticity (H):	3.28	Skew:	7.63
Prob(H) (two-sided):	0.00	Kurtosis:	122.88

	coef	std err	z	P>\|z\|	[0.025	0.975]
intercept	6.839e+05	8.28e+04	8.256	0.000	5.22e+05	8.46e+05
ar.L1	0.6284	0.010	60.466	0.000	0.608	0.649
sigma2	7.248e+11	0.690	1.05e+12	0.000	7.25e+11	7.25e+11