Reference: https://www.kaggle.com/c/web-traffic-time-series-forecasting/data
Original data: train_1.csv
-----------------------------
rows = 145,063
columns = 551
first column = Page
date columns = 2015-07-01, 2015-07-02, ..., 2016-12-31 (550 columns)
file size: 284.6 MB
Data for time series
----------------------------------------------
selected year: 2016 (leap year 366 days)
selected time series:
The most visited page.
df['Page'] == """Special:Search_en.wikipedia.org_desktop_all-agents"""
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set(context='notebook', style='whitegrid', rc={'figure.figsize': (12,8)})
plt.style.use('fivethirtyeight') # better than sns styles.
matplotlib.rcParams['figure.figsize'] = 12,8
import os
import time
# random state
random_state=100
np.random.seed(random_state)
# Jupyter notebook settings for pandas
#pd.set_option('display.float_format', '{:,.2g}'.format) # numbers sep by comma
from pandas.api.types import CategoricalDtype
np.set_printoptions(precision=3)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 200)
import IPython
from IPython.display import display, HTML, Image, Markdown
import gc
import scipy
from scipy import stats
from scipy.stats import shapiro # Shapiro-Wilk normality
from statsmodels.tsa.stattools import adfuller # Augmented Dickey-Fuller stationarity
import statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.graphics.api as smg
import statsmodels.robust as smrb # smrb.mad() etc
import patsy # y,X1 = patsy.dmatrices(formula, df, return_type='dataframe')
from statsmodels.tsa.stattools import acf
from statsmodels.tsa.stattools import pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose # sm.tsa.seasonal_decompose
from pandas.plotting import autocorrelation_plot
from pandas.plotting import lag_plot
# versions
import watermark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv
Bhishan Poudel 2020-10-14 CPython 3.7.7 IPython 7.18.1 compiler : Clang 4.0.1 (tags/RELEASE_401/final) system : Darwin release : 19.6.0 machine : x86_64 processor : i386 CPU cores : 4 interpreter: 64bit scipy 1.4.1 autopep8 1.5.2 statsmodels.api 0.12.0 pandas 1.1.0 numpy 1.18.4 json 2.0.9 matplotlib 3.2.1 watermark 2.0.2 seaborn 0.11.0 IPython 7.18.1 patsy 0.5.1
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
def show_methods(obj, ncols=4):
lst = [i for i in dir(obj) if i[0]!='_' ]
df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
return df
df = pd.read_csv('../data/train_1.csv.zip',compression='zip',encoding='latin-1')
cond = df['Page'] == """Special:Search_en.wikipedia.org_desktop_all-agents"""
df = df.loc[cond]
df = df.filter(regex="Page|2016")
df = df.melt(id_vars=['Page'],var_name='date',value_name='visits').drop('Page',axis=1)
df = df.dropna(how='any')
print(df.shape)
df.iloc[:2,:5]
(366, 2)
date | visits | |
---|---|---|
0 | 2016-01-01 | 1401667.0 |
1 | 2016-01-02 | 1395136.0 |
df['date'] = pd.to_datetime(df['date'])
ts = df[['visits','date']].set_index('date')
ts.head()
visits | |
---|---|
date | |
2016-01-01 | 1401667.0 |
2016-01-02 | 1395136.0 |
2016-01-03 | 1455522.0 |
2016-01-04 | 1750373.0 |
2016-01-05 | 1787494.0 |
ts.plot()
# ts is periodic
# ts has some very large peaks
# ts in not going upward, it does not have trend (it may have if I have more years)
<matplotlib.axes._subplots.AxesSubplot at 0x7fec219a4750>
# Monthly subplots
#ts.groupby(ts.index.month).plot()
ts.asfreq('MS')
visits | |
---|---|
date | |
2016-01-01 | 1401667.0 |
2016-02-01 | 2058777.0 |
2016-03-01 | 1920594.0 |
2016-04-01 | 1700498.0 |
2016-05-01 | 1402524.0 |
2016-06-01 | 1678428.0 |
2016-07-01 | 1954525.0 |
2016-08-01 | 1864873.0 |
2016-09-01 | 6878515.0 |
2016-10-01 | 1299740.0 |
2016-11-01 | 1806303.0 |
2016-12-01 | 1594861.0 |
Reference: https://machinelearningmastery.com/a-gentle-introduction-to-normality-tests-in-python/
import statsmodels.api as sm
from scipy import stats
x = stats.norm.rvs(loc=5, scale=3, size=1000) # mean=5, std=3
# qq-plot
sm.qqplot(x, loc = 5, scale = 3, line='s')
# shapiro-wilk
res = stats.shapiro(x)
# anderson-darling
res = stats.anderson(x, dist='norm')
from scipy.stats import shapiro
def shapiro_normality_test(ts, sig=0.05):
"""
Shapiro-Wilk test of normality
H0: data IS normally distributed
H1: data is NOT normally distributed.
if p < 0.05, we reject the null hypothesis and say data is NOT normal.
Example:
from scipy import stats
x_1000 = stats.norm.rvs(loc=5, scale=3, size=1000)
shapiro_normality_test(x_1000)
Example:
ts = pandas series with datetime index and some values
ts = ts.to_numpy().ravel() # make it numpy array
shapiro_normality_test(ts.to_numpy().ravel())
NOTE:
In linear regression, the residuals must follow normal distribution.
"""
p_value = shapiro(ts)[1]
mu = np.round(np.mean(ts),3)
sigma = np.round(np.std(ts),3)
if p_value < sig:
print("Reject H0: that data follows normal distribution.")
print("**Data is NOT normally distributed.**")
print("Shapiro-Wilk p_value={}".format(np.round(p_value,3)))
else:
print("Accept H0: that data follows normal distribution.")
print("**Data IS normally distributed.**")
print("Shapiro-Wilk p_value={}".format(np.round(p_value,3)))
shapiro_normality_test(ts.to_numpy().ravel())
Reject H0: that data follows normal distribution. **Data is NOT normally distributed.** Shapiro-Wilk p_value=0.0
Ref: https://www.analyticsvidhya.com/blog/2018/09/non-stationary-time-series-python/
Stationarity Tests
Unit root indicates that the statistical properties of a given series are not constant with time, which is the condition for stationary time series.
Suppose we have a time series :
yt = a*yt-1 + ε t
where yt is the value at the time instant t and ε t is the error term. In order to calculate yt we need the value of yt-1, which is :
yt-1 = a*yt-2 + ε t-1
If we do that for all observations, the value of yt will come out to be:
yt = anyt-n + Σεt-iai
If the value of a is 1 (unit) in the above equation, then the predictions will be equal to the yt-n and sum of all errors from t-n to t, which means that the variance will increase with time.
This is knows as unit root in a time series. We know that for a stationary time series, the variance must not be a function of time.
The unit root tests check the presence of unit root in the series by checking if value of a=1.
The two most popular test of unit root are:
Types of Stationarity
Let us understand the different types of stationarities and how to interpret the results of the above tests.
It’s always better to apply both the tests, so that we are sure that the series is truly stationary. Let us look at the possible outcomes of applying these stationary tests.
Making a Time Series Stationary
from statsmodels.tsa.stattools import adfuller
def stationarity_test_adfuller(ts,sig=0.05):
"""
Augmented Dickey Fuller Test
H0: There is a unit root in the time series sample. (no stationary)
H1: Data IS stationary.
The augmented Dickey–Fuller (ADF) statistic,
used in the test, is a negative number.
The more negative it is, the stronger the rejection of the hypothesis
that there is a unit root at some level of confidence.
If p < 0.05, we reject the null hypothesis and say data is stationary.
Usage:
ts = ts.to_numpy().ravel()
stationarity_test_adfuller(ts)
"""
statistic = adfuller(ts)[0]
p_val=adfuller(ts)[1]
if p_val < sig:
print("Reject H0: that the time series have a unit root and is not stationary.")
print("**Data IS stationary.**")
print("Augmented Dickey-Fuller p_value={}".format(np.round(p_val,3)))
print("Augmented Dickey-Fuller statistic{}".format(np.round(statistic,3)))
else:
print("Accept H0: that time series have a unit root and is not stationary.")
print("**Data is NOT stationary.**")
print("Augmented Dickey-Fuller p_value={}".format(np.round(p_val,3)))
print("Augmented Dickey-Fuller statistic={}".format(np.round(statistic,3)))
stationarity_test_adfuller(ts.to_numpy().ravel())
Reject H0: that the time series have a unit root and is not stationary. **Data IS stationary.** Augmented Dickey-Fuller p_value=0.0 Augmented Dickey-Fuller statistic-6.02
def test_stationarity(ts,window=7):
# rolling statistics
rolmean = ts.rolling(window=window, center=False).mean()
rolstd = ts.rolling(window=window, center=False).std()
x = ts.to_numpy().ravel()
# Plot rolling statistics
plt.figure(figsize=(12,8))
orig = plt.plot(x, color='blue', label='Original')
mean = plt.plot(rolmean.values, color='red', label='Rolling Mean')
std = plt.plot(rolstd.values, color='black', label='Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show(block=False)
# Augmented Dickey-Fuller test
result = adfuller(x)
print(F"ADF Statistic: {result[0]:f}")
print(F"p-value: {result[1]:f}")
pvalue=result[1]
for key, value in result[4].items():
if result[0] > value:
print("**Timeseries is NOT stationary**")
break
else:
print('**Timeseries IS stationary**')
break
test_stationarity(ts)
ADF Statistic: -6.020090 p-value: 0.000000 **Timeseries IS stationary**
import statsmodels.graphics.tsaplots
show_methods(statsmodels.graphics.tsaplots,8)
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | |
---|---|---|---|---|---|---|---|---|
0 | acf | month_plot | pacf | plot_acf | plot_predict | quarter_plot | seasonal_plot | utils |
1 | deprecate_kwarg | np | pd | plot_pacf |
def simple_autocorrelation_plot(ts):
fig, axes = plt.subplots(1,2, squeeze=False);
fig.set_size_inches(14,4);
axes[0,0].scatter(x=ts[1:], y=ts.shift(1)[1:]);
axes[0,1].scatter(x=ts[7:], y=ts.shift(7)[7:]);
axes[0,0].set_title('Correlation of y_t and y_t-1');
axes[0,1].set_title('Correlation of y_t and y_t-7');
simple_autocorrelation_plot(ts)
def plot_correlations(ts, title="Time Series plot",
xlabel='',ylabel='',lags=None):
"""Correlation plots
Row1: lineplot of time series
Row2: correlation plot, partial-corr plot (mulitivariate), density plot
References: https://github.com/statsmodels/statsmodels/blob/master/examples/notebooks/regression_plots.ipynb
"""
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
# first row plot:
# time series plot
plt.figure(figsize=(16,4))
plt.plot(ts)
plt.title(title)
plt.ylabel(ylabel)
plt.xlabel(xlabel)
# second row plots:
fig, axes = plt.subplots(1,3,squeeze=False)
fig.set_size_inches(16,4)
# acf auto correlation function
plot_acf(ts, ax=axes[0,0], lags=lags);
# pacf (partial auto correlation function for multi-variate regression)
plot_pacf(ts, ax=axes[0,1], lags=lags);
# density plot
sns.distplot(ts, ax=axes[0,2])
axes[0,2].set_xlabel('')
axes[0,2].set_title("Probability Distribution")
plot_correlations(ts)
/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
def plot_acf_pacf(ts):
from statsmodels.tsa.stattools import acf
from statsmodels.tsa.stattools import pacf
# data
data = ts.to_numpy().ravel()
# fig setup
fig = plt.figure(1,figsize=(14,8))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)
# differencing or lagging
data_diff = [data[i] - data[i-1] for i in range(1,len(data))]
# acf and pacf
autocorr = acf(data_diff,fft=False)
pac = pacf(data_diff)
# array for plotting
x = [x for x in range(len(pac))]
# acf
ax1.plot(x[1:],autocorr[1:],label='Autocorrelation')
ax1.set_xlabel('Lag')
ax1.set_ylabel('Autocorrelation')
# pacf
ax2.plot(x[1:],pac[1:],label='Partial Autocorrelation')
ax2.set_xlabel('Lag')
ax2.set_ylabel('Partial Autocorrelation')
# show
ax1.set_xticks(range(0,50,7))
ax2.set_xticks(range(0,50,7))
ax1.legend()
ax2.legend()
plt.show()
plot_acf_pacf(ts)
/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/statsmodels/tsa/stattools.py:657: FutureWarning: The default number of lags is changing from 40 tomin(int(10 * np.log10(nobs)), nobs - 1) after 0.12is released. Set the number of lags to an integer to silence this warning. FutureWarning, /Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/statsmodels/tsa/stattools.py:1021: FutureWarning: The default number of lags is changing from 40 tomin(int(10 * np.log10(nobs)), nobs // 2 - 1) after 0.12is released. Set the number of lags to an integer to silence this warning. FutureWarning,
# we can see strong correlations and anti-correlations weekly
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.graphics.api as smg
import statsmodels.robust as smrb # smrb.mad() etc
import patsy # y,X1 = patsy.dmatrices(formula, df, return_type='dataframe')
res = sm.tsa.seasonal_decompose(ts, model='additive')
fig = res.plot()
fig.set_figheight(8)
fig.set_figwidth(15)
ts.rolling(7).mean().plot(figsize=(20,10), linewidth=5, fontsize=20)
# we do not see trend, it is not inreasing or decreasing.
<matplotlib.axes._subplots.AxesSubplot at 0x7fec21c50890>
for i in range(1,8):
ts.diff(i).plot(figsize=(12,4))
plt.title('lag period = '+ str(i))
plt.show()
X-axis is number of lag h.
Y-axis is Autocorrlation of timeseries to lag of itself. $$ C_{n}=\sum_{t=1}^{n-h}(y(t)-\hat{y})(y(t+n)-\hat{y}) / n $$
$$ C_{0}=\sum_{t=1}^{n}(y(t)-\hat{y})^{2} / n $$plt.figure(figsize=(12,8))
pd.plotting.autocorrelation_plot(ts)
plt.ylim(-0.25,0.75)
plt.grid(True)
plt.xticks(range(0,370,10));
plt.figure(figsize=(12,8))
pd.plotting.lag_plot(ts,c='blue')
<matplotlib.axes._subplots.AxesSubplot at 0x7fec23f364d0>
ts.head().append(ts.tail())
visits | |
---|---|
date | |
2016-01-01 | 1401667.0 |
2016-01-02 | 1395136.0 |
2016-01-03 | 1455522.0 |
2016-01-04 | 1750373.0 |
2016-01-05 | 1787494.0 |
2016-12-27 | 1481319.0 |
2016-12-28 | 1399599.0 |
2016-12-29 | 1455447.0 |
2016-12-30 | 1397331.0 |
2016-12-31 | 1175657.0 |