import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set(context='notebook', style='whitegrid', rc={'figure.figsize': (12,8)})
plt.style.use('fivethirtyeight') # better than sns styles.
matplotlib.rcParams['figure.figsize'] = 12,8

import os
import time

# random state
random_state=100
np.random.seed(random_state)

# Jupyter notebook settings for pandas
#pd.set_option('display.float_format', '{:,.2g}'.format) # numbers sep by comma
from pandas.api.types import CategoricalDtype
np.set_printoptions(precision=3)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 200)

import IPython
from IPython.display import display, HTML, Image, Markdown


import gc

import scipy
from scipy import stats
from scipy.stats import shapiro # Shapiro-Wilk normality
from statsmodels.tsa.stattools import adfuller # Augmented Dickey-Fuller stationarity

import statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.graphics.api as smg
import statsmodels.robust as smrb # smrb.mad() etc
import patsy # y,X1 = patsy.dmatrices(formula, df, return_type='dataframe')

from statsmodels.tsa.stattools import acf
from statsmodels.tsa.stattools import pacf

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose # sm.tsa.seasonal_decompose

from pandas.plotting import autocorrelation_plot
from pandas.plotting import lag_plot

# versions
import watermark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv

Bhishan Poudel 2020-10-14 

CPython 3.7.7
IPython 7.18.1

compiler   : Clang 4.0.1 (tags/RELEASE_401/final)
system     : Darwin
release    : 19.6.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit

scipy           1.4.1
autopep8        1.5.2
statsmodels.api 0.12.0
pandas          1.1.0
numpy           1.18.4
json            2.0.9
matplotlib      3.2.1
watermark       2.0.2
seaborn         0.11.0
IPython         7.18.1
patsy           0.5.1


%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;


def show_methods(obj, ncols=4):
    lst = [i for i in dir(obj) if i[0]!='_' ]
    df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
    return df


df = pd.read_csv('../data/train_1.csv.zip',compression='zip',encoding='latin-1')

cond = df['Page'] == """Special:Search_en.wikipedia.org_desktop_all-agents"""
df = df.loc[cond]
df = df.filter(regex="Page|2016")
df = df.melt(id_vars=['Page'],var_name='date',value_name='visits').drop('Page',axis=1)
df = df.dropna(how='any')

print(df.shape)
df.iloc[:2,:5]

(366, 2)


df['date'] = pd.to_datetime(df['date'])
ts = df[['visits','date']].set_index('date')

ts.head()


ts.plot()

# ts is periodic
# ts has some very large peaks
# ts in not going upward, it does not have trend (it may have if I have more years)

<matplotlib.axes._subplots.AxesSubplot at 0x7fec219a4750>


# Monthly subplots


#ts.groupby(ts.index.month).plot()


ts.asfreq('MS')


from scipy.stats import shapiro
def shapiro_normality_test(ts, sig=0.05):
    """
    Shapiro-Wilk test of normality
    
    
    H0: data IS normally distributed
    H1: data is NOT normally distributed.
    
    if p < 0.05, we reject the null hypothesis and say data is NOT normal.
    
    Example:
    from scipy import stats
    x_1000 = stats.norm.rvs(loc=5, scale=3, size=1000)
    shapiro_normality_test(x_1000)
    
    Example:
    ts = pandas series with datetime index and some values
    ts = ts.to_numpy().ravel() # make it numpy array
    shapiro_normality_test(ts.to_numpy().ravel())
    
    NOTE:
    In linear regression, the residuals must follow normal distribution.
    """
    
    p_value = shapiro(ts)[1]
    mu = np.round(np.mean(ts),3)
    sigma = np.round(np.std(ts),3)
    
    if p_value < sig:
        print("Reject H0: that data follows normal distribution.")
        print("**Data is NOT normally distributed.**")
        print("Shapiro-Wilk p_value={}".format(np.round(p_value,3)))
    else:
        print("Accept H0: that data follows normal distribution.")
        print("**Data IS normally distributed.**")
        print("Shapiro-Wilk p_value={}".format(np.round(p_value,3)))

shapiro_normality_test(ts.to_numpy().ravel())

Reject H0: that data follows normal distribution.
**Data is NOT normally distributed.**
Shapiro-Wilk p_value=0.0


from statsmodels.tsa.stattools import adfuller

def stationarity_test_adfuller(ts,sig=0.05):
    """
    Augmented Dickey Fuller Test
    
    H0: There is a unit root in the time series sample. (no stationary)
    H1: Data IS stationary.
    
    The augmented Dickey–Fuller (ADF) statistic,
    used in the test, is a negative number.
    
    The more negative it is, the stronger the rejection of the hypothesis
    that there is a unit root at some level of confidence.
    
    If p < 0.05, we reject the null hypothesis and say data is stationary.
    
    Usage:
    ts = ts.to_numpy().ravel()
    stationarity_test_adfuller(ts)
    
    """
    
    statistic = adfuller(ts)[0]
    p_val=adfuller(ts)[1]

    if p_val < sig:
        print("Reject H0: that the time series have a unit root and is not stationary.")
        print("**Data IS stationary.**")
        print("Augmented Dickey-Fuller p_value={}".format(np.round(p_val,3)))
        print("Augmented Dickey-Fuller statistic{}".format(np.round(statistic,3)))
    else:
        print("Accept H0: that time series have a unit root and is not stationary.")
        print("**Data is NOT stationary.**")
        print("Augmented Dickey-Fuller p_value={}".format(np.round(p_val,3)))
        print("Augmented Dickey-Fuller statistic={}".format(np.round(statistic,3)))
        
stationarity_test_adfuller(ts.to_numpy().ravel())

Reject H0: that the time series have a unit root and is not stationary.
**Data IS stationary.**
Augmented Dickey-Fuller p_value=0.0
Augmented Dickey-Fuller statistic-6.02


def test_stationarity(ts,window=7):
    # rolling statistics
    rolmean = ts.rolling(window=window, center=False).mean()
    rolstd = ts.rolling(window=window, center=False).std()
    
    x = ts.to_numpy().ravel()
    
    # Plot rolling statistics
    plt.figure(figsize=(12,8))
    orig = plt.plot(x, color='blue', label='Original')
    mean = plt.plot(rolmean.values, color='red', label='Rolling Mean') 
    std = plt.plot(rolstd.values, color='black', label='Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show(block=False)
    
    # Augmented Dickey-Fuller test
    result = adfuller(x) 
    print(F"ADF Statistic: {result[0]:f}")
    print(F"p-value: {result[1]:f}")
    pvalue=result[1]
    for key, value in result[4].items():
        if result[0] > value:
            print("**Timeseries is NOT stationary**")
            break
        else:
            print('**Timeseries IS stationary**')
            break


test_stationarity(ts)

ADF Statistic: -6.020090
p-value: 0.000000
**Timeseries IS stationary**


import statsmodels.graphics.tsaplots
show_methods(statsmodels.graphics.tsaplots,8)


def simple_autocorrelation_plot(ts):
    fig, axes = plt.subplots(1,2, squeeze=False);
    fig.set_size_inches(14,4);
    axes[0,0].scatter(x=ts[1:], y=ts.shift(1)[1:]);
    axes[0,1].scatter(x=ts[7:], y=ts.shift(7)[7:]);
    axes[0,0].set_title('Correlation of y_t and y_t-1');
    axes[0,1].set_title('Correlation of y_t and y_t-7');
    
simple_autocorrelation_plot(ts)


def plot_correlations(ts, title="Time Series plot",
                      xlabel='',ylabel='',lags=None):
    """Correlation plots
    Row1: lineplot of time series
    Row2: correlation plot, partial-corr plot (mulitivariate), density plot
    
    References: https://github.com/statsmodels/statsmodels/blob/master/examples/notebooks/regression_plots.ipynb
    """
    from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

    # first row plot:
    # time series plot
    plt.figure(figsize=(16,4))
    plt.plot(ts)
    plt.title(title)
    plt.ylabel(ylabel)
    plt.xlabel(xlabel)
    
    # second row plots:
    fig, axes = plt.subplots(1,3,squeeze=False)
    fig.set_size_inches(16,4)

    # acf auto correlation function
    plot_acf(ts, ax=axes[0,0], lags=lags);
    
    # pacf (partial auto correlation function for multi-variate regression)
    plot_pacf(ts, ax=axes[0,1], lags=lags);
    
    # density plot
    sns.distplot(ts, ax=axes[0,2])
    axes[0,2].set_xlabel('')
    axes[0,2].set_title("Probability Distribution")
    
plot_correlations(ts)

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)


def plot_acf_pacf(ts):
    from statsmodels.tsa.stattools import acf
    from statsmodels.tsa.stattools import pacf
    
    # data
    data = ts.to_numpy().ravel()

    # fig setup
    fig = plt.figure(1,figsize=(14,8))
    ax1 = fig.add_subplot(121)
    ax2 = fig.add_subplot(122)
    
    # differencing or lagging
    data_diff = [data[i] - data[i-1] for i in range(1,len(data))]
    
    # acf and pacf
    autocorr = acf(data_diff,fft=False)
    pac = pacf(data_diff)

    # array for plotting
    x = [x for x in range(len(pac))]
    
    # acf
    ax1.plot(x[1:],autocorr[1:],label='Autocorrelation')
    ax1.set_xlabel('Lag')
    ax1.set_ylabel('Autocorrelation')
    
    # pacf
    ax2.plot(x[1:],pac[1:],label='Partial Autocorrelation')
    ax2.set_xlabel('Lag')
    ax2.set_ylabel('Partial Autocorrelation')
    
    # show
    ax1.set_xticks(range(0,50,7))
    ax2.set_xticks(range(0,50,7))
    ax1.legend()
    ax2.legend()
    plt.show()
    
plot_acf_pacf(ts)

/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/statsmodels/tsa/stattools.py:657: FutureWarning: The default number of lags is changing from 40 tomin(int(10 * np.log10(nobs)), nobs - 1) after 0.12is released. Set the number of lags to an integer to  silence this warning.
  FutureWarning,
/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/statsmodels/tsa/stattools.py:1021: FutureWarning: The default number of lags is changing from 40 tomin(int(10 * np.log10(nobs)), nobs // 2 - 1) after 0.12is released. Set the number of lags to an integer to  silence this warning.
  FutureWarning,


# we can see strong correlations and anti-correlations weekly


import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.graphics.api as smg
import statsmodels.robust as smrb # smrb.mad() etc
import patsy # y,X1 = patsy.dmatrices(formula, df, return_type='dataframe')


res = sm.tsa.seasonal_decompose(ts, model='additive')
fig = res.plot()
fig.set_figheight(8)
fig.set_figwidth(15)


ts.rolling(7).mean().plot(figsize=(20,10), linewidth=5, fontsize=20)
# we do not see trend, it is not inreasing or decreasing.

<matplotlib.axes._subplots.AxesSubplot at 0x7fec21c50890>


for i in range(1,8):
    ts.diff(i).plot(figsize=(12,4))
    plt.title('lag period = '+ str(i))
    plt.show()


plt.figure(figsize=(12,8))
pd.plotting.autocorrelation_plot(ts)
plt.ylim(-0.25,0.75)
plt.grid(True)
plt.xticks(range(0,370,10));


plt.figure(figsize=(12,8))
pd.plotting.lag_plot(ts,c='blue')

<matplotlib.axes._subplots.AxesSubplot at 0x7fec23f364d0>


ts.head().append(ts.tail())

	date	visits
0	2016-01-01	1401667.0
1	2016-01-02	1395136.0

	visits
date
2016-01-01	1401667.0
2016-01-02	1395136.0
2016-01-03	1455522.0
2016-01-04	1750373.0
2016-01-05	1787494.0

	visits
date
2016-01-01	1401667.0
2016-02-01	2058777.0
2016-03-01	1920594.0
2016-04-01	1700498.0
2016-05-01	1402524.0
2016-06-01	1678428.0
2016-07-01	1954525.0
2016-08-01	1864873.0
2016-09-01	6878515.0
2016-10-01	1299740.0
2016-11-01	1806303.0
2016-12-01	1594861.0

	visits
date
2016-01-01	1401667.0
2016-01-02	1395136.0
2016-01-03	1455522.0
2016-01-04	1750373.0
2016-01-05	1787494.0
2016-12-27	1481319.0
2016-12-28	1399599.0
2016-12-29	1455447.0
2016-12-30	1397331.0
2016-12-31	1175657.0

Table of Contents

Data Description¶

Load the libraries¶

Useful Functions¶

Load the data¶

Timeseries Statistics¶

Normality Test¶

Stationarity Test (Augmented Dickey Fuller Test)¶

------------------------------------------------------------------------------¶

------------------------------------------------------------------------------¶

------------------------------------------------------------------------------¶

Auto correlation plots¶

Seasonal Decomposition¶

Trend Analysis¶

Seasonability Analysis¶

Periodicity and Autocorrelation¶

	0	1	2	3	4	5	6	7
0	acf	month_plot	pacf	plot_acf	plot_predict	quarter_plot	seasonal_plot	utils
1	deprecate_kwarg	np	pd	plot_pacf