Reference: https://www.kaggle.com/c/web-traffic-time-series-forecasting/data
I have cleaned the kaggle wikipedia traffic data and selected only data of 2016 with fraction of 0.1.
The data was melted and additional columns were created.
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set(context='notebook', style='whitegrid', rc={'figure.figsize': (12,8)})
plt.style.use('fivethirtyeight') # better than sns styles.
matplotlib.rcParams['figure.figsize'] = 12,8
import os
import time
# random state
SEED=100
np.random.seed(SEED)
import functools
import gc
import IPython
from IPython.display import display, Markdown, Image
# versions
import watermark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv
The watermark extension is already loaded. To reload it, use: %reload_ext watermark Bhishan Poudel 2020-10-14 CPython 3.7.7 IPython 7.18.1 compiler : Clang 4.0.1 (tags/RELEASE_401/final) system : Darwin release : 19.6.0 machine : x86_64 processor : i386 CPU cores : 4 interpreter: 64bit pandas 1.1.0 matplotlib 3.2.1 numpy 1.18.4 IPython 7.18.1 json 2.0.9 watermark 2.0.2 seaborn 0.11.0 autopep8 1.5.2
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
df = pd.read_csv('../data/train_1.csv.zip',compression='zip',encoding='latin-1')
print(df.shape)
display(df.head())
(145063, 551)
Page | 2015-07-01 | 2015-07-02 | 2015-07-03 | 2015-07-04 | 2015-07-05 | 2015-07-06 | 2015-07-07 | 2015-07-08 | 2015-07-09 | ... | 2016-12-22 | 2016-12-23 | 2016-12-24 | 2016-12-25 | 2016-12-26 | 2016-12-27 | 2016-12-28 | 2016-12-29 | 2016-12-30 | 2016-12-31 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2NE1_zh.wikipedia.org_all-access_spider | 18.0 | 11.0 | 5.0 | 13.0 | 14.0 | 9.0 | 9.0 | 22.0 | 26.0 | ... | 32.0 | 63.0 | 15.0 | 26.0 | 14.0 | 20.0 | 22.0 | 19.0 | 18.0 | 20.0 |
1 | 2PM_zh.wikipedia.org_all-access_spider | 11.0 | 14.0 | 15.0 | 18.0 | 11.0 | 13.0 | 22.0 | 11.0 | 10.0 | ... | 17.0 | 42.0 | 28.0 | 15.0 | 9.0 | 30.0 | 52.0 | 45.0 | 26.0 | 20.0 |
2 | 3C_zh.wikipedia.org_all-access_spider | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 4.0 | 0.0 | 3.0 | 4.0 | ... | 3.0 | 1.0 | 1.0 | 7.0 | 4.0 | 4.0 | 6.0 | 3.0 | 4.0 | 17.0 |
3 | 4minute_zh.wikipedia.org_all-access_spider | 35.0 | 13.0 | 10.0 | 94.0 | 4.0 | 26.0 | 14.0 | 9.0 | 11.0 | ... | 32.0 | 10.0 | 26.0 | 27.0 | 16.0 | 11.0 | 17.0 | 19.0 | 10.0 | 11.0 |
4 | 52_Hz_I_Love_You_zh.wikipedia.org_all-access_s... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 48.0 | 9.0 | 25.0 | 13.0 | 3.0 | 11.0 | 27.0 | 13.0 | 36.0 | 10.0 |
5 rows × 551 columns
import functools
@functools.lru_cache(maxsize=None)
def clean_df():
# load the data
df = pd.read_csv('../data/train_1.csv.zip',compression='zip',encoding='latin-1')
# small data
df = df.sample(frac=0.01,random_state=SEED)
# description said zeros and nans are same
df = df.fillna(0)
# reduce memory
df.iloc[:,1:] = df.iloc[:,1:].astype(np.int32)
# data of year 2016 only
t1 = pd.Timestamp('2015-07-01')
t2 = pd.Timestamp('2016-01-01')
diff = (t2-t1).days
df = df.iloc[:, np.r_[0,diff+1:diff+1+366]]
# make long data
df = df.melt(id_vars=['Page'],var_name='date',value_name='visits')
# time features
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year # yyyy
df['month'] = df['date'].dt.month # 1 to 12
df['day'] = df['date'].dt.day # 1 to 31
df['quarter'] = df['date'].dt.quarter # 1 to 4
df['dayofweek'] = df['date'].dt.dayofweek # 0 to 6
df['dayofyear'] = df['date'].dt.dayofyear # 1 to 366 (leap year)
df['day_name'] = df['date'].dt.day_name() # Monday
df['month_name'] = df['date'].dt.month_name() # January
df['weekend'] = ((df['date'].dt.dayofweek) // 5 == 1)
df['weekday'] = ((df['date'].dt.dayofweek) // 5 != 1)
# agg features
df['mean'] = df.groupby('Page')['visits'].transform('mean')
df['median'] = df.groupby('Page')['visits'].transform('median')
# new features
df['agent'] = df['Page'].str.split('_').str[-1]
df['access'] = df['Page'].str.split('_').str[-2]
df['project'] = df['Page'].str.split('_').str[-3]
df['name'] = df['Page'].str.split('_').str[:-3].str.join('_')
# language
lang_map ={'en':'English','ja':'Japanese','de':'German',
'www':'Media','fr':'French','zh':'Chinese',
'ru':'Russian','es':'Spanish','commons': 'Media'
}
df['lang'] = df['project'].str.split('.').str[0]
df['language'] = df['lang'].map(lang_map)
# all the year is 2016,drop it.
df.drop('year',axis=1,inplace=True)
# memory reduction
cols_int = ['visits']
cols_cat = ['month','day','quarter','day_name','month_name',
'project','access','agent','language']
cols_float = ['mean','median']
for c in cols_int: df[c] = df[c].astype(np.int32)
for c in cols_float: df[c] = df[c].astype(np.float32)
for c in cols_cat: df[c] = df[c].astype(pd.api.types.CategoricalDtype())
# make day_name and month_name ordered categorical
cats = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
df['day_name'] = pd.Categorical(df['day_name'], ordered=True, categories=cats)
cats = ['January', 'February', 'March', 'April', 'May', 'June', 'July',
'August', 'September', 'October', 'November', 'December']
df['month_name'] = pd.Categorical(df['month_name'], ordered=True, categories=cats)
return df
df = clean_df()
df.head()
Page | date | visits | month | day | quarter | dayofweek | dayofyear | day_name | month_name | weekend | weekday | mean | median | agent | access | project | name | lang | language | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Sean_Connery_en.wikipedia.org_desktop_all-agents | 2016-01-01 | 4872 | 1 | 1 | 1 | 4 | 1 | Friday | January | False | True | 3405.661133 | 2624.0 | all-agents | desktop | en.wikipedia.org | Sean_Connery | en | English |
1 | Tableau_des_médailles_des_Jeux_olympiques_d'ét... | 2016-01-01 | 6 | 1 | 1 | 1 | 4 | 1 | Friday | January | False | True | 170.841537 | 18.0 | all-agents | desktop | fr.wikipedia.org | Tableau_des_médailles_des_Jeux_olympiques_d'ét... | fr | French |
2 | The_Undertaker_fr.wikipedia.org_mobile-web_all... | 2016-01-01 | 469 | 1 | 1 | 1 | 4 | 1 | Friday | January | False | True | 400.336060 | 345.5 | all-agents | mobile-web | fr.wikipedia.org | The_Undertaker | fr | French |
3 | Category:Outdoor_sex_commons.wikimedia.org_all... | 2016-01-01 | 142 | 1 | 1 | 1 | 4 | 1 | Friday | January | False | True | 205.174866 | 193.0 | all-agents | all-access | commons.wikimedia.org | Category:Outdoor_sex | commons | Media |
4 | Камызяк_ru.wikipedia.org_all-access_all-agents | 2016-01-01 | 6692 | 1 | 1 | 1 | 4 | 1 | Friday | January | False | True | 912.516418 | 559.0 | all-agents | all-access | ru.wikipedia.org | Камызяк | ru | Russian |
cols_cat = ['month','day','quarter','day_name','month_name',
'project','access','agent','language']
for c in cols_cat:
print(c)
print(df[c].value_counts().sort_index())
print()
month 1 44981 2 42079 3 44981 4 43530 5 44981 6 43530 7 44981 8 44981 9 43530 10 44981 11 43530 12 44981 Name: month, dtype: int64 day 1 17412 2 17412 3 17412 4 17412 5 17412 6 17412 7 17412 8 17412 9 17412 10 17412 11 17412 12 17412 13 17412 14 17412 15 17412 16 17412 17 17412 18 17412 19 17412 20 17412 21 17412 22 17412 23 17412 24 17412 25 17412 26 17412 27 17412 28 17412 29 17412 30 15961 31 10157 Name: day, dtype: int64 quarter 1 132041 2 132041 3 133492 4 133492 Name: quarter, dtype: int64 day_name Monday 75452 Tuesday 75452 Wednesday 75452 Thursday 75452 Friday 76903 Saturday 76903 Sunday 75452 Name: day_name, dtype: int64 month_name January 44981 February 42079 March 44981 April 43530 May 44981 June 43530 July 44981 August 44981 September 43530 October 44981 November 43530 December 44981 Name: month_name, dtype: int64 project commons.wikimedia.org 39528 de.wikipedia.org 70638 en.wikipedia.org 80520 es.wikipedia.org 46482 fr.wikipedia.org 68808 ja.wikipedia.org 73566 ru.wikipedia.org 53436 www.mediawiki.org 31110 zh.wikipedia.org 66978 Name: project, dtype: int64 access all-access 262056 desktop 128100 mobile-web 140910 Name: access, dtype: int64 agent all-agents 410286 spider 120780 Name: agent, dtype: int64 language Chinese 66978 English 80520 French 68808 German 70638 Japanese 73566 Media 70638 Russian 53436 Spanish 46482 Name: language, dtype: int64
print(df.shape)
df.head()
(531066, 20)
Page | date | visits | month | day | quarter | dayofweek | dayofyear | day_name | month_name | weekend | weekday | mean | median | agent | access | project | name | lang | language | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Sean_Connery_en.wikipedia.org_desktop_all-agents | 2016-01-01 | 4872 | 1 | 1 | 1 | 4 | 1 | Friday | January | False | True | 3405.661133 | 2624.0 | all-agents | desktop | en.wikipedia.org | Sean_Connery | en | English |
1 | Tableau_des_médailles_des_Jeux_olympiques_d'ét... | 2016-01-01 | 6 | 1 | 1 | 1 | 4 | 1 | Friday | January | False | True | 170.841537 | 18.0 | all-agents | desktop | fr.wikipedia.org | Tableau_des_médailles_des_Jeux_olympiques_d'ét... | fr | French |
2 | The_Undertaker_fr.wikipedia.org_mobile-web_all... | 2016-01-01 | 469 | 1 | 1 | 1 | 4 | 1 | Friday | January | False | True | 400.336060 | 345.5 | all-agents | mobile-web | fr.wikipedia.org | The_Undertaker | fr | French |
3 | Category:Outdoor_sex_commons.wikimedia.org_all... | 2016-01-01 | 142 | 1 | 1 | 1 | 4 | 1 | Friday | January | False | True | 205.174866 | 193.0 | all-agents | all-access | commons.wikimedia.org | Category:Outdoor_sex | commons | Media |
4 | Камызяк_ru.wikipedia.org_all-access_all-agents | 2016-01-01 | 6692 | 1 | 1 | 1 | 4 | 1 | Friday | January | False | True | 912.516418 | 559.0 | all-agents | all-access | ru.wikipedia.org | Камызяк | ru | Russian |
df['Page'].nunique() # there are 14.5k unique pages visited in 2016
1451
df.groupby('Page')['visits'].sum()
Page 10月31日_zh.wikipedia.org_all-access_spider 3728 11.22.63_ru.wikipedia.org_desktop_all-agents 198142 12月23日_ja.wikipedia.org_desktop_all-agents 50885 14._September_de.wikipedia.org_desktop_all-agents 40060 1896_Summer_Olympics_en.wikipedia.org_all-access_all-agents 1390141 ... 麻美ゆま_ja.wikipedia.org_mobile-web_all-agents 435128 黃仲裕_zh.wikipedia.org_mobile-web_all-agents 27658 黃毓民_zh.wikipedia.org_all-access_all-agents 212466 黒田孝高_ja.wikipedia.org_all-access_all-agents 804791 龍八夷_zh.wikipedia.org_desktop_all-agents 192697 Name: visits, Length: 1451, dtype: int32
df.groupby('Page')['visits'].sum().sort_values(ascending=False)
Page Special:RecentChangesLinked_en.wikipedia.org_desktop_all-agents 25548886 Game_of_Thrones_en.wikipedia.org_mobile-web_all-agents 7152769 Orange_Is_the_New_Black_en.wikipedia.org_all-access_all-agents 6241503 User:GoogleAnalitycsRoman10/google-api_en.wikipedia.org_desktop_all-agents 5356326 The_Jungle_Book_(2016_film)_en.wikipedia.org_mobile-web_all-agents 4970762 ... January_2017_European_cold_wave_en.wikipedia.org_desktop_all-agents 0 Resident_Evil_7:_Biohazard_de.wikipedia.org_all-access_spider 0 漫漫回家路_(2016年電影)_zh.wikipedia.org_mobile-web_all-agents 0 File:Interstate_275_near_I-96_and_M-14_(wide_crop).jpg_commons.wikimedia.org_mobile-web_all-agents 0 極品絕配_zh.wikipedia.org_desktop_all-agents 0 Name: visits, Length: 1451, dtype: int32
df.groupby('Page')['visits'].sum().nlargest(5)
Page Special:RecentChangesLinked_en.wikipedia.org_desktop_all-agents 25548886 Game_of_Thrones_en.wikipedia.org_mobile-web_all-agents 7152769 Orange_Is_the_New_Black_en.wikipedia.org_all-access_all-agents 6241503 User:GoogleAnalitycsRoman10/google-api_en.wikipedia.org_desktop_all-agents 5356326 The_Jungle_Book_(2016_film)_en.wikipedia.org_mobile-web_all-agents 4970762 Name: visits, dtype: int32
df.query(""" Page == 'Special:Search_en.wikipedia.org_desktop_all-agents' """).head()
Page | date | visits | month | day | quarter | dayofweek | dayofyear | day_name | month_name | weekend | weekday | mean | median | agent | access | project | name | lang | language |
---|
# df.groupby('language')['visits'].nlargest(5)
# df.groupby('language')['visits'].apply(lambda x: x.nlargest(5))
# df.groupby('language')['visits'].apply(lambda x: x.nlargest(5).index)
df.groupby('language')['visits'].apply(lambda x: df.loc[x.nlargest(5).index])
Page | date | visits | month | day | quarter | dayofweek | dayofyear | day_name | month_name | weekend | weekday | mean | median | agent | access | project | name | lang | language | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
language | |||||||||||||||||||||
Chinese | 359618 | 屈原_zh.wikipedia.org_desktop_all-agents | 2016-09-04 | 79490 | 9 | 4 | 3 | 6 | 248 | Sunday | September | True | False | 580.912598 | 235.5 | all-agents | desktop | zh.wikipedia.org | 屈原 | zh | Chinese |
516455 | 冬至_zh.wikipedia.org_desktop_all-agents | 2016-12-21 | 52874 | 12 | 21 | 4 | 2 | 356 | Wednesday | December | False | True | 378.784149 | 102.0 | all-agents | desktop | zh.wikipedia.org | 冬至 | zh | Chinese | |
38373 | 孟庭麗_zh.wikipedia.org_desktop_all-agents | 2016-01-27 | 40898 | 1 | 27 | 1 | 2 | 27 | Wednesday | January | False | True | 493.920776 | 60.0 | all-agents | desktop | zh.wikipedia.org | 孟庭麗 | zh | Chinese | |
121852 | 乙武洋匡_zh.wikipedia.org_all-access_all-agents | 2016-03-24 | 35694 | 3 | 24 | 1 | 3 | 84 | Thursday | March | False | True | 373.920776 | 98.0 | all-agents | all-access | zh.wikipedia.org | 乙武洋匡 | zh | Chinese | |
225587 | 六四事件_zh.wikipedia.org_desktop_all-agents | 2016-06-04 | 34491 | 6 | 4 | 2 | 5 | 156 | Saturday | June | True | False | 2462.322510 | 2110.5 | all-agents | desktop | zh.wikipedia.org | 六四事件 | zh | Chinese | |
English | 150393 | Special:RecentChangesLinked_en.wikipedia.org_d... | 2016-04-13 | 853784 | 4 | 13 | 2 | 2 | 104 | Wednesday | April | False | True | 69805.703125 | 34649.5 | all-agents | desktop | en.wikipedia.org | Special:RecentChangesLinked | en | English |
139512 | 1896_Summer_Olympics_en.wikipedia.org_all-acce... | 2016-04-06 | 628696 | 4 | 6 | 2 | 2 | 97 | Wednesday | April | False | True | 3798.199463 | 962.0 | all-agents | all-access | en.wikipedia.org | 1896_Summer_Olympics | en | English | |
456554 | Special:RecentChangesLinked_en.wikipedia.org_d... | 2016-11-10 | 554336 | 11 | 10 | 4 | 3 | 315 | Thursday | November | False | True | 69805.703125 | 34649.5 | all-agents | desktop | en.wikipedia.org | Special:RecentChangesLinked | en | English | |
458005 | Special:RecentChangesLinked_en.wikipedia.org_d... | 2016-11-11 | 537826 | 11 | 11 | 4 | 4 | 316 | Friday | November | False | True | 69805.703125 | 34649.5 | all-agents | desktop | en.wikipedia.org | Special:RecentChangesLinked | en | English | |
227427 | Issus_(genus)_en.wikipedia.org_all-access_all-... | 2016-06-05 | 526014 | 6 | 5 | 2 | 6 | 157 | Sunday | June | True | False | 1492.535522 | 20.0 | all-agents | all-access | en.wikipedia.org | Issus_(genus) | en | English | |
French | 28471 | Guy_Georges_fr.wikipedia.org_all-access_all-ag... | 2016-01-20 | 120647 | 1 | 20 | 1 | 2 | 20 | Wednesday | January | False | True | 1541.696777 | 474.0 | all-agents | all-access | fr.wikipedia.org | Guy_Georges | fr | French |
283278 | Fête_nationale_française_fr.wikipedia.org_desk... | 2016-07-14 | 88702 | 7 | 14 | 3 | 3 | 196 | Thursday | July | False | True | 724.303284 | 332.5 | all-agents | desktop | fr.wikipedia.org | Fête_nationale_française | fr | French | |
97415 | Journée_internationale_des_femmes_fr.wikipedia... | 2016-03-08 | 64350 | 3 | 8 | 1 | 1 | 68 | Tuesday | March | False | True | 346.448090 | 48.0 | all-agents | mobile-web | fr.wikipedia.org | Journée_internationale_des_femmes | fr | French | |
44381 | Michel_Bouquet_fr.wikipedia.org_mobile-web_all... | 2016-01-31 | 57214 | 1 | 31 | 1 | 6 | 31 | Sunday | January | True | False | 401.893433 | 105.0 | all-agents | mobile-web | fr.wikipedia.org | Michel_Bouquet | fr | French | |
58634 | Zoo_York_fr.wikipedia.org_desktop_all-agents | 2016-02-10 | 50164 | 2 | 10 | 1 | 2 | 41 | Wednesday | February | False | True | 142.336060 | 5.0 | all-agents | desktop | fr.wikipedia.org | Zoo_York | fr | French | |
German | 444962 | Gerätestecker_de.wikipedia.org_desktop_all-agents | 2016-11-02 | 558381 | 11 | 2 | 4 | 2 | 307 | Wednesday | November | False | True | 1870.743164 | 398.0 | all-agents | desktop | de.wikipedia.org | Gerätestecker | de | German |
458449 | San_Marino_de.wikipedia.org_all-access_all-agents | 2016-11-11 | 381926 | 11 | 11 | 4 | 4 | 316 | Friday | November | False | True | 2644.194092 | 997.0 | all-agents | all-access | de.wikipedia.org | San_Marino | de | German | |
181165 | Bosseborn_de.wikipedia.org_desktop_all-agents | 2016-05-04 | 363750 | 5 | 4 | 2 | 2 | 125 | Wednesday | May | False | True | 1012.385254 | 5.0 | all-agents | desktop | de.wikipedia.org | Bosseborn | de | German | |
458137 | Leonard_Cohen_de.wikipedia.org_desktop_all-agents | 2016-11-11 | 146315 | 11 | 11 | 4 | 4 | 316 | Friday | November | False | True | 1144.467163 | 289.0 | all-agents | desktop | de.wikipedia.org | Leonard_Cohen | de | German | |
530290 | Dinner_for_One_de.wikipedia.org_all-access_all... | 2016-12-31 | 133437 | 12 | 31 | 4 | 5 | 366 | Saturday | December | True | False | 974.609314 | 338.0 | all-agents | all-access | de.wikipedia.org | Dinner_for_One | de | German | |
Japanese | 187547 | ポラロイド_ja.wikipedia.org_desktop_all-agents | 2016-05-09 | 1000441 | 5 | 9 | 2 | 0 | 130 | Monday | May | False | True | 2780.065674 | 44.0 | all-agents | desktop | ja.wikipedia.org | ポラロイド | ja | Japanese |
118036 | MediaWiki:EnhancedCollapsibleElements.js_ja.wi... | 2016-03-22 | 611845 | 3 | 22 | 1 | 1 | 82 | Tuesday | March | False | True | 1675.147583 | 2.0 | all-agents | all-access | ja.wikipedia.org | MediaWiki:EnhancedCollapsibleElements.js | ja | Japanese | |
399206 | O・J・シンプソン事件_ja.wikipedia.org_mobile-web_all-ag... | 2016-10-02 | 277713 | 10 | 2 | 4 | 6 | 276 | Sunday | October | True | False | 1231.292358 | 174.5 | all-agents | mobile-web | ja.wikipedia.org | O・J・シンプソン事件 | ja | Japanese | |
16774 | 田京恵_ja.wikipedia.org_all-access_all-agents | 2016-01-12 | 246433 | 1 | 12 | 1 | 1 | 12 | Tuesday | January | False | True | 916.576477 | 95.5 | all-agents | all-access | ja.wikipedia.org | 田京恵 | ja | Japanese | |
286452 | 尾崎豊_ja.wikipedia.org_all-access_all-agents | 2016-07-16 | 167035 | 7 | 16 | 3 | 5 | 198 | Saturday | July | True | False | 3884.458984 | 2264.0 | all-agents | all-access | ja.wikipedia.org | 尾崎豊 | ja | Japanese | |
Media | 126253 | MediaWiki:Sitenotice-translation_commons.wikim... | 2016-03-28 | 84197 | 3 | 28 | 1 | 0 | 88 | Monday | March | False | True | 1044.579224 | 1.0 | all-agents | desktop | commons.wikimedia.org | MediaWiki:Sitenotice-translation | commons | Media |
127704 | MediaWiki:Sitenotice-translation_commons.wikim... | 2016-03-29 | 80325 | 3 | 29 | 1 | 1 | 89 | Tuesday | March | False | True | 1044.579224 | 1.0 | all-agents | desktop | commons.wikimedia.org | MediaWiki:Sitenotice-translation | commons | Media | |
129155 | MediaWiki:Sitenotice-translation_commons.wikim... | 2016-03-30 | 80210 | 3 | 30 | 1 | 2 | 90 | Wednesday | March | False | True | 1044.579224 | 1.0 | all-agents | desktop | commons.wikimedia.org | MediaWiki:Sitenotice-translation | commons | Media | |
214764 | MediaWiki:Sitenotice-translation_commons.wikim... | 2016-05-28 | 67617 | 5 | 28 | 2 | 5 | 149 | Saturday | May | True | False | 1044.579224 | 1.0 | all-agents | desktop | commons.wikimedia.org | MediaWiki:Sitenotice-translation | commons | Media | |
130606 | MediaWiki:Sitenotice-translation_commons.wikim... | 2016-03-31 | 41197 | 3 | 31 | 1 | 3 | 91 | Thursday | March | False | True | 1044.579224 | 1.0 | all-agents | desktop | commons.wikimedia.org | MediaWiki:Sitenotice-translation | commons | Media | |
Russian | 226282 | Али,_Мохаммед_ru.wikipedia.org_all-access_all-... | 2016-06-04 | 562514 | 6 | 4 | 2 | 5 | 156 | Saturday | June | True | False | 5483.262207 | 2232.0 | all-agents | all-access | ru.wikipedia.org | Али,_Мохаммед | ru | Russian |
227733 | Али,_Мохаммед_ru.wikipedia.org_all-access_all-... | 2016-06-05 | 163648 | 6 | 5 | 2 | 6 | 157 | Sunday | June | True | False | 5483.262207 | 2232.0 | all-agents | all-access | ru.wikipedia.org | Али,_Мохаммед | ru | Russian | |
229184 | Али,_Мохаммед_ru.wikipedia.org_all-access_all-... | 2016-06-06 | 87524 | 6 | 6 | 2 | 0 | 158 | Monday | June | False | True | 5483.262207 | 2232.0 | all-agents | all-access | ru.wikipedia.org | Али,_Мохаммед | ru | Russian | |
338886 | Служебная:Все_страницы_ru.wikipedia.org_all-ac... | 2016-08-21 | 82452 | 8 | 21 | 3 | 6 | 234 | Sunday | August | True | False | 6155.705078 | 4508.5 | all-agents | all-access | ru.wikipedia.org | Служебная:Все_страницы | ru | Russian | |
406519 | Путин,_Владимир_Владимирович_ru.wikipedia.org_... | 2016-10-07 | 81917 | 10 | 7 | 4 | 4 | 281 | Friday | October | False | True | 10498.401367 | 9335.0 | all-agents | all-access | ru.wikipedia.org | Путин,_Владимир_Владимирович | ru | Russian | |
Spanish | 455139 | Colegio_Electoral_de_los_Estados_Unidos_es.wik... | 2016-11-09 | 224685 | 11 | 9 | 4 | 2 | 314 | Wednesday | November | False | True | 1188.204956 | 132.5 | all-agents | mobile-web | es.wikipedia.org | Colegio_Electoral_de_los_Estados_Unidos | es | Spanish |
454614 | Melania_Trump_es.wikipedia.org_all-access_all-... | 2016-11-09 | 215610 | 11 | 9 | 4 | 2 | 314 | Wednesday | November | False | True | 3180.704834 | 888.5 | all-agents | all-access | es.wikipedia.org | Melania_Trump | es | Spanish | |
450000 | Aneuploidía_es.wikipedia.org_all-access_all-ag... | 2016-11-06 | 186042 | 11 | 6 | 4 | 6 | 311 | Sunday | November | True | False | 1083.666626 | 476.0 | all-agents | all-access | es.wikipedia.org | Aneuploidía | es | Spanish | |
473461 | Ramón_Serrano_Suñer_es.wikipedia.org_all-acces... | 2016-11-22 | 163644 | 11 | 22 | 4 | 1 | 327 | Tuesday | November | False | True | 2269.912598 | 210.0 | all-agents | all-access | es.wikipedia.org | Ramón_Serrano_Suñer | es | Spanish | |
456065 | Melania_Trump_es.wikipedia.org_all-access_all-... | 2016-11-10 | 118104 | 11 | 10 | 4 | 3 | 315 | Thursday | November | False | True | 3180.704834 | 888.5 | all-agents | all-access | es.wikipedia.org | Melania_Trump | es | Spanish |
idx = df.groupby('Page')['visits'].sum().idxmax()
df.query(""" Page == @idx """).head()
Page | date | visits | month | day | quarter | dayofweek | dayofyear | day_name | month_name | weekend | weekday | mean | median | agent | access | project | name | lang | language | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
940 | Special:RecentChangesLinked_en.wikipedia.org_d... | 2016-01-01 | 28809 | 1 | 1 | 1 | 4 | 1 | Friday | January | False | True | 69805.703125 | 34649.5 | all-agents | desktop | en.wikipedia.org | Special:RecentChangesLinked | en | English |
2391 | Special:RecentChangesLinked_en.wikipedia.org_d... | 2016-01-02 | 23581 | 1 | 2 | 1 | 5 | 2 | Saturday | January | True | False | 69805.703125 | 34649.5 | all-agents | desktop | en.wikipedia.org | Special:RecentChangesLinked | en | English |
3842 | Special:RecentChangesLinked_en.wikipedia.org_d... | 2016-01-03 | 42066 | 1 | 3 | 1 | 6 | 3 | Sunday | January | True | False | 69805.703125 | 34649.5 | all-agents | desktop | en.wikipedia.org | Special:RecentChangesLinked | en | English |
5293 | Special:RecentChangesLinked_en.wikipedia.org_d... | 2016-01-04 | 16696 | 1 | 4 | 1 | 0 | 4 | Monday | January | False | True | 69805.703125 | 34649.5 | all-agents | desktop | en.wikipedia.org | Special:RecentChangesLinked | en | English |
6744 | Special:RecentChangesLinked_en.wikipedia.org_d... | 2016-01-05 | 24761 | 1 | 5 | 1 | 1 | 5 | Tuesday | January | False | True | 69805.703125 | 34649.5 | all-agents | desktop | en.wikipedia.org | Special:RecentChangesLinked | en | English |
ts = df.query(""" Page == @idx """)[['date','visits']].set_index('date')
print(ts.shape)
ts.head()
(366, 1)
visits | |
---|---|
date | |
2016-01-01 | 28809 |
2016-01-02 | 23581 |
2016-01-03 | 42066 |
2016-01-04 | 16696 |
2016-01-05 | 24761 |
ts.plot()
# ts is periodic
# ts has some very large peaks
# ts in not going upward, it does not have trend (it may have if I have more years)
<matplotlib.axes._subplots.AxesSubplot at 0x7fbd04e4f950>
ts.groupby(ts.index.month).plot();
df.head(2)
Page | date | visits | month | day | quarter | dayofweek | dayofyear | day_name | month_name | weekend | weekday | mean | median | agent | access | project | name | lang | language | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Sean_Connery_en.wikipedia.org_desktop_all-agents | 2016-01-01 | 4872 | 1 | 1 | 1 | 4 | 1 | Friday | January | False | True | 3405.661133 | 2624.0 | all-agents | desktop | en.wikipedia.org | Sean_Connery | en | English |
1 | Tableau_des_médailles_des_Jeux_olympiques_d'ét... | 2016-01-01 | 6 | 1 | 1 | 1 | 4 | 1 | Friday | January | False | True | 170.841537 | 18.0 | all-agents | desktop | fr.wikipedia.org | Tableau_des_médailles_des_Jeux_olympiques_d'ét... | fr | French |
fname_lang_monthly_mean = '../reports/figures/2016_sample001_monthly_visits.png'
if not os.path.isfile(fname_lang_monthly_mean):
plt.figure(figsize=(12,12))
sns.pointplot(x="month_name", y="visits", hue='language', data=df,estimator='mean')
plt.savefig(fname_lang_monthly_mean, dpi=300)
Image(fname_lang_monthly_mean)
# df.groupby(['month_name', 'language'])['visits'].sum().unstack().reset_index()
# TypeError: cannot insert an item into a CategoricalIndex that is not already an existing category
df.groupby(['month_name', 'language'])['visits'].mean().unstack()
language | Chinese | English | French | German | Japanese | Media | Russian | Spanish |
---|---|---|---|---|---|---|---|---|
month_name | ||||||||
January | 342.459193 | 2621.086510 | 573.951784 | 512.001337 | 867.843043 | 86.918603 | 891.316836 | 879.596647 |
February | 329.817599 | 2338.958621 | 549.187087 | 530.061461 | 674.786584 | 97.250313 | 821.931507 | 1185.318219 |
March | 301.278336 | 2168.100733 | 516.006177 | 541.103961 | 663.204943 | 144.932475 | 821.942996 | 1180.546355 |
April | 313.879964 | 3600.553636 | 470.413298 | 456.632297 | 665.828192 | 116.374957 | 789.744064 | 1166.903412 |
May | 374.369822 | 2445.316129 | 461.631263 | 600.784891 | 915.704381 | 112.023901 | 726.005303 | 1289.404369 |
June | 475.481603 | 2828.781364 | 383.480142 | 572.915371 | 708.529519 | 96.194128 | 910.601826 | 1247.700000 |
July | 378.291380 | 2493.361437 | 361.601407 | 440.325088 | 738.719628 | 82.590172 | 620.536677 | 1085.744729 |
August | 330.799048 | 2101.702933 | 403.728895 | 382.253050 | 713.152143 | 104.909243 | 683.399912 | 1305.298959 |
September | 325.865756 | 2338.375455 | 397.074823 | 457.672193 | 675.521559 | 113.057513 | 668.401598 | 1370.632808 |
October | 350.985722 | 2059.808211 | 410.550961 | 467.847902 | 737.381159 | 112.381915 | 728.887539 | 1194.172212 |
November | 394.443716 | 3549.760455 | 457.678723 | 757.351813 | 736.341294 | 104.715371 | 744.500228 | 1586.094751 |
December | 402.447735 | 2727.078152 | 467.288092 | 509.881665 | 748.898732 | 110.517466 | 864.160407 | 1003.130302 |
ax = df.groupby(['month_name', 'language'])['visits'].mean().unstack()\
.reset_index(drop=True).plot(figsize=(12,12), logy=False )
plt.xticks(range(12), rotation=90)
ax.set_xticklabels(df['month_name'].unique());
# exclude english and compare others
# plt.figure(figsize=(12,12))
# df.query("""language != 'English'""").pipe((sns.pointplot,'data'), x='month_name',
# y='visits',hue='language')
# # ValueError: 'c' argument has 12 elements, which is not acceptable for use with 'x' with size 0, 'y' with size 0.
ax = df.query("""language != 'English'""")\
.groupby(['month_name', 'language'])['visits'].mean().unstack()\
.reset_index(drop=True).plot(figsize=(12,12), logy=False )
plt.xticks(range(12), rotation=90)
ax.set_xticklabels(df['month_name'].unique());
df.groupby(['dayofyear', 'language'])['visits'].mean().unstack().plot(figsize=(12,12))
plt.savefig('../reports/figures/daily_visits.png',dpi=300)
%%time
# plt.figure(figsize=(12,12))
# sns.lineplot(x='dayofyear',y='visits',hue='language',data=df)
# Wall time: 2min 27s
CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs Wall time: 4.05 µs
df['day_name'].unique().categories
Index(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], dtype='object')
%%time
sns.barplot(x='day_name',y='visits',data=df,
order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday',
'Sunday'])
plt.savefig('../reports/figures/visits_by_weekday.png')
CPU times: user 2.56 s, sys: 22.2 ms, total: 2.58 s Wall time: 2.68 s
%%time
df.groupby('day_name')['visits'].mean().sort_index().plot.bar()
CPU times: user 28.7 ms, sys: 2.51 ms, total: 31.2 ms Wall time: 30.2 ms
<matplotlib.axes._subplots.AxesSubplot at 0x7fbd0739f7d0>
%%time
ax = df.groupby('day_name')['visits'].mean().sort_index(ascending=False)\
.plot.barh(use_index=True, color=sns.color_palette('husl',7))
plt.tick_params(axis='y', which='both', labelright='on')
CPU times: user 30.2 ms, sys: 1.75 ms, total: 31.9 ms Wall time: 35.4 ms
df.groupby(['day_name','month_name'])['visits'].mean().sort_index(ascending=False).unstack().plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x7fbd06d86850>
%%time
sns.catplot(data=df, kind='bar',ci=None,
x='day_name',y='visits',hue='month_name')
plt.xticks(rotation=90)
plt.savefig('../reports/figures/visits_per_month_per_weekday.png')
CPU times: user 3.11 s, sys: 126 ms, total: 3.23 s Wall time: 2.93 s
df.groupby(['day_name','month_name'])['visits'].mean().sort_index(ascending=False).unstack(0).plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x7fbd076afe50>
df1 = df.groupby(["day_name", "day"])['visits'].mean().reset_index()\
.pivot('day','day_name','visits').dropna()
fig, ax = plt.subplots(figsize=(50, 30))
sns.heatmap(data=df1, annot=False, ax=ax, fmt="d", linewidths=2).invert_yaxis()
plt.title('Web Traffic per Days of Week',fontsize=28)
plt.xlabel('Week Day Name', fontsize=28)
plt.ylabel('Day of Month', fontsize=28)
plt.xticks(fontsize=28)
plt.yticks(fontsize=28)
cbar = ax.collections[0].colorbar
cbar.ax.tick_params(labelsize=28)
plt.savefig('../reports/figures/visits_per_dayofmonth_per_dayofweek.png')
plt.show()
df1 = df.groupby(["month_name", "day"])['visits'].mean().reset_index()\
.pivot('day','month_name','visits').dropna()
fig, ax = plt.subplots(figsize=(50, 30))
# sns.set(font_scale=3)
sns.heatmap(data=df1, annot=False, ax=ax, fmt="d", linewidths=2).invert_yaxis()
plt.title('Web Traffic for Months per days of month',fontsize=28)
plt.xlabel('Month Name', fontsize=28)
plt.ylabel('Day of Month', fontsize=28)
plt.xticks(fontsize=28)
plt.yticks(fontsize=28)
cbar = ax.collections[0].colorbar
cbar.ax.tick_params(labelsize=28)
plt.show()
Resources:
Here we can that the plots seems periodic in nature in time domain. We can work in the frequency domain using FFT transformation of the time series. Peaks in the FFT show us the strongest frequencies in the periodic signal.
The Fourier transform is an alternative representation of a signal as a superposition of periodic components. It is an important mathematical result that any well-behaved function can be represented under this form. Whereas a time-varying signal is most naturally considered as a function of time, the Fourier transform represents it as a function of the frequency. A magnitude and a phase, which are both encoded in a single complex number, are associated to each frequency.
The Discrete Fourier Transform
Let's consider a digital signal x represented by a vector $(x0,...,xN−1)$. We assume that this signal is regularly sampled. The Discrete Fourier Transform (DFT) of x is $X=(X0,...,XN−1)$ defined as:
The DFT can be computed efficiently with the Fast Fourier Transform (FFT), an algorithm that exploits symmetries and redundancies in this definition to considerably speed up the computation. The complexity of the FFT is $O(NlogN)$ instead of $O(N^2)$ for the naive DFT. The FFT is one of the most important algorithms of the digital universe.
days = df['dayofyear'].unique()
df_daily = df.groupby(['dayofyear', 'language'])['visits'].mean().unstack()
df_daily.head()
language | Chinese | English | French | German | Japanese | Media | Russian | Spanish |
---|---|---|---|---|---|---|---|---|
dayofyear | ||||||||
1 | 306.234973 | 2600.668182 | 681.744681 | 761.170984 | 961.223881 | 69.383420 | 1060.171233 | 545.700787 |
2 | 381.191257 | 3011.259091 | 635.835106 | 589.082902 | 1078.119403 | 73.911917 | 884.945205 | 657.291339 |
3 | 407.907104 | 3176.172727 | 639.170213 | 645.492228 | 1071.691542 | 78.678756 | 796.493151 | 730.070866 |
4 | 327.142077 | 2959.886364 | 655.047872 | 499.595855 | 984.273632 | 85.523316 | 789.609589 | 753.275591 |
5 | 318.710383 | 3342.718182 | 519.462766 | 455.051813 | 860.353234 | 87.911917 | 771.342466 | 686.732283 |
df_daily.columns
CategoricalIndex(['Chinese', 'English', 'French', 'German', 'Japanese', 'Media', 'Russian', 'Spanish'], categories=['Chinese', 'English', 'French', 'German', 'Japanese', 'Media', 'Russian', 'Spanish'], ordered=False, name='language', dtype='category')
def plot_with_fft(df_daily, col):
from scipy.fftpack import fft
fig = plt.figure(1,figsize=[15,5])
plt.ylabel('Views per Page')
plt.xlabel('Day')
plt.title(col)
plt.xticks(range(0,370,10))
plt.plot(days, df_daily[col].to_numpy(),label = col )
fig = plt.figure(2,figsize=[15,5])
fft_complex = fft(df_daily[col].to_numpy())
fft_mag = [np.sqrt(np.real(x)*np.real(x)+np.imag(x)*np.imag(x))
for x in fft_complex]
fft_xvals = [day / days[-1] for day in days]
npts = len(fft_xvals) // 2 + 1
fft_mag = fft_mag[:npts]
fft_xvals = fft_xvals[:npts]
plt.ylabel('FFT Magnitude')
plt.xlabel(r"Frequency [days]$^{-1}$")
plt.title('Fourier Transform')
plt.plot(fft_xvals[1:],fft_mag[1:],label = col )
# Draw lines at 1, 1/2, and 1/3 week periods
plt.axvline(x=1./7,color='red',alpha=0.3)
plt.axvline(x=2./7,color='red',alpha=0.3)
plt.axvline(x=3./7,color='red',alpha=0.3)
plt.show()
for col in df_daily.columns:
plot_with_fft(df_daily, col)