import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import os
import time

# random state
SEED = 0
RNG = np.random.RandomState(SEED)

# Jupyter notebook settings for pandas
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 50)

print([(x.__name__,x.__version__) for x in [np, pd,sns,matplotlib]])

[('numpy', '1.16.4'), ('pandas', '0.25.0'), ('seaborn', '0.9.0'), ('matplotlib', '3.1.1')]


import scipy
from scipy import stats


import IPython
from IPython.display import display


from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler


def show_method_attributes(method, ncols=7):
    """ Show all the attributes of a given method.
    Example:
    ========
    show_method_attributes(list)
     """
    x = [i for i in dir(method) if i[0].islower()]
    x = [i for i in x if i not in 'os np pd sys time psycopg2'.split()]

    return pd.DataFrame(np.array_split(x,ncols)).T.fillna('')


def json_dump_tofile(myjson,ofile,sort_keys=False):
    """Write json dictionary to a datafile.
    
    Usage:
    myjson = {'num': 5, my_list = [1,2,'apple']}
    json_dump_tofile(myjson, ofile)
    
    """
    import io
    import json

    with io.open(ofile, 'w', encoding='utf8') as fo:
        json_str = json.dumps(myjson,
                          indent=4,
                          sort_keys=sort_keys,
                          separators=(',', ': '),
                          ensure_ascii=False)
        fo.write(str(json_str))


df = pd.read_csv('../data/raw/kc_house_data.csv')
df.head()


df.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')


df.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')


features_sqft = df.filter(like='sqft').columns
features_sqft

Index(['sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement',
       'sqft_living15', 'sqft_lot15'],
      dtype='object')


def plot_statistics(df,features,statistic,color='b'):
    plt.figure(figsize=(12,4), dpi=80)
    sns.barplot(x=features, y= df[features].agg(statistic).sort_values(),color=color)
    plt.xlabel('Features')
    plt.ylabel(statistic.title())
    plt.title(statistic.title()+ ' for all features')
    plt.savefig(f'../reports/statistics/{statistic}.png',dpi=300)
    plt.xticks(rotation=90)
    plt.show()
    plt.close()


plot_statistics(df,features_sqft,'mean',color='b')
# mean is not zero, we need to normalize.


plot_statistics(df,features_sqft,'std',color='tomato')
# values are deviated, we need to normalize them.


plot_statistics(df,features_sqft,'skew',color='seagreen')
# the features are skewed, we need to do boxcox transformation and
# also need to look at outliers.


feat = 'sqft_lot15'
df[feat].skew()

9.506743246764398


plt.figure(figsize=(12,4), dpi=80)
sns.distplot(df[feat], bins=300, kde=False)
plt.ylabel('Count')
plt.title(f'Distribution plot of feature: {feat}');

# almost all of data is centered around 0.


plt.figure(figsize=(12,4), dpi=80)
sns.distplot(df[feat], bins=9000, kde=False)
plt.ylabel('Count')
plt.title(f'Distribution plot of feature: {feat} with selected x limit')
plt.xlim(0,20_000)
plt.show()

# this does not look like gaussian.


plt.figure(figsize=(12,4), dpi=80)
sns.boxplot(df[feat])
plt.title(f'Box plot of feature: {feat}')
plt.savefig(f'../reports/statistics/{feat}_boxplot.png',dpi=300)

# there are so many outliers
# boxplot is hard to read, we can plot kurtosis plot.


plot_statistics(df,features_sqft,'kurtosis',color='darkorange')


df[feat].kurtosis()

150.76311004626973


plot_statistics(df,features_sqft,'median',color='b')

# medians are not zero.


plt.figure(figsize=(12,4), dpi=80)
sns.barplot(x=features_sqft, y= (df[features_sqft].quantile(0.75)
            - df[features_sqft].quantile(0.25)).sort_values(), color='darkred')
plt.xlabel('Column')
plt.ylabel('IQR')
plt.title('Sqft features IQR')
plt.savefig('../reports/statistics/iqr.png',dpi=300)


plot_statistics(df,features_sqft,'std',color='darkred')
# iqr and std does not look similar, we may have outliers.


plt.figure(figsize=(12,8))
ax = sns.boxplot(data = df[features_sqft], 
  orient = 'h', palette = 'winter')

ax.set(xlim=(-5,50_000))
ax.set_facecolor('#fafafa')
ax.set_title('Box plots of Sqft features')
ax.set_ylabel('Variables')
plt.savefig('../reports/statistics/boxplot_sqft_features.png',dpi=300)


from sklearn.feature_selection import mutual_info_classif


df.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')


all_features = df.columns.difference(['id','date','price'])
all_features

Index(['bathrooms', 'bedrooms', 'condition', 'floors', 'grade', 'lat', 'long',
       'sqft_above', 'sqft_basement', 'sqft_living', 'sqft_living15',
       'sqft_lot', 'sqft_lot15', 'view', 'waterfront', 'yr_built',
       'yr_renovated', 'zipcode'],
      dtype='object')


%%time
target = 'price'

mutual_infos = pd.Series(data=mutual_info_classif(df[all_features], df[target],
                                                  discrete_features=False,
                                                  random_state=SEED),
                         index=all_features)

CPU times: user 21.3 s, sys: 2.75 s, total: 24.1 s
Wall time: 24.2 s


mutual_infos.sort_values(ascending=False).to_frame()

	id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	condition	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15
0	7129300520	20141013T000000	221900.0	3	1.00	1180	5650	1.0	3	7	1180	0	1955	0	98178	47.5112	-122.257	1340	5650
1	6414100192	20141209T000000	538000.0	3	2.25	2570	7242	2.0	3	7	2170	400	1951	1991	98125	47.7210	-122.319	1690	7639
2	5631500400	20150225T000000	180000.0	2	1.00	770	10000	1.0	3	6	770	0	1933	0	98028	47.7379	-122.233	2720	8062
3	2487200875	20141209T000000	604000.0	4	3.00	1960	5000	1.0	5	7	1050	910	1965	0	98136	47.5208	-122.393	1360	5000
4	1954400510	20150218T000000	510000.0	3	2.00	1680	8080	1.0	3	8	1680	0	1987	0	98074	47.6168	-122.045	1800	7503

Table of Contents

Data Description¶

Imports¶

Useful Scripts¶

Load the data¶

descriptive statistics¶

Mean¶

standard deviation¶

skewness¶

kurtosis¶

Robust statistics: median¶

Outliers¶

Mutual Information (MI) between Target and the Predictors¶

	0
floors	0.569917
grade	0.509087
lat	0.333617
sqft_living	0.324773
bedrooms	0.289373
bathrooms	0.284329
sqft_living15	0.248312
zipcode	0.233668
sqft_above	0.213384
condition	0.201436
long	0.076660
sqft_lot15	0.075715
yr_built	0.070296
sqft_basement	0.065090
sqft_lot	0.061365
view	0.045477
yr_renovated	0.003725
waterfront	0.000000