import time
time_start_notebook = time.time()


%%capture
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    ## install modules
    !pip install scikit-plot
    !pip install lrcurve
    !pip install watermark
    !pip install -U scikit-learn

    ## print
    print('Environment: Google Colaboratory.')


# usual imports
import numpy as np
import pandas as pd

import os
import time
import collections
import itertools
import six
import pickle
import joblib

# random state
SEED = 0
RNG = np.random.RandomState(SEED)

# sklearn
import sklearn
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import ensemble
from sklearn import metrics

# versions
import watermark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Bhishan Poudel 2020-11-04 

CPython 3.7.7
IPython 7.18.1

compiler   : Clang 4.0.1 (tags/RELEASE_401/final)
system     : Darwin
release    : 19.6.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit

sklearn   0.23.1
pandas    1.1.0
watermark 2.0.2
six       1.15.0
numpy     1.18.4
joblib    0.17.0


def show_methods(obj, ncols=7,start=None, inside=None):
    """ Show all the attributes of a given method.
    Example:
    ========
    show_method_attributes(list)
     """
    lst = [elem for elem in dir(obj) if elem[0]!='_' ]
    lst = [elem for elem in lst 
           if elem not in 'os np pd sys time psycopg2'.split() ]

    if isinstance(start,str):
        lst = [elem for elem in lst if elem.startswith(start)]
        
    if isinstance(start,tuple) or isinstance(start,list):
        lst = [elem for elem in lst for start_elem in start
               if elem.startswith(start_elem)]
        
    if isinstance(inside,str):
        lst = [elem for elem in lst if inside in elem]
        
    if isinstance(inside,tuple) or isinstance(inside,list):
        lst = [elem for elem in lst for inside_elem in inside
               if inside_elem in elem]

    return pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')

def adjustedR2(rsquared,nrows,kcols):
    return rsquared- (kcols-1)/(nrows-kcols) * (1-rsquared)


def print_reg_metrics(yt,yp,ncols):
    rmse = np.sqrt(sklearn.metrics.mean_squared_error(yt,yp))
    r2 = sklearn.metrics.r2_score(yt, yp)
    ar2 = adjustedR2(r2, len(yt), ncols)

    out = f"""
    RMSE     : {rmse:,.2f}
    R-squared: {r2:,.6f}
    Adj R2   : {ar2:,.6f}
    """
    print(out)


if ENV_COLAB:
    path_raw = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/'
    proj = 'Projects/King_County_Seattle_House_Price_Kaggle/'
    data_path_parent = path_raw + proj

else:
    data_path_parent = '../data/'

target = 'price'
cols_drop = ['id', 'date', 'zipcode_top10']
cols_sq = ['bedrooms','bathrooms','floors','waterfront','view',
    'age','age_after_renovation','log1p_sqft_living','log1p_sqft_lot',
    'log1p_sqft_above','log1p_sqft_basement',
    'log1p_sqft_living15','log1p_sqft_lot15']

train_size = 0.8


target = 'price'


data_path_clean = data_path_parent + 'processed/data_cleaned_encoded.csv'
df = pd.read_csv(data_path_clean)

print(f"df shape : {df.shape}")
display(df.head(2).append(df.tail(2)))

df shape : (21613, 91)


print(df.columns)

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15', 'yr_sales', 'age',
       'yr_renovated2', 'age_after_renovation', 'zipcode_top10',
       'zipcode_houses', 'basement_bool', 'renovation_bool', 'age_cat',
       'age_after_renovation_cat', 'waterfront_0', 'waterfront_1', 'view_0',
       'view_1', 'view_2', 'view_3', 'view_4', 'condition_1', 'condition_2',
       'condition_3', 'condition_4', 'condition_5', 'grade_1', 'grade_10',
       'grade_11', 'grade_12', 'grade_13', 'grade_3', 'grade_4', 'grade_5',
       'grade_6', 'grade_7', 'grade_8', 'grade_9', 'zipcode_top10_98004',
       'zipcode_top10_98006', 'zipcode_top10_98033', 'zipcode_top10_98039',
       'zipcode_top10_98040', 'zipcode_top10_98102', 'zipcode_top10_98105',
       'zipcode_top10_98155', 'zipcode_top10_98177', 'zipcode_top10_others',
       'age_cat_0', 'age_cat_1', 'age_cat_2', 'age_cat_3', 'age_cat_4',
       'age_cat_5', 'age_cat_6', 'age_cat_7', 'age_cat_8', 'age_cat_9',
       'age_after_renovation_cat_0', 'age_after_renovation_cat_1',
       'age_after_renovation_cat_2', 'age_after_renovation_cat_3',
       'age_after_renovation_cat_4', 'age_after_renovation_cat_5',
       'age_after_renovation_cat_6', 'age_after_renovation_cat_7',
       'age_after_renovation_cat_8', 'age_after_renovation_cat_9',
       'log1p_sqft_living', 'log1p_sqft_lot', 'log1p_sqft_above',
       'log1p_sqft_basement', 'log1p_sqft_living15', 'log1p_sqft_lot15'],
      dtype='object')


df.filter(regex='price').columns
# there is no data leakage, there is only one target column

Index(['price'], dtype='object')


df.filter(regex='log').columns

Index(['log1p_sqft_living', 'log1p_sqft_lot', 'log1p_sqft_above',
       'log1p_sqft_basement', 'log1p_sqft_living15', 'log1p_sqft_lot15'],
      dtype='object')


df = df.drop(cols_drop, axis=1)


for col in cols_sq:
    df[col + '_sq'] = df[col]**2


df_Xtrain,df_Xtest,ser_ytrain,ser_ytest = model_selection.train_test_split(
    df.drop([target],axis=1),
    df[target],
    train_size=train_size,
    random_state=SEED)

ytrain = np.array(ser_ytrain).flatten()
ytest = np.array(ser_ytest).flatten()


scaler = preprocessing.StandardScaler()
scaler.fit(df_Xtrain)
Xtrain = scaler.transform(df_Xtrain)
Xtest  = scaler.transform(df_Xtest)


features = df.drop([target],axis=1).columns


model = RandomForestRegressor(random_state=SEED,n_jobs=-1)
model.fit(Xtrain,ytrain)

ypreds = model.predict(Xtest)
print_reg_metrics(ytest,ypreds,Xtest.shape[-1])

    RMSE     : 122,552.77
    R-squared: 0.888556
    Adj R2   : 0.885944


%%time
model = RandomForestRegressor(n_estimators= 50,random_state=SEED)

model.fit(Xtrain,ytrain)

ypreds = model.predict(Xtest)
print_reg_metrics(ytest,ypreds,Xtest.shape[-1])

    RMSE     : 121,626.72
    R-squared: 0.890234
    Adj R2   : 0.887661


from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint

# Number of trees in  forest
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 200, num = 5)]

# max features
max_features = ['auto', 'sqrt']

# max depth of leaves
max_depth = [int(x) for x in np.linspace(1, 45, num = 3)]

# min samples split
min_samples_split = [5, 10]

# random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split}


pprint(random_grid)

{'max_depth': [1, 23, 45],
 'max_features': ['auto', 'sqrt'],
 'min_samples_split': [5, 10],
 'n_estimators': [20, 65, 110, 155, 200]}


%%time
model = RandomForestRegressor(random_state=SEED)
rf_random = RandomizedSearchCV(model,random_grid,
                               n_iter = 100,
                               cv = 5,
                               verbose=2,
                               random_state=SEED,
                               n_jobs = -1,
                               scoring='neg_mean_squared_error')
# Fit the random search model
# rf_random.fit(Xtrain, ytrain) # comment this


# rf_random.best_params_

"""
{'n_estimators': 110,
 'min_samples_split': 5,
 'max_features': 'auto',
 'max_depth': 45}
"""

{'n_estimators': 110,
 'min_samples_split': 5,
 'max_features': 'auto',
 'max_depth': 45}


params_rf_best = {'n_estimators': 110,
 'min_samples_split': 5,
 'max_features': 'auto',
 'max_depth': 45}

model = RandomForestRegressor(random_state=SEED,**params_rf_best)
model

RandomForestRegressor(max_depth=45, min_samples_split=5, n_estimators=110,
                      random_state=100)


%%time
model.fit(Xtrain,ytrain)

ypreds = model.predict(Xtest)
print_reg_metrics(ytest,ypreds,Xtest.shape[-1])

    RMSE     : 124,313.37
    R-squared: 0.885331
    Adj R2   : 0.882643


importances = model.feature_importances_
importances[:5]

array([0.00116118, 0.00444788, 0.08525432, 0.00405505, 0.00063537])


df_imp = pd.DataFrame({'feature': features,
                      'importance': importances})

df_imp.sort_values('importance', ascending=False)\
  .style.background_gradient(subset=['importance'])


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 1 hr 16 min 37 secs

	id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	...	log1p_sqft_living	log1p_sqft_lot	log1p_sqft_above	log1p_sqft_basement	log1p_sqft_living15	log1p_sqft_lot15
0	7129300520	2014-10-13	221900.0	3	1.00	1180	5650	1.0	...	7.074117	8.639588	7.074117	0.000000	7.201171	8.639588
1	6414100192	2014-12-09	538000.0	3	2.25	2570	7242	2.0	...	7.852050	8.887791	7.682943	5.993961	7.433075	8.941153
21611	291310100	2015-01-16	400000.0	3	2.50	1600	2388	2.0	...	7.378384	7.778630	7.378384	0.000000	7.252054	7.160846
21612	1523300157	2014-10-15	325000.0	2	0.75	1020	1076	2.0	...	6.928538	6.981935	6.928538	0.000000	6.928538	7.213768

Table of Contents

NOTES¶

Imports¶

Useful Scripts¶

Parameters¶

Load the data¶

Data Processing¶

Sanity Check¶

Drop unwanted columns¶

Create squared columns¶

Train test split¶

Scaling¶

Modelling: Random Forest¶

Grid Search¶

Randomized search¶

Feature Importance¶

Time Taken¶

	feature	importance
8	grade	0.324809
14	lat	0.150440
81	log1p_sqft_living	0.088471
2	sqft_living	0.085254
94	log1p_sqft_living_sq	0.071862
15	long	0.063767
98	log1p_sqft_living15_sq	0.009697
22	zipcode_houses	0.009553
13	zipcode	0.009523
16	sqft_living15	0.009467
5	waterfront	0.009218
85	log1p_sqft_living15	0.009132
90	waterfront_sq	0.008649
92	age_sq	0.008617
27	waterfront_0	0.008563
11	yr_built	0.008179
9	sqft_above	0.007636
28	waterfront_1	0.007627
96	log1p_sqft_above_sq	0.007255
83	log1p_sqft_above	0.006967
19	age	0.006722
60	zipcode_top10_others	0.005920
1	bathrooms	0.004448
6	view	0.004327
86	log1p_sqft_lot15	0.004198
99	log1p_sqft_lot15_sq	0.004081
3	sqft_lot	0.004055
17	sqft_lot15	0.004011
91	view_sq	0.003781
50	grade_9	0.003768
95	log1p_sqft_lot_sq	0.003692
82	log1p_sqft_lot	0.003639
88	bathrooms_sq	0.002928
51	zipcode_top10_98004	0.002926
20	yr_renovated2	0.002530
21	age_after_renovation	0.002196
93	age_after_renovation_sq	0.002133
33	view_4	0.001748
97	log1p_sqft_basement_sq	0.001471
18	yr_sales	0.001460
10	sqft_basement	0.001316
84	log1p_sqft_basement	0.001283
87	bedrooms_sq	0.001278
49	grade_8	0.001233
43	grade_13	0.001201
7	condition	0.001168
0	bedrooms	0.001161
55	zipcode_top10_98040	0.001022
29	view_0	0.001012
25	age_cat	0.000972
32	view_3	0.000738
41	grade_11	0.000738
73	age_after_renovation_cat_2	0.000723
42	grade_12	0.000693
12	yr_renovated	0.000684
40	grade_10	0.000672
4	floors	0.000635
26	age_after_renovation_cat	0.000629
89	floors_sq	0.000594
48	grade_7	0.000532
31	view_2	0.000523
36	condition_3	0.000482
38	condition_5	0.000474
37	condition_4	0.000439
54	zipcode_top10_98039	0.000396
30	view_1	0.000327
75	age_after_renovation_cat_4	0.000285
65	age_cat_4	0.000276
72	age_after_renovation_cat_1	0.000269
24	renovation_bool	0.000266
63	age_cat_2	0.000263
52	zipcode_top10_98006	0.000254
62	age_cat_1	0.000213
67	age_cat_6	0.000196
47	grade_6	0.000192
66	age_cat_5	0.000183
69	age_cat_8	0.000160
79	age_after_renovation_cat_8	0.000155
77	age_after_renovation_cat_6	0.000154
76	age_after_renovation_cat_5	0.000154
78	age_after_renovation_cat_7	0.000154
74	age_after_renovation_cat_3	0.000149
53	zipcode_top10_98033	0.000142
64	age_cat_3	0.000137
68	age_cat_7	0.000131
23	basement_bool	0.000121
35	condition_2	0.000100
71	age_after_renovation_cat_0	0.000068
57	zipcode_top10_98105	0.000055
61	age_cat_0	0.000048
58	zipcode_top10_98155	0.000048
46	grade_5	0.000046
34	condition_1	0.000041
56	zipcode_top10_98102	0.000026
80	age_after_renovation_cat_9	0.000023
59	zipcode_top10_98177	0.000022
70	age_cat_9	0.000019
45	grade_4	0.000001
39	grade_1	0.000000
44	grade_3	0.000000