import time
time_start_notebook = time.time()
%%capture
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
## install modules
!pip install scikit-plot
!pip install lrcurve
!pip install watermark
!pip install -U scikit-learn
## print
print('Environment: Google Colaboratory.')
# usual imports
import numpy as np
import pandas as pd
import os
import time
import collections
import itertools
import six
import pickle
import joblib
# random state
SEED = 0
RNG = np.random.RandomState(SEED)
# sklearn
import sklearn
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import ensemble
from sklearn import metrics
# versions
import watermark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv
The watermark extension is already loaded. To reload it, use: %reload_ext watermark Bhishan Poudel 2020-11-04 CPython 3.7.7 IPython 7.18.1 compiler : Clang 4.0.1 (tags/RELEASE_401/final) system : Darwin release : 19.6.0 machine : x86_64 processor : i386 CPU cores : 4 interpreter: 64bit sklearn 0.23.1 pandas 1.1.0 watermark 2.0.2 six 1.15.0 numpy 1.18.4 joblib 0.17.0
def show_methods(obj, ncols=7,start=None, inside=None):
""" Show all the attributes of a given method.
Example:
========
show_method_attributes(list)
"""
lst = [elem for elem in dir(obj) if elem[0]!='_' ]
lst = [elem for elem in lst
if elem not in 'os np pd sys time psycopg2'.split() ]
if isinstance(start,str):
lst = [elem for elem in lst if elem.startswith(start)]
if isinstance(start,tuple) or isinstance(start,list):
lst = [elem for elem in lst for start_elem in start
if elem.startswith(start_elem)]
if isinstance(inside,str):
lst = [elem for elem in lst if inside in elem]
if isinstance(inside,tuple) or isinstance(inside,list):
lst = [elem for elem in lst for inside_elem in inside
if inside_elem in elem]
return pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
def adjustedR2(rsquared,nrows,kcols):
return rsquared- (kcols-1)/(nrows-kcols) * (1-rsquared)
def print_reg_metrics(yt,yp,ncols):
rmse = np.sqrt(sklearn.metrics.mean_squared_error(yt,yp))
r2 = sklearn.metrics.r2_score(yt, yp)
ar2 = adjustedR2(r2, len(yt), ncols)
out = f"""
RMSE : {rmse:,.2f}
R-squared: {r2:,.6f}
Adj R2 : {ar2:,.6f}
"""
print(out)
if ENV_COLAB:
path_raw = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/'
proj = 'Projects/King_County_Seattle_House_Price_Kaggle/'
data_path_parent = path_raw + proj
else:
data_path_parent = '../data/'
target = 'price'
cols_drop = ['id', 'date', 'zipcode_top10']
cols_sq = ['bedrooms','bathrooms','floors','waterfront','view',
'age','age_after_renovation','log1p_sqft_living','log1p_sqft_lot',
'log1p_sqft_above','log1p_sqft_basement',
'log1p_sqft_living15','log1p_sqft_lot15']
train_size = 0.8
target = 'price'
data_path_clean = data_path_parent + 'processed/data_cleaned_encoded.csv'
df = pd.read_csv(data_path_clean)
print(f"df shape : {df.shape}")
display(df.head(2).append(df.tail(2)))
df shape : (21613, 91)
id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | ... | age_after_renovation_cat_6 | age_after_renovation_cat_7 | age_after_renovation_cat_8 | age_after_renovation_cat_9 | log1p_sqft_living | log1p_sqft_lot | log1p_sqft_above | log1p_sqft_basement | log1p_sqft_living15 | log1p_sqft_lot15 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7129300520 | 2014-10-13 | 221900.0 | 3 | 1.00 | 1180 | 5650 | 1.0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 7.074117 | 8.639588 | 7.074117 | 0.000000 | 7.201171 | 8.639588 |
1 | 6414100192 | 2014-12-09 | 538000.0 | 3 | 2.25 | 2570 | 7242 | 2.0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 7.852050 | 8.887791 | 7.682943 | 5.993961 | 7.433075 | 8.941153 |
21611 | 291310100 | 2015-01-16 | 400000.0 | 3 | 2.50 | 1600 | 2388 | 2.0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 7.378384 | 7.778630 | 7.378384 | 0.000000 | 7.252054 | 7.160846 |
21612 | 1523300157 | 2014-10-15 | 325000.0 | 2 | 0.75 | 1020 | 1076 | 2.0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 6.928538 | 6.981935 | 6.928538 | 0.000000 | 6.928538 | 7.213768 |
4 rows × 91 columns
log(target)
, do not forget to do exp(ypreds)
while doing model evaluation.print(df.columns)
Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'yr_sales', 'age', 'yr_renovated2', 'age_after_renovation', 'zipcode_top10', 'zipcode_houses', 'basement_bool', 'renovation_bool', 'age_cat', 'age_after_renovation_cat', 'waterfront_0', 'waterfront_1', 'view_0', 'view_1', 'view_2', 'view_3', 'view_4', 'condition_1', 'condition_2', 'condition_3', 'condition_4', 'condition_5', 'grade_1', 'grade_10', 'grade_11', 'grade_12', 'grade_13', 'grade_3', 'grade_4', 'grade_5', 'grade_6', 'grade_7', 'grade_8', 'grade_9', 'zipcode_top10_98004', 'zipcode_top10_98006', 'zipcode_top10_98033', 'zipcode_top10_98039', 'zipcode_top10_98040', 'zipcode_top10_98102', 'zipcode_top10_98105', 'zipcode_top10_98155', 'zipcode_top10_98177', 'zipcode_top10_others', 'age_cat_0', 'age_cat_1', 'age_cat_2', 'age_cat_3', 'age_cat_4', 'age_cat_5', 'age_cat_6', 'age_cat_7', 'age_cat_8', 'age_cat_9', 'age_after_renovation_cat_0', 'age_after_renovation_cat_1', 'age_after_renovation_cat_2', 'age_after_renovation_cat_3', 'age_after_renovation_cat_4', 'age_after_renovation_cat_5', 'age_after_renovation_cat_6', 'age_after_renovation_cat_7', 'age_after_renovation_cat_8', 'age_after_renovation_cat_9', 'log1p_sqft_living', 'log1p_sqft_lot', 'log1p_sqft_above', 'log1p_sqft_basement', 'log1p_sqft_living15', 'log1p_sqft_lot15'], dtype='object')
df.filter(regex='price').columns
# there is no data leakage, there is only one target column
Index(['price'], dtype='object')
df.filter(regex='log').columns
Index(['log1p_sqft_living', 'log1p_sqft_lot', 'log1p_sqft_above', 'log1p_sqft_basement', 'log1p_sqft_living15', 'log1p_sqft_lot15'], dtype='object')
df = df.drop(cols_drop, axis=1)
for col in cols_sq:
df[col + '_sq'] = df[col]**2
df_Xtrain,df_Xtest,ser_ytrain,ser_ytest = model_selection.train_test_split(
df.drop([target],axis=1),
df[target],
train_size=train_size,
random_state=SEED)
ytrain = np.array(ser_ytrain).flatten()
ytest = np.array(ser_ytest).flatten()
scaler = preprocessing.StandardScaler()
scaler.fit(df_Xtrain)
Xtrain = scaler.transform(df_Xtrain)
Xtest = scaler.transform(df_Xtest)
features = df.drop([target],axis=1).columns
model = RandomForestRegressor(random_state=SEED,n_jobs=-1)
model.fit(Xtrain,ytrain)
ypreds = model.predict(Xtest)
print_reg_metrics(ytest,ypreds,Xtest.shape[-1])
RMSE : 122,552.77 R-squared: 0.888556 Adj R2 : 0.885944
Most important hyperparameters of Random Forest:
%%time
model = RandomForestRegressor(n_estimators= 50,random_state=SEED)
model.fit(Xtrain,ytrain)
ypreds = model.predict(Xtest)
print_reg_metrics(ytest,ypreds,Xtest.shape[-1])
RMSE : 121,626.72 R-squared: 0.890234 Adj R2 : 0.887661
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint
# Number of trees in forest
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 200, num = 5)]
# max features
max_features = ['auto', 'sqrt']
# max depth of leaves
max_depth = [int(x) for x in np.linspace(1, 45, num = 3)]
# min samples split
min_samples_split = [5, 10]
# random grid
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split}
pprint(random_grid)
{'max_depth': [1, 23, 45], 'max_features': ['auto', 'sqrt'], 'min_samples_split': [5, 10], 'n_estimators': [20, 65, 110, 155, 200]}
%%time
model = RandomForestRegressor(random_state=SEED)
rf_random = RandomizedSearchCV(model,random_grid,
n_iter = 100,
cv = 5,
verbose=2,
random_state=SEED,
n_jobs = -1,
scoring='neg_mean_squared_error')
# Fit the random search model
# rf_random.fit(Xtrain, ytrain) # comment this
# rf_random.best_params_
"""
{'n_estimators': 110,
'min_samples_split': 5,
'max_features': 'auto',
'max_depth': 45}
"""
{'n_estimators': 110, 'min_samples_split': 5, 'max_features': 'auto', 'max_depth': 45}
params_rf_best = {'n_estimators': 110,
'min_samples_split': 5,
'max_features': 'auto',
'max_depth': 45}
model = RandomForestRegressor(random_state=SEED,**params_rf_best)
model
RandomForestRegressor(max_depth=45, min_samples_split=5, n_estimators=110, random_state=100)
%%time
model.fit(Xtrain,ytrain)
ypreds = model.predict(Xtest)
print_reg_metrics(ytest,ypreds,Xtest.shape[-1])
RMSE : 124,313.37 R-squared: 0.885331 Adj R2 : 0.882643
importances = model.feature_importances_
importances[:5]
array([0.00116118, 0.00444788, 0.08525432, 0.00405505, 0.00063537])
df_imp = pd.DataFrame({'feature': features,
'importance': importances})
df_imp.sort_values('importance', ascending=False)\
.style.background_gradient(subset=['importance'])
feature | importance | |
---|---|---|
8 | grade | 0.324809 |
14 | lat | 0.150440 |
81 | log1p_sqft_living | 0.088471 |
2 | sqft_living | 0.085254 |
94 | log1p_sqft_living_sq | 0.071862 |
15 | long | 0.063767 |
98 | log1p_sqft_living15_sq | 0.009697 |
22 | zipcode_houses | 0.009553 |
13 | zipcode | 0.009523 |
16 | sqft_living15 | 0.009467 |
5 | waterfront | 0.009218 |
85 | log1p_sqft_living15 | 0.009132 |
90 | waterfront_sq | 0.008649 |
92 | age_sq | 0.008617 |
27 | waterfront_0 | 0.008563 |
11 | yr_built | 0.008179 |
9 | sqft_above | 0.007636 |
28 | waterfront_1 | 0.007627 |
96 | log1p_sqft_above_sq | 0.007255 |
83 | log1p_sqft_above | 0.006967 |
19 | age | 0.006722 |
60 | zipcode_top10_others | 0.005920 |
1 | bathrooms | 0.004448 |
6 | view | 0.004327 |
86 | log1p_sqft_lot15 | 0.004198 |
99 | log1p_sqft_lot15_sq | 0.004081 |
3 | sqft_lot | 0.004055 |
17 | sqft_lot15 | 0.004011 |
91 | view_sq | 0.003781 |
50 | grade_9 | 0.003768 |
95 | log1p_sqft_lot_sq | 0.003692 |
82 | log1p_sqft_lot | 0.003639 |
88 | bathrooms_sq | 0.002928 |
51 | zipcode_top10_98004 | 0.002926 |
20 | yr_renovated2 | 0.002530 |
21 | age_after_renovation | 0.002196 |
93 | age_after_renovation_sq | 0.002133 |
33 | view_4 | 0.001748 |
97 | log1p_sqft_basement_sq | 0.001471 |
18 | yr_sales | 0.001460 |
10 | sqft_basement | 0.001316 |
84 | log1p_sqft_basement | 0.001283 |
87 | bedrooms_sq | 0.001278 |
49 | grade_8 | 0.001233 |
43 | grade_13 | 0.001201 |
7 | condition | 0.001168 |
0 | bedrooms | 0.001161 |
55 | zipcode_top10_98040 | 0.001022 |
29 | view_0 | 0.001012 |
25 | age_cat | 0.000972 |
32 | view_3 | 0.000738 |
41 | grade_11 | 0.000738 |
73 | age_after_renovation_cat_2 | 0.000723 |
42 | grade_12 | 0.000693 |
12 | yr_renovated | 0.000684 |
40 | grade_10 | 0.000672 |
4 | floors | 0.000635 |
26 | age_after_renovation_cat | 0.000629 |
89 | floors_sq | 0.000594 |
48 | grade_7 | 0.000532 |
31 | view_2 | 0.000523 |
36 | condition_3 | 0.000482 |
38 | condition_5 | 0.000474 |
37 | condition_4 | 0.000439 |
54 | zipcode_top10_98039 | 0.000396 |
30 | view_1 | 0.000327 |
75 | age_after_renovation_cat_4 | 0.000285 |
65 | age_cat_4 | 0.000276 |
72 | age_after_renovation_cat_1 | 0.000269 |
24 | renovation_bool | 0.000266 |
63 | age_cat_2 | 0.000263 |
52 | zipcode_top10_98006 | 0.000254 |
62 | age_cat_1 | 0.000213 |
67 | age_cat_6 | 0.000196 |
47 | grade_6 | 0.000192 |
66 | age_cat_5 | 0.000183 |
69 | age_cat_8 | 0.000160 |
79 | age_after_renovation_cat_8 | 0.000155 |
77 | age_after_renovation_cat_6 | 0.000154 |
76 | age_after_renovation_cat_5 | 0.000154 |
78 | age_after_renovation_cat_7 | 0.000154 |
74 | age_after_renovation_cat_3 | 0.000149 |
53 | zipcode_top10_98033 | 0.000142 |
64 | age_cat_3 | 0.000137 |
68 | age_cat_7 | 0.000131 |
23 | basement_bool | 0.000121 |
35 | condition_2 | 0.000100 |
71 | age_after_renovation_cat_0 | 0.000068 |
57 | zipcode_top10_98105 | 0.000055 |
61 | age_cat_0 | 0.000048 |
58 | zipcode_top10_98155 | 0.000048 |
46 | grade_5 | 0.000046 |
34 | condition_1 | 0.000041 |
56 | zipcode_top10_98102 | 0.000026 |
80 | age_after_renovation_cat_9 | 0.000023 |
59 | zipcode_top10_98177 | 0.000022 |
70 | age_cat_9 | 0.000019 |
45 | grade_4 | 0.000001 |
39 | grade_1 | 0.000000 |
44 | grade_3 | 0.000000 |
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 1 hr 16 min 37 secs