This dataset contains house sale prices for King County, which includes Seattle. It includes homes sold between May 2014 and May 2015.
Task: Try to estimate the price based on given features.
%%capture
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
## install modules
!pip install watermark
# if we update existing module, we need to restart colab
!pip install -U scikit-learn
## print
print('Environment: Google Colaboratory.')
TREE_METHOD = 'gpu_hist' if ENV_COLAB else 'auto'
import numpy as np
import pandas as pd
# visualization
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
# mixed
import os
import time
from pprint import pprint
# random state
SEED = 0
RNG = np.random.RandomState(SEED)
# settings
pd.set_option('display.max_columns', 200)
# sklearn
import sklearn
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import ensemble
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_hist_gradient_boosting # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
# versions
import watermark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv
Bhishan Poudel 2020-11-04 CPython 3.6.9 IPython 5.5.0 compiler : GCC 8.4.0 system : Linux release : 4.19.112+ machine : x86_64 processor : x86_64 CPU cores : 2 interpreter: 64bit seaborn 0.11.0 watermark 2.0.2 pandas 1.1.4 matplotlib 3.2.2 sklearn 0.23.2 numpy 1.18.5
def show_methods(obj, ncols=7,start=None, inside=None):
""" Show all the attributes of a given method.
Example:
========
show_method_attributes(list)
"""
print(f'Object Type: {type(obj)}\n')
lst = [elem for elem in dir(obj) if elem[0]!='_' ]
lst = [elem for elem in lst
if elem not in 'os np pd sys time psycopg2'.split() ]
if isinstance(start,str):
lst = [elem for elem in lst if elem.startswith(start)]
if isinstance(start,tuple) or isinstance(start,list):
lst = [elem for elem in lst for start_elem in start
if elem.startswith(start_elem)]
if isinstance(inside,str):
lst = [elem for elem in lst if inside in elem]
if isinstance(inside,tuple) or isinstance(inside,list):
lst = [elem for elem in lst for inside_elem in inside
if inside_elem in elem]
return pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
def adjustedR2(rsquared,nrows,ncols):
return rsquared- (ncols-1)/(nrows-ncols) * (1-rsquared)
if ENV_COLAB:
path_raw = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/'
proj = 'Projects/King_County_Seattle_House_Price_Kaggle/'
data_path_parent = path_raw + proj
data_path_train = data_path_parent + 'raw/train.csv'
data_path_test = data_path_parent + 'raw/test.csv'
else:
data_path_parent = '../data/'
data_path_train = data_path_parent + 'raw/train.csv'
data_path_test = data_path_parent + 'raw/test.csv'
target = 'price'
train_size = 0.8
print(data_path_train)
https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/King_County_Seattle_House_Price_Kaggle/raw/train.csv
df_train_raw = pd.read_csv(data_path_train)
df_test_raw = pd.read_csv(data_path_test)
print(df_train_raw.shape)
print(df_train_raw.columns)
display(df_train_raw.head(2).append(df_train_raw.tail(2)))
(17290, 21) Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15'], dtype='object')
id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2561340020 | 20140804T000000 | 325000.0 | 3 | 1.75 | 1780 | 11096 | 1.0 | 0 | 0 | 3 | 7 | 1210 | 570 | 1979 | 0 | 98074 | 47.6170 | -122.051 | 1780 | 10640 |
1 | 8598200070 | 20141208T000000 | 278000.0 | 2 | 2.50 | 1420 | 2229 | 2.0 | 0 | 0 | 3 | 7 | 1420 | 0 | 2004 | 0 | 98059 | 47.4871 | -122.165 | 1500 | 2230 |
17288 | 7174800760 | 20140725T000000 | 667000.0 | 5 | 2.00 | 1900 | 5470 | 1.0 | 0 | 0 | 3 | 7 | 1180 | 720 | 1930 | 1965 | 98105 | 47.6666 | -122.303 | 1300 | 3250 |
17289 | 9521100280 | 20140612T000000 | 480000.0 | 3 | 2.50 | 1250 | 1103 | 3.0 | 0 | 2 | 3 | 8 | 1250 | 0 | 2005 | 0 | 98103 | 47.6619 | -122.352 | 1250 | 1188 |
def clean_data(df):
df = df.copy()
# Date time features
df['date'] = pd.to_datetime(df['date'])
df['yr_sales'] = df['date'].dt.year
df['age'] = df['yr_sales'] - df['yr_built']
df['yr_renovated2'] = np.where(df['yr_renovated'].eq(0), df['yr_built'], df['yr_renovated'])
df['age_after_renovation'] = df['yr_sales'] - df['yr_renovated2']
# Categorical Features
cols_str = ['waterfront', 'view', 'condition', 'grade','zipcode']
for c in cols_str:
df[c] = df[c].astype(str)
cols_obj = df.select_dtypes(['object','category']).columns
cols_obj_small = ['waterfront', 'view', 'condition', 'grade']
# Boolean data types
df['basement_bool'] = df['sqft_basement'].apply(lambda x: 1 if x>0 else 0)
df['renovation_bool'] = df['yr_renovated'].apply(lambda x: 1 if x>0 else 0)
# Numerical features binning
cols_bin = ['age','age_after_renovation']
df['age_cat'] = pd.cut(df['age'], 10, labels=range(10)).astype(str)
df['age_after_renovation_cat'] = pd.cut(df['age_after_renovation'], 10, labels=range(10))
# Create dummy variables from object and categories
cols_obj_cat = df.select_dtypes(include=[np.object, 'category']).columns
cols_dummy = ['waterfront', 'view', 'condition', 'grade',
'age_cat', 'age_after_renovation_cat']
df_dummy = pd.get_dummies(df[cols_dummy],drop_first=False)
df = pd.concat([df,df_dummy], axis=1)
# after creating dummy, make the columns number
for c in cols_obj_cat:
df[c] = df[c].astype(np.int8)
# Log transformation of large numerical values
cols_log = ['sqft_living', 'sqft_lot', 'sqft_above',
'sqft_basement', 'sqft_living15', 'sqft_lot15']
for col in cols_log:
df['log1p_' + col] = np.log1p(df[col])
# squared columns
cols_sq = [
# cats
'bedrooms','bathrooms','floors','waterfront','view',
# nums
'age','age_after_renovation',
# log nums
'log1p_sqft_living','log1p_sqft_lot',
'log1p_sqft_above','log1p_sqft_basement',
'log1p_sqft_living15','log1p_sqft_lot15'
]
for col in cols_sq:
df[col + '_sq'] = df[col]**2
# Drop unwanted columns
cols_drop = ['id','date']
df = df.drop(cols_drop,axis=1)
return df
df_train = clean_data(df_train_raw)
df_test = clean_data(df_test_raw)
print(df_train.shape)
print(df_train.columns)
(17290, 90) Index(['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'yr_sales', 'age', 'yr_renovated2', 'age_after_renovation', 'basement_bool', 'renovation_bool', 'age_cat', 'age_after_renovation_cat', 'waterfront_0', 'waterfront_1', 'view_0', 'view_1', 'view_2', 'view_3', 'view_4', 'condition_1', 'condition_2', 'condition_3', 'condition_4', 'condition_5', 'grade_1', 'grade_10', 'grade_11', 'grade_12', 'grade_13', 'grade_3', 'grade_4', 'grade_5', 'grade_6', 'grade_7', 'grade_8', 'grade_9', 'age_cat_0', 'age_cat_1', 'age_cat_2', 'age_cat_3', 'age_cat_4', 'age_cat_5', 'age_cat_6', 'age_cat_7', 'age_cat_8', 'age_cat_9', 'age_after_renovation_cat_0', 'age_after_renovation_cat_1', 'age_after_renovation_cat_2', 'age_after_renovation_cat_3', 'age_after_renovation_cat_4', 'age_after_renovation_cat_5', 'age_after_renovation_cat_6', 'age_after_renovation_cat_7', 'age_after_renovation_cat_8', 'age_after_renovation_cat_9', 'log1p_sqft_living', 'log1p_sqft_lot', 'log1p_sqft_above', 'log1p_sqft_basement', 'log1p_sqft_living15', 'log1p_sqft_lot15', 'bedrooms_sq', 'bathrooms_sq', 'floors_sq', 'waterfront_sq', 'view_sq', 'age_sq', 'age_after_renovation_sq', 'log1p_sqft_living_sq', 'log1p_sqft_lot_sq', 'log1p_sqft_above_sq', 'log1p_sqft_basement_sq', 'log1p_sqft_living15_sq', 'log1p_sqft_lot15_sq'], dtype='object')
# make sure no data leakage
df_train.filter(regex='price').columns
Index(['price'], dtype='object')
# make sure no nans
df_train.isna().sum().sum(), df_test.isna().sum().sum()
(0, 0)
# choose features to train, we can change it later
features = list(sorted(df_train.columns.drop(target)))
# print(np.array(features))
# note
note = """
here the test data do not have some features from train data.
df_test[features].head(2)
KeyError: "['grade_3', 'grade_1'] not in index"
"""
features = [i for i in features if i in df_test.columns if i in df_train.columns]
# print(np.array(sorted(features)))
df_Xtrain = df_train[features]
ser_ytrain = df_train[target]
df_Xtest = df_test[features]
ser_ytest = df_test[target]
ytrain = np.array(ser_ytrain).flatten()
ytest = np.array(ser_ytest).flatten()
parameters
------------
early_stopping: 'auto' or bool (default=’auto’)
If ‘auto’, early stopping is enabled if the sample size is larger than 10000. If True, early stopping is enabled, otherwise early stopping is disabled.
scoring: str or callable or None, optional (default=’loss’)
Scoring parameter to use for early stopping. It can be a single string (see The scoring parameter: defining model evaluation rules) or a callable (see Defining your scoring strategy from metric functions). If None, the estimator’s default scorer is used. If scoring='loss', early stopping is checked w.r.t the loss value. Only used if early stopping is performed.
ensemble.HistGradientBoostingRegressor()
HistGradientBoostingRegressor()
%%time
pipe = Pipeline([
# scaling
# ('Scaler', preprocessing.StandardScaler()),
# scaling did not help r2 value, so I commented it.
# model
('hgbr', ensemble.HistGradientBoostingRegressor(random_state=SEED))
])
pipe.fit(df_Xtrain,ytrain)
# model evaluation
ypreds = pipe.predict(df_Xtest)
rmse = np.sqrt(sklearn.metrics.mean_squared_error(ytest,ypreds))
r2 = sklearn.metrics.r2_score(ytest, ypreds)
ar2 = adjustedR2(r2, df_Xtest.shape[0], df_Xtest.shape[1])
print(f'Test RMSE : {rmse:,.2f}')
print(f'r_squared : {r2:.6f} ')
print(f'adjustedr2 : {ar2:.6f}')
Test RMSE : 128,414.87 r_squared : 0.877640 adjustedr2 : 0.875156 CPU times: user 3.73 s, sys: 294 ms, total: 4.03 s Wall time: 2.13 s
show_methods(pipe['hgbr'])
Object Type: <class 'sklearn.ensemble._hist_gradient_boosting.gradient_boosting.HistGradientBoostingRegressor'>
0 | 1 | 2 | 3 | 4 | 5 | 6 | |
---|---|---|---|---|---|---|---|
0 | bin_mapper_ | l2_regularization | max_depth | n_features_ | predict | scoring | validation_fraction |
1 | do_early_stopping_ | learning_rate | max_iter | n_features_in_ | random_state | set_params | validation_score_ |
2 | early_stopping | loss | max_leaf_nodes | n_iter_ | score | tol | verbose |
3 | fit | loss_ | min_samples_leaf | n_iter_no_change | scorer_ | train_score_ | warm_start |
4 | get_params | max_bins | monotonic_cst | n_trees_per_iteration_ |
reg = ensemble.HistGradientBoostingRegressor(
l2_regularization=0.0,
learning_rate=0.01, # default 0.1
loss='least_squares',
max_bins=255,
max_depth=None,
max_iter=5000, # default 100
max_leaf_nodes=31,
min_samples_leaf=20,
n_iter_no_change=10,
random_state=SEED,
early_stopping=True,
scoring=None,
tol=1e-07,
validation_fraction=0.1,
verbose=0,
warm_start=False)
fitted_reg = reg.fit(df_Xtrain,ytrain)
ypreds_tr = fitted_reg.predict(df_Xtrain)
rmse = np.sqrt(sklearn.metrics.mean_squared_error(ytrain,ypreds_tr))
r2 = sklearn.metrics.r2_score(ytrain, ypreds_tr)
ar2 = adjustedR2(r2, df_Xtrain.shape[0], df_Xtrain.shape[1])
print(f'Train RMSE : {rmse:,.2f}')
print(f'r_squared : {r2:.6f} ')
print(f'adjustedr2 : {ar2:.6f}')
Train RMSE : 96,568.93 r_squared : 0.930807 adjustedr2 : 0.930461
ypreds = fitted_reg.predict(df_Xtest)
rmse = np.sqrt(sklearn.metrics.mean_squared_error(ytest,ypreds))
r2 = sklearn.metrics.r2_score(ytest, ypreds)
ar2 = adjustedR2(r2, df_Xtest.shape[0], df_Xtest.shape[1])
print(f'Test RMSE : {rmse:,.2f}')
print(f'r_squared : {r2:.6f} ')
print(f'adjustedr2 : {ar2:.6f}')
Test RMSE : 126,760.36 r_squared : 0.880772 adjustedr2 : 0.878352
df_cv = pd.DataFrame({'Model': [],
'10-Fold Cross Validation Mean':[],
'10-Fold Cross Validation Std':[]
})
%%time
# kfold = model_selection.KFold(n_splits=5,shuffle=False)
cv_results = model_selection.cross_val_score(reg,df_Xtrain,ytrain,
cv=3,
scoring='neg_mean_squared_error'
) * -1
df_cv.loc[0] = ['HGBR', cv_results.mean(), cv_results.std() ]
display(df_cv)
# smallest rmse is best, largest negMSE is best.
Model | 10-Fold Cross Validation Mean | 10-Fold Cross Validation Std | |
---|---|---|---|
0 | HGBR | 1.653866e+10 | 7.250196e+08 |
CPU times: user 13min 49s, sys: 9.38 s, total: 13min 58s Wall time: 7min 3s