This dataset contains house sale prices for King County, which includes Seattle. It includes homes sold between May 2014 and May 2015.
Task: Try to estimate the price based on given features.
import time
time_start_notebook = time.time()
%%capture
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
## install modules
!pip install watermark
!pip install catboost
!pip install shap eli5
# if we update existing module, we need to restart colab
!pip install -U scikit-learn
## print
print('Environment: Google Colaboratory.')
TREE_METHOD = 'gpu_hist' if ENV_COLAB else 'auto'
import numpy as np
import pandas as pd
# visualization
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
# mixed
import os
import time
from pprint import pprint
import joblib
# random state
SEED = 0
RNG = np.random.RandomState(SEED)
# settings
pd.set_option('display.max_columns', 200)
# sklearn
import sklearn
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import ensemble
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.inspection import permutation_importance
# boosting
from sklearn.experimental import enable_hist_gradient_boosting # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
import xgboost
import lightgbm
import catboost
# versions
import watermark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv
Bhishan Poudel 2020-11-23 CPython 3.7.7 IPython 7.19.0 compiler : Clang 4.0.1 (tags/RELEASE_401/final) system : Darwin release : 19.6.0 machine : x86_64 processor : i386 CPU cores : 4 interpreter: 64bit xgboost 1.2.0 json 2.0.9 seaborn 0.11.0 catboost 0.23.2 lightgbm 2.3.1 matplotlib 3.2.1 joblib 0.17.0 pandas 1.1.0 sklearn 0.23.1 numpy 1.18.4 watermark 2.0.2
def show_methods(obj, ncols=7,start=None, inside=None):
""" Show all the attributes of a given method.
Example:
========
show_method_attributes(list)
"""
print(f'Object Type: {type(obj)}\n')
lst = [elem for elem in dir(obj) if elem[0]!='_' ]
lst = [elem for elem in lst
if elem not in 'os np pd sys time psycopg2'.split() ]
if isinstance(start,str):
lst = [elem for elem in lst if elem.startswith(start)]
if isinstance(start,tuple) or isinstance(start,list):
lst = [elem for elem in lst for start_elem in start
if elem.startswith(start_elem)]
if isinstance(inside,str):
lst = [elem for elem in lst if inside in elem]
if isinstance(inside,tuple) or isinstance(inside,list):
lst = [elem for elem in lst for inside_elem in inside
if inside_elem in elem]
return pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
def adjustedR2(rsquared,nrows,ncols):
return rsquared- (ncols-1)/(nrows-ncols) * (1-rsquared)
def print_regr_eval(ytest,ypreds,ncols):
rmse = np.sqrt(metrics.mean_squared_error(ytest,ypreds))
r2 = metrics.r2_score(ytest,ypreds)
ar2 = adjustedR2(r2,len(ytest),ncols)
evs = metrics.explained_variance_score(ytest, ypreds)
print(f"""
RMSE : {rmse:,.2f}
Explained Variance: {evs:.6f}
R-Squared: {r2:,.6f}
Adjusted R-squared: {ar2:,.6f}
""")
def plot_xgb_cv_res(df_cv_results):
fig,ax = plt.subplots()
plt.plot(df_cv_results['train-rmse-mean'],color='b',label='train-rmse')
plt.plot(df_cv_results['test-rmse-mean'],color='r',label='train-rmse')
plt.title('Cross validation score mean plot',fontsize=14)
plt.legend()
plt.show()
if ENV_COLAB:
path_raw = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/'
proj = 'Projects/King_County_Seattle_House_Price_Kaggle/'
data_path_parent = path_raw + proj
data_path_train = data_path_parent + 'raw/train.csv'
data_path_test = data_path_parent + 'raw/test.csv'
else:
data_path_parent = '../data/'
data_path_train = data_path_parent + 'raw/train.csv'
data_path_test = data_path_parent + 'raw/test.csv'
target = 'price'
train_size = 0.8
print(data_path_train)
../data/raw/train.csv
df_train = pd.read_csv(data_path_train)
df_test = pd.read_csv(data_path_test)
print(df_train.shape)
print(df_train.columns)
display(df_train.head(2).append(df_train.tail(2)))
(17290, 21) Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15'], dtype='object')
id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2561340020 | 20140804T000000 | 325000.0 | 3 | 1.75 | 1780 | 11096 | 1.0 | 0 | 0 | 3 | 7 | 1210 | 570 | 1979 | 0 | 98074 | 47.6170 | -122.051 | 1780 | 10640 |
1 | 8598200070 | 20141208T000000 | 278000.0 | 2 | 2.50 | 1420 | 2229 | 2.0 | 0 | 0 | 3 | 7 | 1420 | 0 | 2004 | 0 | 98059 | 47.4871 | -122.165 | 1500 | 2230 |
17288 | 7174800760 | 20140725T000000 | 667000.0 | 5 | 2.00 | 1900 | 5470 | 1.0 | 0 | 0 | 3 | 7 | 1180 | 720 | 1930 | 1965 | 98105 | 47.6666 | -122.303 | 1300 | 3250 |
17289 | 9521100280 | 20140612T000000 | 480000.0 | 3 | 2.50 | 1250 | 1103 | 3.0 | 0 | 2 | 3 | 8 | 1250 | 0 | 2005 | 0 | 98103 | 47.6619 | -122.352 | 1250 | 1188 |
def clean_data(df,log=True,sq=True,logsq=True,dummy=True,dummy_cat=False):
# log sq
if logsq:
log = True
sq = True
df = df.copy()
# Date time features
df['date'] = pd.to_datetime(df['date'])
df['yr_sales'] = df['date'].dt.year
df['age'] = df['yr_sales'] - df['yr_built']
df['yr_renovated2'] = np.where(df['yr_renovated'].eq(0), df['yr_built'], df['yr_renovated'])
df['age_after_renovation'] = df['yr_sales'] - df['yr_renovated2']
# Boolean data types
f = lambda x: 1 if x>0 else 0
df['basement_bool'] = df['sqft_basement'].apply(f)
df['renovation_bool'] = df['yr_renovated'].apply(f)
# Numerical features binning
cols_bin = ['age','age_after_renovation']
df['age_cat'] = pd.cut(df['age'], 10, labels=range(10)).astype(str)
df['age_after_renovation_cat'] = pd.cut(df['age_after_renovation'],
10, labels=range(10))
# Log transformation of large numerical values
cols_log = ['sqft_living', 'sqft_lot', 'sqft_above',
'sqft_basement', 'sqft_living15', 'sqft_lot15']
if log:
for col in cols_log:
df['log1p_' + col] = np.log1p(df[col])
# squared columns
cols_sq = [
# cats
'bedrooms','bathrooms','floors','waterfront','view',
# created nums
'age','age_after_renovation']
if sq:
for col in cols_sq:
df[col + '_sq'] = df[col]**2
cols_log_sq = [
# log nums
'log1p_sqft_living','log1p_sqft_lot',
'log1p_sqft_above','log1p_sqft_basement',
'log1p_sqft_living15','log1p_sqft_lot15'
]
if logsq:
for col in cols_log_sq:
df[col + '_sq'] = df[col]**2
# Categorical Features
cols_dummy = ['waterfront', 'view', 'condition', 'grade']
cols_dummy_cat = ['age_cat', 'age_after_renovation_cat']
for c in cols_dummy:
df[c] = df[c].astype(str)
# Create dummy variables
if dummy:
df_dummy = pd.get_dummies(df[cols_dummy],drop_first=False)
df = pd.concat([df,df_dummy], axis=1)
# dummy variable for newly created cats from numerical feature
if dummy_cat:
df_dummy = pd.get_dummies(df[cols_dummy_cat],drop_first=False)
df = pd.concat([df,cols_dummy_cat], axis=1)
# after creating dummy, make the columns number
for c in cols_dummy + cols_dummy_cat:
df[c] = df[c].astype(np.int32)
# Drop unwanted columns
cols_drop = ['id','date']
df = df.drop(cols_drop,axis=1)
return df
params_data = dict(log=True,sq=True,logsq=True,
dummy=True,dummy_cat=False)
df_train = clean_data(df_train,**params_data)
df_test = clean_data(df_test,**params_data)
print(df_train.shape)
print(df_train.columns)
(17290, 70) Index(['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'yr_sales', 'age', 'yr_renovated2', 'age_after_renovation', 'basement_bool', 'renovation_bool', 'age_cat', 'age_after_renovation_cat', 'log1p_sqft_living', 'log1p_sqft_lot', 'log1p_sqft_above', 'log1p_sqft_basement', 'log1p_sqft_living15', 'log1p_sqft_lot15', 'bedrooms_sq', 'bathrooms_sq', 'floors_sq', 'waterfront_sq', 'view_sq', 'age_sq', 'age_after_renovation_sq', 'log1p_sqft_living_sq', 'log1p_sqft_lot_sq', 'log1p_sqft_above_sq', 'log1p_sqft_basement_sq', 'log1p_sqft_living15_sq', 'log1p_sqft_lot15_sq', 'waterfront_0', 'waterfront_1', 'view_0', 'view_1', 'view_2', 'view_3', 'view_4', 'condition_1', 'condition_2', 'condition_3', 'condition_4', 'condition_5', 'grade_1', 'grade_10', 'grade_11', 'grade_12', 'grade_13', 'grade_3', 'grade_4', 'grade_5', 'grade_6', 'grade_7', 'grade_8', 'grade_9'], dtype='object')
# df_train.dtypes.to_numpy()
# make sure no data leakage
df_train.filter(regex='price').columns
Index(['price'], dtype='object')
# make sure no nans
df_train.isna().sum().sum(), df_test.isna().sum().sum()
(0, 0)
# choose features to train, we can change it later
features = list(sorted(df_train.columns.drop(target)))
# print(np.array(features))
features = [i for i in features if i in df_test.columns if i in df_train.columns]
# print(np.array(sorted(features)))
df_Xtrain = df_train[features]
ser_ytrain = df_train[target]
df_Xtest = df_test[features]
ser_ytest = df_test[target]
ytrain = np.array(ser_ytrain).flatten()
ytest = np.array(ser_ytest).flatten()
scaling = 'standard'
if scaling == 'standard':
scaler = preprocessing.StandardScaler()
scaler.fit(df_Xtrain)
df_Xtrain = pd.DataFrame(scaler.transform(df_Xtrain),columns=features)
df_Xtest = pd.DataFrame(scaler.transform(df_Xtest),columns=features)
elif scaling == 'minmax':
scaler = preprocessing.MinMaxScaler()
scaler.fit(df_Xtrain)
df_Xtrain = pd.DataFrame(scaler.transform(df_Xtrain),columns=features)
df_Xtest = pd.DataFrame(scaler.transform(df_Xtest),columns=features)
df_Xtrain.head(2)
age | age_after_renovation | age_after_renovation_cat | age_after_renovation_sq | age_cat | age_sq | basement_bool | bathrooms | bathrooms_sq | bedrooms | bedrooms_sq | condition | condition_1 | condition_2 | condition_3 | condition_4 | condition_5 | floors | floors_sq | grade | grade_10 | grade_11 | grade_12 | grade_13 | grade_4 | grade_5 | grade_6 | grade_7 | grade_8 | grade_9 | lat | log1p_sqft_above | log1p_sqft_above_sq | log1p_sqft_basement | log1p_sqft_basement_sq | log1p_sqft_living | log1p_sqft_living15 | log1p_sqft_living15_sq | log1p_sqft_living_sq | log1p_sqft_lot | log1p_sqft_lot15 | log1p_sqft_lot15_sq | log1p_sqft_lot_sq | long | renovation_bool | sqft_above | sqft_basement | sqft_living | sqft_living15 | sqft_lot | sqft_lot15 | view | view_0 | view_1 | view_2 | view_3 | view_4 | view_sq | waterfront | waterfront_0 | waterfront_1 | waterfront_sq | yr_built | yr_renovated | yr_renovated2 | yr_sales | zipcode | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -0.288109 | -0.212303 | -0.062185 | -0.438016 | -0.139825 | -0.494698 | 1.247166 | -0.468811 | -0.537610 | -0.39033 | -0.302220 | -0.630613 | -0.035694 | -0.08937 | 0.735526 | -0.595921 | -0.294513 | -0.916249 | -0.837904 | -0.554878 | -0.238288 | -0.135782 | -0.066005 | -0.026354 | -0.036497 | -0.108453 | -0.324043 | 1.186907 | -0.624934 | -0.367371 | 0.410048 | -0.688967 | -0.698830 | 1.208375 | 1.137983 | -0.149505 | -0.169074 | -0.189252 | -0.177052 | 0.361630 | 0.383984 | 0.328910 | 0.301512 | 1.151178 | -0.207998 | -0.698239 | 0.636923 | -0.322100 | -0.302502 | -0.095727 | -0.078695 | -0.305512 | 0.329787 | -0.123077 | -0.217065 | -0.1533 | -0.124282 | -0.261712 | -0.089698 | 0.089698 | -0.089698 | -0.089698 | 0.277141 | -0.207992 | 0.201159 | -0.693043 | -0.071763 |
1 | -1.135161 | -1.074946 | -1.265291 | -0.814627 | -1.320662 | -0.856409 | -0.801818 | 0.506258 | 0.326221 | -1.46038 | -0.775165 | -0.630613 | -0.035694 | -0.08937 | 0.735526 | -0.595921 | -0.294513 | 0.933474 | 0.806845 | -0.554878 | -0.238288 | -0.135782 | -0.066005 | -0.026354 | -0.036497 | -0.108453 | -0.324043 | 1.186907 | -0.624934 | -0.367371 | -0.527440 | -0.314663 | -0.338123 | -0.795545 | -0.779839 | -0.681826 | -0.692075 | -0.700087 | -0.697163 | -1.411647 | -1.527957 | -1.398248 | -1.291600 | 0.344386 | -0.207998 | -0.442941 | -0.658262 | -0.716449 | -0.712318 | -0.302804 | -0.378759 | -0.305512 | 0.329787 | -0.123077 | -0.217065 | -0.1533 | -0.124282 | -0.261712 | -0.089698 | 0.089698 | -0.089698 | -0.089698 | 1.124268 | -0.207992 | 1.064027 | -0.693043 | -0.353180 |
s = f"""
df_Xtrain = {df_Xtrain.shape}
ytrain = {ytrain.shape}
df_Xtest = {df_Xtest.shape}
ytest = {ytest.shape}
"""
print(s)
df_Xtrain = (17290, 67) ytrain = (17290,) df_Xtest = (4323, 67) ytest = (4323,)
# persist data
df_Xtrain.to_csv('../data/processed/Xtrain.csv.zip',compression='zip',index=False)
df_Xtest.to_csv('../data/processed/Xtest.csv.zip',compression='zip',index=False)
np.savetxt('../data/processed/ytrain.csv',ytrain)
np.savetxt('../data/processed/ytest.csv',ytest)
https://xgboost.readthedocs.io/en/latest/parameter.html#general-parameters
Parameters:
-------------
max_depth=3
learning_rate=0.1
n_estimators=100 # number of trees you want to build.
verbosity=1 **NOTE: it print in ipython terminal not in browser
silent=None **deprecated use verbosity
objective='binary:logistic' **for binary classification
booster='gbtree' **use default tree not linear even for regression (may also use dart instead of gbtree, but needs to tune)
n_jobs=1 **make this -1
nthread=None **deprecated use n_jobs
gamma=0 # A higher value leads to fewer splits.
min_child_weight=1
max_delta_step=0
subsample=1 # percentage of samples used per tree. Low value can lead to underfitting.
colsample_bytree=1 # percentage of features used per tree. High value can lead to overfitting.
colsample_bylevel=1
colsample_bynode=1
reg_alpha=0 # A large value leads to more regularization.
reg_lambda=1 # L2 regularization on leaf weights and is smoother than L1 regularization.
scale_pos_weight=1
base_score=0.5
random_state=0 **use your own random state
seed=None **deprecated use random_state
missing=None
If you have a validation set, you can use early stopping to find the optimal number of boosting rounds. Early stopping requires at least one set in evals. If there’s more than one, it will use the last.
train(..., evals=evals, early_stopping_rounds=10)
The model will train until the validation score stops improving. Validation error needs to decrease at least every early_stopping_rounds to continue training.
If early stopping occurs, the model will have three additional fields: bst.best_score
, bst.best_iteration
and bst.best_ntree_limit
. Note that xgboost.train() will return a model from the last iteration, not the best one.
This works with both metrics to minimize (RMSE, log loss, etc.) and to maximize (MAP, NDCG, AUC). Note that if you specify more than one evaluation metric the last one in param['eval_metric'] is used for early stopping.
If early stopping is enabled during training, you can get predictions from the best iteration with bst.best_ntree_limit:
ypred = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)
Flexible tree growing policies The existing tree grower in xgboost grows a tree in a depth-wise fashion, executing splits in first level before splits in second and so forth. The new grower lets you control the way new nodes are added to the tree:
grow_policy=depthwise (default): split at nodes closest to the root, i.e. grow depth-wise. grow_policy=lossguide: split at nodes with highest loss change. This behavior mimics that of LightGBM. It has been reported that the lossguide policy often results in faster convergence in loss, though there is also risk of over-fitting(see the preliminary results).
model = xgboost.XGBRegressor(n_jobs=-1, random_state=SEED,
objective='reg:squarederror')
model.fit(df_Xtrain, ytrain)
ypreds = model.predict(df_Xtest)
print_regr_eval(ytest,ypreds,df_Xtest.shape[1])
RMSE : 124,475.57 Explained Variance: 0.885097 R-Squared: 0.885032 Adjusted R-squared: 0.883249
%%time
scoring = "neg_mean_squared_error"
kf = model_selection.KFold(5,shuffle=True,random_state=SEED)
cvs = model_selection.cross_val_score(model, df_Xtrain, ytrain,cv=kf,
scoring = scoring)
score = cvs.mean()
score_std = cvs.std()
print(f"{scoring}: {score:,.2f}\n std : {score_std:,.2f}")
neg_mean_squared_error: -16,122,317,314.99 std : 2,821,145,115.10 CPU times: user 23.1 s, sys: 86.4 ms, total: 23.2 s Wall time: 23.5 s
plt.plot(cvs*-1)
plt.xticks(range(len(cvs)))
plt.show()
%%time
dtrain = xgboost.DMatrix(df_Xtrain,ytrain,
feature_names=features)
params = {"objective":"reg:squarederror",
'colsample_bytree': 0.3,
'learning_rate': 0.1,
'max_depth': 5,
'alpha': 10}
num_boost_round=500
kf=model_selection.KFold(n_splits=5,shuffle=True,random_state=SEED)
# we need xgb data matrix to use xgboost api of xgboost.cv
df_cv_results = xgboost.cv(params,dtrain, num_boost_round,
nfold=5,
early_stopping_rounds=50,
metrics="rmse",
folds=kf,
verbose_eval=50, # show progress at Nth iteration
seed=SEED)
display(df_cv_results.head())
[0] train-rmse:595399.87500+3805.19258 test-rmse:595702.05000+15407.01215 [50] train-rmse:105573.52031+2156.19075 test-rmse:137950.91562+11666.75600 [100] train-rmse:87688.98750+1116.10493 test-rmse:128487.33281+11520.59900 [150] train-rmse:78634.67969+1029.28058 test-rmse:125073.05156+11155.93922 [200] train-rmse:72366.42188+670.32210 test-rmse:123669.81094+11340.60912 [250] train-rmse:67851.61406+268.31472 test-rmse:122734.95312+11381.78452 [300] train-rmse:63911.67812+313.72536 test-rmse:121927.51719+11399.70860 [350] train-rmse:60906.43203+203.21987 test-rmse:121528.95781+11560.37613 [400] train-rmse:58461.84766+410.89633 test-rmse:121319.82500+11537.08273 [450] train-rmse:55989.08672+345.70102 test-rmse:120972.04531+11696.68500 [499] train-rmse:53793.44531+314.52973 test-rmse:120762.97500+11641.49620
train-rmse-mean | train-rmse-std | test-rmse-mean | test-rmse-std | |
---|---|---|---|---|
0 | 595399.87500 | 3805.192577 | 595702.0500 | 15407.012146 |
1 | 544175.62500 | 3747.841473 | 545224.7125 | 15089.800899 |
2 | 497403.95000 | 4098.070245 | 499268.9125 | 14047.165952 |
3 | 455813.43125 | 4262.515081 | 458664.3125 | 13041.137876 |
4 | 418719.71875 | 4614.028706 | 422088.4500 | 12737.654828 |
CPU times: user 39 s, sys: 182 ms, total: 39.2 s Wall time: 40.2 s
plot_xgb_cv_res(df_cv_results)
%%time
params_xgb = dict(n_jobs=-1, random_state=SEED,
objective='reg:squarederror',
n_estimators=1200,
max_depth=3,
reg_alpha=1,
reg_lambda=5,
subsample=1,
gamma=0,
min_child_weight=1,
colsample_bytree=1,
learning_rate=0.1
)
model = xgboost.XGBRegressor(**params_xgb)
model.fit(df_Xtrain,ytrain)
ypreds = model.predict(df_Xtest)
print_regr_eval(ytest,ypreds,df_Xtest.shape[1])
RMSE : 114,726.79 Explained Variance: 0.902373 R-Squared: 0.902335 Adjusted R-squared: 0.900820 CPU times: user 33.4 s, sys: 83.9 ms, total: 33.5 s Wall time: 34.4 s
%%time
ytrain_log1p = np.log1p(ytrain)
model = xgboost.XGBRegressor(**params_xgb)
model.fit(df_Xtrain, ytrain_log1p)
# persist the model
path_model_xgb = '../models/model_xgb_logtarget.dump'
model.save_model(path_model_xgb)
# persist using joblib
path_model_xgb_joblib = '../models/model_xgb_logtarget.joblib'
joblib.dump(model,path_model_xgb_joblib)
model = xgboost.XGBRegressor()
model.load_model(fname='../models/model_xgb_logtarget.dump')
ypreds_log1p = model.predict(df_Xtest)
ypreds = np.expm1(ypreds_log1p)
print('ytest:', ytest[:3])
print('ypreds: ', ypreds[:3])
print_regr_eval(ytest,ypreds,df_Xtest.shape[1])
ytest: [285000. 239950. 460000.] ypreds: [343218.4 204292.33 508420.8 ] RMSE : 110,471.76 Explained Variance: 0.910365 R-Squared: 0.909445 Adjusted R-squared: 0.908041 CPU times: user 35.4 s, sys: 211 ms, total: 35.6 s Wall time: 38.1 s
# feature importance
df_imp = pd.DataFrame({'Feature': features,
'Importance_gain': model.feature_importances_
})
df_imp.nlargest(10,'Importance_gain').style.background_gradient()
Feature | Importance_gain | |
---|---|---|
19 | grade | 0.308453 |
35 | log1p_sqft_living | 0.177563 |
51 | view | 0.132501 |
58 | waterfront | 0.076463 |
30 | lat | 0.061198 |
11 | condition | 0.028256 |
36 | log1p_sqft_living15 | 0.026587 |
65 | yr_sales | 0.023181 |
0 | age | 0.020266 |
63 | yr_renovated | 0.014971 |
(df_imp
.set_index('Feature')
.nlargest(10,'Importance_gain')
.plot
.barh(figsize=(12,8))
.invert_yaxis()
)
show_methods(model,5)
Object Type: <class 'xgboost.sklearn.XGBRegressor'>
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | apply | fit | intercept_ | n_estimators | save_model |
1 | base_score | gamma | kwargs | n_features_in_ | scale_pos_weight |
2 | booster | get_booster | learning_rate | n_jobs | score |
3 | coef_ | get_num_boosting_rounds | load_model | num_parallel_tree | set_params |
4 | colsample_bylevel | get_params | max_delta_step | objective | subsample |
5 | colsample_bynode | get_xgb_params | max_depth | predict | tree_method |
6 | colsample_bytree | gpu_id | min_child_weight | random_state | validate_parameters |
7 | evals_result | importance_type | missing | reg_alpha | verbosity |
8 | feature_importances_ | interaction_constraints | monotone_constraints | reg_lambda |
bst = model.get_booster()
bst
<xgboost.core.Booster at 0x7fe077d74510>
fig,ax = plt.subplots(figsize=(12,8))
xgboost.plot_tree(bst,ax=ax,num_trees=4)
<matplotlib.axes._subplots.AxesSubplot at 0x7fe05da65cd0>
# help(xgboost.plot_importance)
fig,ax = plt.subplots(figsize=(12,8))
xgboost.plot_importance(bst,ax=ax,importance_type='weight',max_num_features=20)
plt.show()
from sklearn.inspection import permutation_importance
# permutation_importance?
Xtr,Xvd,ytr,yvd = model_selection.train_test_split(df_Xtrain,ytrain,
train_size=0.8,random_state=SEED)
%%time
model = xgboost.XGBRegressor(**params_xgb)
model.fit(Xtr,ytr)
perm_imp = permutation_importance(model, Xvd, yvd,
n_repeats=20,
n_jobs=-1,
random_state=SEED)
CPU times: user 28.9 s, sys: 219 ms, total: 29.1 s Wall time: 1min 22s
df_perm_imp = pd.DataFrame({
'importances_mean': abs(perm_imp.importances_mean),
'importance_std': perm_imp.importances_std
},index=features)
df_perm_imp = df_perm_imp.sort_values('importances_mean',ascending=False)
df_perm_imp.head(10)
importances_mean | importance_std | |
---|---|---|
lat | 0.335251 | 0.013772 |
log1p_sqft_living | 0.169468 | 0.003991 |
grade | 0.168113 | 0.006245 |
long | 0.120965 | 0.011823 |
log1p_sqft_living15 | 0.034103 | 0.001742 |
log1p_sqft_lot | 0.029592 | 0.001546 |
zipcode | 0.027605 | 0.003611 |
log1p_sqft_above | 0.021245 | 0.001288 |
waterfront | 0.021151 | 0.001104 |
view | 0.015423 | 0.001054 |
df_perm_imp.tail()
importances_mean | importance_std | |
---|---|---|
log1p_sqft_lot15_sq | 0.0 | 0.0 |
log1p_sqft_lot_sq | 0.0 | 0.0 |
sqft_above | 0.0 | 0.0 |
sqft_basement | 0.0 | 0.0 |
age_cat | 0.0 | 0.0 |
features_sel = df_perm_imp.query("importances_mean > 0.00").index.to_numpy()
print(features_sel)
features_sel = list(features_sel)
['lat' 'log1p_sqft_living' 'grade' 'long' 'log1p_sqft_living15' 'log1p_sqft_lot' 'zipcode' 'log1p_sqft_above' 'waterfront' 'view' 'bathrooms' 'condition' 'log1p_sqft_basement' 'yr_built' 'yr_sales' 'age_after_renovation' 'floors' 'log1p_sqft_lot15' 'age' 'bedrooms' 'yr_renovated' 'yr_renovated2' 'grade_10' 'condition_4' 'view_3' 'grade_9' 'grade_11' 'renovation_bool' 'grade_12' 'basement_bool' 'grade_7' 'condition_3' 'grade_8' 'view_1' 'grade_6' 'condition_2' 'view_2' 'grade_4']
%%time
model = xgboost.XGBRegressor(**params_xgb)
model.fit(df_Xtrain[features_sel],ytrain)
ypreds = model.predict(df_Xtest[features_sel])
print_regr_eval(ytest,ypreds,len(features_sel))
RMSE : 117,402.34 Explained Variance: 0.897784 R-Squared: 0.897726 Adjusted R-squared: 0.896843 CPU times: user 20.9 s, sys: 142 ms, total: 21.1 s Wall time: 23.7 s
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 0 hr 4 min 18 secs