import time
time_start_notebook = time.time()


%%capture
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    ## install modules
    !yes | pip install -q watermark
    !yes | pip install -q dtreeviz

    # if we update existing module, we need to restart colab
    !yes | pip install -q -U scikit-learn
    !yes | pip uninstall xgboost
    !yes | pip install -q xgboost==1.2.0 # dtreeviz and joblib needs same version as in local


import numpy as np
import pandas as pd
import xgboost
import xgboost as xgb
import sklearn
from sklearn import metrics as skmetrics
import os
import joblib

# random state
SEED = 0
RNG = np.random.RandomState(SEED)

# model eval
import graphviz
import dtreeviz
from dtreeviz.models.shadow_decision_tree import ShadowDecTree
from dtreeviz.models.xgb_decision_tree import ShadowXGBDTree

# params
SEED = 100

# versions
import watermark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv

Bhishan Poudel 2020-11-24 

CPython 3.6.9
IPython 5.5.0

compiler   : GCC 8.4.0
system     : Linux
release    : 4.19.112+
machine    : x86_64
processor  : x86_64
CPU cores  : 2
interpreter: 64bit

sklearn   0.23.2
watermark 2.0.2
graphviz  0.10.1
numpy     1.18.5
xgboost   1.2.0
pandas    1.1.4
joblib    0.17.0


def adjustedR2(rsquared,nrows,ncols):
    return rsquared- (ncols-1)/(nrows-ncols) * (1-rsquared)

def print_regr_eval(ytest,ypreds,ncols):
    rmse = np.sqrt(skmetrics.mean_squared_error(ytest,ypreds))
    r2 = skmetrics.r2_score(ytest,ypreds)
    ar2 = adjustedR2(r2,len(ytest),ncols)
    evs = skmetrics.explained_variance_score(ytest, ypreds)

    print(f"""
             RMSE : {rmse:,.2f}
Explained Variance: {evs:.6f}
         R-Squared: {r2:,.6f}
Adjusted R-squared: {ar2:,.6f}

""")
    
def show_methods(obj, ncols=4):
    lst = [i for i in dir(obj) if i[0]!='_' ]
    df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
    return df


if ENV_COLAB:
    path_git = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/'
    project = 'Projects/King_County_Seattle_House_Price_Kaggle/'
    data_path_parent = path_git + project
else:
    data_path_parent = '../data/'

data_path_Xtrain = data_path_parent + 'processed/Xtrain.csv.zip'
data_path_ytrain = data_path_parent + 'processed/ytrain.csv'
data_path_Xtest = data_path_parent + 'processed/Xtest.csv.zip'
data_path_ytest = data_path_parent + 'processed/ytest.csv'
target = 'price'
train_size = 0.8

print(data_path_Xtest)

https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/King_County_Seattle_House_Price_Kaggle/processed/Xtest.csv.zip


df_Xtrain  = pd.read_csv(data_path_Xtrain,compression='zip')
ser_ytrain = pd.read_csv(data_path_ytrain,header=None)
ytrain  = np.array(ser_ytrain).flatten()
ytrain_log1p = np.log1p(ytrain)

df_Xtest  = pd.read_csv(data_path_Xtest,compression='zip')
ser_ytest = pd.read_csv(data_path_ytest,header=None)
ytest  = np.array(ser_ytest).flatten()

features = list(df_Xtest.columns)

s = f"""
df_Xtest  = {df_Xtest.shape}
ytest     = {ytest.shape}

"""
print(s)

display(df_Xtest.head(2))
display(ser_ytest.head(2))

assert df_Xtest.shape[0] == ytest.shape[0]

df_Xtest  = (4323, 67)
ytest     = (4323,)


# saved models
if ENV_COLAB:
    !mkdir -p ../models
    !wget https://github.com/bhishanpdl/Datasets/blob/master/Projects/King_County_Seattle_House_Price_Kaggle/models/model_xgb_logtarget.dump?raw=true
    !mv model_xgb_logtarget.dump?raw=true ../models/model_xgb_logtarget.dump
    !ls ../models
    !du -sh ../models/model_xgb_logtarget.dump

    path_model_xgb = '../models/model_xgb_logtarget.dump'
    model = xgboost.XGBRegressor()
    model.load_model(path_model_xgb)

    ypreds_log1p = model.predict(df_Xtest)
    ypreds = np.expm1(ypreds_log1p)

    print('ytest:', ytest[:3])
    print('ypreds: ', ypreds[:3]) # **WARNING: This gives empty ypreds**

--2020-11-24 15:48:54--  https://github.com/bhishanpdl/Datasets/blob/master/Projects/King_County_Seattle_House_Price_Kaggle/models/model_xgb_logtarget.dump?raw=true
Resolving github.com (github.com)... 13.114.40.48
Connecting to github.com (github.com)|13.114.40.48|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/bhishanpdl/Datasets/raw/master/Projects/King_County_Seattle_House_Price_Kaggle/models/model_xgb_logtarget.dump [following]
--2020-11-24 15:48:55--  https://github.com/bhishanpdl/Datasets/raw/master/Projects/King_County_Seattle_House_Price_Kaggle/models/model_xgb_logtarget.dump
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/King_County_Seattle_House_Price_Kaggle/models/model_xgb_logtarget.dump [following]
--2020-11-24 15:48:55--  https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/King_County_Seattle_House_Price_Kaggle/models/model_xgb_logtarget.dump
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 821441 (802K) [application/octet-stream]
Saving to: ‘model_xgb_logtarget.dump?raw=true’

model_xgb_logtarget 100%[===================>] 802.19K  --.-KB/s    in 0.1s    

2020-11-24 15:48:56 (8.14 MB/s) - ‘model_xgb_logtarget.dump?raw=true’ saved [821441/821441]

model_xgb_logtarget.dump
804K	../models/model_xgb_logtarget.dump
ytest: [285000. 239950. 460000.]
ypreds:  [343218.4  204292.31 508420.8 ]


# saved models
use_joblib = 0
if use_joblib:
    !mkdir -p ../models
    !wget https://github.com/bhishanpdl/Datasets/blob/master/Projects/King_County_Seattle_House_Price_Kaggle/models/model_xgb_logtarget.dump?raw=true
    !mv model_xgb_logtarget.joblib?raw=true ../models/model_xgb_logtarget.joblib
    !ls ../models
    !du -sh ../models/model_xgb_logtarget.joblib

    path_model_xgb = '../models/model_xgb_logtarget.joblib'
    model = xgboost.XGBRegressor()
    model.load_model(path_model_xgb)

    ypreds_log1p = model.predict(df_Xtest)
    ypreds = np.expm1(ypreds_log1p)

    print('ytest:', ytest[:3])
    print('ypreds: ', ypreds[:3]) # **WARNING: This gives empty ypreds**


%%time
train_model = 0
if train_model:
    params_xgb = dict(n_jobs=-1,
                    random_state=SEED,
                    objective='reg:squarederror',
                    n_estimators=1200,
                    max_depth=3,  # default 6
                    reg_alpha=1,  # default alpha = 0,  alias reg_alpha
                    reg_lambda=5, # default lambda = 1, alias reg_lambda
                    subsample=1,  # default 1
                    gamma=0, # default gamma=0 alias min_split_loss
                    min_child_weight=1, # default 1
                    colsample_bytree=1, # default 1
                    learning_rate=0.1,   # default eta = 0.3
                    tree_method = 'auto', # default auto, use gpu_hist
                    )

    model = xgboost.XGBRegressor(**params_xgb)
    model.fit(df_Xtrain, ytrain_log1p)

    ypreds_log1p = model.predict(df_Xtest)
    ypreds = np.expm1(ypreds_log1p)

    print('ytest:', ytest[:3])
    print('ypreds: ', ypreds[:3])
    print_regr_eval(ytest,ypreds,df_Xtest.shape[1])

CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 9.54 µs


show_methods(model)


bst = model.get_booster()
bst.trees_to_dataframe().head(2)


bst.trees_to_dataframe()['Tree'].nunique()

1200


bst.trees_to_dataframe().query("Tree == 0")


xgb.plot_tree(model)

<matplotlib.axes._subplots.AxesSubplot at 0x7f8ca18b1a20>


import graphviz
from dtreeviz import trees as dtrees
from dtreeviz.models.shadow_decision_tree import ShadowDecTree
from dtreeviz.models.xgb_decision_tree import ShadowXGBDTree


show_methods(dtrees)


# help(ShadowXGBDTree)


bst_shadow = ShadowXGBDTree(bst, tree_index=1,
    x_data=df_Xtrain, y_data=ytrain_log1p,
    feature_names=features, target_name=target)


# help(dtrees.dtreeviz)


dtrees.dtreeviz(bst_shadow)

findfont: Font family ['Arial'] not found. Falling back to DejaVu Sans.


dtrees.viz_leaf_samples(bst, df_Xtrain,feature_names=features,tree_index=1)

findfont: Font family ['Arial'] not found. Falling back to DejaVu Sans.


dtrees.describe_node_sample(bst, node_id=1,
                            x_data=df_Xtrain,
                            feature_names=features,
                            tree_index=1).iloc[:,:5]


row = df_Xtrain.iloc[10]
row.head()

age                        -0.694694
age_after_renovation       -0.626372
age_after_renovation_cat   -0.463220
age_after_renovation_sq    -0.671013
age_cat                    -0.533437
Name: 10, dtype: float64


s = dtrees.explain_prediction_path(bst_shadow, row,
    explanation_type="plain_english",tree_index=9)

print(s)

lat < -0.21
log1p_sqft_living < 0.48


dtrees.viz_leaf_target(bst, df_Xtrain, ytrain_log1p,
    feature_names=features,
    target_name=target,
    tree_index=1)


dtrees.viz_leaf_target(bst_shadow)


# features_reg_univar = ["age"]
# target_reg_univar = "price"

# dtrain_reg_univar = xgb.DMatrix(df_Xtrain[features_reg_univar], ytrain_log1p)

# params_reg_univar = {"max_depth":3,
#                      "eta":0.05,
#                      "objective":"reg:squarederror",
#                      "subsample":1}

# xgb_model_reg_univar = xgb.train(params=params_reg_univar,
#                                 dtrain=dtrain_reg_univar,
#                                 num_boost_round=8)

# xgb_shadow_reg_univar = ShadowXGBDTree(xgb_model_reg_univar, 1,
#     df_Xtrain[features_reg_univar], ytrain_log1p,
#     features_reg_univar, target_reg_univar)

# dtrees.rtreeviz_univar(xgb_shadow_reg_univar,
#     df_Xtrain[features_reg_univar],ytrain_log1p,
#     features_reg_univar, target_reg_univar)

err = """
VisualisationNotYetSupportedError: get_min_samples_leaf() is not implemented yet for XGBoost
"""


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 0 hr 0 min 52 secs

	age	age_after_renovation	age_after_renovation_cat	age_after_renovation_sq	age_cat	age_sq	basement_bool	bathrooms	bathrooms_sq	bedrooms	bedrooms_sq	condition	condition_1	condition_2	condition_3	condition_4	condition_5	floors	floors_sq	grade	grade_10	grade_11	grade_12	grade_13	grade_4	grade_5	grade_6	grade_7	grade_8	grade_9	lat	log1p_sqft_above	log1p_sqft_above_sq	log1p_sqft_basement	log1p_sqft_basement_sq	log1p_sqft_living	log1p_sqft_living15	log1p_sqft_living15_sq	log1p_sqft_living_sq	log1p_sqft_lot	log1p_sqft_lot15	log1p_sqft_lot15_sq	log1p_sqft_lot_sq	long	renovation_bool	sqft_above	sqft_basement	sqft_living	sqft_living15	sqft_lot	sqft_lot15	view	view_0	view_1	view_2	view_3	view_4	view_sq	waterfront	waterfront_0	waterfront_1	waterfront_sq	yr_built	yr_renovated	yr_renovated2	yr_sales	zipcode
0	-1.372335	-1.316486	-1.265291	-0.845091	-1.320662	-0.885667	-0.801818	0.506258	0.326221	-0.39033	-0.30222	-0.630613	-0.035694	-0.08937	0.735526	-0.595921	-0.294513	0.933474	0.806845	-0.554878	-0.238288	-0.135782	-0.066005	-0.026354	-0.036497	-0.108453	-0.324043	1.186907	-0.624934	-0.367371	-1.504623	0.948862	0.938843	-0.795545	-0.779839	0.590708	0.791054	0.779280	0.57266	-0.489502	-0.638404	-0.634197	-0.500625	0.025916	-0.207998	0.793435	-0.658262	0.397588	0.659102	-0.234915	-0.293699	-0.305512	0.329787	-0.123077	-0.217065	-0.1533	-0.124282	-0.261712	-0.089698	0.089698	-0.089698	-0.089698	1.361464	-0.207992	1.305630	-0.693043	-1.422563
1	-0.084817	-0.005269	-0.062185	-0.285363	-0.139825	-0.348085	-0.801818	0.506258	0.326221	-0.39033	-0.30222	0.902903	-0.035694	-0.08937	-1.359572	1.678075	-0.294513	0.933474	0.806845	-0.554878	-0.238288	-0.135782	-0.066005	-0.026354	-0.036497	-0.108453	-0.324043	1.186907	-0.624934	-0.367371	-2.128172	-0.094722	-0.122423	-0.795545	-0.779839	-0.460316	-0.671771	-0.680476	-0.48267	-0.564254	0.555413	0.499294	-0.567770	-0.504869	-0.207998	-0.272741	-0.658262	-0.563091	-0.697681	-0.242762	-0.021608	-0.305512	0.329787	-0.123077	-0.217065	-0.1533	-0.124282	-0.261712	-0.089698	0.089698	-0.089698	-0.089698	0.107715	-0.207992	0.028586	1.442912	-1.441324

	0	1	2	3
0	apply	get_booster	max_delta_step	random_state
1	base_score	get_num_boosting_rounds	max_depth	reg_alpha
2	booster	get_params	min_child_weight	reg_lambda
3	coef_	get_xgb_params	missing	save_model
4	colsample_bylevel	gpu_id	monotone_constraints	scale_pos_weight
5	colsample_bynode	importance_type	n_estimators	score
6	colsample_bytree	interaction_constraints	n_features_in_	set_params
7	evals_result	intercept_	n_jobs	subsample
8	feature_importances_	kwargs	num_parallel_tree	tree_method
9	fit	learning_rate	objective	validate_parameters
10	gamma	load_model	predict	verbosity

	Tree	Node	ID	Feature	Split	Yes	No	Missing	Gain	Cover
0	0	0	0-0	grade	0.716166	0-1	0-2	0-1	689.75	17290.0
1	0	1	0-1	lat	-0.194376	0-3	0-4	0-3	105.75	13905.0

	Node	ID	Feature	Split	Yes	No	Missing	Gain	Cover
0	0	0-0	grade	0.716166	0-1	0-2	0-1	689.750000	17290.0
1	1	0-1	lat	-0.194376	0-3	0-4	0-3	105.750000	13905.0
2	2	0-2	Leaf	NaN	NaN	NaN	NaN	1.313649	3385.0
3	3	0-3	Leaf	NaN	NaN	NaN	NaN	1.208724	5749.0
4	4	0-4	Leaf	NaN	NaN	NaN	NaN	1.260315	8156.0

	0	1	2	3
0	Color	adjust_colors	graphviz	rgb2hex
1	DTreeViz	class_leaf_viz	inline_svg_images	rtreeviz_bivar_3D
2	List	class_split_viz	myround	rtreeviz_bivar_heatmap
3	Mapping	ctreeviz_bivar	np	rtreeviz_univar
4	NUM_BINS	ctreeviz_leaf_samples	os	run
5	Number	ctreeviz_univar	patches	scale_SVG
6	PLATFORM	describe_node_sample	pd	tempfile
7	Path	draw_legend	plt	tree
8	ShadowDecTree	draw_piechart	prediction_path	view
9	ShadowDecTreeNode	dtreeviz	prop_size	viz_leaf_criterion
10	Tuple	explain_prediction_path	regr_leaf_viz	viz_leaf_samples
11	add_classifier_legend	get_num_bins	regr_split_viz	viz_leaf_target

Table of Contents

Introduction¶

Load the libraries¶

Load the libraries¶

Useful Functions¶

Parameters¶

Load the Data¶

Modelling Xgboost¶

Model evaluation using xgboost¶

Model Evaluation using dtreeviz¶

dtreeviz¶

viz_leaf_samples¶

describe_node_sample¶

explain_prediction_path¶

viz_leaf_target¶

Time Taken¶

	age	age_after_renovation	age_after_renovation_cat	age_after_renovation_sq	age_cat
count	11685.000000	11685.000000	11685.000000	11685.000000	11685.000000
mean	0.181673	0.198875	0.195734	0.167634	0.178939
std	0.984658	0.995964	0.998632	1.036658	0.987304
min	-1.507863	-1.454509	-1.265291	-0.848104	-1.320662
25%	-0.593048	-0.591866	-0.463220	-0.655279	-0.533437
50%	0.152358	0.201766	0.338850	-0.108606	0.253788
75%	0.796117	0.857374	0.739886	0.610136	0.647401
max	2.422456	2.548154	2.344027	3.579173	2.221851

	0
0	285000.0
1	239950.0