import time
time_start_notebook = time.time()
%%capture
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
## install modules
!yes | pip install -q watermark
!yes | pip install -q dtreeviz
# if we update existing module, we need to restart colab
!yes | pip install -q -U scikit-learn
!yes | pip uninstall xgboost
!yes | pip install -q xgboost==1.2.0 # dtreeviz and joblib needs same version as in local
import numpy as np
import pandas as pd
import xgboost
import xgboost as xgb
import sklearn
from sklearn import metrics as skmetrics
import os
import joblib
# random state
SEED = 0
RNG = np.random.RandomState(SEED)
# model eval
import graphviz
import dtreeviz
from dtreeviz.models.shadow_decision_tree import ShadowDecTree
from dtreeviz.models.xgb_decision_tree import ShadowXGBDTree
# params
SEED = 100
# versions
import watermark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv
Bhishan Poudel 2020-11-24 CPython 3.6.9 IPython 5.5.0 compiler : GCC 8.4.0 system : Linux release : 4.19.112+ machine : x86_64 processor : x86_64 CPU cores : 2 interpreter: 64bit sklearn 0.23.2 watermark 2.0.2 graphviz 0.10.1 numpy 1.18.5 xgboost 1.2.0 pandas 1.1.4 joblib 0.17.0
def adjustedR2(rsquared,nrows,ncols):
return rsquared- (ncols-1)/(nrows-ncols) * (1-rsquared)
def print_regr_eval(ytest,ypreds,ncols):
rmse = np.sqrt(skmetrics.mean_squared_error(ytest,ypreds))
r2 = skmetrics.r2_score(ytest,ypreds)
ar2 = adjustedR2(r2,len(ytest),ncols)
evs = skmetrics.explained_variance_score(ytest, ypreds)
print(f"""
RMSE : {rmse:,.2f}
Explained Variance: {evs:.6f}
R-Squared: {r2:,.6f}
Adjusted R-squared: {ar2:,.6f}
""")
def show_methods(obj, ncols=4):
lst = [i for i in dir(obj) if i[0]!='_' ]
df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
return df
if ENV_COLAB:
path_git = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/'
project = 'Projects/King_County_Seattle_House_Price_Kaggle/'
data_path_parent = path_git + project
else:
data_path_parent = '../data/'
data_path_Xtrain = data_path_parent + 'processed/Xtrain.csv.zip'
data_path_ytrain = data_path_parent + 'processed/ytrain.csv'
data_path_Xtest = data_path_parent + 'processed/Xtest.csv.zip'
data_path_ytest = data_path_parent + 'processed/ytest.csv'
target = 'price'
train_size = 0.8
print(data_path_Xtest)
https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/King_County_Seattle_House_Price_Kaggle/processed/Xtest.csv.zip
df_Xtrain = pd.read_csv(data_path_Xtrain,compression='zip')
ser_ytrain = pd.read_csv(data_path_ytrain,header=None)
ytrain = np.array(ser_ytrain).flatten()
ytrain_log1p = np.log1p(ytrain)
df_Xtest = pd.read_csv(data_path_Xtest,compression='zip')
ser_ytest = pd.read_csv(data_path_ytest,header=None)
ytest = np.array(ser_ytest).flatten()
features = list(df_Xtest.columns)
s = f"""
df_Xtest = {df_Xtest.shape}
ytest = {ytest.shape}
"""
print(s)
display(df_Xtest.head(2))
display(ser_ytest.head(2))
assert df_Xtest.shape[0] == ytest.shape[0]
df_Xtest = (4323, 67) ytest = (4323,)
age | age_after_renovation | age_after_renovation_cat | age_after_renovation_sq | age_cat | age_sq | basement_bool | bathrooms | bathrooms_sq | bedrooms | bedrooms_sq | condition | condition_1 | condition_2 | condition_3 | condition_4 | condition_5 | floors | floors_sq | grade | grade_10 | grade_11 | grade_12 | grade_13 | grade_4 | grade_5 | grade_6 | grade_7 | grade_8 | grade_9 | lat | log1p_sqft_above | log1p_sqft_above_sq | log1p_sqft_basement | log1p_sqft_basement_sq | log1p_sqft_living | log1p_sqft_living15 | log1p_sqft_living15_sq | log1p_sqft_living_sq | log1p_sqft_lot | log1p_sqft_lot15 | log1p_sqft_lot15_sq | log1p_sqft_lot_sq | long | renovation_bool | sqft_above | sqft_basement | sqft_living | sqft_living15 | sqft_lot | sqft_lot15 | view | view_0 | view_1 | view_2 | view_3 | view_4 | view_sq | waterfront | waterfront_0 | waterfront_1 | waterfront_sq | yr_built | yr_renovated | yr_renovated2 | yr_sales | zipcode | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -1.372335 | -1.316486 | -1.265291 | -0.845091 | -1.320662 | -0.885667 | -0.801818 | 0.506258 | 0.326221 | -0.39033 | -0.30222 | -0.630613 | -0.035694 | -0.08937 | 0.735526 | -0.595921 | -0.294513 | 0.933474 | 0.806845 | -0.554878 | -0.238288 | -0.135782 | -0.066005 | -0.026354 | -0.036497 | -0.108453 | -0.324043 | 1.186907 | -0.624934 | -0.367371 | -1.504623 | 0.948862 | 0.938843 | -0.795545 | -0.779839 | 0.590708 | 0.791054 | 0.779280 | 0.57266 | -0.489502 | -0.638404 | -0.634197 | -0.500625 | 0.025916 | -0.207998 | 0.793435 | -0.658262 | 0.397588 | 0.659102 | -0.234915 | -0.293699 | -0.305512 | 0.329787 | -0.123077 | -0.217065 | -0.1533 | -0.124282 | -0.261712 | -0.089698 | 0.089698 | -0.089698 | -0.089698 | 1.361464 | -0.207992 | 1.305630 | -0.693043 | -1.422563 |
1 | -0.084817 | -0.005269 | -0.062185 | -0.285363 | -0.139825 | -0.348085 | -0.801818 | 0.506258 | 0.326221 | -0.39033 | -0.30222 | 0.902903 | -0.035694 | -0.08937 | -1.359572 | 1.678075 | -0.294513 | 0.933474 | 0.806845 | -0.554878 | -0.238288 | -0.135782 | -0.066005 | -0.026354 | -0.036497 | -0.108453 | -0.324043 | 1.186907 | -0.624934 | -0.367371 | -2.128172 | -0.094722 | -0.122423 | -0.795545 | -0.779839 | -0.460316 | -0.671771 | -0.680476 | -0.48267 | -0.564254 | 0.555413 | 0.499294 | -0.567770 | -0.504869 | -0.207998 | -0.272741 | -0.658262 | -0.563091 | -0.697681 | -0.242762 | -0.021608 | -0.305512 | 0.329787 | -0.123077 | -0.217065 | -0.1533 | -0.124282 | -0.261712 | -0.089698 | 0.089698 | -0.089698 | -0.089698 | 0.107715 | -0.207992 | 0.028586 | 1.442912 | -1.441324 |
0 | |
---|---|
0 | 285000.0 |
1 | 239950.0 |
# saved models
if ENV_COLAB:
!mkdir -p ../models
!wget https://github.com/bhishanpdl/Datasets/blob/master/Projects/King_County_Seattle_House_Price_Kaggle/models/model_xgb_logtarget.dump?raw=true
!mv model_xgb_logtarget.dump?raw=true ../models/model_xgb_logtarget.dump
!ls ../models
!du -sh ../models/model_xgb_logtarget.dump
path_model_xgb = '../models/model_xgb_logtarget.dump'
model = xgboost.XGBRegressor()
model.load_model(path_model_xgb)
ypreds_log1p = model.predict(df_Xtest)
ypreds = np.expm1(ypreds_log1p)
print('ytest:', ytest[:3])
print('ypreds: ', ypreds[:3]) # **WARNING: This gives empty ypreds**
--2020-11-24 15:48:54-- https://github.com/bhishanpdl/Datasets/blob/master/Projects/King_County_Seattle_House_Price_Kaggle/models/model_xgb_logtarget.dump?raw=true Resolving github.com (github.com)... 13.114.40.48 Connecting to github.com (github.com)|13.114.40.48|:443... connected. HTTP request sent, awaiting response... 302 Found Location: https://github.com/bhishanpdl/Datasets/raw/master/Projects/King_County_Seattle_House_Price_Kaggle/models/model_xgb_logtarget.dump [following] --2020-11-24 15:48:55-- https://github.com/bhishanpdl/Datasets/raw/master/Projects/King_County_Seattle_House_Price_Kaggle/models/model_xgb_logtarget.dump Reusing existing connection to github.com:443. HTTP request sent, awaiting response... 302 Found Location: https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/King_County_Seattle_House_Price_Kaggle/models/model_xgb_logtarget.dump [following] --2020-11-24 15:48:55-- https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/King_County_Seattle_House_Price_Kaggle/models/model_xgb_logtarget.dump Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ... Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 821441 (802K) [application/octet-stream] Saving to: ‘model_xgb_logtarget.dump?raw=true’ model_xgb_logtarget 100%[===================>] 802.19K --.-KB/s in 0.1s 2020-11-24 15:48:56 (8.14 MB/s) - ‘model_xgb_logtarget.dump?raw=true’ saved [821441/821441] model_xgb_logtarget.dump 804K ../models/model_xgb_logtarget.dump ytest: [285000. 239950. 460000.] ypreds: [343218.4 204292.31 508420.8 ]
# saved models
use_joblib = 0
if use_joblib:
!mkdir -p ../models
!wget https://github.com/bhishanpdl/Datasets/blob/master/Projects/King_County_Seattle_House_Price_Kaggle/models/model_xgb_logtarget.dump?raw=true
!mv model_xgb_logtarget.joblib?raw=true ../models/model_xgb_logtarget.joblib
!ls ../models
!du -sh ../models/model_xgb_logtarget.joblib
path_model_xgb = '../models/model_xgb_logtarget.joblib'
model = xgboost.XGBRegressor()
model.load_model(path_model_xgb)
ypreds_log1p = model.predict(df_Xtest)
ypreds = np.expm1(ypreds_log1p)
print('ytest:', ytest[:3])
print('ypreds: ', ypreds[:3]) # **WARNING: This gives empty ypreds**
%%time
train_model = 0
if train_model:
params_xgb = dict(n_jobs=-1,
random_state=SEED,
objective='reg:squarederror',
n_estimators=1200,
max_depth=3, # default 6
reg_alpha=1, # default alpha = 0, alias reg_alpha
reg_lambda=5, # default lambda = 1, alias reg_lambda
subsample=1, # default 1
gamma=0, # default gamma=0 alias min_split_loss
min_child_weight=1, # default 1
colsample_bytree=1, # default 1
learning_rate=0.1, # default eta = 0.3
tree_method = 'auto', # default auto, use gpu_hist
)
model = xgboost.XGBRegressor(**params_xgb)
model.fit(df_Xtrain, ytrain_log1p)
ypreds_log1p = model.predict(df_Xtest)
ypreds = np.expm1(ypreds_log1p)
print('ytest:', ytest[:3])
print('ypreds: ', ypreds[:3])
print_regr_eval(ytest,ypreds,df_Xtest.shape[1])
CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs Wall time: 9.54 µs
show_methods(model)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | apply | get_booster | max_delta_step | random_state |
1 | base_score | get_num_boosting_rounds | max_depth | reg_alpha |
2 | booster | get_params | min_child_weight | reg_lambda |
3 | coef_ | get_xgb_params | missing | save_model |
4 | colsample_bylevel | gpu_id | monotone_constraints | scale_pos_weight |
5 | colsample_bynode | importance_type | n_estimators | score |
6 | colsample_bytree | interaction_constraints | n_features_in_ | set_params |
7 | evals_result | intercept_ | n_jobs | subsample |
8 | feature_importances_ | kwargs | num_parallel_tree | tree_method |
9 | fit | learning_rate | objective | validate_parameters |
10 | gamma | load_model | predict | verbosity |
bst = model.get_booster()
bst.trees_to_dataframe().head(2)
Tree | Node | ID | Feature | Split | Yes | No | Missing | Gain | Cover | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0-0 | grade | 0.716166 | 0-1 | 0-2 | 0-1 | 689.75 | 17290.0 |
1 | 0 | 1 | 0-1 | lat | -0.194376 | 0-3 | 0-4 | 0-3 | 105.75 | 13905.0 |
bst.trees_to_dataframe()['Tree'].nunique()
1200
bst.trees_to_dataframe().query("Tree == 0")
Tree | Node | ID | Feature | Split | Yes | No | Missing | Gain | Cover | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0-0 | grade | 0.716166 | 0-1 | 0-2 | 0-1 | 689.750000 | 17290.0 |
1 | 0 | 1 | 0-1 | lat | -0.194376 | 0-3 | 0-4 | 0-3 | 105.750000 | 13905.0 |
2 | 0 | 2 | 0-2 | Leaf | NaN | NaN | NaN | NaN | 1.313649 | 3385.0 |
3 | 0 | 3 | 0-3 | Leaf | NaN | NaN | NaN | NaN | 1.208724 | 5749.0 |
4 | 0 | 4 | 0-4 | Leaf | NaN | NaN | NaN | NaN | 1.260315 | 8156.0 |
xgb.plot_tree(model)
<matplotlib.axes._subplots.AxesSubplot at 0x7f8ca18b1a20>
ShadowXGBDTree(self,
booster:xgboost.core.Booster,
tree_index:int,
x_data,
y_data,
feature_names:List[str]=None,
target_name:str=None,
class_names:(typing.List[str],typing.Mapping[int, str])=None)
import graphviz
from dtreeviz import trees as dtrees
from dtreeviz.models.shadow_decision_tree import ShadowDecTree
from dtreeviz.models.xgb_decision_tree import ShadowXGBDTree
show_methods(dtrees)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | Color | adjust_colors | graphviz | rgb2hex |
1 | DTreeViz | class_leaf_viz | inline_svg_images | rtreeviz_bivar_3D |
2 | List | class_split_viz | myround | rtreeviz_bivar_heatmap |
3 | Mapping | ctreeviz_bivar | np | rtreeviz_univar |
4 | NUM_BINS | ctreeviz_leaf_samples | os | run |
5 | Number | ctreeviz_univar | patches | scale_SVG |
6 | PLATFORM | describe_node_sample | pd | tempfile |
7 | Path | draw_legend | plt | tree |
8 | ShadowDecTree | draw_piechart | prediction_path | view |
9 | ShadowDecTreeNode | dtreeviz | prop_size | viz_leaf_criterion |
10 | Tuple | explain_prediction_path | regr_leaf_viz | viz_leaf_samples |
11 | add_classifier_legend | get_num_bins | regr_split_viz | viz_leaf_target |
# help(ShadowXGBDTree)
bst_shadow = ShadowXGBDTree(bst, tree_index=1,
x_data=df_Xtrain, y_data=ytrain_log1p,
feature_names=features, target_name=target)
dtreeviz(tree_model,
x_data:(<class 'pandas.core.frame.DataFrame'>, <class 'numpy.ndarray'>)=None,
y_data:(<class 'pandas.core.frame.DataFrame'>, <class 'numpy.ndarray'>)=None,
feature_names:List[str]=None,
target_name:str=None,
class_names:(typing.Mapping[numbers.Number, str], typing.List[str])=None,
tree_index:int=None,
precision:int=2,
orientation:('TD', 'LR')='TD',
instance_orientation:('TD', 'LR')='LR',
show_root_edge_labels:bool=True,
show_node_labels:bool=False,
show_just_path:bool=False,
fancy:bool=True,
histtype:('bar', 'barstacked', 'strip')='barstacked',
highlight_path:List[int]=[],
X:numpy.ndarray=None,
max_X_features_LR:int=10,
max_X_features_TD:int=20,
label_fontsize:int=12,
ticks_fontsize:int=8,
fontname:str='Arial',
colors:dict=None,
scale=1.0) -> dtreeviz.trees.DTreeViz
# help(dtrees.dtreeviz)
dtrees.dtreeviz(bst_shadow)
findfont: Font family ['Arial'] not found. Falling back to DejaVu Sans.
viz_leaf_samples(tree_model,
x_data:(<class 'pandas.core.frame.DataFrame'>, <class 'numpy.ndarray'>)=None,
feature_names:List[str]=None,
tree_index:int=None,
figsize:tuple=(10, 5),
display_type:str='plot',
colors:dict=None,
fontsize:int=14,
fontname:str='Arial',
grid:bool=False,
bins:int=10,
min_samples:int=0,
max_samples:int=None)
dtrees.viz_leaf_samples(bst, df_Xtrain,feature_names=features,tree_index=1)
findfont: Font family ['Arial'] not found. Falling back to DejaVu Sans.
describe_node_sample(tree_model,
node_id:int,
x_data:(<class 'pandas.core.frame.DataFrame'>, <class 'numpy.ndarray'>)=None,
feature_names:List[str]=None,
tree_index:int=None)
dtrees.describe_node_sample(bst, node_id=1,
x_data=df_Xtrain,
feature_names=features,
tree_index=1).iloc[:,:5]
age | age_after_renovation | age_after_renovation_cat | age_after_renovation_sq | age_cat | |
---|---|---|---|---|---|
count | 11685.000000 | 11685.000000 | 11685.000000 | 11685.000000 | 11685.000000 |
mean | 0.181673 | 0.198875 | 0.195734 | 0.167634 | 0.178939 |
std | 0.984658 | 0.995964 | 0.998632 | 1.036658 | 0.987304 |
min | -1.507863 | -1.454509 | -1.265291 | -0.848104 | -1.320662 |
25% | -0.593048 | -0.591866 | -0.463220 | -0.655279 | -0.533437 |
50% | 0.152358 | 0.201766 | 0.338850 | -0.108606 | 0.253788 |
75% | 0.796117 | 0.857374 | 0.739886 | 0.610136 | 0.647401 |
max | 2.422456 | 2.548154 | 2.344027 | 3.579173 | 2.221851 |
explain_prediction_path(tree_model,
x:numpy.ndarray,
x_data=None,
y_data=None,
explanation_type:('plain_english', 'sklearn_default')='plain_english',
feature_names:List[str]=None,
target_name:str=None,
class_names:(typing.Mapping[numbers.Number, str],
typing.List[str])=None,
tree_index:int=None)
row = df_Xtrain.iloc[10]
row.head()
age -0.694694 age_after_renovation -0.626372 age_after_renovation_cat -0.463220 age_after_renovation_sq -0.671013 age_cat -0.533437 Name: 10, dtype: float64
s = dtrees.explain_prediction_path(bst_shadow, row,
explanation_type="plain_english",tree_index=9)
print(s)
lat < -0.21 log1p_sqft_living < 0.48
viz_leaf_target(tree_model,
x_data:(<class 'pandas.core.frame.DataFrame'>, <class 'numpy.ndarray'>)=None,
y_data:(<class 'pandas.core.frame.DataFrame'>, <class 'numpy.ndarray'>)=None,
feature_names:List[str]=None,
target_name:str=None,
tree_index:int=None,
show_leaf_labels:bool=True,
colors:dict=None,
markersize:int=50,
label_fontsize:int=14,
fontname:str='Arial',
precision:int=1,
figsize:tuple=None,
grid:bool=False,
prediction_line_width:int=2)
dtrees.viz_leaf_target(bst, df_Xtrain, ytrain_log1p,
feature_names=features,
target_name=target,
tree_index=1)
dtrees.viz_leaf_target(bst_shadow)
# features_reg_univar = ["age"]
# target_reg_univar = "price"
# dtrain_reg_univar = xgb.DMatrix(df_Xtrain[features_reg_univar], ytrain_log1p)
# params_reg_univar = {"max_depth":3,
# "eta":0.05,
# "objective":"reg:squarederror",
# "subsample":1}
# xgb_model_reg_univar = xgb.train(params=params_reg_univar,
# dtrain=dtrain_reg_univar,
# num_boost_round=8)
# xgb_shadow_reg_univar = ShadowXGBDTree(xgb_model_reg_univar, 1,
# df_Xtrain[features_reg_univar], ytrain_log1p,
# features_reg_univar, target_reg_univar)
# dtrees.rtreeviz_univar(xgb_shadow_reg_univar,
# df_Xtrain[features_reg_univar],ytrain_log1p,
# features_reg_univar, target_reg_univar)
err = """
VisualisationNotYetSupportedError: get_min_samples_leaf() is not implemented yet for XGBoost
"""
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 0 hr 0 min 52 secs