import time
time_start_notebook = time.time()


%%capture
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    ## install modules
    !pip install watermark
    !pip install catboost
    !pip install shap eli5

    # if we update existing module, we need to restart colab
    !pip install -U scikit-learn

    ## print
    print('Environment: Google Colaboratory.')
TREE_METHOD = 'gpu_hist' if ENV_COLAB else 'auto'


import numpy as np
import pandas as pd

# visualization
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

# mixed
import os
import time
from pprint import pprint

# random state
SEED = 0
RNG = np.random.RandomState(SEED)

# settings
pd.set_option('display.max_columns', 200)

# sklearn
import sklearn
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import ensemble
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.inspection import permutation_importance

# boosting
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
import xgboost
import lightgbm
import catboost
import catboost as catb

# versions
import watermark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv

Bhishan Poudel 2020-11-05 

CPython 3.6.9
IPython 5.5.0

compiler   : GCC 8.4.0
system     : Linux
release    : 4.19.112+
machine    : x86_64
processor  : x86_64
CPU cores  : 2
interpreter: 64bit

pandas     1.1.4
sklearn    0.23.2
xgboost    0.90
catboost   0.24.2
seaborn    0.11.0
lightgbm   2.2.3
watermark  2.0.2
numpy      1.18.5
matplotlib 3.2.2


def show_methods(obj, ncols=7,start=None, inside=None):
    """ Show all the attributes of a given method.
    Example:
    ========
    show_method_attributes(list)
     """

    print(f'Object Type: {type(obj)}\n')
    lst = [elem for elem in dir(obj) if elem[0]!='_' ]
    lst = [elem for elem in lst 
           if elem not in 'os np pd sys time psycopg2'.split() ]

    if isinstance(start,str):
        lst = [elem for elem in lst if elem.startswith(start)]
        
    if isinstance(start,tuple) or isinstance(start,list):
        lst = [elem for elem in lst for start_elem in start
               if elem.startswith(start_elem)]
        
    if isinstance(inside,str):
        lst = [elem for elem in lst if inside in elem]
        
    if isinstance(inside,tuple) or isinstance(inside,list):
        lst = [elem for elem in lst for inside_elem in inside
               if inside_elem in elem]

    return pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')


def adjustedR2(rsquared,nrows,ncols):
    return rsquared- (ncols-1)/(nrows-ncols) * (1-rsquared)


def print_regr_eval(ytest,ypreds,ncols,log_back=False):
    # if we have done log1p(target), we need to log back ypreds
    if log_back:
        ypreds = np.expm1(ypreds)
    rmse = np.sqrt(metrics.mean_squared_error(ytest,ypreds))
    r2 = metrics.r2_score(ytest,ypreds)
    ar2 = adjustedR2(r2,len(ytest),ncols)
    evs = metrics.explained_variance_score(ytest, ypreds)

    print('ytest :', ytest[:3])
    print('ypreds:', ypreds[:3])

    print(f"""           
Explained Variance: {evs:.6f}
         R-Squared: {r2:,.6f}

             RMSE : {rmse:,.2f}
Adjusted R-squared: {ar2:,.6f}

""")


def plot_feature_imp_catboost(model_cat,n=10):
    """Plot the feature importance horizontal bar plot.

    Parameters
    ----------
    model_cat: fitted catboost model
    
    """

    df_imp = pd.DataFrame({'Feature': model_cat.feature_names_,
                        'Importance': model_cat.feature_importances_
                        }) 

    df_imp = df_imp.nlargest(n,'Importance').set_index('Feature')
    ax = df_imp.plot.barh(figsize=(12,8)) # .invert_yaxis()

    plt.grid(True)
    plt.title('Feature Importance',fontsize=14)
    ax.get_legend().remove()

    for p in ax.patches:
        x = p.get_width()
        y = p.get_y()
        text = '{:.2f}'.format(p.get_width())
        ax.text(x, y,text,fontsize=15,color='indigo')
    ax.invert_yaxis()
    plt.show()


if ENV_COLAB:
    path_raw = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/'
    proj = 'Projects/King_County_Seattle_House_Price_Kaggle/'
    data_path_parent = path_raw + proj
    data_path_train = data_path_parent + 'raw/train.csv'
    data_path_test = data_path_parent + 'raw/test.csv'

else:
    data_path_parent = '../data/'
    data_path_train = data_path_parent + 'raw/train.csv'
    data_path_test = data_path_parent + 'raw/test.csv'

target = 'price'
train_size = 0.8

print(data_path_train)

https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/King_County_Seattle_House_Price_Kaggle/raw/train.csv


df_train_raw = pd.read_csv(data_path_train)
df_test_raw = pd.read_csv(data_path_test)
print(df_train_raw.shape)
print(df_train_raw.columns)

display(df_train_raw.head(2).append(df_train_raw.tail(2)))

(17290, 21)
Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')


def clean_data(df,log=True,sq=True,dummy=True):
    df = df.copy()

    # Date time features
    df['date'] = pd.to_datetime(df['date'])
    df['yr_sales'] = df['date'].dt.year
    df['age'] = df['yr_sales'] - df['yr_built']
    df['yr_renovated2'] = np.where(df['yr_renovated'].eq(0), df['yr_built'], df['yr_renovated'])
    df['age_after_renovation'] = df['yr_sales'] - df['yr_renovated2']

    # Categorical Features
    cols_str = ['waterfront', 'view', 'condition', 'grade','zipcode']
    for c in cols_str:
        df[c] = df[c].astype(str)

    cols_obj = df.select_dtypes(['object','category']).columns
    cols_obj_small = ['waterfront', 'view', 'condition', 'grade']

    # Boolean data types
    df['basement_bool'] = df['sqft_basement'].apply(lambda x: 1 if x>0 else 0)
    df['renovation_bool'] = df['yr_renovated'].apply(lambda x: 1 if x>0 else 0)

    # Numerical features binning
    cols_bin = ['age','age_after_renovation']
    df['age_cat'] = pd.cut(df['age'], 10, labels=range(10)).astype(str)
    df['age_after_renovation_cat'] = pd.cut(df['age_after_renovation'],
                                            10, labels=range(10))

    # Create dummy variables from object and categories
    cols_obj_cat = df.select_dtypes(include=[np.object, 'category']).columns
    cols_dummy = ['waterfront', 'view', 'condition', 'grade',
                    'age_cat', 'age_after_renovation_cat']
    if dummy:
        df_dummy = pd.get_dummies(df[cols_dummy],drop_first=False)
        df = pd.concat([df,df_dummy], axis=1)

    # after creating dummy, make the columns number
    for c in cols_obj_cat:
        df[c] = df[c].astype(np.int8)

    # Log transformation of large numerical values
    cols_log = ['sqft_living', 'sqft_lot', 'sqft_above',
                'sqft_basement', 'sqft_living15', 'sqft_lot15']
    if log:
        for col in cols_log:
            df['log1p_' + col] = np.log1p(df[col])

    # squared columns
    cols_sq = [
        # cats
        'bedrooms','bathrooms','floors','waterfront','view',

        # created nums
        'age','age_after_renovation']

    if sq:
        for col in cols_sq:
            df[col + '_sq'] = df[col]**2


    # Drop unwanted columns
    cols_drop = ['id','date']
    df = df.drop(cols_drop,axis=1)

    return df


df_train = clean_data(df_train_raw)
df_test = clean_data(df_test_raw)

print(df_train.shape)
print(df_train.columns)

(17290, 84)
Index(['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
       'sqft_living15', 'sqft_lot15', 'yr_sales', 'age', 'yr_renovated2',
       'age_after_renovation', 'basement_bool', 'renovation_bool', 'age_cat',
       'age_after_renovation_cat', 'waterfront_0', 'waterfront_1', 'view_0',
       'view_1', 'view_2', 'view_3', 'view_4', 'condition_1', 'condition_2',
       'condition_3', 'condition_4', 'condition_5', 'grade_1', 'grade_10',
       'grade_11', 'grade_12', 'grade_13', 'grade_3', 'grade_4', 'grade_5',
       'grade_6', 'grade_7', 'grade_8', 'grade_9', 'age_cat_0', 'age_cat_1',
       'age_cat_2', 'age_cat_3', 'age_cat_4', 'age_cat_5', 'age_cat_6',
       'age_cat_7', 'age_cat_8', 'age_cat_9', 'age_after_renovation_cat_0',
       'age_after_renovation_cat_1', 'age_after_renovation_cat_2',
       'age_after_renovation_cat_3', 'age_after_renovation_cat_4',
       'age_after_renovation_cat_5', 'age_after_renovation_cat_6',
       'age_after_renovation_cat_7', 'age_after_renovation_cat_8',
       'age_after_renovation_cat_9', 'log1p_sqft_living', 'log1p_sqft_lot',
       'log1p_sqft_above', 'log1p_sqft_basement', 'log1p_sqft_living15',
       'log1p_sqft_lot15', 'bedrooms_sq', 'bathrooms_sq', 'floors_sq',
       'waterfront_sq', 'view_sq', 'age_sq', 'age_after_renovation_sq'],
      dtype='object')


# df_train.dtypes.to_numpy()


# make sure no data leakage
df_train.filter(regex='price').columns

Index(['price'], dtype='object')


# make sure no nans
df_train.isna().sum().sum(), df_test.isna().sum().sum()

(0, 0)


df_train[target] = np.log1p(df_train[target])


# choose features to train, we can change it later
features = list(sorted(df_train.columns.drop(target)))
# print(np.array(features))


features = [i for i in features if i in df_test.columns if i in df_train.columns]
# print(np.array(sorted(features)))


df_Xtrain_orig  = df_train[features]
ser_ytrain_orig = df_train[target]

df_Xtest  = df_test[features]
ser_ytest = df_test[target]

ytrain_orig = np.array(ser_ytrain_orig).flatten()
ytest  = np.array(ser_ytest).flatten()


df_Xtrain, df_Xvalid, ser_ytrain, ser_yvalid = model_selection.train_test_split(
    df_Xtrain_orig, ser_ytrain_orig,
    train_size=0.8, random_state=SEED)

ytrain = ser_ytrain.to_numpy().ravel()
yvalid = ser_yvalid.to_numpy().ravel()


print(f"df_train   : {df_train.shape}\n")

print(f"df_Xtrain  : {df_Xtrain.shape}")
print(f"ser_ytrain : {ser_ytrain.shape}\n")

print(f"df_Xvalid  : {df_Xvalid.shape}")
print(f"ser_yvalid : {ser_yvalid.shape}\n")

print(f"df_test    : {df_test.shape}")
print(f"ser_ytest  : This does not exist.")

df_Xtrain.head(2)

df_train   : (17290, 84)

df_Xtrain  : (13832, 81)
ser_ytrain : (13832,)

df_Xvalid  : (3458, 81)
ser_yvalid : (3458,)

df_test    : (4323, 82)
ser_ytest  : This does not exist.


features = [i for i in features 
            if i in df_Xtrain.columns
            if i in df_Xvalid.columns
            if i in df_test.columns]
print(np.array(sorted(features)))

['age' 'age_after_renovation' 'age_after_renovation_cat'
 'age_after_renovation_cat_0' 'age_after_renovation_cat_1'
 'age_after_renovation_cat_2' 'age_after_renovation_cat_3'
 'age_after_renovation_cat_4' 'age_after_renovation_cat_5'
 'age_after_renovation_cat_6' 'age_after_renovation_cat_7'
 'age_after_renovation_cat_8' 'age_after_renovation_cat_9'
 'age_after_renovation_sq' 'age_cat' 'age_cat_0' 'age_cat_1' 'age_cat_2'
 'age_cat_3' 'age_cat_4' 'age_cat_5' 'age_cat_6' 'age_cat_7' 'age_cat_8'
 'age_cat_9' 'age_sq' 'basement_bool' 'bathrooms' 'bathrooms_sq'
 'bedrooms' 'bedrooms_sq' 'condition' 'condition_1' 'condition_2'
 'condition_3' 'condition_4' 'condition_5' 'floors' 'floors_sq' 'grade'
 'grade_10' 'grade_11' 'grade_12' 'grade_13' 'grade_4' 'grade_5' 'grade_6'
 'grade_7' 'grade_8' 'grade_9' 'lat' 'log1p_sqft_above'
 'log1p_sqft_basement' 'log1p_sqft_living' 'log1p_sqft_living15'
 'log1p_sqft_lot' 'log1p_sqft_lot15' 'long' 'renovation_bool' 'sqft_above'
 'sqft_basement' 'sqft_living' 'sqft_living15' 'sqft_lot' 'sqft_lot15'
 'view' 'view_0' 'view_1' 'view_2' 'view_3' 'view_4' 'view_sq'
 'waterfront' 'waterfront_0' 'waterfront_1' 'waterfront_sq' 'yr_built'
 'yr_renovated' 'yr_renovated2' 'yr_sales' 'zipcode']


scaling = None
if scaling == 'standard':
    scaler = preprocessing.StandardScaler()
    scaler.fit(df_Xtrain)
    df_Xtrain = pd.DataFrame(scaler.transform(df_Xtrain),columns=features)
    df_Xtest =  pd.DataFrame(scaler.transform(df_Xtest),columns=features)
elif scaling == 'minmax':
    scaler = preprocessing.MinMaxScaler()
    scaler.fit(df_Xtrain)
    df_Xtrain = pd.DataFrame(scaler.transform(df_Xtrain),columns=features)
    df_Xtest = pd.DataFrame(scaler.transform(df_Xtest),columns=features)

df_Xtrain.head(2)


show_methods(catboost,4)

Object Type: <class 'module'>


%%time

model = catb.CatBoostRegressor(verbose=1000,random_state=0)

model.fit(df_Xtrain_orig, ytrain_orig)
ypreds = model.predict(df_Xtest)

print_regr_eval(ytest,ypreds,df_Xtest.shape[1],log_back=True)

Learning rate set to 0.064823
0:	learn: 0.5028378	total: 59.7ms	remaining: 59.6s
999:	learn: 0.1240938	total: 9.7s	remaining: 0us
ytest : [285000. 239950. 460000.]
ypreds: [320203.43610525 221111.94861914 522547.92538233]
           
Explained Variance: 0.914482
         R-Squared: 0.913675

             RMSE : 107,861.00
Adjusted R-squared: 0.912047


CPU times: user 17.8 s, sys: 1.16 s, total: 19 s
Wall time: 9.94 s


%%time
model = catb.CatBoostRegressor(random_state=0, # seed = 0 gives better
                               iterations = 10_000
)

model.fit(df_Xtrain, ytrain,
          eval_set=(df_Xvalid, yvalid),
          early_stopping_rounds=50,
          use_best_model=True,
          cat_features=None,
          verbose=2000,
          plot=False
          )

ypreds = model.predict(df_Xtest)
print_regr_eval(ytest,ypreds,df_Xtest.shape[1],log_back=True)

Learning rate set to 0.018035
0:	learn: 0.5194880	test: 0.5195496	best: 0.5195496 (0)	total: 11.7ms	remaining: 1m 57s
2000:	learn: 0.1364108	test: 0.1565696	best: 0.1565696 (2000)	total: 18s	remaining: 1m 12s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.1539303674
bestIteration = 3632

Shrink model to first 3633 iterations.
ytest : [285000. 239950. 460000.]
ypreds: [317424.47677942 224472.26820779 530673.1782456 ]
           
Explained Variance: 0.909876
         R-Squared: 0.909021

             RMSE : 110,730.20
Adjusted R-squared: 0.907305


CPU times: user 1min, sys: 3.6 s, total: 1min 4s
Wall time: 33.6 s


note = """
WARNING: Here, using early stopping and validation set gave worse result.

"""


%%time

# getting data (note: getting data is very fast, only modelling is slow)
df_train = clean_data(df_train_raw,dummy=False)
df_test = clean_data(df_test_raw,dummy=False)

df_train[target] = np.log1p(df_train[target])
features = list(sorted(df_train.columns.drop(target)))
features = [i for i in features if i in df_test.columns if i in df_train.columns]

df_Xtrain_orig  = df_train[features]
ser_ytrain_orig = df_train[target]

df_Xtest  = df_test[features]
ser_ytest = df_test[target]
ytrain_orig = np.array(ser_ytrain_orig).flatten()
ytest  = np.array(ser_ytest).flatten()

# modelling
cat_features = ['waterfront', 'view', 'condition', 'grade',
                    # 'age_cat', 'age_after_renovation_cat'
                ]
cat_idx = [df_Xtrain_orig.columns.to_list().index(i) for i in cat_features]
model = catboost.CatBoostRegressor(cat_features=cat_idx, 
                                   one_hot_max_size=100,
                                   iterations=2000,
                                   random_state=0,
                                   )
model.fit(df_Xtrain_orig, ytrain_orig,
          verbose=1_000,
          cat_features=cat_idx
          )

ypreds = model.predict(df_Xtest)
print_regr_eval(ytest,ypreds,df_Xtest.shape[1],log_back=True)

Learning rate set to 0.038278
0:	learn: 0.5122805	total: 10ms	remaining: 20s
1000:	learn: 0.1377115	total: 8.85s	remaining: 8.83s
1999:	learn: 0.1207688	total: 17.7s	remaining: 0us
ytest : [285000. 239950. 460000.]
ypreds: [322311.16735949 221504.3821753  506996.99758012]
           
Explained Variance: 0.911634
         R-Squared: 0.910792

             RMSE : 109,647.27
Adjusted R-squared: 0.910000


CPU times: user 33.7 s, sys: 1.75 s, total: 35.4 s
Wall time: 18.5 s


cat_idx

[32, 30, 11, 14]


show_methods(model,4)

Object Type: <class 'catboost.core.CatBoostRegressor'>


show_methods(catboost,4)

Object Type: <class 'module'>


# help(model.calc_feature_statistics)


# float feature
feature_name = 'sqft_living'
dict_stats = model.calc_feature_statistics(df_Xtrain_orig, ytrain_orig,
                                           feature_name, plot=True)


# one hot feature
feature_name = 'bedrooms'
cat_vals = df_Xtrain_orig[feature_name].unique().tolist()

dict_stats = model.calc_feature_statistics(df_Xtrain_orig, ytrain_orig, feature_name)


for key in dict_stats.keys():
    print(key, len(dict_stats[key]))

borders 11
binarized_feature 17290
mean_target 12
mean_weighted_target 0
mean_prediction 12
objects_per_bin 12
predictions_on_varying_feature 12


# feature importance
df_imp = pd.DataFrame({'Feature': features,
                       'Importance': model.feature_importances_
                       }) 

df_imp.sort_values('Importance',ascending=False).style.background_gradient()


plot_feature_imp_catboost(model,n=10)


import shap


%%time

model = catboost.CatBoostRegressor(verbose=500,random_state=0)
model.fit(df_Xtrain_orig, ytrain_orig)

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(df_Xtest)

Learning rate set to 0.064823
0:	learn: 0.5023784	total: 10.4ms	remaining: 10.4s
500:	learn: 0.1408636	total: 4.53s	remaining: 4.51s
999:	learn: 0.1243705	total: 9.07s	remaining: 0us
CPU times: user 21.1 s, sys: 968 ms, total: 22 s
Wall time: 11.4 s


df_Xtrain_orig.shape, df_Xtest.shape, shap_values.shape

((17290, 39), (4323, 39), (4323, 39))


# load JS visualization code to notebook
shap.initjs()

# Look only first row of test data
# use matplotlib=True to avoid Javascript
shap.force_plot(explainer.expected_value,
                shap_values[0,:],
                df_Xtest.iloc[0,:],
                matplotlib=False,
                text_rotation=90)


# the prediction for first row is 12.69 which is due to alll columns.
#
# red features contribute positive, blue features contribute negative.
# here, first row has sqft_living = 2,437, which is a good value it makes prediction higher 
# but, lat = 47.35 makes the label prediction lower


shap.summary_plot(shap_values, df_Xtest)


shap.summary_plot(shap_values, df_Xtest, plot_type='bar')


shap.dependence_plot("sqft_living", shap_values, df_Xtest)


shap.dependence_plot("view", shap_values, df_Xtest)


shap.dependence_plot(ind='sqft_living', interaction_index='sqft_living15',
                     shap_values=shap_values, 
                     features=df_Xtest,  
                     display_features=df_Xtest)


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 0 hr 8 min 32 secs

	id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	view	condition	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15
0	2561340020	20140804T000000	325000.0	3	1.75	1780	11096	1.0	0	3	7	1210	570	1979	0	98074	47.6170	-122.051	1780	10640
1	8598200070	20141208T000000	278000.0	2	2.50	1420	2229	2.0	0	3	7	1420	0	2004	0	98059	47.4871	-122.165	1500	2230
17288	7174800760	20140725T000000	667000.0	5	2.00	1900	5470	1.0	0	3	7	1180	720	1930	1965	98105	47.6666	-122.303	1300	3250
17289	9521100280	20140612T000000	480000.0	3	2.50	1250	1103	3.0	2	3	8	1250	0	2005	0	98103	47.6619	-122.352	1250	1188

	0	1	2	3
0	CatBoost	EFstrType	Pool	to_regressor
1	CatBoostClassifier	FeaturesData	core	train
2	CatBoostError	MetricVisualizer	cv	version
3	CatBoostRegressor	MultiRegressionCustomMetric	sum_models	widget
4	CatboostError	MultiRegressionCustomObjective	to_classifier

	0	1	2	3
0	best_iteration_	get_best_iteration	get_scale_and_bias	predict
1	best_score_	get_best_score	get_test_eval	random_seed_
2	calc_feature_statistics	get_borders	get_test_evals	randomized_search
3	calc_leaf_indexes	get_cat_feature_indices	get_text_feature_indices	save_borders
4	classes_	get_embedding_feature_indices	get_tree_leaf_counts	save_model
5	compare	get_evals_result	grid_search	score
6	copy	get_feature_importance	is_fitted	set_feature_names
7	create_metric_calcer	get_leaf_values	iterate_leaf_indexes	set_leaf_values
8	drop_unused_features	get_leaf_weights	learning_rate_	set_params
9	eval_metrics	get_metadata	load_model	set_scale_and_bias
10	evals_result_	get_n_features_in	n_features_in_	shrink
11	feature_importances_	get_object_importance	plot_partial_dependence	staged_predict
12	feature_names_	get_param	plot_predictions	tree_count_
13	fit	get_params	plot_tree	virtual_ensembles_predict
14	get_all_params

	0	1	2	3
0	CatBoost	EFstrType	Pool	to_regressor
1	CatBoostClassifier	FeaturesData	core	train
2	CatBoostError	MetricVisualizer	cv	version
3	CatBoostRegressor	MultiRegressionCustomMetric	sum_models	widget
4	CatboostError	MultiRegressionCustomObjective	to_classifier

Table of Contents

Data Description¶

Imports¶

Important Scripts¶

Parameters¶

Load the data¶

Data Processing¶

Log transform Target¶

Train-target split¶

Train-Validation Split¶

Scaling¶

Modelling: catboost¶

Baseline model¶

Catboost with validation set¶

catboost built-in categorical features¶

Feature Statistics¶

Feature Importance¶

Metric Visualizer (only works in notebook, not jupyterlab or colab)¶

Model Evaluation Using shap¶

Time Taken¶

	age	age_after_renovation	age_after_renovation_cat	age_after_renovation_cat_0	age_after_renovation_cat_1	age_after_renovation_cat_2	age_after_renovation_cat_3	age_after_renovation_cat_4	age_after_renovation_cat_5	age_after_renovation_cat_6	age_after_renovation_cat_7	age_after_renovation_cat_8	age_after_renovation_cat_9	age_after_renovation_sq	age_cat	age_cat_0	age_cat_1	age_cat_2	age_cat_3	age_cat_4	age_cat_5	age_cat_6	age_cat_7	age_cat_8	age_cat_9	age_sq	basement_bool	bathrooms	bathrooms_sq	bedrooms	bedrooms_sq	condition	condition_1	condition_2	condition_3	condition_4	condition_5	floors	floors_sq	grade	grade_10	grade_11	grade_12	grade_13	grade_4	grade_5	grade_6	grade_7	grade_8	grade_9	lat	log1p_sqft_above	log1p_sqft_basement	log1p_sqft_living	log1p_sqft_living15	log1p_sqft_lot	log1p_sqft_lot15	long	renovation_bool	sqft_above	sqft_basement	sqft_living	sqft_living15	sqft_lot	sqft_lot15	view	view_0	view_1	view_2	view_3	view_4	view_sq	waterfront	waterfront_0	waterfront_1	waterfront_sq	yr_built	yr_renovated	yr_renovated2	yr_sales	zipcode
13832	30	30	2	0	0	1	0	0	0	0	0	0	0	900	2	0	0	1	0	0	0	0	0	0	0	900	0	3.0	9.00	5	25	4	0	0	0	1	0	1.5	2.25	9	0	0	0	0	0	0	0	0	0	1	47.3488	8.243019	0.000000	8.243019	7.855932	10.652944	10.485033	-122.095	0	3800	0	3800	2580	42316	35775	0	1	0	0	0	0	0	0	1	0	0	1984	0	1984	2014	-6
4184	72	19	1	0	1	0	0	0	0	0	0	0	0	361	6	0	0	0	0	0	0	1	0	0	0	5184	1	2.5	6.25	3	9	5	0	0	0	0	1	2.0	4.00	9	0	0	0	0	0	0	0	0	0	1	47.5855	7.601402	6.508769	7.890208	7.749753	8.517393	8.517393	-122.292	1	2000	670	2670	2320	5000	5000	3	0	0	0	1	0	9	0	1	0	0	1942	1995	1995	2014	96

	Feature	Importance
15	lat	44.816561
18	log1p_sqft_living	11.275696
26	sqft_living	7.955914
22	long	6.925514
14	grade	4.662216
19	log1p_sqft_living15	2.803159
38	zipcode	2.581891
27	sqft_living15	2.039709
16	log1p_sqft_above	1.849234
30	view	1.425478
31	view_sq	1.424221
11	condition	1.227043
24	sqft_above	0.966273
20	log1p_sqft_lot	0.957940
21	log1p_sqft_lot15	0.848439
28	sqft_lot	0.814951
36	yr_renovated2	0.795638
1	age_after_renovation	0.605672
29	sqft_lot15	0.564038
32	waterfront	0.541281
8	bathrooms_sq	0.534811
34	yr_built	0.515996
37	yr_sales	0.503194
7	bathrooms	0.490852
3	age_after_renovation_sq	0.444812
33	waterfront_sq	0.415367
5	age_sq	0.389578
0	age	0.297204
17	log1p_sqft_basement	0.266718
9	bedrooms	0.210771
25	sqft_basement	0.186138
35	yr_renovated	0.166124
4	age_cat	0.133029
10	bedrooms_sq	0.104462
2	age_after_renovation_cat	0.095204
12	floors	0.090801
13	floors_sq	0.062421
23	renovation_bool	0.008142
6	basement_bool	0.003509