This dataset contains house sale prices for King County, which includes Seattle. It includes homes sold between May 2014 and May 2015.
Task: Try to estimate the price based on given features.
import time
time_start_notebook = time.time()
%%capture
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
## install modules
!pip install watermark
!pip install catboost
!pip install shap eli5
# if we update existing module, we need to restart colab
!pip install -U scikit-learn
## print
print('Environment: Google Colaboratory.')
TREE_METHOD = 'gpu_hist' if ENV_COLAB else 'auto'
import numpy as np
import pandas as pd
# visualization
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
# mixed
import os
import time
from pprint import pprint
# random state
SEED = 0
RNG = np.random.RandomState(SEED)
# settings
pd.set_option('display.max_columns', 200)
# sklearn
import sklearn
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import ensemble
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.inspection import permutation_importance
# boosting
from sklearn.experimental import enable_hist_gradient_boosting # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
import xgboost
import lightgbm
import catboost
import catboost as catb
# versions
import watermark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv
Bhishan Poudel 2020-11-05 CPython 3.6.9 IPython 5.5.0 compiler : GCC 8.4.0 system : Linux release : 4.19.112+ machine : x86_64 processor : x86_64 CPU cores : 2 interpreter: 64bit pandas 1.1.4 sklearn 0.23.2 xgboost 0.90 catboost 0.24.2 seaborn 0.11.0 lightgbm 2.2.3 watermark 2.0.2 numpy 1.18.5 matplotlib 3.2.2
def show_methods(obj, ncols=7,start=None, inside=None):
""" Show all the attributes of a given method.
Example:
========
show_method_attributes(list)
"""
print(f'Object Type: {type(obj)}\n')
lst = [elem for elem in dir(obj) if elem[0]!='_' ]
lst = [elem for elem in lst
if elem not in 'os np pd sys time psycopg2'.split() ]
if isinstance(start,str):
lst = [elem for elem in lst if elem.startswith(start)]
if isinstance(start,tuple) or isinstance(start,list):
lst = [elem for elem in lst for start_elem in start
if elem.startswith(start_elem)]
if isinstance(inside,str):
lst = [elem for elem in lst if inside in elem]
if isinstance(inside,tuple) or isinstance(inside,list):
lst = [elem for elem in lst for inside_elem in inside
if inside_elem in elem]
return pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
def adjustedR2(rsquared,nrows,ncols):
return rsquared- (ncols-1)/(nrows-ncols) * (1-rsquared)
def print_regr_eval(ytest,ypreds,ncols,log_back=False):
# if we have done log1p(target), we need to log back ypreds
if log_back:
ypreds = np.expm1(ypreds)
rmse = np.sqrt(metrics.mean_squared_error(ytest,ypreds))
r2 = metrics.r2_score(ytest,ypreds)
ar2 = adjustedR2(r2,len(ytest),ncols)
evs = metrics.explained_variance_score(ytest, ypreds)
print('ytest :', ytest[:3])
print('ypreds:', ypreds[:3])
print(f"""
Explained Variance: {evs:.6f}
R-Squared: {r2:,.6f}
RMSE : {rmse:,.2f}
Adjusted R-squared: {ar2:,.6f}
""")
def plot_feature_imp_catboost(model_cat,n=10):
"""Plot the feature importance horizontal bar plot.
Parameters
----------
model_cat: fitted catboost model
"""
df_imp = pd.DataFrame({'Feature': model_cat.feature_names_,
'Importance': model_cat.feature_importances_
})
df_imp = df_imp.nlargest(n,'Importance').set_index('Feature')
ax = df_imp.plot.barh(figsize=(12,8)) # .invert_yaxis()
plt.grid(True)
plt.title('Feature Importance',fontsize=14)
ax.get_legend().remove()
for p in ax.patches:
x = p.get_width()
y = p.get_y()
text = '{:.2f}'.format(p.get_width())
ax.text(x, y,text,fontsize=15,color='indigo')
ax.invert_yaxis()
plt.show()
if ENV_COLAB:
path_raw = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/'
proj = 'Projects/King_County_Seattle_House_Price_Kaggle/'
data_path_parent = path_raw + proj
data_path_train = data_path_parent + 'raw/train.csv'
data_path_test = data_path_parent + 'raw/test.csv'
else:
data_path_parent = '../data/'
data_path_train = data_path_parent + 'raw/train.csv'
data_path_test = data_path_parent + 'raw/test.csv'
target = 'price'
train_size = 0.8
print(data_path_train)
https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/King_County_Seattle_House_Price_Kaggle/raw/train.csv
df_train_raw = pd.read_csv(data_path_train)
df_test_raw = pd.read_csv(data_path_test)
print(df_train_raw.shape)
print(df_train_raw.columns)
display(df_train_raw.head(2).append(df_train_raw.tail(2)))
(17290, 21) Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15'], dtype='object')
id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2561340020 | 20140804T000000 | 325000.0 | 3 | 1.75 | 1780 | 11096 | 1.0 | 0 | 0 | 3 | 7 | 1210 | 570 | 1979 | 0 | 98074 | 47.6170 | -122.051 | 1780 | 10640 |
1 | 8598200070 | 20141208T000000 | 278000.0 | 2 | 2.50 | 1420 | 2229 | 2.0 | 0 | 0 | 3 | 7 | 1420 | 0 | 2004 | 0 | 98059 | 47.4871 | -122.165 | 1500 | 2230 |
17288 | 7174800760 | 20140725T000000 | 667000.0 | 5 | 2.00 | 1900 | 5470 | 1.0 | 0 | 0 | 3 | 7 | 1180 | 720 | 1930 | 1965 | 98105 | 47.6666 | -122.303 | 1300 | 3250 |
17289 | 9521100280 | 20140612T000000 | 480000.0 | 3 | 2.50 | 1250 | 1103 | 3.0 | 0 | 2 | 3 | 8 | 1250 | 0 | 2005 | 0 | 98103 | 47.6619 | -122.352 | 1250 | 1188 |
def clean_data(df,log=True,sq=True,dummy=True):
df = df.copy()
# Date time features
df['date'] = pd.to_datetime(df['date'])
df['yr_sales'] = df['date'].dt.year
df['age'] = df['yr_sales'] - df['yr_built']
df['yr_renovated2'] = np.where(df['yr_renovated'].eq(0), df['yr_built'], df['yr_renovated'])
df['age_after_renovation'] = df['yr_sales'] - df['yr_renovated2']
# Categorical Features
cols_str = ['waterfront', 'view', 'condition', 'grade','zipcode']
for c in cols_str:
df[c] = df[c].astype(str)
cols_obj = df.select_dtypes(['object','category']).columns
cols_obj_small = ['waterfront', 'view', 'condition', 'grade']
# Boolean data types
df['basement_bool'] = df['sqft_basement'].apply(lambda x: 1 if x>0 else 0)
df['renovation_bool'] = df['yr_renovated'].apply(lambda x: 1 if x>0 else 0)
# Numerical features binning
cols_bin = ['age','age_after_renovation']
df['age_cat'] = pd.cut(df['age'], 10, labels=range(10)).astype(str)
df['age_after_renovation_cat'] = pd.cut(df['age_after_renovation'],
10, labels=range(10))
# Create dummy variables from object and categories
cols_obj_cat = df.select_dtypes(include=[np.object, 'category']).columns
cols_dummy = ['waterfront', 'view', 'condition', 'grade',
'age_cat', 'age_after_renovation_cat']
if dummy:
df_dummy = pd.get_dummies(df[cols_dummy],drop_first=False)
df = pd.concat([df,df_dummy], axis=1)
# after creating dummy, make the columns number
for c in cols_obj_cat:
df[c] = df[c].astype(np.int8)
# Log transformation of large numerical values
cols_log = ['sqft_living', 'sqft_lot', 'sqft_above',
'sqft_basement', 'sqft_living15', 'sqft_lot15']
if log:
for col in cols_log:
df['log1p_' + col] = np.log1p(df[col])
# squared columns
cols_sq = [
# cats
'bedrooms','bathrooms','floors','waterfront','view',
# created nums
'age','age_after_renovation']
if sq:
for col in cols_sq:
df[col + '_sq'] = df[col]**2
# Drop unwanted columns
cols_drop = ['id','date']
df = df.drop(cols_drop,axis=1)
return df
df_train = clean_data(df_train_raw)
df_test = clean_data(df_test_raw)
print(df_train.shape)
print(df_train.columns)
(17290, 84) Index(['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'yr_sales', 'age', 'yr_renovated2', 'age_after_renovation', 'basement_bool', 'renovation_bool', 'age_cat', 'age_after_renovation_cat', 'waterfront_0', 'waterfront_1', 'view_0', 'view_1', 'view_2', 'view_3', 'view_4', 'condition_1', 'condition_2', 'condition_3', 'condition_4', 'condition_5', 'grade_1', 'grade_10', 'grade_11', 'grade_12', 'grade_13', 'grade_3', 'grade_4', 'grade_5', 'grade_6', 'grade_7', 'grade_8', 'grade_9', 'age_cat_0', 'age_cat_1', 'age_cat_2', 'age_cat_3', 'age_cat_4', 'age_cat_5', 'age_cat_6', 'age_cat_7', 'age_cat_8', 'age_cat_9', 'age_after_renovation_cat_0', 'age_after_renovation_cat_1', 'age_after_renovation_cat_2', 'age_after_renovation_cat_3', 'age_after_renovation_cat_4', 'age_after_renovation_cat_5', 'age_after_renovation_cat_6', 'age_after_renovation_cat_7', 'age_after_renovation_cat_8', 'age_after_renovation_cat_9', 'log1p_sqft_living', 'log1p_sqft_lot', 'log1p_sqft_above', 'log1p_sqft_basement', 'log1p_sqft_living15', 'log1p_sqft_lot15', 'bedrooms_sq', 'bathrooms_sq', 'floors_sq', 'waterfront_sq', 'view_sq', 'age_sq', 'age_after_renovation_sq'], dtype='object')
# df_train.dtypes.to_numpy()
# make sure no data leakage
df_train.filter(regex='price').columns
Index(['price'], dtype='object')
# make sure no nans
df_train.isna().sum().sum(), df_test.isna().sum().sum()
(0, 0)
df_train[target] = np.log1p(df_train[target])
# choose features to train, we can change it later
features = list(sorted(df_train.columns.drop(target)))
# print(np.array(features))
features = [i for i in features if i in df_test.columns if i in df_train.columns]
# print(np.array(sorted(features)))
df_Xtrain_orig = df_train[features]
ser_ytrain_orig = df_train[target]
df_Xtest = df_test[features]
ser_ytest = df_test[target]
ytrain_orig = np.array(ser_ytrain_orig).flatten()
ytest = np.array(ser_ytest).flatten()
df_Xtrain, df_Xvalid, ser_ytrain, ser_yvalid = model_selection.train_test_split(
df_Xtrain_orig, ser_ytrain_orig,
train_size=0.8, random_state=SEED)
ytrain = ser_ytrain.to_numpy().ravel()
yvalid = ser_yvalid.to_numpy().ravel()
print(f"df_train : {df_train.shape}\n")
print(f"df_Xtrain : {df_Xtrain.shape}")
print(f"ser_ytrain : {ser_ytrain.shape}\n")
print(f"df_Xvalid : {df_Xvalid.shape}")
print(f"ser_yvalid : {ser_yvalid.shape}\n")
print(f"df_test : {df_test.shape}")
print(f"ser_ytest : This does not exist.")
df_Xtrain.head(2)
df_train : (17290, 84) df_Xtrain : (13832, 81) ser_ytrain : (13832,) df_Xvalid : (3458, 81) ser_yvalid : (3458,) df_test : (4323, 82) ser_ytest : This does not exist.
age | age_after_renovation | age_after_renovation_cat | age_after_renovation_cat_0 | age_after_renovation_cat_1 | age_after_renovation_cat_2 | age_after_renovation_cat_3 | age_after_renovation_cat_4 | age_after_renovation_cat_5 | age_after_renovation_cat_6 | age_after_renovation_cat_7 | age_after_renovation_cat_8 | age_after_renovation_cat_9 | age_after_renovation_sq | age_cat | age_cat_0 | age_cat_1 | age_cat_2 | age_cat_3 | age_cat_4 | age_cat_5 | age_cat_6 | age_cat_7 | age_cat_8 | age_cat_9 | age_sq | basement_bool | bathrooms | bathrooms_sq | bedrooms | bedrooms_sq | condition | condition_1 | condition_2 | condition_3 | condition_4 | condition_5 | floors | floors_sq | grade | grade_10 | grade_11 | grade_12 | grade_13 | grade_4 | grade_5 | grade_6 | grade_7 | grade_8 | grade_9 | lat | log1p_sqft_above | log1p_sqft_basement | log1p_sqft_living | log1p_sqft_living15 | log1p_sqft_lot | log1p_sqft_lot15 | long | renovation_bool | sqft_above | sqft_basement | sqft_living | sqft_living15 | sqft_lot | sqft_lot15 | view | view_0 | view_1 | view_2 | view_3 | view_4 | view_sq | waterfront | waterfront_0 | waterfront_1 | waterfront_sq | yr_built | yr_renovated | yr_renovated2 | yr_sales | zipcode | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
13832 | 30 | 30 | 2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 900 | 2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 900 | 0 | 3.0 | 9.00 | 5 | 25 | 4 | 0 | 0 | 0 | 1 | 0 | 1.5 | 2.25 | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 47.3488 | 8.243019 | 0.000000 | 8.243019 | 7.855932 | 10.652944 | 10.485033 | -122.095 | 0 | 3800 | 0 | 3800 | 2580 | 42316 | 35775 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1984 | 0 | 1984 | 2014 | -6 |
4184 | 72 | 19 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 361 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5184 | 1 | 2.5 | 6.25 | 3 | 9 | 5 | 0 | 0 | 0 | 0 | 1 | 2.0 | 4.00 | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 47.5855 | 7.601402 | 6.508769 | 7.890208 | 7.749753 | 8.517393 | 8.517393 | -122.292 | 1 | 2000 | 670 | 2670 | 2320 | 5000 | 5000 | 3 | 0 | 0 | 0 | 1 | 0 | 9 | 0 | 1 | 0 | 0 | 1942 | 1995 | 1995 | 2014 | 96 |
features = [i for i in features
if i in df_Xtrain.columns
if i in df_Xvalid.columns
if i in df_test.columns]
print(np.array(sorted(features)))
['age' 'age_after_renovation' 'age_after_renovation_cat' 'age_after_renovation_cat_0' 'age_after_renovation_cat_1' 'age_after_renovation_cat_2' 'age_after_renovation_cat_3' 'age_after_renovation_cat_4' 'age_after_renovation_cat_5' 'age_after_renovation_cat_6' 'age_after_renovation_cat_7' 'age_after_renovation_cat_8' 'age_after_renovation_cat_9' 'age_after_renovation_sq' 'age_cat' 'age_cat_0' 'age_cat_1' 'age_cat_2' 'age_cat_3' 'age_cat_4' 'age_cat_5' 'age_cat_6' 'age_cat_7' 'age_cat_8' 'age_cat_9' 'age_sq' 'basement_bool' 'bathrooms' 'bathrooms_sq' 'bedrooms' 'bedrooms_sq' 'condition' 'condition_1' 'condition_2' 'condition_3' 'condition_4' 'condition_5' 'floors' 'floors_sq' 'grade' 'grade_10' 'grade_11' 'grade_12' 'grade_13' 'grade_4' 'grade_5' 'grade_6' 'grade_7' 'grade_8' 'grade_9' 'lat' 'log1p_sqft_above' 'log1p_sqft_basement' 'log1p_sqft_living' 'log1p_sqft_living15' 'log1p_sqft_lot' 'log1p_sqft_lot15' 'long' 'renovation_bool' 'sqft_above' 'sqft_basement' 'sqft_living' 'sqft_living15' 'sqft_lot' 'sqft_lot15' 'view' 'view_0' 'view_1' 'view_2' 'view_3' 'view_4' 'view_sq' 'waterfront' 'waterfront_0' 'waterfront_1' 'waterfront_sq' 'yr_built' 'yr_renovated' 'yr_renovated2' 'yr_sales' 'zipcode']
scaling = None
if scaling == 'standard':
scaler = preprocessing.StandardScaler()
scaler.fit(df_Xtrain)
df_Xtrain = pd.DataFrame(scaler.transform(df_Xtrain),columns=features)
df_Xtest = pd.DataFrame(scaler.transform(df_Xtest),columns=features)
elif scaling == 'minmax':
scaler = preprocessing.MinMaxScaler()
scaler.fit(df_Xtrain)
df_Xtrain = pd.DataFrame(scaler.transform(df_Xtrain),columns=features)
df_Xtest = pd.DataFrame(scaler.transform(df_Xtest),columns=features)
df_Xtrain.head(2)
age | age_after_renovation | age_after_renovation_cat | age_after_renovation_cat_0 | age_after_renovation_cat_1 | age_after_renovation_cat_2 | age_after_renovation_cat_3 | age_after_renovation_cat_4 | age_after_renovation_cat_5 | age_after_renovation_cat_6 | age_after_renovation_cat_7 | age_after_renovation_cat_8 | age_after_renovation_cat_9 | age_after_renovation_sq | age_cat | age_cat_0 | age_cat_1 | age_cat_2 | age_cat_3 | age_cat_4 | age_cat_5 | age_cat_6 | age_cat_7 | age_cat_8 | age_cat_9 | age_sq | basement_bool | bathrooms | bathrooms_sq | bedrooms | bedrooms_sq | condition | condition_1 | condition_2 | condition_3 | condition_4 | condition_5 | floors | floors_sq | grade | grade_10 | grade_11 | grade_12 | grade_13 | grade_4 | grade_5 | grade_6 | grade_7 | grade_8 | grade_9 | lat | log1p_sqft_above | log1p_sqft_basement | log1p_sqft_living | log1p_sqft_living15 | log1p_sqft_lot | log1p_sqft_lot15 | long | renovation_bool | sqft_above | sqft_basement | sqft_living | sqft_living15 | sqft_lot | sqft_lot15 | view | view_0 | view_1 | view_2 | view_3 | view_4 | view_sq | waterfront | waterfront_0 | waterfront_1 | waterfront_sq | yr_built | yr_renovated | yr_renovated2 | yr_sales | zipcode | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
13832 | 30 | 30 | 2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 900 | 2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 900 | 0 | 3.0 | 9.00 | 5 | 25 | 4 | 0 | 0 | 0 | 1 | 0 | 1.5 | 2.25 | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 47.3488 | 8.243019 | 0.000000 | 8.243019 | 7.855932 | 10.652944 | 10.485033 | -122.095 | 0 | 3800 | 0 | 3800 | 2580 | 42316 | 35775 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1984 | 0 | 1984 | 2014 | -6 |
4184 | 72 | 19 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 361 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5184 | 1 | 2.5 | 6.25 | 3 | 9 | 5 | 0 | 0 | 0 | 0 | 1 | 2.0 | 4.00 | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 47.5855 | 7.601402 | 6.508769 | 7.890208 | 7.749753 | 8.517393 | 8.517393 | -122.292 | 1 | 2000 | 670 | 2670 | 2320 | 5000 | 5000 | 3 | 0 | 0 | 0 | 1 | 0 | 9 | 0 | 1 | 0 | 0 | 1942 | 1995 | 1995 | 2014 | 96 |
class CatBoostRegressor(
iterations=None,
learning_rate=None,
loss_function='RMSE',
use_best_model=None,
verbose=None,
silent=None,
logging_level=None,
one_hot_max_size=None,
ignored_features=None,
train_dir=None,
custom_metric=None,
eval_metric=None,
subsample=None,
max_depth=None,
n_estimators=None,
num_boost_round=None,
num_trees=None,
colsample_bylevel=None,
random_state=None,
reg_lambda=None,
objective=None,
eta=None,
max_bin=None,
early_stopping_rounds=None,
cat_features=None,
min_child_samples=None,
max_leaves=None,
num_leaves=None,
score_function=None,
)
show_methods(catboost,4)
Object Type: <class 'module'>
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | CatBoost | EFstrType | Pool | to_regressor |
1 | CatBoostClassifier | FeaturesData | core | train |
2 | CatBoostError | MetricVisualizer | cv | version |
3 | CatBoostRegressor | MultiRegressionCustomMetric | sum_models | widget |
4 | CatboostError | MultiRegressionCustomObjective | to_classifier |
%%time
model = catb.CatBoostRegressor(verbose=1000,random_state=0)
model.fit(df_Xtrain_orig, ytrain_orig)
ypreds = model.predict(df_Xtest)
print_regr_eval(ytest,ypreds,df_Xtest.shape[1],log_back=True)
Learning rate set to 0.064823 0: learn: 0.5028378 total: 59.7ms remaining: 59.6s 999: learn: 0.1240938 total: 9.7s remaining: 0us ytest : [285000. 239950. 460000.] ypreds: [320203.43610525 221111.94861914 522547.92538233] Explained Variance: 0.914482 R-Squared: 0.913675 RMSE : 107,861.00 Adjusted R-squared: 0.912047 CPU times: user 17.8 s, sys: 1.16 s, total: 19 s Wall time: 9.94 s
%%time
model = catb.CatBoostRegressor(random_state=0, # seed = 0 gives better
iterations = 10_000
)
model.fit(df_Xtrain, ytrain,
eval_set=(df_Xvalid, yvalid),
early_stopping_rounds=50,
use_best_model=True,
cat_features=None,
verbose=2000,
plot=False
)
ypreds = model.predict(df_Xtest)
print_regr_eval(ytest,ypreds,df_Xtest.shape[1],log_back=True)
Learning rate set to 0.018035 0: learn: 0.5194880 test: 0.5195496 best: 0.5195496 (0) total: 11.7ms remaining: 1m 57s 2000: learn: 0.1364108 test: 0.1565696 best: 0.1565696 (2000) total: 18s remaining: 1m 12s Stopped by overfitting detector (50 iterations wait) bestTest = 0.1539303674 bestIteration = 3632 Shrink model to first 3633 iterations. ytest : [285000. 239950. 460000.] ypreds: [317424.47677942 224472.26820779 530673.1782456 ] Explained Variance: 0.909876 R-Squared: 0.909021 RMSE : 110,730.20 Adjusted R-squared: 0.907305 CPU times: user 1min, sys: 3.6 s, total: 1min 4s Wall time: 33.6 s
note = """
WARNING: Here, using early stopping and validation set gave worse result.
"""
%%time
# getting data (note: getting data is very fast, only modelling is slow)
df_train = clean_data(df_train_raw,dummy=False)
df_test = clean_data(df_test_raw,dummy=False)
df_train[target] = np.log1p(df_train[target])
features = list(sorted(df_train.columns.drop(target)))
features = [i for i in features if i in df_test.columns if i in df_train.columns]
df_Xtrain_orig = df_train[features]
ser_ytrain_orig = df_train[target]
df_Xtest = df_test[features]
ser_ytest = df_test[target]
ytrain_orig = np.array(ser_ytrain_orig).flatten()
ytest = np.array(ser_ytest).flatten()
# modelling
cat_features = ['waterfront', 'view', 'condition', 'grade',
# 'age_cat', 'age_after_renovation_cat'
]
cat_idx = [df_Xtrain_orig.columns.to_list().index(i) for i in cat_features]
model = catboost.CatBoostRegressor(cat_features=cat_idx,
one_hot_max_size=100,
iterations=2000,
random_state=0,
)
model.fit(df_Xtrain_orig, ytrain_orig,
verbose=1_000,
cat_features=cat_idx
)
ypreds = model.predict(df_Xtest)
print_regr_eval(ytest,ypreds,df_Xtest.shape[1],log_back=True)
Learning rate set to 0.038278 0: learn: 0.5122805 total: 10ms remaining: 20s 1000: learn: 0.1377115 total: 8.85s remaining: 8.83s 1999: learn: 0.1207688 total: 17.7s remaining: 0us ytest : [285000. 239950. 460000.] ypreds: [322311.16735949 221504.3821753 506996.99758012] Explained Variance: 0.911634 R-Squared: 0.910792 RMSE : 109,647.27 Adjusted R-squared: 0.910000 CPU times: user 33.7 s, sys: 1.75 s, total: 35.4 s Wall time: 18.5 s
cat_idx
[32, 30, 11, 14]
show_methods(model,4)
Object Type: <class 'catboost.core.CatBoostRegressor'>
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | best_iteration_ | get_best_iteration | get_scale_and_bias | predict |
1 | best_score_ | get_best_score | get_test_eval | random_seed_ |
2 | calc_feature_statistics | get_borders | get_test_evals | randomized_search |
3 | calc_leaf_indexes | get_cat_feature_indices | get_text_feature_indices | save_borders |
4 | classes_ | get_embedding_feature_indices | get_tree_leaf_counts | save_model |
5 | compare | get_evals_result | grid_search | score |
6 | copy | get_feature_importance | is_fitted | set_feature_names |
7 | create_metric_calcer | get_leaf_values | iterate_leaf_indexes | set_leaf_values |
8 | drop_unused_features | get_leaf_weights | learning_rate_ | set_params |
9 | eval_metrics | get_metadata | load_model | set_scale_and_bias |
10 | evals_result_ | get_n_features_in | n_features_in_ | shrink |
11 | feature_importances_ | get_object_importance | plot_partial_dependence | staged_predict |
12 | feature_names_ | get_param | plot_predictions | tree_count_ |
13 | fit | get_params | plot_tree | virtual_ensembles_predict |
14 | get_all_params |
show_methods(catboost,4)
Object Type: <class 'module'>
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | CatBoost | EFstrType | Pool | to_regressor |
1 | CatBoostClassifier | FeaturesData | core | train |
2 | CatBoostError | MetricVisualizer | cv | version |
3 | CatBoostRegressor | MultiRegressionCustomMetric | sum_models | widget |
4 | CatboostError | MultiRegressionCustomObjective | to_classifier |
# help(model.calc_feature_statistics)
# float feature
feature_name = 'sqft_living'
dict_stats = model.calc_feature_statistics(df_Xtrain_orig, ytrain_orig,
feature_name, plot=True)
# one hot feature
feature_name = 'bedrooms'
cat_vals = df_Xtrain_orig[feature_name].unique().tolist()
dict_stats = model.calc_feature_statistics(df_Xtrain_orig, ytrain_orig, feature_name)
for key in dict_stats.keys():
print(key, len(dict_stats[key]))
borders 11 binarized_feature 17290 mean_target 12 mean_weighted_target 0 mean_prediction 12 objects_per_bin 12 predictions_on_varying_feature 12
# feature importance
df_imp = pd.DataFrame({'Feature': features,
'Importance': model.feature_importances_
})
df_imp.sort_values('Importance',ascending=False).style.background_gradient()
Feature | Importance | |
---|---|---|
15 | lat | 44.816561 |
18 | log1p_sqft_living | 11.275696 |
26 | sqft_living | 7.955914 |
22 | long | 6.925514 |
14 | grade | 4.662216 |
19 | log1p_sqft_living15 | 2.803159 |
38 | zipcode | 2.581891 |
27 | sqft_living15 | 2.039709 |
16 | log1p_sqft_above | 1.849234 |
30 | view | 1.425478 |
31 | view_sq | 1.424221 |
11 | condition | 1.227043 |
24 | sqft_above | 0.966273 |
20 | log1p_sqft_lot | 0.957940 |
21 | log1p_sqft_lot15 | 0.848439 |
28 | sqft_lot | 0.814951 |
36 | yr_renovated2 | 0.795638 |
1 | age_after_renovation | 0.605672 |
29 | sqft_lot15 | 0.564038 |
32 | waterfront | 0.541281 |
8 | bathrooms_sq | 0.534811 |
34 | yr_built | 0.515996 |
37 | yr_sales | 0.503194 |
7 | bathrooms | 0.490852 |
3 | age_after_renovation_sq | 0.444812 |
33 | waterfront_sq | 0.415367 |
5 | age_sq | 0.389578 |
0 | age | 0.297204 |
17 | log1p_sqft_basement | 0.266718 |
9 | bedrooms | 0.210771 |
25 | sqft_basement | 0.186138 |
35 | yr_renovated | 0.166124 |
4 | age_cat | 0.133029 |
10 | bedrooms_sq | 0.104462 |
2 | age_after_renovation_cat | 0.095204 |
12 | floors | 0.090801 |
13 | floors_sq | 0.062421 |
23 | renovation_bool | 0.008142 |
6 | basement_bool | 0.003509 |
plot_feature_imp_catboost(model,n=10)
import catboost
from catboost import CatBoostClassifier
# part 1: fit the model
cat_features = [0,1,2]
train_data = [["a", "b", 1, 4, 5, 6],
["a", "b", 4, 5, 6, 7],
["c", "d", 30, 40, 50, 60]]
train_labels = [1,1,0]
model = CatBoostClassifier(iterations=20,
loss_function = "CrossEntropy",
train_dir = "crossentropy")
model.fit(train_data, train_labels, cat_features)
predictions = model.predict(train_data)
# part 2: visualize
w = catboost.MetricVisualizer('/crossentropy/')
w.start()
Part 1 works in google colab and gives some files in the directory crossentroy but part2 keeps running for infinite time.
import shap
%%time
model = catboost.CatBoostRegressor(verbose=500,random_state=0)
model.fit(df_Xtrain_orig, ytrain_orig)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(df_Xtest)
Learning rate set to 0.064823 0: learn: 0.5023784 total: 10.4ms remaining: 10.4s 500: learn: 0.1408636 total: 4.53s remaining: 4.51s 999: learn: 0.1243705 total: 9.07s remaining: 0us CPU times: user 21.1 s, sys: 968 ms, total: 22 s Wall time: 11.4 s
df_Xtrain_orig.shape, df_Xtest.shape, shap_values.shape
((17290, 39), (4323, 39), (4323, 39))
# load JS visualization code to notebook
shap.initjs()
# Look only first row of test data
# use matplotlib=True to avoid Javascript
shap.force_plot(explainer.expected_value,
shap_values[0,:],
df_Xtest.iloc[0,:],
matplotlib=False,
text_rotation=90)
# the prediction for first row is 12.69 which is due to alll columns.
#
# red features contribute positive, blue features contribute negative.
# here, first row has sqft_living = 2,437, which is a good value it makes prediction higher
# but, lat = 47.35 makes the label prediction lower
shap.summary_plot(shap_values, df_Xtest)
This plot is made of many dots. Each dot has three characteristics:
shap.summary_plot(shap_values, df_Xtest, plot_type='bar')
shap.dependence_plot("sqft_living", shap_values, df_Xtest)
shap.dependence_plot("view", shap_values, df_Xtest)
shap.dependence_plot(ind='sqft_living', interaction_index='sqft_living15',
shap_values=shap_values,
features=df_Xtest,
display_features=df_Xtest)
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 0 hr 8 min 32 secs