%%html
<marquee style='width: 30%; color: blue;'><b> Author: Bhishan Poudel</b></marquee>
This dataset contains house sale prices for King County, which includes Seattle. It includes homes sold between May 2014 and May 2015.
Task: Try to estimate the price based on given features.
import time
time_start_notebook = time.time()
%%capture
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
## install modules
!pip install watermark
!pip install catboost
!pip install shap eli5
# if we update existing module, we need to restart colab
!pip install -U scikit-learn
## print
print('Environment: Google Colaboratory.')
TREE_METHOD = 'gpu_hist' if ENV_COLAB else 'auto'
import numpy as np
import pandas as pd
# visualization
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
# mixed
import os
import time
from pprint import pprint
# random state
SEED = 0
RNG = np.random.RandomState(SEED)
# settings
pd.set_option('display.max_columns', 200)
# sklearn
import sklearn
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import ensemble
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.inspection import permutation_importance
# boosting
from sklearn.experimental import enable_hist_gradient_boosting # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
import xgboost
import lightgbm
import catboost
import catboost as cb
# versions
import watermark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv
The watermark extension is already loaded. To reload it, use: %reload_ext watermark Bhishan Poudel 2020-11-14 CPython 3.7.7 IPython 7.18.1 compiler : Clang 4.0.1 (tags/RELEASE_401/final) system : Darwin release : 19.6.0 machine : x86_64 processor : i386 CPU cores : 4 interpreter: 64bit matplotlib 3.2.1 watermark 2.0.2 seaborn 0.11.0 xgboost 1.2.0 numpy 1.18.4 sklearn 0.23.1 lightgbm 2.3.1 catboost 0.23.2 pandas 1.1.0 json 2.0.9
def show_methods(obj, ncols=7,start=None, inside=None):
""" Show all the attributes of a given method.
Example:
========
show_method_attributes(list)
"""
print(f'Object Type: {type(obj)}\n')
lst = [elem for elem in dir(obj) if elem[0]!='_' ]
lst = [elem for elem in lst
if elem not in 'os np pd sys time psycopg2'.split() ]
if isinstance(start,str):
lst = [elem for elem in lst if elem.startswith(start)]
if isinstance(start,tuple) or isinstance(start,list):
lst = [elem for elem in lst for start_elem in start
if elem.startswith(start_elem)]
if isinstance(inside,str):
lst = [elem for elem in lst if inside in elem]
if isinstance(inside,tuple) or isinstance(inside,list):
lst = [elem for elem in lst for inside_elem in inside
if inside_elem in elem]
return pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
def adjustedR2(rsquared,nrows,ncols):
return rsquared- (ncols-1)/(nrows-ncols) * (1-rsquared)
def print_regr_eval(ytest,ypreds,ncols,log_target=False):
# if we have done log1p(target), we need to log back ypreds
if log_target:
ypreds = np.expm1(ypreds)
rmse = np.sqrt(metrics.mean_squared_error(ytest,ypreds))
r2 = metrics.r2_score(ytest,ypreds)
ar2 = adjustedR2(r2,len(ytest),ncols)
evs = metrics.explained_variance_score(ytest, ypreds)
print('ytest :', ytest[:3])
print('ypreds:', ypreds[:3])
print(f"""
Explained Variance: {evs:.6f}
R-Squared: {r2:,.6f}
RMSE : {rmse:,.2f}
Adjusted R-squared: {ar2:,.6f}
""")
def plot_feature_imp_catboost(model_cat,n=10):
"""Plot the feature importance horizontal bar plot.
Parameters
----------
model_cat: fitted catboost model
"""
df_imp = pd.DataFrame({'Feature': model_cat.feature_names_,
'Importance': model_cat.feature_importances_
})
df_imp = df_imp.nlargest(n,'Importance').set_index('Feature')
ax = df_imp.plot.barh(figsize=(12,8)) # .invert_yaxis()
plt.grid(True)
plt.title('Feature Importance',fontsize=14)
ax.get_legend().remove()
for p in ax.patches:
x = p.get_width()
y = p.get_y()
text = '{:.2f}'.format(p.get_width())
ax.text(x, y,text,fontsize=15,color='indigo')
ax.invert_yaxis()
plt.show()
if ENV_COLAB:
path_raw = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/'
proj = 'Projects/King_County_Seattle_House_Price_Kaggle/'
data_path_parent = path_raw + proj
data_path_train = data_path_parent + 'raw/train.csv'
data_path_test = data_path_parent + 'raw/test.csv'
else:
data_path_parent = '../data/'
data_path_train = data_path_parent + 'raw/train.csv'
data_path_test = data_path_parent + 'raw/test.csv'
target = 'price'
train_size = 0.8
print(data_path_train)
../data/raw/train.csv
df_train_raw = pd.read_csv(data_path_train)
df_test_raw = pd.read_csv(data_path_test)
print(df_train_raw.shape)
print(df_train_raw.columns)
display(df_train_raw.head(2).append(df_train_raw.tail(2)))
(17290, 21) Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15'], dtype='object')
id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2561340020 | 20140804T000000 | 325000.0 | 3 | 1.75 | 1780 | 11096 | 1.0 | 0 | 0 | 3 | 7 | 1210 | 570 | 1979 | 0 | 98074 | 47.6170 | -122.051 | 1780 | 10640 |
1 | 8598200070 | 20141208T000000 | 278000.0 | 2 | 2.50 | 1420 | 2229 | 2.0 | 0 | 0 | 3 | 7 | 1420 | 0 | 2004 | 0 | 98059 | 47.4871 | -122.165 | 1500 | 2230 |
17288 | 7174800760 | 20140725T000000 | 667000.0 | 5 | 2.00 | 1900 | 5470 | 1.0 | 0 | 0 | 3 | 7 | 1180 | 720 | 1930 | 1965 | 98105 | 47.6666 | -122.303 | 1300 | 3250 |
17289 | 9521100280 | 20140612T000000 | 480000.0 | 3 | 2.50 | 1250 | 1103 | 3.0 | 0 | 2 | 3 | 8 | 1250 | 0 | 2005 | 0 | 98103 | 47.6619 | -122.352 | 1250 | 1188 |
def clean_data(df,log=True,sq=True,logsq=True,dummy=True,dummy_cat=False):
df = df.copy()
# Date time features
df['date'] = pd.to_datetime(df['date'])
df['yr_sales'] = df['date'].dt.year
df['age'] = df['yr_sales'] - df['yr_built']
df['yr_renovated2'] = np.where(df['yr_renovated'].eq(0), df['yr_built'], df['yr_renovated'])
df['age_after_renovation'] = df['yr_sales'] - df['yr_renovated2']
# Boolean data types
f = lambda x: 1 if x>0 else 0
df['basement_bool'] = df['sqft_basement'].apply(f)
df['renovation_bool'] = df['yr_renovated'].apply(f)
# Numerical features binning
cols_bin = ['age','age_after_renovation']
df['age_cat'] = pd.cut(df['age'], 10, labels=range(10)).astype(str)
df['age_after_renovation_cat'] = pd.cut(df['age_after_renovation'],
10, labels=range(10))
# Log transformation of large numerical values
cols_log = ['sqft_living', 'sqft_lot', 'sqft_above',
'sqft_basement', 'sqft_living15', 'sqft_lot15']
if log:
for col in cols_log:
df['log1p_' + col] = np.log1p(df[col])
# squared columns
cols_sq = [
# cats
'bedrooms','bathrooms','floors','waterfront','view',
# created nums
'age','age_after_renovation']
if sq:
for col in cols_sq:
df[col + '_sq'] = df[col]**2
cols_log_sq = [
# log nums
'log1p_sqft_living','log1p_sqft_lot',
'log1p_sqft_above','log1p_sqft_basement',
'log1p_sqft_living15','log1p_sqft_lot15'
]
if logsq:
for col in cols_log_sq:
df[col + '_sq'] = df[col]**2
# Categorical Features
cols_dummy = ['waterfront', 'view', 'condition', 'grade']
cols_dummy_cat = ['age_cat', 'age_after_renovation_cat']
for c in cols_dummy:
df[c] = df[c].astype(str)
# Create dummy variables
if dummy:
df_dummy = pd.get_dummies(df[cols_dummy],drop_first=False)
df = pd.concat([df,df_dummy], axis=1)
# dummy variable for newly created cats from numerical feature
if dummy_cat:
df_dummy = pd.get_dummies(df[cols_dummy_cat],drop_first=False)
df = pd.concat([df,cols_dummy_cat], axis=1)
# after creating dummy, make the columns number
for c in cols_dummy + cols_dummy_cat:
df[c] = df[c].astype(np.int32)
# Drop unwanted columns
cols_drop = ['id','date']
df = df.drop(cols_drop,axis=1)
return df
params_data = dict(log=True,sq=True,logsq=False,dummy=True,dummy_cat=False)
df_train = clean_data(df_train_raw,**params_data)
df_test = clean_data(df_test_raw,**params_data)
print(df_train.shape)
print(df_train.columns)
(17290, 64) Index(['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'yr_sales', 'age', 'yr_renovated2', 'age_after_renovation', 'basement_bool', 'renovation_bool', 'age_cat', 'age_after_renovation_cat', 'log1p_sqft_living', 'log1p_sqft_lot', 'log1p_sqft_above', 'log1p_sqft_basement', 'log1p_sqft_living15', 'log1p_sqft_lot15', 'bedrooms_sq', 'bathrooms_sq', 'floors_sq', 'waterfront_sq', 'view_sq', 'age_sq', 'age_after_renovation_sq', 'waterfront_0', 'waterfront_1', 'view_0', 'view_1', 'view_2', 'view_3', 'view_4', 'condition_1', 'condition_2', 'condition_3', 'condition_4', 'condition_5', 'grade_1', 'grade_10', 'grade_11', 'grade_12', 'grade_13', 'grade_3', 'grade_4', 'grade_5', 'grade_6', 'grade_7', 'grade_8', 'grade_9'], dtype='object')
# make sure no data leakage
df_train.filter(regex='price').columns
Index(['price'], dtype='object')
# make sure no nans
df_train.isna().sum().sum(), df_test.isna().sum().sum()
(0, 0)
log_target = True
if log_target:
df_train[target] = np.log1p(df_train[target])
# choose features to train, we can change it later
features = list(sorted(df_train.columns.drop(target)))
# print(np.array(features))
features = [i for i in features if i in df_test.columns if i in df_train.columns]
# print(np.array(sorted(features)))
df_Xtrain_orig = df_train[features]
ser_ytrain_orig = df_train[target]
df_Xtest = df_test[features]
ser_ytest = df_test[target]
ytrain_orig = np.array(ser_ytrain_orig).flatten()
ytest = np.array(ser_ytest).flatten()
df_Xtrain, df_Xvalid, ser_ytrain, ser_yvalid = model_selection.train_test_split(
df_Xtrain_orig, ser_ytrain_orig,
train_size=0.8, random_state=SEED)
ytrain = ser_ytrain.to_numpy().ravel()
yvalid = ser_yvalid.to_numpy().ravel()
print(f"df_train : {df_train.shape}\n")
print(f"df_Xtrain : {df_Xtrain.shape}")
print(f"ser_ytrain : {ser_ytrain.shape}\n")
print(f"df_Xvalid : {df_Xvalid.shape}")
print(f"ser_yvalid : {ser_yvalid.shape}\n")
print(f"df_test : {df_test.shape}")
print(f"ser_ytest : This does not exist.")
df_Xtrain.head(2)
df_train : (17290, 64) df_Xtrain : (13832, 61) ser_ytrain : (13832,) df_Xvalid : (3458, 61) ser_yvalid : (3458,) df_test : (4323, 62) ser_ytest : This does not exist.
age | age_after_renovation | age_after_renovation_cat | age_after_renovation_sq | age_cat | age_sq | basement_bool | bathrooms | bathrooms_sq | bedrooms | bedrooms_sq | condition | condition_1 | condition_2 | condition_3 | condition_4 | condition_5 | floors | floors_sq | grade | grade_10 | grade_11 | grade_12 | grade_13 | grade_4 | grade_5 | grade_6 | grade_7 | grade_8 | grade_9 | lat | log1p_sqft_above | log1p_sqft_basement | log1p_sqft_living | log1p_sqft_living15 | log1p_sqft_lot | log1p_sqft_lot15 | long | renovation_bool | sqft_above | sqft_basement | sqft_living | sqft_living15 | sqft_lot | sqft_lot15 | view | view_0 | view_1 | view_2 | view_3 | view_4 | view_sq | waterfront | waterfront_0 | waterfront_1 | waterfront_sq | yr_built | yr_renovated | yr_renovated2 | yr_sales | zipcode | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
13832 | 30 | 30 | 2 | 900 | 2 | 900 | 0 | 3.0 | 9.00 | 5 | 25 | 4 | 0 | 0 | 0 | 1 | 0 | 1.5 | 2.25 | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 47.3488 | 8.243019 | 0.000000 | 8.243019 | 7.855932 | 10.652944 | 10.485033 | -122.095 | 0 | 3800 | 0 | 3800 | 2580 | 42316 | 35775 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1984 | 0 | 1984 | 2014 | 98042 |
4184 | 72 | 19 | 1 | 361 | 6 | 5184 | 1 | 2.5 | 6.25 | 3 | 9 | 5 | 0 | 0 | 0 | 0 | 1 | 2.0 | 4.00 | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 47.5855 | 7.601402 | 6.508769 | 7.890208 | 7.749753 | 8.517393 | 8.517393 | -122.292 | 1 | 2000 | 670 | 2670 | 2320 | 5000 | 5000 | 3 | 0 | 0 | 0 | 1 | 0 | 9 | 0 | 1 | 0 | 0 | 1942 | 1995 | 1995 | 2014 | 98144 |
features = [i for i in features
if i in df_Xtrain.columns
if i in df_Xvalid.columns
if i in df_test.columns]
print(np.array(sorted(features)))
['age' 'age_after_renovation' 'age_after_renovation_cat' 'age_after_renovation_sq' 'age_cat' 'age_sq' 'basement_bool' 'bathrooms' 'bathrooms_sq' 'bedrooms' 'bedrooms_sq' 'condition' 'condition_1' 'condition_2' 'condition_3' 'condition_4' 'condition_5' 'floors' 'floors_sq' 'grade' 'grade_10' 'grade_11' 'grade_12' 'grade_13' 'grade_4' 'grade_5' 'grade_6' 'grade_7' 'grade_8' 'grade_9' 'lat' 'log1p_sqft_above' 'log1p_sqft_basement' 'log1p_sqft_living' 'log1p_sqft_living15' 'log1p_sqft_lot' 'log1p_sqft_lot15' 'long' 'renovation_bool' 'sqft_above' 'sqft_basement' 'sqft_living' 'sqft_living15' 'sqft_lot' 'sqft_lot15' 'view' 'view_0' 'view_1' 'view_2' 'view_3' 'view_4' 'view_sq' 'waterfront' 'waterfront_0' 'waterfront_1' 'waterfront_sq' 'yr_built' 'yr_renovated' 'yr_renovated2' 'yr_sales' 'zipcode']
scaling = None
if scaling == 'standard':
scaler = preprocessing.StandardScaler()
scaler.fit(df_Xtrain)
df_Xtrain = pd.DataFrame(scaler.transform(df_Xtrain),columns=features)
df_Xtest = pd.DataFrame(scaler.transform(df_Xtest),columns=features)
elif scaling == 'minmax':
scaler = preprocessing.MinMaxScaler()
scaler.fit(df_Xtrain)
df_Xtrain = pd.DataFrame(scaler.transform(df_Xtrain),columns=features)
df_Xtest = pd.DataFrame(scaler.transform(df_Xtest),columns=features)
df_Xtrain.head(2)
age | age_after_renovation | age_after_renovation_cat | age_after_renovation_sq | age_cat | age_sq | basement_bool | bathrooms | bathrooms_sq | bedrooms | bedrooms_sq | condition | condition_1 | condition_2 | condition_3 | condition_4 | condition_5 | floors | floors_sq | grade | grade_10 | grade_11 | grade_12 | grade_13 | grade_4 | grade_5 | grade_6 | grade_7 | grade_8 | grade_9 | lat | log1p_sqft_above | log1p_sqft_basement | log1p_sqft_living | log1p_sqft_living15 | log1p_sqft_lot | log1p_sqft_lot15 | long | renovation_bool | sqft_above | sqft_basement | sqft_living | sqft_living15 | sqft_lot | sqft_lot15 | view | view_0 | view_1 | view_2 | view_3 | view_4 | view_sq | waterfront | waterfront_0 | waterfront_1 | waterfront_sq | yr_built | yr_renovated | yr_renovated2 | yr_sales | zipcode | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
13832 | 30 | 30 | 2 | 900 | 2 | 900 | 0 | 3.0 | 9.00 | 5 | 25 | 4 | 0 | 0 | 0 | 1 | 0 | 1.5 | 2.25 | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 47.3488 | 8.243019 | 0.000000 | 8.243019 | 7.855932 | 10.652944 | 10.485033 | -122.095 | 0 | 3800 | 0 | 3800 | 2580 | 42316 | 35775 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1984 | 0 | 1984 | 2014 | 98042 |
4184 | 72 | 19 | 1 | 361 | 6 | 5184 | 1 | 2.5 | 6.25 | 3 | 9 | 5 | 0 | 0 | 0 | 0 | 1 | 2.0 | 4.00 | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 47.5855 | 7.601402 | 6.508769 | 7.890208 | 7.749753 | 8.517393 | 8.517393 | -122.292 | 1 | 2000 | 670 | 2670 | 2320 | 5000 | 5000 | 3 | 0 | 0 | 0 | 1 | 0 | 9 | 0 | 1 | 0 | 0 | 1942 | 1995 | 1995 | 2014 | 98144 |
class CatBoostRegressor(
iterations=None,
learning_rate=None,
loss_function='RMSE',
use_best_model=None,
verbose=None,
silent=None,
logging_level=None,
one_hot_max_size=None,
ignored_features=None,
train_dir=None,
custom_metric=None,
eval_metric=None,
subsample=None,
max_depth=None,
n_estimators=None,
num_boost_round=None,
num_trees=None,
colsample_bylevel=None,
random_state=None,
reg_lambda=None,
objective=None,
eta=None,
max_bin=None,
early_stopping_rounds=None,
cat_features=None,
min_child_samples=None,
max_leaves=None,
num_leaves=None,
score_function=None,
)
show_methods(catboost,4)
Object Type: <class 'module'>
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | CatBoost | EFstrType | Pool | to_regressor |
1 | CatBoostClassifier | FeaturesData | core | train |
2 | CatBoostError | MetricVisualizer | cv | version |
3 | CatBoostRegressor | MultiRegressionCustomMetric | sum_models | widget |
4 | CatboostError | MultiRegressionCustomObjective | to_classifier |
%%time
model = cb.CatBoostRegressor(verbose=1000,random_state=0)
model.fit(df_Xtrain_orig, ytrain_orig)
ypreds = model.predict(df_Xtest)
print_regr_eval(ytest,ypreds,df_Xtest.shape[1],log_target=log_target)
Learning rate set to 0.064823 0: learn: 0.5033190 total: 13.5ms remaining: 13.5s 999: learn: 0.1245045 total: 5.9s remaining: 0us ytest : [285000. 239950. 460000.] ypreds: [317520.05086988 221473.59319332 513032.95014102] Explained Variance: 0.912510 R-Squared: 0.911658 RMSE : 109,113.67 Adjusted R-squared: 0.910414 CPU times: user 16.4 s, sys: 902 ms, total: 17.3 s Wall time: 6.12 s
# float feature
feature_name = 'sqft_living'
dict_stats = model.calc_feature_statistics(df_Xtrain_orig, ytrain_orig,
feature_name, plot=True)
# one hot feature
feature_name = 'bedrooms'
cat_vals = df_Xtrain_orig[feature_name].unique().tolist()
dict_stats = model.calc_feature_statistics(df_Xtrain_orig, ytrain_orig, feature_name)
for key in dict_stats.keys():
print(key, len(dict_stats[key]))
borders 8 binarized_feature 17290 mean_target 9 mean_weighted_target 0 mean_prediction 9 objects_per_bin 9 predictions_on_varying_feature 9
# feature importance
df_imp = pd.DataFrame({'Feature': features,
'Importance': model.feature_importances_
})
df_imp.sort_values('Importance',ascending=False).head(10).style.background_gradient()
Feature | Importance | |
---|---|---|
30 | lat | 44.425659 |
19 | grade | 10.737741 |
41 | sqft_living | 8.457095 |
33 | log1p_sqft_living | 6.725510 |
37 | long | 6.467298 |
60 | zipcode | 2.843294 |
42 | sqft_living15 | 1.675843 |
34 | log1p_sqft_living15 | 1.659347 |
51 | view_sq | 1.578175 |
35 | log1p_sqft_lot | 1.104773 |
plot_feature_imp_catboost(model,n=10)
# help(model.fit)
Help on method fit in module catboost.core: fit(X, y=None, cat_features=None, sample_weight=None, baseline=None, use_best_model=None, eval_set=None, verbose=None, logging_level=None, plot=False, column_description=None, verbose_eval=None, metric_period=None, silent=None, early_stopping_rounds=None, save_snapshot=None, snapshot_file=None, snapshot_interval=None, init_model=None) method of catboost.core.CatBoostRegressor instance Fit the CatBoost model. Parameters ---------- X : catboost.Pool or list or numpy.ndarray or pandas.DataFrame or pandas.Series If not catboost.Pool, 2 dimensional Feature matrix or string - file with dataset. y : list or numpy.ndarray or pandas.DataFrame or pandas.Series, optional (default=None) Labels, 1 dimensional array like. Use only if X is not catboost.Pool. cat_features : list or numpy.ndarray, optional (default=None) If not None, giving the list of Categ columns indices. Use only if X is not catboost.Pool. sample_weight : list or numpy.ndarray or pandas.DataFrame or pandas.Series, optional (default=None) Instance weights, 1 dimensional array like. baseline : list or numpy.ndarray, optional (default=None) If not None, giving 2 dimensional array like data. Use only if X is not catboost.Pool. use_best_model : bool, optional (default=None) Flag to use best model eval_set : catboost.Pool or list, optional (default=None) A list of (X, y) tuple pairs to use as a validation set for early-stopping metric_period : int Frequency of evaluating metrics. verbose : bool or int If verbose is bool, then if set to True, logging_level is set to Verbose, if set to False, logging_level is set to Silent. If verbose is int, it determines the frequency of writing metrics to output and logging_level is set to Verbose. silent : bool If silent is True, logging_level is set to Silent. If silent is False, logging_level is set to Verbose. logging_level : string, optional (default=None) Possible values: - 'Silent' - 'Verbose' - 'Info' - 'Debug' plot : bool, optional (default=False) If True, draw train and eval error in Jupyter notebook verbose_eval : bool or int Synonym for verbose. Only one of these parameters should be set. early_stopping_rounds : int Activates Iter overfitting detector with od_wait set to early_stopping_rounds. save_snapshot : bool, [default=None] Enable progress snapshotting for restoring progress after crashes or interruptions snapshot_file : string, [default=None] Learn progress snapshot file path, if None will use default filename snapshot_interval: int, [default=600] Interval between saving snapshots (seconds) init_model : CatBoost class or string, [default=None] Continue training starting from the existing model. If this parameter is a string, load initial model from the path specified by this string. Returns ------- model : CatBoost
model = cb.CatBoostRegressor(verbose=1000,random_state=0,iterations=10_000)
model.fit(df_Xtrain, ytrain,
eval_set=(df_Xvalid,yvalid),
early_stopping_rounds=100,
use_best_model=True,
plot=True)
Learning rate set to 0.018035 0: learn: 0.5196319 test: 0.5196904 best: 0.5196904 (0) total: 35.6ms remaining: 5m 55s 1000: learn: 0.1512908 test: 0.1616362 best: 0.1616362 (1000) total: 5.41s remaining: 48.6s 2000: learn: 0.1364410 test: 0.1565496 best: 0.1565484 (1988) total: 10.6s remaining: 42.5s 3000: learn: 0.1262268 test: 0.1544832 best: 0.1544812 (2997) total: 15.6s remaining: 36.3s 4000: learn: 0.1185301 test: 0.1535202 best: 0.1535194 (3995) total: 21.1s remaining: 31.6s Stopped by overfitting detector (100 iterations wait) bestTest = 0.153097429 bestIteration = 4763 Shrink model to first 4764 iterations.
<catboost.core.CatBoostRegressor at 0x7f99c459ba50>
categorical_features_indices = []
dtrain = cb.Pool(df_Xtrain, ytrain, cat_features=categorical_features_indices)
dvalid = cb.Pool(df_Xvalid, yvalid, cat_features=categorical_features_indices)
model = cb.CatBoostRegressor(iterations=50,
random_seed=42,
logging_level='Silent')
model.fit(dtrain,
eval_set=dvalid,
plot=True)
<catboost.core.CatBoostRegressor at 0x7f99b16ab910>
eval_metrics = model.eval_metrics(dvalid, ['RMSE'], plot=True)
path_model = '../models/catboost_model.dump'
model.save_model(path_model)
model = cb.CatBoostRegressor()
model.load_model(path_model)
model
<catboost.core.CatBoostRegressor at 0x7f99b18d46d0>
%%bash
rm -rf catboost_model.dump catboost_info $path_model