%%html
<marquee style='width: 30%; color: blue;'><b> Author: Bhishan Poudel</b></marquee>
This dataset contains house sale prices for King County, which includes Seattle. It includes homes sold between May 2014 and May 2015.
Task: Try to estimate the price based on given features.

import time
time_start_notebook = time.time()
%%capture
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
    ## install modules
    !pip install watermark
    !pip install catboost
    !pip install shap eli5
    # if we update existing module, we need to restart colab
    !pip install -U scikit-learn
    ## print
    print('Environment: Google Colaboratory.')
TREE_METHOD = 'gpu_hist' if ENV_COLAB else 'auto'
import numpy as np
import pandas as pd
# visualization
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
# mixed
import os
import time
from pprint import pprint
# random state
SEED = 0
RNG = np.random.RandomState(SEED)
# settings
pd.set_option('display.max_columns', 200)
# sklearn
import sklearn
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import ensemble
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.inspection import permutation_importance
# boosting
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
import xgboost
import lightgbm
import catboost
import catboost as cb
# versions
import watermark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv
The watermark extension is already loaded. To reload it, use: %reload_ext watermark Bhishan Poudel 2020-11-14 CPython 3.7.7 IPython 7.18.1 compiler : Clang 4.0.1 (tags/RELEASE_401/final) system : Darwin release : 19.6.0 machine : x86_64 processor : i386 CPU cores : 4 interpreter: 64bit matplotlib 3.2.1 watermark 2.0.2 seaborn 0.11.0 xgboost 1.2.0 numpy 1.18.4 sklearn 0.23.1 lightgbm 2.3.1 catboost 0.23.2 pandas 1.1.0 json 2.0.9
def show_methods(obj, ncols=7,start=None, inside=None):
    """ Show all the attributes of a given method.
    Example:
    ========
    show_method_attributes(list)
     """
    print(f'Object Type: {type(obj)}\n')
    lst = [elem for elem in dir(obj) if elem[0]!='_' ]
    lst = [elem for elem in lst 
           if elem not in 'os np pd sys time psycopg2'.split() ]
    if isinstance(start,str):
        lst = [elem for elem in lst if elem.startswith(start)]
        
    if isinstance(start,tuple) or isinstance(start,list):
        lst = [elem for elem in lst for start_elem in start
               if elem.startswith(start_elem)]
        
    if isinstance(inside,str):
        lst = [elem for elem in lst if inside in elem]
        
    if isinstance(inside,tuple) or isinstance(inside,list):
        lst = [elem for elem in lst for inside_elem in inside
               if inside_elem in elem]
    return pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
def adjustedR2(rsquared,nrows,ncols):
    return rsquared- (ncols-1)/(nrows-ncols) * (1-rsquared)
def print_regr_eval(ytest,ypreds,ncols,log_target=False):
    # if we have done log1p(target), we need to log back ypreds
    if log_target:
        ypreds = np.expm1(ypreds)
    rmse = np.sqrt(metrics.mean_squared_error(ytest,ypreds))
    r2 = metrics.r2_score(ytest,ypreds)
    ar2 = adjustedR2(r2,len(ytest),ncols)
    evs = metrics.explained_variance_score(ytest, ypreds)
    print('ytest :', ytest[:3])
    print('ypreds:', ypreds[:3])
    print(f"""           
Explained Variance: {evs:.6f}
         R-Squared: {r2:,.6f}
             RMSE : {rmse:,.2f}
Adjusted R-squared: {ar2:,.6f}
""")
def plot_feature_imp_catboost(model_cat,n=10):
    """Plot the feature importance horizontal bar plot.
    Parameters
    ----------
    model_cat: fitted catboost model
    
    """
    df_imp = pd.DataFrame({'Feature': model_cat.feature_names_,
                        'Importance': model_cat.feature_importances_
                        }) 
    df_imp = df_imp.nlargest(n,'Importance').set_index('Feature')
    ax = df_imp.plot.barh(figsize=(12,8)) # .invert_yaxis()
    plt.grid(True)
    plt.title('Feature Importance',fontsize=14)
    ax.get_legend().remove()
    for p in ax.patches:
        x = p.get_width()
        y = p.get_y()
        text = '{:.2f}'.format(p.get_width())
        ax.text(x, y,text,fontsize=15,color='indigo')
    ax.invert_yaxis()
    plt.show()
if ENV_COLAB:
    path_raw = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/'
    proj = 'Projects/King_County_Seattle_House_Price_Kaggle/'
    data_path_parent = path_raw + proj
    data_path_train = data_path_parent + 'raw/train.csv'
    data_path_test = data_path_parent + 'raw/test.csv'
else:
    data_path_parent = '../data/'
    data_path_train = data_path_parent + 'raw/train.csv'
    data_path_test = data_path_parent + 'raw/test.csv'
target = 'price'
train_size = 0.8
print(data_path_train)
../data/raw/train.csv
df_train_raw = pd.read_csv(data_path_train)
df_test_raw = pd.read_csv(data_path_test)
print(df_train_raw.shape)
print(df_train_raw.columns)
display(df_train_raw.head(2).append(df_train_raw.tail(2)))
(17290, 21)
Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')
| id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2561340020 | 20140804T000000 | 325000.0 | 3 | 1.75 | 1780 | 11096 | 1.0 | 0 | 0 | 3 | 7 | 1210 | 570 | 1979 | 0 | 98074 | 47.6170 | -122.051 | 1780 | 10640 | 
| 1 | 8598200070 | 20141208T000000 | 278000.0 | 2 | 2.50 | 1420 | 2229 | 2.0 | 0 | 0 | 3 | 7 | 1420 | 0 | 2004 | 0 | 98059 | 47.4871 | -122.165 | 1500 | 2230 | 
| 17288 | 7174800760 | 20140725T000000 | 667000.0 | 5 | 2.00 | 1900 | 5470 | 1.0 | 0 | 0 | 3 | 7 | 1180 | 720 | 1930 | 1965 | 98105 | 47.6666 | -122.303 | 1300 | 3250 | 
| 17289 | 9521100280 | 20140612T000000 | 480000.0 | 3 | 2.50 | 1250 | 1103 | 3.0 | 0 | 2 | 3 | 8 | 1250 | 0 | 2005 | 0 | 98103 | 47.6619 | -122.352 | 1250 | 1188 | 
def clean_data(df,log=True,sq=True,logsq=True,dummy=True,dummy_cat=False):
    df = df.copy()
    # Date time features
    df['date'] = pd.to_datetime(df['date'])
    df['yr_sales'] = df['date'].dt.year
    df['age'] = df['yr_sales'] - df['yr_built']
    df['yr_renovated2'] = np.where(df['yr_renovated'].eq(0), df['yr_built'], df['yr_renovated'])
    df['age_after_renovation'] = df['yr_sales'] - df['yr_renovated2']
    # Boolean data types
    f = lambda x: 1 if x>0 else 0
    df['basement_bool'] = df['sqft_basement'].apply(f)
    df['renovation_bool'] = df['yr_renovated'].apply(f)
    # Numerical features binning
    cols_bin = ['age','age_after_renovation']
    df['age_cat'] = pd.cut(df['age'], 10, labels=range(10)).astype(str)
    df['age_after_renovation_cat'] = pd.cut(df['age_after_renovation'],
                                            10, labels=range(10))
    # Log transformation of large numerical values
    cols_log = ['sqft_living', 'sqft_lot', 'sqft_above',
                'sqft_basement', 'sqft_living15', 'sqft_lot15']
    if log:
        for col in cols_log:
            df['log1p_' + col] = np.log1p(df[col])
    # squared columns
    cols_sq = [
        # cats
        'bedrooms','bathrooms','floors','waterfront','view',
        # created nums
        'age','age_after_renovation']
    if sq:
        for col in cols_sq:
            df[col + '_sq'] = df[col]**2
    cols_log_sq = [
        # log nums
        'log1p_sqft_living','log1p_sqft_lot',
        'log1p_sqft_above','log1p_sqft_basement',
        'log1p_sqft_living15','log1p_sqft_lot15'
        ]
    if logsq:
        for col in cols_log_sq:
            df[col + '_sq'] = df[col]**2
    # Categorical Features
    cols_dummy     = ['waterfront', 'view', 'condition', 'grade']
    cols_dummy_cat = ['age_cat', 'age_after_renovation_cat']
    for c in cols_dummy:
        df[c] = df[c].astype(str)
    # Create dummy variables
    if dummy:
        df_dummy = pd.get_dummies(df[cols_dummy],drop_first=False)
        df       = pd.concat([df,df_dummy], axis=1)
    # dummy variable for newly created cats from numerical feature
    if dummy_cat:
        df_dummy = pd.get_dummies(df[cols_dummy_cat],drop_first=False)
        df       = pd.concat([df,cols_dummy_cat], axis=1)
    # after creating dummy, make the columns number
    for c in cols_dummy + cols_dummy_cat:
        df[c] = df[c].astype(np.int32)
    # Drop unwanted columns
    cols_drop = ['id','date']
    df = df.drop(cols_drop,axis=1)
    return df
params_data = dict(log=True,sq=True,logsq=False,dummy=True,dummy_cat=False)
df_train = clean_data(df_train_raw,**params_data)
df_test = clean_data(df_test_raw,**params_data)
print(df_train.shape)
print(df_train.columns)
(17290, 64)
Index(['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
       'sqft_living15', 'sqft_lot15', 'yr_sales', 'age', 'yr_renovated2',
       'age_after_renovation', 'basement_bool', 'renovation_bool', 'age_cat',
       'age_after_renovation_cat', 'log1p_sqft_living', 'log1p_sqft_lot',
       'log1p_sqft_above', 'log1p_sqft_basement', 'log1p_sqft_living15',
       'log1p_sqft_lot15', 'bedrooms_sq', 'bathrooms_sq', 'floors_sq',
       'waterfront_sq', 'view_sq', 'age_sq', 'age_after_renovation_sq',
       'waterfront_0', 'waterfront_1', 'view_0', 'view_1', 'view_2', 'view_3',
       'view_4', 'condition_1', 'condition_2', 'condition_3', 'condition_4',
       'condition_5', 'grade_1', 'grade_10', 'grade_11', 'grade_12',
       'grade_13', 'grade_3', 'grade_4', 'grade_5', 'grade_6', 'grade_7',
       'grade_8', 'grade_9'],
      dtype='object')
# make sure no data leakage
df_train.filter(regex='price').columns
Index(['price'], dtype='object')
# make sure no nans
df_train.isna().sum().sum(), df_test.isna().sum().sum()
(0, 0)
log_target = True
if log_target:
    df_train[target] = np.log1p(df_train[target])
# choose features to train, we can change it later
features = list(sorted(df_train.columns.drop(target)))
# print(np.array(features))
features = [i for i in features if i in df_test.columns if i in df_train.columns]
# print(np.array(sorted(features)))
df_Xtrain_orig  = df_train[features]
ser_ytrain_orig = df_train[target]
df_Xtest  = df_test[features]
ser_ytest = df_test[target]
ytrain_orig = np.array(ser_ytrain_orig).flatten()
ytest  = np.array(ser_ytest).flatten()
df_Xtrain, df_Xvalid, ser_ytrain, ser_yvalid = model_selection.train_test_split(
    df_Xtrain_orig, ser_ytrain_orig,
    train_size=0.8, random_state=SEED)
ytrain = ser_ytrain.to_numpy().ravel()
yvalid = ser_yvalid.to_numpy().ravel()
print(f"df_train   : {df_train.shape}\n")
print(f"df_Xtrain  : {df_Xtrain.shape}")
print(f"ser_ytrain : {ser_ytrain.shape}\n")
print(f"df_Xvalid  : {df_Xvalid.shape}")
print(f"ser_yvalid : {ser_yvalid.shape}\n")
print(f"df_test    : {df_test.shape}")
print(f"ser_ytest  : This does not exist.")
df_Xtrain.head(2)
df_train : (17290, 64) df_Xtrain : (13832, 61) ser_ytrain : (13832,) df_Xvalid : (3458, 61) ser_yvalid : (3458,) df_test : (4323, 62) ser_ytest : This does not exist.
| age | age_after_renovation | age_after_renovation_cat | age_after_renovation_sq | age_cat | age_sq | basement_bool | bathrooms | bathrooms_sq | bedrooms | bedrooms_sq | condition | condition_1 | condition_2 | condition_3 | condition_4 | condition_5 | floors | floors_sq | grade | grade_10 | grade_11 | grade_12 | grade_13 | grade_4 | grade_5 | grade_6 | grade_7 | grade_8 | grade_9 | lat | log1p_sqft_above | log1p_sqft_basement | log1p_sqft_living | log1p_sqft_living15 | log1p_sqft_lot | log1p_sqft_lot15 | long | renovation_bool | sqft_above | sqft_basement | sqft_living | sqft_living15 | sqft_lot | sqft_lot15 | view | view_0 | view_1 | view_2 | view_3 | view_4 | view_sq | waterfront | waterfront_0 | waterfront_1 | waterfront_sq | yr_built | yr_renovated | yr_renovated2 | yr_sales | zipcode | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 13832 | 30 | 30 | 2 | 900 | 2 | 900 | 0 | 3.0 | 9.00 | 5 | 25 | 4 | 0 | 0 | 0 | 1 | 0 | 1.5 | 2.25 | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 47.3488 | 8.243019 | 0.000000 | 8.243019 | 7.855932 | 10.652944 | 10.485033 | -122.095 | 0 | 3800 | 0 | 3800 | 2580 | 42316 | 35775 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1984 | 0 | 1984 | 2014 | 98042 | 
| 4184 | 72 | 19 | 1 | 361 | 6 | 5184 | 1 | 2.5 | 6.25 | 3 | 9 | 5 | 0 | 0 | 0 | 0 | 1 | 2.0 | 4.00 | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 47.5855 | 7.601402 | 6.508769 | 7.890208 | 7.749753 | 8.517393 | 8.517393 | -122.292 | 1 | 2000 | 670 | 2670 | 2320 | 5000 | 5000 | 3 | 0 | 0 | 0 | 1 | 0 | 9 | 0 | 1 | 0 | 0 | 1942 | 1995 | 1995 | 2014 | 98144 | 
features = [i for i in features 
            if i in df_Xtrain.columns
            if i in df_Xvalid.columns
            if i in df_test.columns]
print(np.array(sorted(features)))
['age' 'age_after_renovation' 'age_after_renovation_cat' 'age_after_renovation_sq' 'age_cat' 'age_sq' 'basement_bool' 'bathrooms' 'bathrooms_sq' 'bedrooms' 'bedrooms_sq' 'condition' 'condition_1' 'condition_2' 'condition_3' 'condition_4' 'condition_5' 'floors' 'floors_sq' 'grade' 'grade_10' 'grade_11' 'grade_12' 'grade_13' 'grade_4' 'grade_5' 'grade_6' 'grade_7' 'grade_8' 'grade_9' 'lat' 'log1p_sqft_above' 'log1p_sqft_basement' 'log1p_sqft_living' 'log1p_sqft_living15' 'log1p_sqft_lot' 'log1p_sqft_lot15' 'long' 'renovation_bool' 'sqft_above' 'sqft_basement' 'sqft_living' 'sqft_living15' 'sqft_lot' 'sqft_lot15' 'view' 'view_0' 'view_1' 'view_2' 'view_3' 'view_4' 'view_sq' 'waterfront' 'waterfront_0' 'waterfront_1' 'waterfront_sq' 'yr_built' 'yr_renovated' 'yr_renovated2' 'yr_sales' 'zipcode']
scaling = None
if scaling == 'standard':
    scaler = preprocessing.StandardScaler()
    scaler.fit(df_Xtrain)
    df_Xtrain = pd.DataFrame(scaler.transform(df_Xtrain),columns=features)
    df_Xtest =  pd.DataFrame(scaler.transform(df_Xtest),columns=features)
elif scaling == 'minmax':
    scaler = preprocessing.MinMaxScaler()
    scaler.fit(df_Xtrain)
    df_Xtrain = pd.DataFrame(scaler.transform(df_Xtrain),columns=features)
    df_Xtest = pd.DataFrame(scaler.transform(df_Xtest),columns=features)
df_Xtrain.head(2)
| age | age_after_renovation | age_after_renovation_cat | age_after_renovation_sq | age_cat | age_sq | basement_bool | bathrooms | bathrooms_sq | bedrooms | bedrooms_sq | condition | condition_1 | condition_2 | condition_3 | condition_4 | condition_5 | floors | floors_sq | grade | grade_10 | grade_11 | grade_12 | grade_13 | grade_4 | grade_5 | grade_6 | grade_7 | grade_8 | grade_9 | lat | log1p_sqft_above | log1p_sqft_basement | log1p_sqft_living | log1p_sqft_living15 | log1p_sqft_lot | log1p_sqft_lot15 | long | renovation_bool | sqft_above | sqft_basement | sqft_living | sqft_living15 | sqft_lot | sqft_lot15 | view | view_0 | view_1 | view_2 | view_3 | view_4 | view_sq | waterfront | waterfront_0 | waterfront_1 | waterfront_sq | yr_built | yr_renovated | yr_renovated2 | yr_sales | zipcode | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 13832 | 30 | 30 | 2 | 900 | 2 | 900 | 0 | 3.0 | 9.00 | 5 | 25 | 4 | 0 | 0 | 0 | 1 | 0 | 1.5 | 2.25 | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 47.3488 | 8.243019 | 0.000000 | 8.243019 | 7.855932 | 10.652944 | 10.485033 | -122.095 | 0 | 3800 | 0 | 3800 | 2580 | 42316 | 35775 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1984 | 0 | 1984 | 2014 | 98042 | 
| 4184 | 72 | 19 | 1 | 361 | 6 | 5184 | 1 | 2.5 | 6.25 | 3 | 9 | 5 | 0 | 0 | 0 | 0 | 1 | 2.0 | 4.00 | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 47.5855 | 7.601402 | 6.508769 | 7.890208 | 7.749753 | 8.517393 | 8.517393 | -122.292 | 1 | 2000 | 670 | 2670 | 2320 | 5000 | 5000 | 3 | 0 | 0 | 0 | 1 | 0 | 9 | 0 | 1 | 0 | 0 | 1942 | 1995 | 1995 | 2014 | 98144 | 
class CatBoostRegressor(
iterations=None,
learning_rate=None,
loss_function='RMSE',
use_best_model=None,
verbose=None,
silent=None,
logging_level=None,
one_hot_max_size=None,
ignored_features=None,
train_dir=None,
custom_metric=None,
eval_metric=None,
subsample=None,
max_depth=None,
n_estimators=None,
num_boost_round=None,
num_trees=None,
colsample_bylevel=None,
random_state=None,
reg_lambda=None,
objective=None,
eta=None,
max_bin=None,
early_stopping_rounds=None,
cat_features=None,
min_child_samples=None,
max_leaves=None,
num_leaves=None,
score_function=None,
)
show_methods(catboost,4)
Object Type: <class 'module'>
| 0 | 1 | 2 | 3 | |
|---|---|---|---|---|
| 0 | CatBoost | EFstrType | Pool | to_regressor | 
| 1 | CatBoostClassifier | FeaturesData | core | train | 
| 2 | CatBoostError | MetricVisualizer | cv | version | 
| 3 | CatBoostRegressor | MultiRegressionCustomMetric | sum_models | widget | 
| 4 | CatboostError | MultiRegressionCustomObjective | to_classifier | 
%%time
model = cb.CatBoostRegressor(verbose=1000,random_state=0)
model.fit(df_Xtrain_orig, ytrain_orig)
ypreds = model.predict(df_Xtest)
print_regr_eval(ytest,ypreds,df_Xtest.shape[1],log_target=log_target)
Learning rate set to 0.064823
0:	learn: 0.5033190	total: 13.5ms	remaining: 13.5s
999:	learn: 0.1245045	total: 5.9s	remaining: 0us
ytest : [285000. 239950. 460000.]
ypreds: [317520.05086988 221473.59319332 513032.95014102]
           
Explained Variance: 0.912510
         R-Squared: 0.911658
             RMSE : 109,113.67
Adjusted R-squared: 0.910414
CPU times: user 16.4 s, sys: 902 ms, total: 17.3 s
Wall time: 6.12 s
# float feature
feature_name = 'sqft_living'
dict_stats = model.calc_feature_statistics(df_Xtrain_orig, ytrain_orig,
                                           feature_name, plot=True)
# one hot feature
feature_name = 'bedrooms'
cat_vals = df_Xtrain_orig[feature_name].unique().tolist()
dict_stats = model.calc_feature_statistics(df_Xtrain_orig, ytrain_orig, feature_name)
for key in dict_stats.keys():
    print(key, len(dict_stats[key]))
borders 8 binarized_feature 17290 mean_target 9 mean_weighted_target 0 mean_prediction 9 objects_per_bin 9 predictions_on_varying_feature 9
# feature importance
df_imp = pd.DataFrame({'Feature': features,
                       'Importance': model.feature_importances_
                       }) 
df_imp.sort_values('Importance',ascending=False).head(10).style.background_gradient()
| Feature | Importance | |
|---|---|---|
| 30 | lat | 44.425659 | 
| 19 | grade | 10.737741 | 
| 41 | sqft_living | 8.457095 | 
| 33 | log1p_sqft_living | 6.725510 | 
| 37 | long | 6.467298 | 
| 60 | zipcode | 2.843294 | 
| 42 | sqft_living15 | 1.675843 | 
| 34 | log1p_sqft_living15 | 1.659347 | 
| 51 | view_sq | 1.578175 | 
| 35 | log1p_sqft_lot | 1.104773 | 
plot_feature_imp_catboost(model,n=10)
# help(model.fit)
Help on method fit in module catboost.core:
fit(X, y=None, cat_features=None, sample_weight=None, baseline=None, use_best_model=None, eval_set=None, verbose=None, logging_level=None, plot=False, column_description=None, verbose_eval=None, metric_period=None, silent=None, early_stopping_rounds=None, save_snapshot=None, snapshot_file=None, snapshot_interval=None, init_model=None) method of catboost.core.CatBoostRegressor instance
    Fit the CatBoost model.
    
    Parameters
    ----------
    X : catboost.Pool or list or numpy.ndarray or pandas.DataFrame or pandas.Series
        If not catboost.Pool, 2 dimensional Feature matrix or string - file with dataset.
    
    y : list or numpy.ndarray or pandas.DataFrame or pandas.Series, optional (default=None)
        Labels, 1 dimensional array like.
        Use only if X is not catboost.Pool.
    
    cat_features : list or numpy.ndarray, optional (default=None)
        If not None, giving the list of Categ columns indices.
        Use only if X is not catboost.Pool.
    
    sample_weight : list or numpy.ndarray or pandas.DataFrame or pandas.Series, optional (default=None)
        Instance weights, 1 dimensional array like.
    
    baseline : list or numpy.ndarray, optional (default=None)
        If not None, giving 2 dimensional array like data.
        Use only if X is not catboost.Pool.
    
    use_best_model : bool, optional (default=None)
        Flag to use best model
    
    eval_set : catboost.Pool or list, optional (default=None)
        A list of (X, y) tuple pairs to use as a validation set for
        early-stopping
    
    metric_period : int
        Frequency of evaluating metrics.
    
    verbose : bool or int
        If verbose is bool, then if set to True, logging_level is set to Verbose,
        if set to False, logging_level is set to Silent.
        If verbose is int, it determines the frequency of writing metrics to output and
        logging_level is set to Verbose.
    
    silent : bool
        If silent is True, logging_level is set to Silent.
        If silent is False, logging_level is set to Verbose.
    
    logging_level : string, optional (default=None)
        Possible values:
            - 'Silent'
            - 'Verbose'
            - 'Info'
            - 'Debug'
    
    plot : bool, optional (default=False)
        If True, draw train and eval error in Jupyter notebook
    
    verbose_eval : bool or int
        Synonym for verbose. Only one of these parameters should be set.
    
    early_stopping_rounds : int
        Activates Iter overfitting detector with od_wait set to early_stopping_rounds.
    
    save_snapshot : bool, [default=None]
        Enable progress snapshotting for restoring progress after crashes or interruptions
    
    snapshot_file : string, [default=None]
        Learn progress snapshot file path, if None will use default filename
    
    snapshot_interval: int, [default=600]
        Interval between saving snapshots (seconds)
    
    init_model : CatBoost class or string, [default=None]
        Continue training starting from the existing model.
        If this parameter is a string, load initial model from the path specified by this string.
    
    Returns
    -------
    model : CatBoost
model = cb.CatBoostRegressor(verbose=1000,random_state=0,iterations=10_000)
model.fit(df_Xtrain, ytrain,
          eval_set=(df_Xvalid,yvalid),
          early_stopping_rounds=100,
          use_best_model=True,
          plot=True)
Learning rate set to 0.018035 0: learn: 0.5196319 test: 0.5196904 best: 0.5196904 (0) total: 35.6ms remaining: 5m 55s 1000: learn: 0.1512908 test: 0.1616362 best: 0.1616362 (1000) total: 5.41s remaining: 48.6s 2000: learn: 0.1364410 test: 0.1565496 best: 0.1565484 (1988) total: 10.6s remaining: 42.5s 3000: learn: 0.1262268 test: 0.1544832 best: 0.1544812 (2997) total: 15.6s remaining: 36.3s 4000: learn: 0.1185301 test: 0.1535202 best: 0.1535194 (3995) total: 21.1s remaining: 31.6s Stopped by overfitting detector (100 iterations wait) bestTest = 0.153097429 bestIteration = 4763 Shrink model to first 4764 iterations.
<catboost.core.CatBoostRegressor at 0x7f99c459ba50>
categorical_features_indices = []
dtrain = cb.Pool(df_Xtrain, ytrain, cat_features=categorical_features_indices)
dvalid = cb.Pool(df_Xvalid, yvalid, cat_features=categorical_features_indices)
model = cb.CatBoostRegressor(iterations=50,
                             random_seed=42,
                             logging_level='Silent')
model.fit(dtrain,
          eval_set=dvalid,
          plot=True)
<catboost.core.CatBoostRegressor at 0x7f99b16ab910>
eval_metrics = model.eval_metrics(dvalid, ['RMSE'], plot=True)
path_model = '../models/catboost_model.dump'
model.save_model(path_model)
model = cb.CatBoostRegressor()
model.load_model(path_model)
model
<catboost.core.CatBoostRegressor at 0x7f99b18d46d0>
%%bash
rm -rf catboost_model.dump catboost_info $path_model