import time
time_start_notebook = time.time()


%%capture
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    ## install modules
    !pip install watermark
    !pip install catboost
    !pip install shap eli5

    # if we update existing module, we need to restart colab
    !pip install -U scikit-learn

    ## print
    print('Environment: Google Colaboratory.')
TREE_METHOD = 'gpu_hist' if ENV_COLAB else 'auto'


import numpy as np
import pandas as pd

# visualization
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

# mixed
import os
import time
from pprint import pprint
import joblib

# random state
SEED = 0
RNG = np.random.RandomState(SEED)

# settings
pd.set_option('display.max_columns', 200)

# sklearn
import sklearn
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import ensemble
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.inspection import permutation_importance

# boosting
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
import xgboost
import lightgbm
import catboost

# versions
import watermark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv

Bhishan Poudel 2020-11-23 

CPython 3.7.7
IPython 7.19.0

compiler   : Clang 4.0.1 (tags/RELEASE_401/final)
system     : Darwin
release    : 19.6.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit

xgboost    1.2.0
json       2.0.9
seaborn    0.11.0
catboost   0.23.2
lightgbm   2.3.1
matplotlib 3.2.1
joblib     0.17.0
pandas     1.1.0
sklearn    0.23.1
numpy      1.18.4
watermark  2.0.2


def show_methods(obj, ncols=7,start=None, inside=None):
    """ Show all the attributes of a given method.
    Example:
    ========
    show_method_attributes(list)
     """

    print(f'Object Type: {type(obj)}\n')
    lst = [elem for elem in dir(obj) if elem[0]!='_' ]
    lst = [elem for elem in lst 
           if elem not in 'os np pd sys time psycopg2'.split() ]

    if isinstance(start,str):
        lst = [elem for elem in lst if elem.startswith(start)]
        
    if isinstance(start,tuple) or isinstance(start,list):
        lst = [elem for elem in lst for start_elem in start
               if elem.startswith(start_elem)]
        
    if isinstance(inside,str):
        lst = [elem for elem in lst if inside in elem]
        
    if isinstance(inside,tuple) or isinstance(inside,list):
        lst = [elem for elem in lst for inside_elem in inside
               if inside_elem in elem]

    return pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')


def adjustedR2(rsquared,nrows,ncols):
    return rsquared- (ncols-1)/(nrows-ncols) * (1-rsquared)


def print_regr_eval(ytest,ypreds,ncols):
    rmse = np.sqrt(metrics.mean_squared_error(ytest,ypreds))
    r2 = metrics.r2_score(ytest,ypreds)
    ar2 = adjustedR2(r2,len(ytest),ncols)
    evs = metrics.explained_variance_score(ytest, ypreds)

    print(f"""
             RMSE : {rmse:,.2f}
Explained Variance: {evs:.6f}
         R-Squared: {r2:,.6f}
Adjusted R-squared: {ar2:,.6f}

""")


def plot_xgb_cv_res(df_cv_results):
    fig,ax = plt.subplots()
    plt.plot(df_cv_results['train-rmse-mean'],color='b',label='train-rmse')
    plt.plot(df_cv_results['test-rmse-mean'],color='r',label='train-rmse')
    plt.title('Cross validation score mean plot',fontsize=14)
    plt.legend()
    plt.show()


if ENV_COLAB:
    path_raw = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/'
    proj = 'Projects/King_County_Seattle_House_Price_Kaggle/'
    data_path_parent = path_raw + proj
    data_path_train = data_path_parent + 'raw/train.csv'
    data_path_test = data_path_parent + 'raw/test.csv'

else:
    data_path_parent = '../data/'
    data_path_train = data_path_parent + 'raw/train.csv'
    data_path_test = data_path_parent + 'raw/test.csv'

target = 'price'
train_size = 0.8

print(data_path_train)

../data/raw/train.csv


df_train = pd.read_csv(data_path_train)
df_test = pd.read_csv(data_path_test)
print(df_train.shape)
print(df_train.columns)

display(df_train.head(2).append(df_train.tail(2)))

(17290, 21)
Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')


def clean_data(df,log=True,sq=True,logsq=True,dummy=True,dummy_cat=False):
    # log sq
    if logsq:
        log = True
        sq = True

    df = df.copy()

    # Date time features
    df['date'] = pd.to_datetime(df['date'])
    df['yr_sales'] = df['date'].dt.year
    df['age'] = df['yr_sales'] - df['yr_built']
    df['yr_renovated2'] = np.where(df['yr_renovated'].eq(0), df['yr_built'], df['yr_renovated'])
    df['age_after_renovation'] = df['yr_sales'] - df['yr_renovated2']

    # Boolean data types
    f = lambda x: 1 if x>0 else 0
    df['basement_bool'] = df['sqft_basement'].apply(f)
    df['renovation_bool'] = df['yr_renovated'].apply(f)

    # Numerical features binning
    cols_bin = ['age','age_after_renovation']
    df['age_cat'] = pd.cut(df['age'], 10, labels=range(10)).astype(str)
    df['age_after_renovation_cat'] = pd.cut(df['age_after_renovation'],
                                            10, labels=range(10))

    # Log transformation of large numerical values
    cols_log = ['sqft_living', 'sqft_lot', 'sqft_above',
                'sqft_basement', 'sqft_living15', 'sqft_lot15']
    if log:
        for col in cols_log:
            df['log1p_' + col] = np.log1p(df[col])

    # squared columns
    cols_sq = [
        # cats
        'bedrooms','bathrooms','floors','waterfront','view',

        # created nums
        'age','age_after_renovation']

    if sq:
        for col in cols_sq:
            df[col + '_sq'] = df[col]**2

    cols_log_sq = [
        # log nums
        'log1p_sqft_living','log1p_sqft_lot',
        'log1p_sqft_above','log1p_sqft_basement',
        'log1p_sqft_living15','log1p_sqft_lot15'
        ]
    if logsq:
        for col in cols_log_sq:
            df[col + '_sq'] = df[col]**2

    # Categorical Features
    cols_dummy     = ['waterfront', 'view', 'condition', 'grade']
    cols_dummy_cat = ['age_cat', 'age_after_renovation_cat']
    for c in cols_dummy:
        df[c] = df[c].astype(str)

    # Create dummy variables
    if dummy:
        df_dummy = pd.get_dummies(df[cols_dummy],drop_first=False)
        df       = pd.concat([df,df_dummy], axis=1)

    # dummy variable for newly created cats from numerical feature
    if dummy_cat:
        df_dummy = pd.get_dummies(df[cols_dummy_cat],drop_first=False)
        df       = pd.concat([df,cols_dummy_cat], axis=1)

    # after creating dummy, make the columns number
    for c in cols_dummy + cols_dummy_cat:
        df[c] = df[c].astype(np.int32)

    # Drop unwanted columns
    cols_drop = ['id','date']
    df = df.drop(cols_drop,axis=1)

    return df


params_data = dict(log=True,sq=True,logsq=True,
                   dummy=True,dummy_cat=False)

df_train = clean_data(df_train,**params_data)
df_test = clean_data(df_test,**params_data)

print(df_train.shape)
print(df_train.columns)

(17290, 70)
Index(['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
       'sqft_living15', 'sqft_lot15', 'yr_sales', 'age', 'yr_renovated2',
       'age_after_renovation', 'basement_bool', 'renovation_bool', 'age_cat',
       'age_after_renovation_cat', 'log1p_sqft_living', 'log1p_sqft_lot',
       'log1p_sqft_above', 'log1p_sqft_basement', 'log1p_sqft_living15',
       'log1p_sqft_lot15', 'bedrooms_sq', 'bathrooms_sq', 'floors_sq',
       'waterfront_sq', 'view_sq', 'age_sq', 'age_after_renovation_sq',
       'log1p_sqft_living_sq', 'log1p_sqft_lot_sq', 'log1p_sqft_above_sq',
       'log1p_sqft_basement_sq', 'log1p_sqft_living15_sq',
       'log1p_sqft_lot15_sq', 'waterfront_0', 'waterfront_1', 'view_0',
       'view_1', 'view_2', 'view_3', 'view_4', 'condition_1', 'condition_2',
       'condition_3', 'condition_4', 'condition_5', 'grade_1', 'grade_10',
       'grade_11', 'grade_12', 'grade_13', 'grade_3', 'grade_4', 'grade_5',
       'grade_6', 'grade_7', 'grade_8', 'grade_9'],
      dtype='object')


# df_train.dtypes.to_numpy()


# make sure no data leakage
df_train.filter(regex='price').columns

Index(['price'], dtype='object')


# make sure no nans
df_train.isna().sum().sum(), df_test.isna().sum().sum()

(0, 0)


# choose features to train, we can change it later
features = list(sorted(df_train.columns.drop(target)))
# print(np.array(features))

features = [i for i in features if i in df_test.columns if i in df_train.columns]
# print(np.array(sorted(features)))


df_Xtrain  = df_train[features]
ser_ytrain = df_train[target]

df_Xtest  = df_test[features]
ser_ytest = df_test[target]

ytrain = np.array(ser_ytrain).flatten()
ytest  = np.array(ser_ytest).flatten()


scaling = 'standard'
if scaling == 'standard':
    scaler = preprocessing.StandardScaler()
    scaler.fit(df_Xtrain)
    df_Xtrain = pd.DataFrame(scaler.transform(df_Xtrain),columns=features)
    df_Xtest =  pd.DataFrame(scaler.transform(df_Xtest),columns=features)
elif scaling == 'minmax':
    scaler = preprocessing.MinMaxScaler()
    scaler.fit(df_Xtrain)
    df_Xtrain = pd.DataFrame(scaler.transform(df_Xtrain),columns=features)
    df_Xtest = pd.DataFrame(scaler.transform(df_Xtest),columns=features)

df_Xtrain.head(2)


s = f"""
df_Xtrain = {df_Xtrain.shape}
ytrain    = {ytrain.shape}

df_Xtest  = {df_Xtest.shape}
ytest     = {ytest.shape}

"""
print(s)

df_Xtrain = (17290, 67)
ytrain    = (17290,)

df_Xtest  = (4323, 67)
ytest     = (4323,)


# persist data
df_Xtrain.to_csv('../data/processed/Xtrain.csv.zip',compression='zip',index=False)
df_Xtest.to_csv('../data/processed/Xtest.csv.zip',compression='zip',index=False)

np.savetxt('../data/processed/ytrain.csv',ytrain)
np.savetxt('../data/processed/ytest.csv',ytest)


model = xgboost.XGBRegressor(n_jobs=-1, random_state=SEED,
                        objective='reg:squarederror')

model.fit(df_Xtrain, ytrain)

ypreds = model.predict(df_Xtest)
print_regr_eval(ytest,ypreds,df_Xtest.shape[1])

             RMSE : 124,475.57
Explained Variance: 0.885097
         R-Squared: 0.885032
Adjusted R-squared: 0.883249


%%time
scoring = "neg_mean_squared_error"
kf = model_selection.KFold(5,shuffle=True,random_state=SEED)
cvs = model_selection.cross_val_score(model, df_Xtrain, ytrain,cv=kf,
                      scoring = scoring)
score = cvs.mean()
score_std = cvs.std()
print(f"{scoring}: {score:,.2f}\n std : {score_std:,.2f}")

neg_mean_squared_error: -16,122,317,314.99
 std : 2,821,145,115.10
CPU times: user 23.1 s, sys: 86.4 ms, total: 23.2 s
Wall time: 23.5 s


plt.plot(cvs*-1)
plt.xticks(range(len(cvs)))
plt.show()


%%time

dtrain = xgboost.DMatrix(df_Xtrain,ytrain,
                     feature_names=features)
params = {"objective":"reg:squarederror",
          'colsample_bytree': 0.3,
          'learning_rate': 0.1,
          'max_depth': 5,
          'alpha': 10}

num_boost_round=500
kf=model_selection.KFold(n_splits=5,shuffle=True,random_state=SEED)

# we need xgb data matrix to use xgboost api of xgboost.cv
df_cv_results = xgboost.cv(params,dtrain, num_boost_round,
                    nfold=5,
                    early_stopping_rounds=50,
                    metrics="rmse",
                    folds=kf,
                    verbose_eval=50, # show progress at Nth iteration
                    seed=SEED)

display(df_cv_results.head())

[0]	train-rmse:595399.87500+3805.19258	test-rmse:595702.05000+15407.01215
[50]	train-rmse:105573.52031+2156.19075	test-rmse:137950.91562+11666.75600
[100]	train-rmse:87688.98750+1116.10493	test-rmse:128487.33281+11520.59900
[150]	train-rmse:78634.67969+1029.28058	test-rmse:125073.05156+11155.93922
[200]	train-rmse:72366.42188+670.32210	test-rmse:123669.81094+11340.60912
[250]	train-rmse:67851.61406+268.31472	test-rmse:122734.95312+11381.78452
[300]	train-rmse:63911.67812+313.72536	test-rmse:121927.51719+11399.70860
[350]	train-rmse:60906.43203+203.21987	test-rmse:121528.95781+11560.37613
[400]	train-rmse:58461.84766+410.89633	test-rmse:121319.82500+11537.08273
[450]	train-rmse:55989.08672+345.70102	test-rmse:120972.04531+11696.68500
[499]	train-rmse:53793.44531+314.52973	test-rmse:120762.97500+11641.49620

CPU times: user 39 s, sys: 182 ms, total: 39.2 s
Wall time: 40.2 s


plot_xgb_cv_res(df_cv_results)


%%time
params_xgb = dict(n_jobs=-1, random_state=SEED,
                        objective='reg:squarederror',
                        n_estimators=1200,
                        max_depth=3,
                        reg_alpha=1,
                        reg_lambda=5,
                        subsample=1,
                        gamma=0,
                        min_child_weight=1,
                        colsample_bytree=1,
                        learning_rate=0.1
                        )

model = xgboost.XGBRegressor(**params_xgb)
model.fit(df_Xtrain,ytrain)

ypreds = model.predict(df_Xtest)
print_regr_eval(ytest,ypreds,df_Xtest.shape[1])

             RMSE : 114,726.79
Explained Variance: 0.902373
         R-Squared: 0.902335
Adjusted R-squared: 0.900820


CPU times: user 33.4 s, sys: 83.9 ms, total: 33.5 s
Wall time: 34.4 s


%%time
ytrain_log1p = np.log1p(ytrain)

model = xgboost.XGBRegressor(**params_xgb)
model.fit(df_Xtrain, ytrain_log1p)

# persist the model
path_model_xgb = '../models/model_xgb_logtarget.dump'
model.save_model(path_model_xgb)

# persist using joblib
path_model_xgb_joblib = '../models/model_xgb_logtarget.joblib'
joblib.dump(model,path_model_xgb_joblib)

model = xgboost.XGBRegressor()
model.load_model(fname='../models/model_xgb_logtarget.dump')

ypreds_log1p = model.predict(df_Xtest)
ypreds = np.expm1(ypreds_log1p)

print('ytest:', ytest[:3])
print('ypreds: ', ypreds[:3])
print_regr_eval(ytest,ypreds,df_Xtest.shape[1])

ytest: [285000. 239950. 460000.]
ypreds:  [343218.4  204292.33 508420.8 ]

             RMSE : 110,471.76
Explained Variance: 0.910365
         R-Squared: 0.909445
Adjusted R-squared: 0.908041


CPU times: user 35.4 s, sys: 211 ms, total: 35.6 s
Wall time: 38.1 s


# feature importance
df_imp = pd.DataFrame({'Feature': features,
                       'Importance_gain': model.feature_importances_
                       }) 

df_imp.nlargest(10,'Importance_gain').style.background_gradient()


(df_imp
 .set_index('Feature')
 .nlargest(10,'Importance_gain')
 .plot
 .barh(figsize=(12,8))
 .invert_yaxis()
)


show_methods(model,5)

Object Type: <class 'xgboost.sklearn.XGBRegressor'>


bst = model.get_booster()
bst

<xgboost.core.Booster at 0x7fe077d74510>


fig,ax = plt.subplots(figsize=(12,8))
xgboost.plot_tree(bst,ax=ax,num_trees=4)

<matplotlib.axes._subplots.AxesSubplot at 0x7fe05da65cd0>


# help(xgboost.plot_importance)


fig,ax = plt.subplots(figsize=(12,8))
xgboost.plot_importance(bst,ax=ax,importance_type='weight',max_num_features=20)
plt.show()


from sklearn.inspection import permutation_importance
# permutation_importance?


Xtr,Xvd,ytr,yvd = model_selection.train_test_split(df_Xtrain,ytrain,
                                        train_size=0.8,random_state=SEED)


%%time
model = xgboost.XGBRegressor(**params_xgb)
model.fit(Xtr,ytr)

perm_imp = permutation_importance(model, Xvd, yvd,
                           n_repeats=20,
                           n_jobs=-1,
                           random_state=SEED)

CPU times: user 28.9 s, sys: 219 ms, total: 29.1 s
Wall time: 1min 22s


df_perm_imp = pd.DataFrame({
    'importances_mean': abs(perm_imp.importances_mean),
    'importance_std': perm_imp.importances_std
},index=features)

df_perm_imp = df_perm_imp.sort_values('importances_mean',ascending=False)

df_perm_imp.head(10)


df_perm_imp.tail()


features_sel = df_perm_imp.query("importances_mean > 0.00").index.to_numpy()
print(features_sel)
features_sel = list(features_sel)

['lat' 'log1p_sqft_living' 'grade' 'long' 'log1p_sqft_living15'
 'log1p_sqft_lot' 'zipcode' 'log1p_sqft_above' 'waterfront' 'view'
 'bathrooms' 'condition' 'log1p_sqft_basement' 'yr_built' 'yr_sales'
 'age_after_renovation' 'floors' 'log1p_sqft_lot15' 'age' 'bedrooms'
 'yr_renovated' 'yr_renovated2' 'grade_10' 'condition_4' 'view_3'
 'grade_9' 'grade_11' 'renovation_bool' 'grade_12' 'basement_bool'
 'grade_7' 'condition_3' 'grade_8' 'view_1' 'grade_6' 'condition_2'
 'view_2' 'grade_4']


%%time
model = xgboost.XGBRegressor(**params_xgb)
model.fit(df_Xtrain[features_sel],ytrain)
ypreds = model.predict(df_Xtest[features_sel])

print_regr_eval(ytest,ypreds,len(features_sel))

             RMSE : 117,402.34
Explained Variance: 0.897784
         R-Squared: 0.897726
Adjusted R-squared: 0.896843


CPU times: user 20.9 s, sys: 142 ms, total: 21.1 s
Wall time: 23.7 s


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 0 hr 4 min 18 secs

	id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	view	condition	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15
0	2561340020	20140804T000000	325000.0	3	1.75	1780	11096	1.0	0	3	7	1210	570	1979	0	98074	47.6170	-122.051	1780	10640
1	8598200070	20141208T000000	278000.0	2	2.50	1420	2229	2.0	0	3	7	1420	0	2004	0	98059	47.4871	-122.165	1500	2230
17288	7174800760	20140725T000000	667000.0	5	2.00	1900	5470	1.0	0	3	7	1180	720	1930	1965	98105	47.6666	-122.303	1300	3250
17289	9521100280	20140612T000000	480000.0	3	2.50	1250	1103	3.0	2	3	8	1250	0	2005	0	98103	47.6619	-122.352	1250	1188

	age	age_after_renovation	age_after_renovation_cat	age_after_renovation_sq	age_cat	age_sq	basement_bool	bathrooms	bathrooms_sq	bedrooms	bedrooms_sq	condition	condition_1	condition_2	condition_3	condition_4	condition_5	floors	floors_sq	grade	grade_10	grade_11	grade_12	grade_13	grade_4	grade_5	grade_6	grade_7	grade_8	grade_9	lat	log1p_sqft_above	log1p_sqft_above_sq	log1p_sqft_basement	log1p_sqft_basement_sq	log1p_sqft_living	log1p_sqft_living15	log1p_sqft_living15_sq	log1p_sqft_living_sq	log1p_sqft_lot	log1p_sqft_lot15	log1p_sqft_lot15_sq	log1p_sqft_lot_sq	long	renovation_bool	sqft_above	sqft_basement	sqft_living	sqft_living15	sqft_lot	sqft_lot15	view	view_0	view_1	view_2	view_3	view_4	view_sq	waterfront	waterfront_0	waterfront_1	waterfront_sq	yr_built	yr_renovated	yr_renovated2	yr_sales	zipcode
0	-0.288109	-0.212303	-0.062185	-0.438016	-0.139825	-0.494698	1.247166	-0.468811	-0.537610	-0.39033	-0.302220	-0.630613	-0.035694	-0.08937	0.735526	-0.595921	-0.294513	-0.916249	-0.837904	-0.554878	-0.238288	-0.135782	-0.066005	-0.026354	-0.036497	-0.108453	-0.324043	1.186907	-0.624934	-0.367371	0.410048	-0.688967	-0.698830	1.208375	1.137983	-0.149505	-0.169074	-0.189252	-0.177052	0.361630	0.383984	0.328910	0.301512	1.151178	-0.207998	-0.698239	0.636923	-0.322100	-0.302502	-0.095727	-0.078695	-0.305512	0.329787	-0.123077	-0.217065	-0.1533	-0.124282	-0.261712	-0.089698	0.089698	-0.089698	-0.089698	0.277141	-0.207992	0.201159	-0.693043	-0.071763
1	-1.135161	-1.074946	-1.265291	-0.814627	-1.320662	-0.856409	-0.801818	0.506258	0.326221	-1.46038	-0.775165	-0.630613	-0.035694	-0.08937	0.735526	-0.595921	-0.294513	0.933474	0.806845	-0.554878	-0.238288	-0.135782	-0.066005	-0.026354	-0.036497	-0.108453	-0.324043	1.186907	-0.624934	-0.367371	-0.527440	-0.314663	-0.338123	-0.795545	-0.779839	-0.681826	-0.692075	-0.700087	-0.697163	-1.411647	-1.527957	-1.398248	-1.291600	0.344386	-0.207998	-0.442941	-0.658262	-0.716449	-0.712318	-0.302804	-0.378759	-0.305512	0.329787	-0.123077	-0.217065	-0.1533	-0.124282	-0.261712	-0.089698	0.089698	-0.089698	-0.089698	1.124268	-0.207992	1.064027	-0.693043	-0.353180

	train-rmse-mean	train-rmse-std	test-rmse-mean	test-rmse-std
0	595399.87500	3805.192577	595702.0500	15407.012146
1	544175.62500	3747.841473	545224.7125	15089.800899
2	497403.95000	4098.070245	499268.9125	14047.165952
3	455813.43125	4262.515081	458664.3125	13041.137876
4	418719.71875	4614.028706	422088.4500	12737.654828

	importances_mean	importance_std
lat	0.335251	0.013772
log1p_sqft_living	0.169468	0.003991
grade	0.168113	0.006245
long	0.120965	0.011823
log1p_sqft_living15	0.034103	0.001742
log1p_sqft_lot	0.029592	0.001546
zipcode	0.027605	0.003611
log1p_sqft_above	0.021245	0.001288
waterfront	0.021151	0.001104
view	0.015423	0.001054

Table of Contents

Data Description¶

Imports¶

Important Scripts¶

Parameters¶

Load the data¶

Data Processing¶

Train target split¶

Scaling¶

Modelling: xgboost¶

cross validation¶

cross validation using xgboost.cv with dtrain¶

Best Model¶

Log transform the target¶

Feature importance¶

Feature selection using permutation feature importance¶

Time Taken¶

	Feature	Importance_gain
19	grade	0.308453
35	log1p_sqft_living	0.177563
51	view	0.132501
58	waterfront	0.076463
30	lat	0.061198
11	condition	0.028256
36	log1p_sqft_living15	0.026587
65	yr_sales	0.023181
0	age	0.020266
63	yr_renovated	0.014971

	0	1	2	3	4
0	apply	fit	intercept_	n_estimators	save_model
1	base_score	gamma	kwargs	n_features_in_	scale_pos_weight
2	booster	get_booster	learning_rate	n_jobs	score
3	coef_	get_num_boosting_rounds	load_model	num_parallel_tree	set_params
4	colsample_bylevel	get_params	max_delta_step	objective	subsample
5	colsample_bynode	get_xgb_params	max_depth	predict	tree_method
6	colsample_bytree	gpu_id	min_child_weight	random_state	validate_parameters
7	evals_result	importance_type	missing	reg_alpha	verbosity
8	feature_importances_	interaction_constraints	monotone_constraints	reg_lambda

	importances_mean	importance_std
log1p_sqft_lot15_sq	0.0	0.0
log1p_sqft_lot_sq	0.0	0.0
sqft_above	0.0	0.0
sqft_basement	0.0	0.0
age_cat	0.0	0.0