import sys
IN_BHISHAN = 'bhishan' in sys.modules

if IN_BHISHAN:
    import src
    import bhishan
    %load_ext autoreload
    %autoreload 2


import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    !pip install hpsklearn

    # set OMP_NUM_THREADS=1 for hpsklearn package
    !export OMP_NUM_THREADS=1

Collecting hpsklearn
  Downloading https://files.pythonhosted.org/packages/ce/cb/61b99f73621e2692abd0e730f7888a9983d01f626868336fa1db1d57bc1e/hpsklearn-0.1.0.tar.gz
Requirement already satisfied: hyperopt in /usr/local/lib/python3.6/dist-packages (from hpsklearn) (0.1.2)
Collecting nose
  Downloading https://files.pythonhosted.org/packages/15/d8/dd071918c040f50fa1cf80da16423af51ff8ce4a0f2399b7bf8de45ac3d9/nose-1.3.7-py3-none-any.whl (154kB)
     |████████████████████████████████| 163kB 7.7MB/s 
Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from hpsklearn) (1.17.4)
Requirement already satisfied: scikit-learn in /usr/local/lib/python3.6/dist-packages (from hpsklearn) (0.21.3)
Requirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from hpsklearn) (1.3.2)
Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from hyperopt->hpsklearn) (0.16.0)
Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from hyperopt->hpsklearn) (1.12.0)
Requirement already satisfied: pymongo in /usr/local/lib/python3.6/dist-packages (from hyperopt->hpsklearn) (3.9.0)
Requirement already satisfied: networkx in /usr/local/lib/python3.6/dist-packages (from hyperopt->hpsklearn) (2.4)
Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from hyperopt->hpsklearn) (4.28.1)
Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-learn->hpsklearn) (0.14.0)
Requirement already satisfied: decorator>=4.3.0 in /usr/local/lib/python3.6/dist-packages (from networkx->hyperopt->hpsklearn) (4.4.1)
Building wheels for collected packages: hpsklearn
  Building wheel for hpsklearn (setup.py) ... done
  Created wheel for hpsklearn: filename=hpsklearn-0.1.0-cp36-none-any.whl size=23913 sha256=76527173918f624b67263eefd48832783a6d8f12c1682b770b55efaed5a1ea14
  Stored in directory: /root/.cache/pip/wheels/41/ee/c4/3c267cbf78f0905434ee36b915d97a20610ad3af7ff3c75852
Successfully built hpsklearn
Installing collected packages: nose, hpsklearn
Successfully installed hpsklearn-0.1.0 nose-1.3.7


import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import os
import time

# random state
SEED = 0
RNG = np.random.RandomState(SEED)

# Jupyter notebook settings for pandas
pd.set_option('display.max_columns', 200)
# pd.set_option('display.float_format', '{:,.4f}'.format) # numbers sep by comma
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 200)

print([(x.__name__,x.__version__) for x in [np, pd,sns,matplotlib]])

[('numpy', '1.17.4'), ('pandas', '0.25.3'), ('seaborn', '0.9.0'), ('matplotlib', '3.1.1')]


%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;


import scipy
import sklearn

print([(x.__name__,x.__version__) for x in [scipy, sklearn]])

[('scipy', '1.3.2'), ('sklearn', '0.21.3')]


# scale and split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


# regressors
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor


# regressor preprocessing
from sklearn.preprocessing import PolynomialFeatures


# pipeline
from sklearn.pipeline import Pipeline


# metrics
from sklearn import metrics
from sklearn.metrics import mean_squared_error


# cross validation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score


df_eval = pd.DataFrame({'Model': [],
                           'Details':[],
                           'Root Mean Squared Error (RMSE)':[],
                           'R-squared (training)':[],
                           'Adjusted R-squared (training)':[],
                           'R-squared (test)':[],
                           'Adjusted R-squared (test)':[],
                           '5-Fold Cross Validation':[]})


def show_method_attributes(obj, ncols=7,start=None, inside=None):
    """ Show all the attributes of a given method.
    Example:
    ========
    show_method_attributes(list)
     """

    print(f'Object Type: {type(obj)}\n')
    lst = [elem for elem in dir(obj) if elem[0]!='_' ]
    lst = [elem for elem in lst 
           if elem not in 'os np pd sys time psycopg2'.split() ]

    if isinstance(start,str):
        lst = [elem for elem in lst if elem.startswith(start)]
        
    if isinstance(start,tuple) or isinstance(start,list):
        lst = [elem for elem in lst for start_elem in start
               if elem.startswith(start_elem)]
        
    if isinstance(inside,str):
        lst = [elem for elem in lst if inside in elem]
        
    if isinstance(inside,tuple) or isinstance(inside,list):
        lst = [elem for elem in lst for inside_elem in inside
               if inside_elem in elem]

    return pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')


def adjustedR2(rsquared,nrows,kcols):
    return rsquared- (kcols-1)/(nrows-kcols) * (1-rsquared)


# df_clean = pd.read_csv('../data/processed/data_cleaned_encoded.csv')
df_clean = pd.read_csv('https://github.com/bhishanpdl/Project_House_Price_Prediction/blob/master/data/processed/data_cleaned_encoded.csv?raw=true')
print(df_clean.shape)
df_clean.head()

(21613, 92)


# I will just take column names from this and will use cleaned data further.
# df_raw = pd.read_csv('../data/raw/kc_house_data.csv')
df_raw = pd.read_csv('https://github.com/bhishanpdl/Project_House_Price_Prediction/blob/master/data/raw/kc_house_data.csv?raw=true',nrows=1)
df_raw.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')


features_raw_all = ['bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15']

df = df_clean[features_raw_all + ['price']]


log_cols = ['price','sqft_living','sqft_living15','sqft_lot','sqft_lot15']

for col in log_cols:
    df[col] = np.log1p(df[col].to_numpy())

/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


X = df[features_raw_all].to_numpy()

y = df['price'].to_numpy()

Xtrain, Xtest, ytrain, ytest = train_test_split (X,y,
                                                     test_size=0.20,
                                                     random_state=random_state)

Xtrain.shape, ytrain.shape, Xtest.shape, ytest.shape

((17290, 18), (17290,), (4323, 18), (4323,))


pipelines = []

pipelines.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()),
                                        ('LR',LinearRegression())])) )


pipelines.append(('ScaledLASSO', Pipeline([('Scaler', StandardScaler()),
                                           ('LASSO', Lasso())])))

pipelines.append(('ScaledEN', Pipeline([('Scaler', StandardScaler()),
                                        ('EN', ElasticNet())])))

pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()),
                                         ('KNN', KNeighborsRegressor())])))

# decision tree may not need scaling
pipelines.append(('ScaledDT', Pipeline([('Scaler', StandardScaler())
                                          ,('DT', DecisionTreeRegressor())])))

# boosting may not need scaling
pipelines.append(('ScaledGBR', Pipeline([('Scaler', StandardScaler()),
                                         ('GBR', GradientBoostingRegressor())])))


df_cv = pd.DataFrame({'Model': [],
                      '10-Fold Cross Validation Mean':[],
                      '10-Fold Cross Validation Std':[]
                     })


# pipeline will scale the data and then fit the data
for name, model in pipelines:
    kfold = KFold(n_splits=10, random_state=random_state)
    cv_results = cross_val_score(model,
                                 Xtrain,
                                 ytrain,
                                 cv=kfold,
                                 scoring='neg_mean_squared_error'
                                 )

    df_cv.loc[len(df_cv)] = [name, cv_results.mean(), cv_results.std() ]


df_cv.sort_values('10-Fold Cross Validation Mean',ascending=False) # smallest rmse is best, largest negMSE is best.

# observation:
# we have values of power 10, its bad, we need more iteration or need to log transform features.


%%time

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor

# we use gradient boosting regressor, since it gave us the best model.



param_grid = dict(n_estimators=np.array([50,100,200,300,400,500,600,700,800]))

model = GradientBoostingRegressor(random_state=random_state)
kfold = KFold(n_splits=10, random_state=random_state)

grid = GridSearchCV(estimator=model,
                    param_grid=param_grid,
                    scoring='neg_mean_squared_error',
                    cv=kfold)

grid_result = grid.fit(Xtrain, ytrain)

means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
df_grid_gbr = pd.DataFrame({'n_estimators': params,
                           'neg_mean_squared_error': means,
                           'std': stds})

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
pd.set_option('display.float_format', '{:,.10f}'.format)
df_grid_gbr.sort_values('neg_mean_squared_error', ascending=False)

Best: -0.027883 using {'n_estimators': 800}
CPU times: user 7min 45s, sys: 51.3 ms, total: 7min 46s
Wall time: 7min 46s


%%time

# best n_estimators found was the largest value.
# We need to look more than the maximum value.


param_grid = dict(n_estimators=np.array([850,900,950,1000]))

model = GradientBoostingRegressor(random_state=random_state)
kfold = KFold(n_splits=10, random_state=random_state)

grid = GridSearchCV(estimator=model,
                    param_grid=param_grid,
                    scoring='neg_mean_squared_error',
                    cv=kfold)

grid_result = grid.fit(Xtrain, ytrain)

means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
df_grid_gbr = pd.DataFrame({'n_estimators': params,
                           'neg_mean_squared_error': means,
                           'std': stds})

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
pd.set_option('display.float_format', '{:,.10f}'.format)
df_grid_gbr.sort_values('neg_mean_squared_error', ascending=False)

Best: -0.027759 using {'n_estimators': 1000}
CPU times: user 7min 52s, sys: 38.9 ms, total: 7min 52s
Wall time: 7min 52s


n_estimators_best = grid_result.best_params_['n_estimators']
n_estimators_best

1000


%%time

# best n_estimators found was the largest value.
# We need to look more than the maximum value.


param_grid = dict(n_estimators=np.array([1050,1100,1200,1300]))

model = GradientBoostingRegressor(random_state=random_state)
kfold = KFold(n_splits=10, random_state=random_state)

grid = GridSearchCV(estimator=model,
                    param_grid=param_grid,
                    scoring='neg_mean_squared_error',
                    cv=kfold)

grid_result = grid.fit(Xtrain, ytrain)

means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
df_grid_gbr = pd.DataFrame({'n_estimators': params,
                           'neg_mean_squared_error': means,
                           'std': stds})

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
pd.set_option('display.float_format', '{:,.10f}'.format)
df_grid_gbr.sort_values('neg_mean_squared_error', ascending=False)

Best: -0.027719 using {'n_estimators': 1200}
CPU times: user 10min 5s, sys: 64.9 ms, total: 10min 5s
Wall time: 10min 6s


n_estimators_best = grid_result.best_params_['n_estimators']
n_estimators_best

1200


%%time
# I can do further grid search, but I will stop here.
# Also, I only looked at one parameter, we need to look at more parameters.
from sklearn.metrics import mean_squared_error


model = GradientBoostingRegressor(random_state=random_state,
                                  n_estimators=n_estimators_best)
model.fit(Xtrain, ytrain)
ypreds = model.predict(Xtest)
print ('Test MSE on scaled data = ', mean_squared_error(ytest, ypreds))

Test MSE on scaled data =  0.025643993963277448
CPU times: user 17.4 s, sys: 1.99 ms, total: 17.4 s
Wall time: 17.4 s


show_method_attributes(model)

Object Type: <class 'sklearn.ensemble.gradient_boosting.GradientBoostingRegressor'>


r_squared = model.score(Xtest, ytest)
print('r_squared = ', r_squared)

r_squared =  0.9085287411926323


def adjustedR2(rsquared,nrows,kcols):
    return rsquared- (kcols-1)/(nrows-kcols) * (1-rsquared)


r2 = sklearn.metrics.r2_score(ytest, ypreds)
ar2 = adjustedR2(r2, Xtest.shape[0], Xtest.shape[1])

print('r_squared   = ', r2)
print('adjustedr2 = ', ar2)

r_squared   =  0.9085287411926322
adjustedr2 =  0.9081675306468191


df_preds = pd.DataFrame({'y_test': ytest,
                        'y_pred' : ypreds,
                        })

df_preds.head()


df_imp = pd.DataFrame({'Feature': features_raw_all,
                       'Importance': model.feature_importances_
                       }) 

df_imp.sort_values('Importance').style.background_gradient(subset=['Importance'])


df_imp.sort_values('Importance').set_index('Feature').sort_values('Importance').plot.barh(figsize=(12,8))

<matplotlib.axes._subplots.AxesSubplot at 0x7f83a6904ba8>


scaler = StandardScaler()
scaler.fit(Xtrain)

Xtrain_scaled = scaler.transform(Xtrain)
Xtest_scaled = scaler.transform(Xtest)


# from hpsklearn import HyperoptEstimator, any_regressor
# from hyperopt import tpe
# estim = HyperoptEstimator(regressor=any_regressor('regressor'),  
#                             algo=tpe.suggest, trial_timeout=300)

# estim.fit(Xtrain_scaled, ytrain )

# print(estim.score(Xtest_scaled, ytest ) )
# print(estim.best_model() )

# Official example fails:
#
# ---> 95     return sklearn.linear_model.SGDRegressor(*args, **kwargs)
# TypeError: __init__() got an unexpected keyword argument 'n_iter'


# %%time
# from hpsklearn import gradient_boosting_regression
# from hpsklearn import HyperoptEstimator, any_regressor
# from hyperopt import tpe

# estim = HyperoptEstimator(regressor=gradient_boosting_regression('gbr'),  
#                             algo=tpe.suggest, trial_timeout=300)

# estim.fit(Xtrain_scaled, ytrain )

# print(estim.score(Xtest_scaled, ytest ) )
# print(estim.best_model() )

# scaled data gives fitting error
# ValueError: attempt to get argmin of an empty sequence


from hpsklearn import gradient_boosting_regression
from hpsklearn import HyperoptEstimator, any_regressor
from hyperopt import tpe

estim = HyperoptEstimator(regressor=gradient_boosting_regression('gbr'),  
                            algo=tpe.suggest, trial_timeout=300)

estim.fit(Xtrain, ytrain )

print(estim.score(Xtest, ytest ) )
print(estim.best_model() )

WARN: OMP_NUM_THREADS=None =>
... If you are using openblas if you are using openblas set OMP_NUM_THREADS=1 or risk subprocess calls hanging indefinitely
100%|██████████| 1/1 [00:03<00:00,  3.24s/it, best loss: 0.4098841088459799]
100%|██████████| 1/1 [01:34<00:00, 94.59s/it, best loss: 0.4098841088459799]
100%|██████████| 1/1 [00:05<00:00,  5.94s/it, best loss: 0.4098841088459799]
100%|██████████| 1/1 [05:00<00:00, 300.13s/it, best loss: 0.4098841088459799]
100%|██████████| 1/1 [00:02<00:00,  2.47s/it, best loss: 0.3616076393294052]
100%|██████████| 1/1 [00:40<00:00, 40.23s/it, best loss: 0.2574011641441949]
100%|██████████| 1/1 [00:00<00:00,  1.16it/s, best loss: 0.2574011641441949]
100%|██████████| 1/1 [00:00<00:00,  5.19it/s, best loss: 0.2574011641441949]
100%|██████████| 1/1 [01:21<00:00, 81.89s/it, best loss: 0.10502638439188017]
100%|██████████| 1/1 [00:00<00:00,  1.49it/s, best loss: 0.10502638439188017]
0.8297883148727474
{'learner': GradientBoostingRegressor(alpha=0.5, criterion='friedman_mse', init=None,
                          learning_rate=0.030436296469075703, loss='quantile',
                          max_depth=None, max_features=0.8340107985008011,
                          max_leaf_nodes=None, min_impurity_decrease=0.0,
                          min_impurity_split=None, min_samples_leaf=3,
                          min_samples_split=2, min_weight_fraction_leaf=0.0,
                          n_estimators=245, n_iter_no_change=None,
                          presort='auto', random_state=3,
                          subsample=0.5131351702909814, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False), 'preprocs': (MinMaxScaler(copy=True, feature_range=(-1.0, 1.0)),), 'ex_preprocs': ()}


show_method_attributes(estim)

Object Type: <class 'hpsklearn.estimator.hyperopt_estimator'>


best_model = estim.best_model()['learner']
best_model

GradientBoostingRegressor(alpha=0.5, criterion='friedman_mse', init=None,
                          learning_rate=0.030436296469075703, loss='quantile',
                          max_depth=None, max_features=0.8340107985008011,
                          max_leaf_nodes=None, min_impurity_decrease=0.0,
                          min_impurity_split=None, min_samples_leaf=3,
                          min_samples_split=2, min_weight_fraction_leaf=0.0,
                          n_estimators=245, n_iter_no_change=None,
                          presort='auto', random_state=3,
                          subsample=0.5131351702909814, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)


%%time
best_model.fit(Xtrain, ytrain)

ypreds = best_model.predict(Xtest)

r2 = sklearn.metrics.r2_score(ytest, ypreds)
ar2 = adjustedR2(r2, Xtest.shape[0], Xtest.shape[1])

print ('Test MSE = ', mean_squared_error(ytest, ypreds))
print('r_squared   = ', r2)
print('adjustedr2 = ', ar2)

Test MSE =  0.02724777601038427
r_squared   =  0.9028081048942618
adjustedr2 =  0.9024243041470382
CPU times: user 1min 56s, sys: 40 ms, total: 1min 56s
Wall time: 1min 56s


# when I scale the data for hyperopt for lgb, it fails
# when I do not scale the data, it gives very low performance   adj-r2 = 0.8889
# but, I already have best adj-r2 = 0.90

	id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	condition	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15	yr_sales	age	yr_renovated2	age_after_renovation	zipcode_top10	zipcode_houses	basement_bool	renovation_bool	age_cat	age_after_renovation_cat	waterfront_0	view_0	condition_3	condition_5	grade_6	grade_7	grade_8	zipcode_top10_others	age_cat_2	age_cat_4	age_cat_5	age_cat_7	age_after_renovation_cat_2	age_after_renovation_cat_4	age_after_renovation_cat_5	age_after_renovation_cat_7	log1p_price	log1p_sqft_living	log1p_sqft_lot	log1p_sqft_above	log1p_sqft_basement	log1p_sqft_living15	log1p_sqft_lot15
0	7129300520	2014-10-13	221900.0	3	1.00	1180	5650	1.0	3	7	1180	0	1955	0	98178	47.5112	-122.257	1340	5650	2014	59	1955	59	others	262	0	0	5	5	1	1	1	0	0	1	0	1	0	0	1	0	0	0	1	0	12.309987	7.074117	8.639588	7.074117	0.000000	7.201171	8.639588
1	6414100192	2014-12-09	538000.0	3	2.25	2570	7242	2.0	3	7	2170	400	1951	1991	98125	47.7210	-122.319	1690	7639	2014	63	1991	23	others	410	1	1	5	2	1	1	1	0	0	1	0	1	0	0	1	0	1	0	0	0	13.195616	7.852050	8.887791	7.682943	5.993961	7.433075	8.941153
2	5631500400	2015-02-25	180000.0	2	1.00	770	10000	1.0	3	6	770	0	1933	0	98028	47.7379	-122.233	2720	8062	2015	82	1933	82	others	283	0	0	7	7	1	1	1	0	1	0	0	1	0	0	0	1	0	0	0	1	12.100718	6.647688	9.210440	6.647688	0.000000	7.908755	8.995041
3	2487200875	2014-12-09	604000.0	4	3.00	1960	5000	1.0	5	7	1050	910	1965	0	98136	47.5208	-122.393	1360	5000	2014	49	1965	49	others	263	1	0	4	4	1	1	0	1	0	1	0	1	0	1	0	0	0	1	0	0	13.311331	7.581210	8.517393	6.957497	6.814543	7.215975	8.517393
4	1954400510	2015-02-18	510000.0	3	2.00	1680	8080	1.0	3	8	1680	0	1987	0	98074	47.6168	-122.045	1800	7503	2015	28	1987	28	others	441	0	0	2	2	1	1	1	0	0	0	1	1	1	0	0	0	1	0	0	0	13.142168	7.427144	8.997271	7.427144	0.000000	7.496097	8.923191

	Model	10-Fold Cross Validation Mean	10-Fold Cross Validation Std
5	ScaledGBR	-0.034164	0.001455
3	ScaledKNN	-0.046319	0.002454
0	ScaledLR	-0.062666	0.002355
4	ScaledDT	-0.064170	0.002628
1	ScaledLASSO	-0.276666	0.010241
2	ScaledEN	-0.276666	0.010241

	y_test	y_pred
0	12.5602479680	12.7795806346
1	12.3881900148	12.2345561363
2	13.0389839424	13.1265819435
3	12.8929527288	13.1087008760
4	13.2085429085	13.2583263798

	0	1	2	3	4	5	6
0	algo	fit_increment	info	n_ex_pps	rstate	set_params	trials
1	best_model	fit_increment_dump_filename	loss_fn	predict	score	space	use_partial_fit
2	classification	fit_iter	max_evals	retrain_best_model_on_full_data	seed	trial_timeout	verbose
3	fit	get_params

Table of Contents

Data Description¶

Imports¶

Important Scripts¶

Load the data¶

Log transform large values¶

Train-Test split after log transform¶

Making pipeline for various sklearn Regressors (with automatic scaling)¶

Hyperparameter Tuning¶

Grid Search for Gradient Boosting Regressor¶

Hyper Parameter using hyperopt-sklearn for Gradient Boosting Regressor¶

Scale data for hyperparameter tuning¶

	0	1	2	3	4	5	6
0	alpha	get_params	max_depth	min_samples_leaf	n_estimators_	random_state	tol
1	apply	init	max_features	min_samples_split	n_features_	score	train_score_
2	criterion	init_	max_features_	min_weight_fraction_leaf	n_iter_no_change	set_params	validation_fraction
3	estimators_	learning_rate	max_leaf_nodes	n_classes_	predict	staged_predict	verbose
4	feature_importances_	loss	min_impurity_decrease	n_estimators	presort	subsample	warm_start
5	fit	loss_	min_impurity_split

	Feature	Importance
4	floors	0.000536463
0	bedrooms	0.000963564
12	yr_renovated	0.00168729
10	sqft_basement	0.00181538
1	bathrooms	0.00354188
7	condition	0.00678444
17	sqft_lot15	0.00744306
5	waterfront	0.00803743
9	sqft_above	0.00841589
3	sqft_lot	0.0109287
13	zipcode	0.0145774
6	view	0.0192044
11	yr_built	0.022047
16	sqft_living15	0.0278152
15	long	0.0304903
2	sqft_living	0.26218
8	grade	0.269238
14	lat	0.304293