This dataset contains house sale prices for King County, which includes Seattle. It includes homes sold between May 2014 and May 2015.
Task: Try to estimate the price based on given features.
import time
time_start_notebook = time.time()
%%capture
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
## install modules
!pip install watermark
!pip install featuretools
# if we update existing module, we need to restart colab
!pip install -U scikit-learn
## print
print('Environment: Google Colaboratory.')
TREE_METHOD = 'gpu_hist' if ENV_COLAB else 'auto'
import numpy as np
import pandas as pd
# visualization
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
# mixed
import os
import time
from pprint import pprint
# random state
SEED = 0
RNG = np.random.RandomState(SEED)
# settings
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows',200)
pd.set_option('display.max_colwidth',500)
# sklearn
import sklearn
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import ensemble
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.inspection import permutation_importance
# boosting
from sklearn.experimental import enable_hist_gradient_boosting # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
import xgboost
import lightgbm
import lightgbm as lgb
# special
import featuretools
import featuretools as ft
from featuretools import variable_types as vtypes
# versions
import watermark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv
Bhishan Poudel 2020-11-12 CPython 3.6.9 IPython 5.5.0 compiler : GCC 8.4.0 system : Linux release : 4.19.112+ machine : x86_64 processor : x86_64 CPU cores : 2 interpreter: 64bit matplotlib 3.2.2 featuretools 0.21.0 lightgbm 2.2.3 pandas 1.1.4 sklearn 0.23.2 seaborn 0.11.0 xgboost 0.90 watermark 2.0.2 numpy 1.18.5
def show_methods(obj, ncols=7,start=None, inside=None):
""" Show all the attributes of a given method.
Example:
========
show_method_attributes(list)
"""
print(f'Object Type: {type(obj)}\n')
lst = [elem for elem in dir(obj) if elem[0]!='_' ]
lst = [elem for elem in lst
if elem not in 'os np pd sys time psycopg2'.split() ]
if isinstance(start,str):
lst = [elem for elem in lst if elem.startswith(start)]
if isinstance(start,tuple) or isinstance(start,list):
lst = [elem for elem in lst for start_elem in start
if elem.startswith(start_elem)]
if isinstance(inside,str):
lst = [elem for elem in lst if inside in elem]
if isinstance(inside,tuple) or isinstance(inside,list):
lst = [elem for elem in lst for inside_elem in inside
if inside_elem in elem]
return pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
def adjustedR2(rsquared,nrows,ncols):
return rsquared- (ncols-1)/(nrows-ncols) * (1-rsquared)
def print_regr_eval(ytest,ypreds,ncols):
rmse = np.sqrt(metrics.mean_squared_error(ytest,ypreds))
r2 = metrics.r2_score(ytest,ypreds)
ar2 = adjustedR2(r2,len(ytest),ncols)
evs = metrics.explained_variance_score(ytest, ypreds)
print(f"""
RMSE : {rmse:,.2f}
Explained Variance: {evs:.6f}
R-Squared: {r2:,.6f}
Adjusted R-squared: {ar2:,.6f}
""")
def plot_xgb_cv_res(df_cv_results):
fig,ax = plt.subplots()
plt.plot(df_cv_results['train-rmse-mean'],color='b',label='train-rmse')
plt.plot(df_cv_results['test-rmse-mean'],color='r',label='train-rmse')
plt.title('Cross validation score mean plot',fontsize=14)
plt.legend()
plt.show()
if ENV_COLAB:
path_raw = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/'
proj = 'Projects/King_County_Seattle_House_Price_Kaggle/'
data_path_parent = path_raw + proj
data_path_train = data_path_parent + 'raw/train.csv'
data_path_test = data_path_parent + 'raw/test.csv'
else:
data_path_parent = '../data/'
data_path_train = data_path_parent + 'raw/train.csv'
data_path_test = data_path_parent + 'raw/test.csv'
target = 'price'
train_size = 0.8
print(data_path_train)
https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/King_County_Seattle_House_Price_Kaggle/raw/train.csv
df_train = pd.read_csv(data_path_train)
df_test = pd.read_csv(data_path_test)
print(df_train.shape)
print(df_train.columns)
display(df_train.head(2).append(df_train.tail(2)))
(17290, 21) Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15'], dtype='object')
id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2561340020 | 20140804T000000 | 325000.0 | 3 | 1.75 | 1780 | 11096 | 1.0 | 0 | 0 | 3 | 7 | 1210 | 570 | 1979 | 0 | 98074 | 47.6170 | -122.051 | 1780 | 10640 |
1 | 8598200070 | 20141208T000000 | 278000.0 | 2 | 2.50 | 1420 | 2229 | 2.0 | 0 | 0 | 3 | 7 | 1420 | 0 | 2004 | 0 | 98059 | 47.4871 | -122.165 | 1500 | 2230 |
17288 | 7174800760 | 20140725T000000 | 667000.0 | 5 | 2.00 | 1900 | 5470 | 1.0 | 0 | 0 | 3 | 7 | 1180 | 720 | 1930 | 1965 | 98105 | 47.6666 | -122.303 | 1300 | 3250 |
17289 | 9521100280 | 20140612T000000 | 480000.0 | 3 | 2.50 | 1250 | 1103 | 3.0 | 0 | 2 | 3 | 8 | 1250 | 0 | 2005 | 0 | 98103 | 47.6619 | -122.352 | 1250 | 1188 |
References
target = 'price'
ytrain = df_train.pop(target)
ytest = df_test.pop(target)
ytrain = np.array(ytrain).flatten()
ytest = np.array(ytest).flatten()
ytrain[:5]
array([325000., 278000., 710000., 389900., 489000.])
features = list(df_train.columns)
features # there is 'index' but not the target 'price'
['id', 'date', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15']
cols_drop = ['id']
try:
df_train = df_train_raw.drop(cols_drop,axis=1)
df_test = df_test_raw.drop(cols_drop,axis=1)
except:
pass
from featuretools import variable_types as vtypes
Numeric Ordinal Categorical Datetime Boolean ZIPCode Text TimeIndex URL
`
# df_train.head(2)
cols_int = ['bathrooms','floors']
for col in cols_int:
df_train[col] = df_train[col].astype(np.int8)
df_test[col] = df_test[col].astype(np.int8)
cols_date = ['date']
for col in cols_date:
df_train[col] = pd.to_datetime(df_train[col])
df_test[col] = pd.to_datetime(df_test[col])
df_train.head(2)
id | date | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2561340020 | 2014-08-04 | 3 | 1 | 1780 | 11096 | 1 | 0 | 0 | 3 | 7 | 1210 | 570 | 1979 | 0 | 98074 | 47.6170 | -122.051 | 1780 | 10640 |
1 | 8598200070 | 2014-12-08 | 2 | 2 | 1420 | 2229 | 2 | 0 | 0 | 3 | 7 | 1420 | 0 | 2004 | 0 | 98059 | 47.4871 | -122.165 | 1500 | 2230 |
df_train.apply(pd.Series.nunique).sort_values()
waterfront 2 floors 3 condition 5 view 5 bathrooms 9 grade 12 bedrooms 13 yr_renovated 66 zipcode 70 yr_built 116 sqft_basement 284 date 366 sqft_living15 713 long 735 sqft_above 844 sqft_living 927 lat 4832 sqft_lot15 7562 sqft_lot 8452 id 17182 dtype: int64
# variable types
# keep bathrooms bedrooms etc numeric
# we dont need to define type of numeric variable
var_types = {
'date': vtypes.Datetime,
'index': vtypes.Index,
'waterfront': vtypes.Boolean,
'zipcode': vtypes.ZIPCode
}
# Here index for all rows is unique, if we do aggregation
# e.g. min max, we get the same number for all (this is simply a bias term)
# So, we don't need agg primitives here.
agg_primitives = []
trans_primitives = [
'divide_numeric', # cross multiply all numeric features (not others)
#'Year' # get year from 'date' feature. do not use year, i will create it later
]
cols_exclude = ['lat','long'] # the division does not make sense for lat lon
def get_extra_features_from_ft(dfx,var_types=var_types,
agg_primitives=agg_primitives,
trans_primitives=trans_primitives,
features=features,
targetname='price',
cols_exclude=cols_exclude,
):
if cols_exclude:
dfx = dfx.drop(cols_exclude,axis=1)
var_types = {k:v for k,v in var_types.items() if k not in cols_exclude}
makeindex = False if 'index' in dfx.columns else True
es = ft.EntitySet("houses")
es.entity_from_dataframe(entity_id="data",
dataframe=dfx,
make_index=makeindex,
index='index',
time_index='date',
variable_types=var_types)
cutoff_time = es['data'].df[['index', 'date']]
es.add_interesting_values()
# additional features added
feature_matrix_extra, feature_defs_extra = ft.dfs(entityset=es,
target_entity='data',
agg_primitives=agg_primitives,
trans_primitives=trans_primitives,
drop_contains=[],
drop_exact=list(features)+[targetname],
cutoff_time=cutoff_time,
max_depth=2,
n_jobs=-1,
max_features=1000,
chunk_size=1000,
verbose=True,
)
return feature_matrix_extra
df_train_extra = get_extra_features_from_ft(df_train)
df_train_extra.head(2)
Built 210 features EntitySet scattered to 2 workers in 2 seconds Elapsed: 01:05 | Progress: 100%|██████████
bathrooms / bedrooms | bathrooms / condition | bathrooms / floors | bathrooms / grade | bathrooms / id | bathrooms / sqft_above | bathrooms / sqft_basement | bathrooms / sqft_living | bathrooms / sqft_living15 | bathrooms / sqft_lot | bathrooms / sqft_lot15 | bathrooms / view | bathrooms / yr_built | bathrooms / yr_renovated | bedrooms / bathrooms | bedrooms / condition | bedrooms / floors | bedrooms / grade | bedrooms / id | bedrooms / sqft_above | bedrooms / sqft_basement | bedrooms / sqft_living | bedrooms / sqft_living15 | bedrooms / sqft_lot | bedrooms / sqft_lot15 | bedrooms / view | bedrooms / yr_built | bedrooms / yr_renovated | condition / bathrooms | condition / bedrooms | condition / floors | condition / grade | condition / id | condition / sqft_above | condition / sqft_basement | condition / sqft_living | condition / sqft_living15 | condition / sqft_lot | condition / sqft_lot15 | condition / view | condition / yr_built | condition / yr_renovated | floors / bathrooms | floors / bedrooms | floors / condition | floors / grade | floors / id | floors / sqft_above | floors / sqft_basement | floors / sqft_living | floors / sqft_living15 | floors / sqft_lot | floors / sqft_lot15 | floors / view | floors / yr_built | floors / yr_renovated | grade / bathrooms | grade / bedrooms | grade / condition | grade / floors | grade / id | grade / sqft_above | grade / sqft_basement | grade / sqft_living | grade / sqft_living15 | grade / sqft_lot | grade / sqft_lot15 | grade / view | grade / yr_built | grade / yr_renovated | id / bathrooms | id / bedrooms | id / condition | id / floors | id / grade | id / sqft_above | id / sqft_basement | id / sqft_living | id / sqft_living15 | id / sqft_lot | id / sqft_lot15 | id / view | id / yr_built | id / yr_renovated | sqft_above / bathrooms | sqft_above / bedrooms | sqft_above / condition | sqft_above / floors | sqft_above / grade | sqft_above / id | sqft_above / sqft_basement | sqft_above / sqft_living | sqft_above / sqft_living15 | sqft_above / sqft_lot | sqft_above / sqft_lot15 | sqft_above / view | sqft_above / yr_built | sqft_above / yr_renovated | sqft_basement / bathrooms | sqft_basement / bedrooms | sqft_basement / condition | sqft_basement / floors | sqft_basement / grade | sqft_basement / id | sqft_basement / sqft_above | sqft_basement / sqft_living | sqft_basement / sqft_living15 | sqft_basement / sqft_lot | sqft_basement / sqft_lot15 | sqft_basement / view | sqft_basement / yr_built | sqft_basement / yr_renovated | sqft_living / bathrooms | sqft_living / bedrooms | sqft_living / condition | sqft_living / floors | sqft_living / grade | sqft_living / id | sqft_living / sqft_above | sqft_living / sqft_basement | sqft_living / sqft_living15 | sqft_living / sqft_lot | sqft_living / sqft_lot15 | sqft_living / view | sqft_living / yr_built | sqft_living / yr_renovated | sqft_living15 / bathrooms | sqft_living15 / bedrooms | sqft_living15 / condition | sqft_living15 / floors | sqft_living15 / grade | sqft_living15 / id | sqft_living15 / sqft_above | sqft_living15 / sqft_basement | sqft_living15 / sqft_living | sqft_living15 / sqft_lot | sqft_living15 / sqft_lot15 | sqft_living15 / view | sqft_living15 / yr_built | sqft_living15 / yr_renovated | sqft_lot / bathrooms | sqft_lot / bedrooms | sqft_lot / condition | sqft_lot / floors | sqft_lot / grade | sqft_lot / id | sqft_lot / sqft_above | sqft_lot / sqft_basement | sqft_lot / sqft_living | sqft_lot / sqft_living15 | sqft_lot / sqft_lot15 | sqft_lot / view | sqft_lot / yr_built | sqft_lot / yr_renovated | sqft_lot15 / bathrooms | sqft_lot15 / bedrooms | sqft_lot15 / condition | sqft_lot15 / floors | sqft_lot15 / grade | sqft_lot15 / id | sqft_lot15 / sqft_above | sqft_lot15 / sqft_basement | sqft_lot15 / sqft_living | sqft_lot15 / sqft_living15 | sqft_lot15 / sqft_lot | sqft_lot15 / view | sqft_lot15 / yr_built | sqft_lot15 / yr_renovated | view / bathrooms | view / bedrooms | view / condition | view / floors | view / grade | view / id | view / sqft_above | view / sqft_basement | view / sqft_living | view / sqft_living15 | view / sqft_lot | view / sqft_lot15 | view / yr_built | view / yr_renovated | yr_built / bathrooms | yr_built / bedrooms | yr_built / condition | yr_built / floors | yr_built / grade | yr_built / id | yr_built / sqft_above | yr_built / sqft_basement | yr_built / sqft_living | yr_built / sqft_living15 | yr_built / sqft_lot | yr_built / sqft_lot15 | yr_built / view | yr_built / yr_renovated | yr_renovated / bathrooms | yr_renovated / bedrooms | yr_renovated / condition | yr_renovated / floors | yr_renovated / grade | yr_renovated / id | yr_renovated / sqft_above | yr_renovated / sqft_basement | yr_renovated / sqft_living | yr_renovated / sqft_living15 | yr_renovated / sqft_lot | yr_renovated / sqft_lot15 | yr_renovated / view | yr_renovated / yr_built | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
index | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
39 | 0.333333 | 0.333333 | 1.0 | 0.142857 | 6.557037e-10 | 0.000556 | inf | 0.000556 | 0.000709 | 0.000043 | 0.000055 | inf | 0.000508 | inf | 3.0 | 1.00 | 3.0 | 0.428571 | 1.967111e-09 | 0.001667 | inf | 0.001667 | 0.002128 | 0.000130 | 0.000165 | inf | 0.001524 | inf | 3.0 | 1.000000 | 3.0 | 0.428571 | 1.967111e-09 | 0.001667 | inf | 0.001667 | 0.002128 | 0.000130 | 0.000165 | inf | 0.001524 | inf | 1.0 | 0.333333 | 0.333333 | 0.142857 | 6.557037e-10 | 0.000556 | inf | 0.000556 | 0.000709 | 0.000043 | 0.000055 | inf | 0.000508 | inf | 7.0 | 2.333333 | 2.333333 | 7.0 | 4.589926e-09 | 0.003889 | inf | 0.003889 | 0.004965 | 0.000303 | 0.000385 | inf | 0.003557 | inf | 1.525079e+09 | 5.083597e+08 | 5.083597e+08 | 1.525079e+09 | 2.178684e+08 | 8.472661e+05 | inf | 847266.142222 | 1.081616e+06 | 66012.165346 | 83966.253152 | inf | 774938.544715 | inf | 1800.0 | 600.0 | 600.0 | 1800.0 | 257.142857 | 1.180267e-06 | inf | 1.00000 | 1.276596 | 0.077912 | 0.099103 | inf | 0.914634 | inf | 0.0 | 0.000000 | 0.0 | 0.0 | 0.00 | 0.000000e+00 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | NaN | 0.000000 | NaN | 1800.0 | 600.000000 | 600.0 | 1800.0 | 257.142857 | 0.000001 | 1.000000 | inf | 1.276596 | 0.077912 | 0.099103 | inf | 0.914634 | inf | 1410.0 | 470.0 | 470.0 | 1410.0 | 201.428571 | 9.245422e-07 | 0.783333 | inf | 0.783333 | 0.061031 | 0.07763 | inf | 0.716463 | inf | 23103.0 | 7701.000000 | 7701.0 | 23103.0 | 3300.428571 | 0.000015 | 12.835000 | inf | 12.835000 | 16.385106 | 1.271982 | inf | 11.739329 | inf | 18163.0 | 6054.333333 | 6054.333333 | 18163.0 | 2594.714286 | 0.000012 | 10.090556 | inf | 10.090556 | 12.881560 | 0.786175 | inf | 9.229167 | inf | 0.0 | 0.000000 | 0.0 | 0.0 | 0.00 | 0.000000e+00 | 0.000000 | NaN | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | NaN | 1968.0 | 656.000000 | 656.00 | 1968.0 | 281.142857 | 0.000001 | 1.093333 | inf | 1.093333 | 1.395745 | 0.085184 | 0.108352 | inf | inf | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 0.0 |
323 | 0.666667 | 0.500000 | 2.0 | 0.250000 | 1.099626e-09 | 0.001389 | 0.002105 | 0.000837 | 0.000995 | 0.000305 | 0.000305 | 1.0 | 0.001023 | inf | 1.5 | 0.75 | 3.0 | 0.375000 | 1.649439e-09 | 0.002083 | 0.003158 | 0.001255 | 0.001493 | 0.000458 | 0.000458 | 1.5 | 0.001535 | inf | 2.0 | 1.333333 | 4.0 | 0.500000 | 2.199252e-09 | 0.002778 | 0.004211 | 0.001674 | 0.001990 | 0.000611 | 0.000611 | 2.0 | 0.002046 | inf | 0.5 | 0.333333 | 0.250000 | 0.125000 | 5.498130e-10 | 0.000694 | 0.001053 | 0.000418 | 0.000498 | 0.000153 | 0.000153 | 0.5 | 0.000512 | inf | 4.0 | 2.666667 | 2.000000 | 8.0 | 4.398504e-09 | 0.005556 | 0.008421 | 0.003347 | 0.003980 | 0.001221 | 0.001221 | 4.0 | 0.004092 | inf | 9.094001e+08 | 6.062667e+08 | 4.547000e+08 | 1.818800e+09 | 2.273500e+08 | 1.263056e+06 | 1.914526e+06 | 761004.244351 | 9.048757e+05 | 277679.411298 | 277679.411298 | 909400072.0 | 930332.554476 | inf | 720.0 | 480.0 | 360.0 | 1440.0 | 180.000000 | 7.917307e-07 | 1.515789 | 0.60251 | 0.716418 | 0.219847 | 0.219847 | 720.0 | 0.736573 | inf | 475.0 | 316.666667 | 237.5 | 950.0 | 118.75 | 5.223224e-07 | 0.659722 | 0.39749 | 0.472637 | 0.145038 | 0.145038 | 475.0 | 0.485934 | inf | 1195.0 | 796.666667 | 597.5 | 2390.0 | 298.750000 | 0.000001 | 1.659722 | 2.515789 | 1.189055 | 0.364885 | 0.364885 | 1195.0 | 1.222506 | inf | 1005.0 | 670.0 | 502.5 | 2010.0 | 251.250000 | 1.105124e-06 | 1.395833 | 2.115789 | 0.841004 | 0.306870 | 0.30687 | 1005.0 | 1.028133 | inf | 3275.0 | 2183.333333 | 1637.5 | 6550.0 | 818.750000 | 0.000004 | 4.548611 | 6.894737 | 2.740586 | 3.258706 | 1.000000 | 3275.0 | 3.350384 | inf | 3275.0 | 2183.333333 | 1637.500000 | 6550.0 | 818.750000 | 0.000004 | 4.548611 | 6.894737 | 2.740586 | 3.258706 | 1.000000 | 3275.0 | 3.350384 | inf | 1.0 | 0.666667 | 0.5 | 2.0 | 0.25 | 1.099626e-09 | 0.001389 | 0.002105 | 0.000837 | 0.000995 | 0.000305 | 0.000305 | 0.001023 | inf | 977.5 | 651.666667 | 488.75 | 1955.0 | 244.375000 | 0.000001 | 1.357639 | 2.057895 | 0.817992 | 0.972637 | 0.298473 | 0.298473 | 977.5 | inf | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
df_test_extra = get_extra_features_from_ft(df_test)
df_test_extra.head(2)
Built 210 features EntitySet scattered to 2 workers in 2 seconds Elapsed: 00:56 | Progress: 100%|██████████
bathrooms / bedrooms | bathrooms / condition | bathrooms / floors | bathrooms / grade | bathrooms / id | bathrooms / sqft_above | bathrooms / sqft_basement | bathrooms / sqft_living | bathrooms / sqft_living15 | bathrooms / sqft_lot | bathrooms / sqft_lot15 | bathrooms / view | bathrooms / yr_built | bathrooms / yr_renovated | bedrooms / bathrooms | bedrooms / condition | bedrooms / floors | bedrooms / grade | bedrooms / id | bedrooms / sqft_above | bedrooms / sqft_basement | bedrooms / sqft_living | bedrooms / sqft_living15 | bedrooms / sqft_lot | bedrooms / sqft_lot15 | bedrooms / view | bedrooms / yr_built | bedrooms / yr_renovated | condition / bathrooms | condition / bedrooms | condition / floors | condition / grade | condition / id | condition / sqft_above | condition / sqft_basement | condition / sqft_living | condition / sqft_living15 | condition / sqft_lot | condition / sqft_lot15 | condition / view | condition / yr_built | condition / yr_renovated | floors / bathrooms | floors / bedrooms | floors / condition | floors / grade | floors / id | floors / sqft_above | floors / sqft_basement | floors / sqft_living | floors / sqft_living15 | floors / sqft_lot | floors / sqft_lot15 | floors / view | floors / yr_built | floors / yr_renovated | grade / bathrooms | grade / bedrooms | grade / condition | grade / floors | grade / id | grade / sqft_above | grade / sqft_basement | grade / sqft_living | grade / sqft_living15 | grade / sqft_lot | grade / sqft_lot15 | grade / view | grade / yr_built | grade / yr_renovated | id / bathrooms | id / bedrooms | id / condition | id / floors | id / grade | id / sqft_above | id / sqft_basement | id / sqft_living | id / sqft_living15 | id / sqft_lot | id / sqft_lot15 | id / view | id / yr_built | id / yr_renovated | sqft_above / bathrooms | sqft_above / bedrooms | sqft_above / condition | sqft_above / floors | sqft_above / grade | sqft_above / id | sqft_above / sqft_basement | sqft_above / sqft_living | sqft_above / sqft_living15 | sqft_above / sqft_lot | sqft_above / sqft_lot15 | sqft_above / view | sqft_above / yr_built | sqft_above / yr_renovated | sqft_basement / bathrooms | sqft_basement / bedrooms | sqft_basement / condition | sqft_basement / floors | sqft_basement / grade | sqft_basement / id | sqft_basement / sqft_above | sqft_basement / sqft_living | sqft_basement / sqft_living15 | sqft_basement / sqft_lot | sqft_basement / sqft_lot15 | sqft_basement / view | sqft_basement / yr_built | sqft_basement / yr_renovated | sqft_living / bathrooms | sqft_living / bedrooms | sqft_living / condition | sqft_living / floors | sqft_living / grade | sqft_living / id | sqft_living / sqft_above | sqft_living / sqft_basement | sqft_living / sqft_living15 | sqft_living / sqft_lot | sqft_living / sqft_lot15 | sqft_living / view | sqft_living / yr_built | sqft_living / yr_renovated | sqft_living15 / bathrooms | sqft_living15 / bedrooms | sqft_living15 / condition | sqft_living15 / floors | sqft_living15 / grade | sqft_living15 / id | sqft_living15 / sqft_above | sqft_living15 / sqft_basement | sqft_living15 / sqft_living | sqft_living15 / sqft_lot | sqft_living15 / sqft_lot15 | sqft_living15 / view | sqft_living15 / yr_built | sqft_living15 / yr_renovated | sqft_lot / bathrooms | sqft_lot / bedrooms | sqft_lot / condition | sqft_lot / floors | sqft_lot / grade | sqft_lot / id | sqft_lot / sqft_above | sqft_lot / sqft_basement | sqft_lot / sqft_living | sqft_lot / sqft_living15 | sqft_lot / sqft_lot15 | sqft_lot / view | sqft_lot / yr_built | sqft_lot / yr_renovated | sqft_lot15 / bathrooms | sqft_lot15 / bedrooms | sqft_lot15 / condition | sqft_lot15 / floors | sqft_lot15 / grade | sqft_lot15 / id | sqft_lot15 / sqft_above | sqft_lot15 / sqft_basement | sqft_lot15 / sqft_living | sqft_lot15 / sqft_living15 | sqft_lot15 / sqft_lot | sqft_lot15 / view | sqft_lot15 / yr_built | sqft_lot15 / yr_renovated | view / bathrooms | view / bedrooms | view / condition | view / floors | view / grade | view / id | view / sqft_above | view / sqft_basement | view / sqft_living | view / sqft_living15 | view / sqft_lot | view / sqft_lot15 | view / yr_built | view / yr_renovated | yr_built / bathrooms | yr_built / bedrooms | yr_built / condition | yr_built / floors | yr_built / grade | yr_built / id | yr_built / sqft_above | yr_built / sqft_basement | yr_built / sqft_living | yr_built / sqft_living15 | yr_built / sqft_lot | yr_built / sqft_lot15 | yr_built / view | yr_built / yr_renovated | yr_renovated / bathrooms | yr_renovated / bedrooms | yr_renovated / condition | yr_renovated / floors | yr_renovated / grade | yr_renovated / id | yr_renovated / sqft_above | yr_renovated / sqft_basement | yr_renovated / sqft_living | yr_renovated / sqft_living15 | yr_renovated / sqft_lot | yr_renovated / sqft_lot15 | yr_renovated / view | yr_renovated / yr_built | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
index | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
249 | 0.500000 | 0.666667 | 2.0 | 0.285714 | 3.263921e-10 | 0.001316 | inf | 0.001316 | 0.001047 | 0.000323 | 0.000323 | inf | 0.001028 | inf | 2.0 | 1.333333 | 4.0 | 0.571429 | 6.527841e-10 | 0.002632 | inf | 0.002632 | 0.002094 | 0.000645 | 0.000645 | inf | 0.002057 | inf | 1.5 | 0.750000 | 3.0 | 0.428571 | 4.895881e-10 | 0.001974 | inf | 0.001974 | 0.001571 | 0.000484 | 0.000484 | inf | 0.001542 | inf | 0.5 | 0.250000 | 0.333333 | 0.142857 | 1.631960e-10 | 0.000658 | inf | 0.000658 | 0.000524 | 0.000161 | 0.000161 | inf | 0.000514 | inf | 3.5 | 1.750000 | 2.333333 | 7.0 | 1.142372e-09 | 0.004605 | inf | 0.004605 | 0.003665 | 0.001129 | 0.001129 | inf | 0.003599 | inf | 3.063800e+09 | 1.531900e+09 | 2.042533e+09 | 6.127600e+09 | 8.753714e+08 | 4.031316e+06 | inf | 4.031316e+06 | 3.208168e+06 | 988322.598387 | 988322.598387 | inf | 3.150437e+06 | inf | 760.0 | 380.0 | 506.666667 | 1520.0 | 217.142857 | 2.480580e-07 | inf | 1.0 | 0.795812 | 0.245161 | 0.245161 | inf | 0.781491 | inf | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 0.0 | NaN | 760.0 | 380.0 | 506.666667 | 1520.0 | 217.142857 | 2.480580e-07 | 1.0 | inf | 0.795812 | 0.245161 | 0.245161 | inf | 0.781491 | inf | 955.0 | 477.5 | 636.666667 | 1910.0 | 272.857143 | 3.117044e-07 | 1.256579 | inf | 1.256579 | 0.308065 | 0.308065 | inf | 0.982005 | inf | 3100.0 | 1550.0 | 2066.666667 | 6200.0 | 885.714286 | 0.000001 | 4.078947 | inf | 4.078947 | 3.246073 | 1.000000 | inf | 3.187661 | inf | 3100.0 | 1550.0 | 2066.666667 | 6200.0 | 885.714286 | 0.000001 | 4.078947 | inf | 4.078947 | 3.246073 | 1.000000 | inf | 3.187661 | inf | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 972.5 | 486.25 | 648.333333 | 1945.0 | 277.857143 | 3.174163e-07 | 1.279605 | inf | 1.279605 | 1.018325 | 0.31371 | 0.313710 | inf | inf | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 0.0 |
367 | 0.333333 | 0.250000 | 1.0 | 0.142857 | 3.819272e-10 | 0.000833 | inf | 0.000833 | 0.000725 | 0.000103 | 0.000097 | inf | 0.000509 | inf | 3.0 | 0.750000 | 3.0 | 0.428571 | 1.145782e-09 | 0.002500 | inf | 0.002500 | 0.002174 | 0.000309 | 0.000292 | inf | 0.001527 | inf | 4.0 | 1.333333 | 4.0 | 0.571429 | 1.527709e-09 | 0.003333 | inf | 0.003333 | 0.002899 | 0.000412 | 0.000389 | inf | 0.002036 | inf | 1.0 | 0.333333 | 0.250000 | 0.142857 | 3.819272e-10 | 0.000833 | inf | 0.000833 | 0.000725 | 0.000103 | 0.000097 | inf | 0.000509 | inf | 7.0 | 2.333333 | 1.750000 | 7.0 | 2.673490e-09 | 0.005833 | inf | 0.005833 | 0.005072 | 0.000720 | 0.000681 | inf | 0.003562 | inf | 2.618300e+09 | 8.727667e+08 | 6.545750e+08 | 2.618300e+09 | 3.740429e+08 | 2.181917e+06 | inf | 2.181917e+06 | 1.897319e+06 | 269372.436214 | 254599.385453 | inf | 1.332468e+06 | inf | 1200.0 | 400.0 | 300.000000 | 1200.0 | 171.428571 | 4.583126e-07 | inf | 1.0 | 0.869565 | 0.123457 | 0.116686 | inf | 0.610687 | inf | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 0.0 | NaN | 1200.0 | 400.0 | 300.000000 | 1200.0 | 171.428571 | 4.583126e-07 | 1.0 | inf | 0.869565 | 0.123457 | 0.116686 | inf | 0.610687 | inf | 1380.0 | 460.0 | 345.000000 | 1380.0 | 197.142857 | 5.270595e-07 | 1.150000 | inf | 1.150000 | 0.141975 | 0.134189 | inf | 0.702290 | inf | 9720.0 | 3240.0 | 2430.000000 | 9720.0 | 1388.571429 | 0.000004 | 8.100000 | inf | 8.100000 | 7.043478 | 0.945158 | inf | 4.946565 | inf | 10284.0 | 3428.0 | 2571.000000 | 10284.0 | 1469.142857 | 0.000004 | 8.570000 | inf | 8.570000 | 7.452174 | 1.058025 | inf | 5.233588 | inf | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 1965.0 | 655.00 | 491.250000 | 1965.0 | 280.714286 | 7.504869e-07 | 1.637500 | inf | 1.637500 | 1.423913 | 0.20216 | 0.191074 | inf | inf | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 0.0 |
def post_process_fm(fm,thr_miss=0.95,thr_corr=0.95):
"""Post process feature matrix.
1. remove duplicated features
2. remove features having many missing features
3. remvoe zero variance features
4. remove high collinear features
"""
# Remove duplicated features
start_features = fm.shape[1]
fm = fm.iloc[:, ~fm.columns.duplicated()]
n_dups = start_features - fm.shape[1]
print(f'There were {n_dups} duplicated features.')
fm = fm.replace({np.inf: np.nan, -np.inf:np.nan})
# Remove the ids and labels
idname = 'index'
targetname = 'price'
cols_drop_id = [ i for i in fm.columns if idname in i]
cols_drop_target = [ i for i in fm.columns if targetname in i]
cols_drop_id_target = cols_drop_id + cols_drop_target
print('Dropping ids and label: ', cols_drop_id_target)
fm = fm.drop(cols_drop_id_target,axis=1)
# One hot encoding (if necessary)
fm = pd.get_dummies(fm)
n_features_start = fm.shape[1]
print('Original shape: ', fm.shape)
# Find missing and percentage
df_miss = pd.DataFrame(fm.isnull().sum())
df_miss['frac'] = df_miss[0] / fm.shape[0]
df_miss.sort_values('frac', ascending = False, inplace = True)
# Missing above threshold
cols_miss = list(df_miss[df_miss['frac'] > thr_miss].index)
n_cols_miss = len(cols_miss)
# Remove missing columns
fm = fm[[i for i in fm if i not in cols_miss]]
print('{} missing columns with threshold: {}.'.format(
n_cols_miss, thr_miss))
# Zero variance
df_unq_ct = pd.DataFrame(fm.nunique()).sort_values(0,ascending=True)
cols_zero_var = list(df_unq_ct[df_unq_ct[0] == 1].index)
n_cols_zero_var = len(cols_zero_var)
# Remove zero variance columns
fm = fm[[i for i in fm if i not in cols_zero_var]]
print('{} zero variance columns.'.format(n_cols_zero_var))
# Correlations
df_corr = fm.corr()
# Extract the upper triangle of the correlation matrix
df_upper = df_corr.where(np.triu(np.ones(df_corr.shape), k = 1).astype(np.bool))
# Select the features with correlations above the threshold
# Need to use the absolute value
cols_drop = [col for col in df_upper.columns
if any(df_upper[col].abs() > thr_corr)]
n_collinear = len(cols_drop)
fm = fm[[i for i in fm if i not in cols_drop]]
print('{} collinear columns removed with correlation above {}.'.format(
n_collinear, thr_corr))
n_total_cols_removed = n_dups + n_cols_miss + n_cols_zero_var + n_collinear
print('Total columns removed: ', n_total_cols_removed)
print('Shape after feature selection: {}.'.format(fm.shape))
return fm
df_train_extra = post_process_fm(df_train_extra)
df_train_extra.head(2)
There were 0 duplicated features. Dropping ids and label: [] Original shape: (17290, 210) 14 missing columns with threshold: 0.95. 0 zero variance columns. 41 collinear columns removed with correlation above 0.95. Total columns removed: 55 Shape after feature selection: (17290, 155).
bathrooms / bedrooms | bathrooms / condition | bathrooms / floors | bathrooms / grade | bathrooms / id | bathrooms / sqft_above | bathrooms / sqft_basement | bathrooms / sqft_living | bathrooms / sqft_living15 | bathrooms / sqft_lot | bathrooms / sqft_lot15 | bathrooms / view | bathrooms / yr_built | bedrooms / bathrooms | bedrooms / condition | bedrooms / floors | bedrooms / grade | bedrooms / sqft_above | bedrooms / sqft_basement | bedrooms / sqft_living | bedrooms / sqft_living15 | bedrooms / sqft_lot | bedrooms / sqft_lot15 | bedrooms / view | bedrooms / yr_built | condition / bathrooms | condition / bedrooms | condition / floors | condition / grade | condition / id | condition / sqft_above | condition / sqft_basement | condition / sqft_living | condition / sqft_living15 | condition / sqft_lot | condition / sqft_lot15 | condition / view | condition / yr_built | floors / bathrooms | floors / bedrooms | floors / condition | floors / grade | floors / sqft_above | floors / sqft_basement | floors / sqft_living | floors / sqft_living15 | floors / sqft_lot | floors / sqft_lot15 | floors / view | floors / yr_built | grade / bathrooms | grade / bedrooms | grade / condition | grade / floors | grade / sqft_above | grade / sqft_living | grade / sqft_living15 | grade / view | grade / yr_built | id / bathrooms | id / bedrooms | id / condition | id / floors | id / grade | id / sqft_above | id / sqft_basement | id / sqft_living | id / sqft_living15 | id / sqft_lot | id / sqft_lot15 | id / view | sqft_above / bathrooms | sqft_above / bedrooms | sqft_above / condition | sqft_above / floors | sqft_above / grade | sqft_above / sqft_basement | sqft_above / sqft_living | sqft_above / sqft_living15 | sqft_above / sqft_lot | sqft_above / sqft_lot15 | sqft_above / view | sqft_above / yr_built | sqft_basement / bathrooms | sqft_basement / bedrooms | sqft_basement / condition | sqft_basement / floors | sqft_basement / id | sqft_basement / sqft_lot | sqft_basement / sqft_lot15 | sqft_basement / view | sqft_living / bathrooms | sqft_living / bedrooms | sqft_living / condition | sqft_living / floors | sqft_living / grade | sqft_living / sqft_living15 | sqft_living / view | sqft_living / yr_built | sqft_living15 / bathrooms | sqft_living15 / bedrooms | sqft_living15 / condition | sqft_living15 / floors | sqft_living15 / grade | sqft_living15 / sqft_above | sqft_living15 / sqft_basement | sqft_living15 / sqft_living | sqft_living15 / sqft_lot | sqft_living15 / sqft_lot15 | sqft_living15 / view | sqft_living15 / yr_built | sqft_lot / bathrooms | sqft_lot / bedrooms | sqft_lot / condition | sqft_lot / floors | sqft_lot / id | sqft_lot / sqft_above | sqft_lot / sqft_basement | sqft_lot / sqft_lot15 | sqft_lot15 / bathrooms | sqft_lot15 / bedrooms | sqft_lot15 / condition | sqft_lot15 / floors | sqft_lot15 / sqft_above | sqft_lot15 / sqft_basement | sqft_lot15 / sqft_living15 | sqft_lot15 / sqft_lot | sqft_lot15 / view | view / bathrooms | view / bedrooms | view / condition | view / floors | view / id | view / sqft_above | view / sqft_basement | view / sqft_lot | view / sqft_lot15 | yr_built / bathrooms | yr_built / bedrooms | yr_built / condition | yr_built / grade | yr_built / sqft_above | yr_built / sqft_living | yr_built / sqft_living15 | yr_built / view | yr_renovated / bathrooms | yr_renovated / bedrooms | yr_renovated / condition | yr_renovated / floors | yr_renovated / id | yr_renovated / sqft_above | yr_renovated / sqft_basement | yr_renovated / sqft_lot | yr_renovated / sqft_lot15 | yr_renovated / view | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
index | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
39 | 0.333333 | 0.333333 | 1.0 | 0.142857 | 6.557037e-10 | 0.000556 | NaN | 0.000556 | 0.000709 | 0.000043 | 0.000055 | NaN | 0.000508 | 3.0 | 1.00 | 3.0 | 0.428571 | 0.001667 | NaN | 0.001667 | 0.002128 | 0.000130 | 0.000165 | NaN | 0.001524 | 3.0 | 1.000000 | 3.0 | 0.428571 | 1.967111e-09 | 0.001667 | NaN | 0.001667 | 0.002128 | 0.000130 | 0.000165 | NaN | 0.001524 | 1.0 | 0.333333 | 0.333333 | 0.142857 | 0.000556 | NaN | 0.000556 | 0.000709 | 0.000043 | 0.000055 | NaN | 0.000508 | 7.0 | 2.333333 | 2.333333 | 7.0 | 0.003889 | 0.003889 | 0.004965 | NaN | 0.003557 | 1.525079e+09 | 5.083597e+08 | 5.083597e+08 | 1.525079e+09 | 2.178684e+08 | 8.472661e+05 | NaN | 847266.142222 | 1.081616e+06 | 66012.165346 | 83966.253152 | NaN | 1800.0 | 600.0 | 600.0 | 1800.0 | 257.142857 | NaN | 1.00000 | 1.276596 | 0.077912 | 0.099103 | NaN | 0.914634 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000e+00 | 0.000000 | 0.000000 | NaN | 1800.0 | 600.000000 | 600.0 | 1800.0 | 257.142857 | 1.276596 | NaN | 0.914634 | 1410.0 | 470.0 | 470.0 | 1410.0 | 201.428571 | 0.783333 | NaN | 0.783333 | 0.061031 | 0.07763 | NaN | 0.716463 | 23103.0 | 7701.000000 | 7701.0 | 23103.0 | 0.000015 | 12.835000 | NaN | 1.271982 | 18163.0 | 6054.333333 | 6054.333333 | 18163.0 | 10.090556 | NaN | 12.881560 | 0.786175 | NaN | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000e+00 | 0.000000 | NaN | 0.000000 | 0.000000 | 1968.0 | 656.000000 | 656.00 | 281.142857 | 1.093333 | 1.093333 | 1.395745 | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 0.0 | 0.0 | NaN |
323 | 0.666667 | 0.500000 | 2.0 | 0.250000 | 1.099626e-09 | 0.001389 | 0.002105 | 0.000837 | 0.000995 | 0.000305 | 0.000305 | 1.0 | 0.001023 | 1.5 | 0.75 | 3.0 | 0.375000 | 0.002083 | 0.003158 | 0.001255 | 0.001493 | 0.000458 | 0.000458 | 1.5 | 0.001535 | 2.0 | 1.333333 | 4.0 | 0.500000 | 2.199252e-09 | 0.002778 | 0.004211 | 0.001674 | 0.001990 | 0.000611 | 0.000611 | 2.0 | 0.002046 | 0.5 | 0.333333 | 0.250000 | 0.125000 | 0.000694 | 0.001053 | 0.000418 | 0.000498 | 0.000153 | 0.000153 | 0.5 | 0.000512 | 4.0 | 2.666667 | 2.000000 | 8.0 | 0.005556 | 0.003347 | 0.003980 | 4.0 | 0.004092 | 9.094001e+08 | 6.062667e+08 | 4.547000e+08 | 1.818800e+09 | 2.273500e+08 | 1.263056e+06 | 1.914526e+06 | 761004.244351 | 9.048757e+05 | 277679.411298 | 277679.411298 | 909400072.0 | 720.0 | 480.0 | 360.0 | 1440.0 | 180.000000 | 1.515789 | 0.60251 | 0.716418 | 0.219847 | 0.219847 | 720.0 | 0.736573 | 475.0 | 316.666667 | 237.5 | 950.0 | 5.223224e-07 | 0.145038 | 0.145038 | 475.0 | 1195.0 | 796.666667 | 597.5 | 2390.0 | 298.750000 | 1.189055 | 1195.0 | 1.222506 | 1005.0 | 670.0 | 502.5 | 2010.0 | 251.250000 | 1.395833 | 2.115789 | 0.841004 | 0.306870 | 0.30687 | 1005.0 | 1.028133 | 3275.0 | 2183.333333 | 1637.5 | 6550.0 | 0.000004 | 4.548611 | 6.894737 | 1.000000 | 3275.0 | 2183.333333 | 1637.500000 | 6550.0 | 4.548611 | 6.894737 | 3.258706 | 1.000000 | 3275.0 | 1.0 | 0.666667 | 0.5 | 2.0 | 1.099626e-09 | 0.001389 | 0.002105 | 0.000305 | 0.000305 | 977.5 | 651.666667 | 488.75 | 244.375000 | 1.357639 | 0.817992 | 0.972637 | 977.5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
df_test_extra = post_process_fm(df_test_extra)
df_test_extra.head(2)
There were 0 duplicated features. Dropping ids and label: [] Original shape: (4323, 210) 14 missing columns with threshold: 0.95. 0 zero variance columns. 48 collinear columns removed with correlation above 0.95. Total columns removed: 62 Shape after feature selection: (4323, 148).
bathrooms / bedrooms | bathrooms / condition | bathrooms / floors | bathrooms / grade | bathrooms / id | bathrooms / sqft_above | bathrooms / sqft_basement | bathrooms / sqft_living | bathrooms / sqft_living15 | bathrooms / sqft_lot | bathrooms / sqft_lot15 | bathrooms / view | bathrooms / yr_built | bedrooms / bathrooms | bedrooms / condition | bedrooms / floors | bedrooms / grade | bedrooms / sqft_above | bedrooms / sqft_basement | bedrooms / sqft_living | bedrooms / sqft_living15 | bedrooms / sqft_lot | bedrooms / sqft_lot15 | bedrooms / view | bedrooms / yr_built | condition / bathrooms | condition / bedrooms | condition / floors | condition / grade | condition / sqft_above | condition / sqft_basement | condition / sqft_living | condition / sqft_living15 | condition / sqft_lot | condition / sqft_lot15 | condition / view | condition / yr_built | floors / bathrooms | floors / bedrooms | floors / condition | floors / grade | floors / sqft_above | floors / sqft_living | floors / sqft_living15 | floors / sqft_lot | floors / sqft_lot15 | floors / view | floors / yr_built | grade / bathrooms | grade / bedrooms | grade / condition | grade / sqft_above | grade / sqft_living | grade / sqft_living15 | grade / view | grade / yr_built | id / bathrooms | id / bedrooms | id / condition | id / floors | id / grade | id / sqft_above | id / sqft_basement | id / sqft_living | id / sqft_living15 | id / sqft_lot | id / sqft_lot15 | id / view | sqft_above / bathrooms | sqft_above / bedrooms | sqft_above / condition | sqft_above / floors | sqft_above / grade | sqft_above / sqft_basement | sqft_above / sqft_living | sqft_above / sqft_living15 | sqft_above / sqft_lot | sqft_above / sqft_lot15 | sqft_above / view | sqft_above / yr_built | sqft_basement / bathrooms | sqft_basement / bedrooms | sqft_basement / condition | sqft_basement / floors | sqft_basement / id | sqft_basement / sqft_lot | sqft_basement / sqft_lot15 | sqft_basement / view | sqft_living / bathrooms | sqft_living / bedrooms | sqft_living / condition | sqft_living / floors | sqft_living / grade | sqft_living / sqft_living15 | sqft_living / view | sqft_living / yr_built | sqft_living15 / bathrooms | sqft_living15 / bedrooms | sqft_living15 / condition | sqft_living15 / floors | sqft_living15 / grade | sqft_living15 / sqft_above | sqft_living15 / sqft_living | sqft_living15 / sqft_lot | sqft_living15 / sqft_lot15 | sqft_living15 / view | sqft_living15 / yr_built | sqft_lot / bathrooms | sqft_lot / bedrooms | sqft_lot / condition | sqft_lot / floors | sqft_lot / id | sqft_lot / sqft_above | sqft_lot / sqft_basement | sqft_lot / sqft_living15 | sqft_lot / sqft_lot15 | sqft_lot15 / bathrooms | sqft_lot15 / bedrooms | sqft_lot15 / condition | sqft_lot15 / floors | sqft_lot15 / sqft_above | sqft_lot15 / sqft_basement | sqft_lot15 / sqft_lot | sqft_lot15 / view | view / bathrooms | view / bedrooms | view / condition | view / floors | view / id | view / sqft_above | view / sqft_basement | view / sqft_lot | view / sqft_lot15 | yr_built / bathrooms | yr_built / bedrooms | yr_built / condition | yr_built / sqft_living15 | yr_built / view | yr_renovated / bathrooms | yr_renovated / bedrooms | yr_renovated / condition | yr_renovated / floors | yr_renovated / id | yr_renovated / sqft_above | yr_renovated / sqft_basement | yr_renovated / sqft_lot | yr_renovated / sqft_lot15 | yr_renovated / view | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
index | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
249 | 0.500000 | 0.666667 | 2.0 | 0.285714 | 3.263921e-10 | 0.001316 | NaN | 0.001316 | 0.001047 | 0.000323 | 0.000323 | NaN | 0.001028 | 2.0 | 1.333333 | 4.0 | 0.571429 | 0.002632 | NaN | 0.002632 | 0.002094 | 0.000645 | 0.000645 | NaN | 0.002057 | 1.5 | 0.750000 | 3.0 | 0.428571 | 0.001974 | NaN | 0.001974 | 0.001571 | 0.000484 | 0.000484 | NaN | 0.001542 | 0.5 | 0.250000 | 0.333333 | 0.142857 | 0.000658 | 0.000658 | 0.000524 | 0.000161 | 0.000161 | NaN | 0.000514 | 3.5 | 1.750000 | 2.333333 | 0.004605 | 0.004605 | 0.003665 | NaN | 0.003599 | 3.063800e+09 | 1.531900e+09 | 2.042533e+09 | 6.127600e+09 | 8.753714e+08 | 4.031316e+06 | NaN | 4.031316e+06 | 3.208168e+06 | 988322.598387 | 988322.598387 | NaN | 760.0 | 380.0 | 506.666667 | 1520.0 | 217.142857 | NaN | 1.0 | 0.795812 | 0.245161 | 0.245161 | NaN | 0.781491 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 760.0 | 380.0 | 506.666667 | 1520.0 | 217.142857 | 0.795812 | NaN | 0.781491 | 955.0 | 477.5 | 636.666667 | 1910.0 | 272.857143 | 1.256579 | 1.256579 | 0.308065 | 0.308065 | NaN | 0.982005 | 3100.0 | 1550.0 | 2066.666667 | 6200.0 | 0.000001 | 4.078947 | NaN | 3.246073 | 1.000000 | 3100.0 | 1550.0 | 2066.666667 | 6200.0 | 4.078947 | NaN | 1.000000 | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 0.0 | 0.0 | 972.5 | 486.25 | 648.333333 | 1.018325 | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 0.0 | 0.0 | NaN |
367 | 0.333333 | 0.250000 | 1.0 | 0.142857 | 3.819272e-10 | 0.000833 | NaN | 0.000833 | 0.000725 | 0.000103 | 0.000097 | NaN | 0.000509 | 3.0 | 0.750000 | 3.0 | 0.428571 | 0.002500 | NaN | 0.002500 | 0.002174 | 0.000309 | 0.000292 | NaN | 0.001527 | 4.0 | 1.333333 | 4.0 | 0.571429 | 0.003333 | NaN | 0.003333 | 0.002899 | 0.000412 | 0.000389 | NaN | 0.002036 | 1.0 | 0.333333 | 0.250000 | 0.142857 | 0.000833 | 0.000833 | 0.000725 | 0.000103 | 0.000097 | NaN | 0.000509 | 7.0 | 2.333333 | 1.750000 | 0.005833 | 0.005833 | 0.005072 | NaN | 0.003562 | 2.618300e+09 | 8.727667e+08 | 6.545750e+08 | 2.618300e+09 | 3.740429e+08 | 2.181917e+06 | NaN | 2.181917e+06 | 1.897319e+06 | 269372.436214 | 254599.385453 | NaN | 1200.0 | 400.0 | 300.000000 | 1200.0 | 171.428571 | NaN | 1.0 | 0.869565 | 0.123457 | 0.116686 | NaN | 0.610687 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 1200.0 | 400.0 | 300.000000 | 1200.0 | 171.428571 | 0.869565 | NaN | 0.610687 | 1380.0 | 460.0 | 345.000000 | 1380.0 | 197.142857 | 1.150000 | 1.150000 | 0.141975 | 0.134189 | NaN | 0.702290 | 9720.0 | 3240.0 | 2430.000000 | 9720.0 | 0.000004 | 8.100000 | NaN | 7.043478 | 0.945158 | 10284.0 | 3428.0 | 2571.000000 | 10284.0 | 8.570000 | NaN | 1.058025 | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 0.0 | 0.0 | 1965.0 | 655.00 | 491.250000 | 1.423913 | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 0.0 | 0.0 | NaN |
features_extra = [ i for i in df_train_extra.columns if i in df_test_extra.columns]
print(features_extra)
['bathrooms / bedrooms', 'bathrooms / condition', 'bathrooms / floors', 'bathrooms / grade', 'bathrooms / id', 'bathrooms / sqft_above', 'bathrooms / sqft_basement', 'bathrooms / sqft_living', 'bathrooms / sqft_living15', 'bathrooms / sqft_lot', 'bathrooms / sqft_lot15', 'bathrooms / view', 'bathrooms / yr_built', 'bedrooms / bathrooms', 'bedrooms / condition', 'bedrooms / floors', 'bedrooms / grade', 'bedrooms / sqft_above', 'bedrooms / sqft_basement', 'bedrooms / sqft_living', 'bedrooms / sqft_living15', 'bedrooms / sqft_lot', 'bedrooms / sqft_lot15', 'bedrooms / view', 'bedrooms / yr_built', 'condition / bathrooms', 'condition / bedrooms', 'condition / floors', 'condition / grade', 'condition / sqft_above', 'condition / sqft_basement', 'condition / sqft_living', 'condition / sqft_living15', 'condition / sqft_lot', 'condition / sqft_lot15', 'condition / view', 'condition / yr_built', 'floors / bathrooms', 'floors / bedrooms', 'floors / condition', 'floors / grade', 'floors / sqft_above', 'floors / sqft_living', 'floors / sqft_living15', 'floors / sqft_lot', 'floors / sqft_lot15', 'floors / view', 'floors / yr_built', 'grade / bathrooms', 'grade / bedrooms', 'grade / condition', 'grade / sqft_above', 'grade / sqft_living', 'grade / sqft_living15', 'grade / view', 'grade / yr_built', 'id / bathrooms', 'id / bedrooms', 'id / condition', 'id / floors', 'id / grade', 'id / sqft_above', 'id / sqft_basement', 'id / sqft_living', 'id / sqft_living15', 'id / sqft_lot', 'id / sqft_lot15', 'id / view', 'sqft_above / bathrooms', 'sqft_above / bedrooms', 'sqft_above / condition', 'sqft_above / floors', 'sqft_above / grade', 'sqft_above / sqft_basement', 'sqft_above / sqft_living', 'sqft_above / sqft_living15', 'sqft_above / sqft_lot', 'sqft_above / sqft_lot15', 'sqft_above / view', 'sqft_above / yr_built', 'sqft_basement / bathrooms', 'sqft_basement / bedrooms', 'sqft_basement / condition', 'sqft_basement / floors', 'sqft_basement / id', 'sqft_basement / sqft_lot', 'sqft_basement / sqft_lot15', 'sqft_basement / view', 'sqft_living / bathrooms', 'sqft_living / bedrooms', 'sqft_living / condition', 'sqft_living / floors', 'sqft_living / grade', 'sqft_living / sqft_living15', 'sqft_living / view', 'sqft_living / yr_built', 'sqft_living15 / bathrooms', 'sqft_living15 / bedrooms', 'sqft_living15 / condition', 'sqft_living15 / floors', 'sqft_living15 / grade', 'sqft_living15 / sqft_above', 'sqft_living15 / sqft_living', 'sqft_living15 / sqft_lot', 'sqft_living15 / sqft_lot15', 'sqft_living15 / view', 'sqft_living15 / yr_built', 'sqft_lot / bathrooms', 'sqft_lot / bedrooms', 'sqft_lot / condition', 'sqft_lot / floors', 'sqft_lot / id', 'sqft_lot / sqft_above', 'sqft_lot / sqft_basement', 'sqft_lot / sqft_lot15', 'sqft_lot15 / bathrooms', 'sqft_lot15 / bedrooms', 'sqft_lot15 / condition', 'sqft_lot15 / floors', 'sqft_lot15 / sqft_above', 'sqft_lot15 / sqft_basement', 'sqft_lot15 / sqft_lot', 'sqft_lot15 / view', 'view / bathrooms', 'view / bedrooms', 'view / condition', 'view / floors', 'view / id', 'view / sqft_above', 'view / sqft_basement', 'view / sqft_lot', 'view / sqft_lot15', 'yr_built / bathrooms', 'yr_built / bedrooms', 'yr_built / condition', 'yr_built / sqft_living15', 'yr_built / view', 'yr_renovated / bathrooms', 'yr_renovated / bedrooms', 'yr_renovated / condition', 'yr_renovated / floors', 'yr_renovated / id', 'yr_renovated / sqft_above', 'yr_renovated / sqft_basement', 'yr_renovated / sqft_lot', 'yr_renovated / sqft_lot15', 'yr_renovated / view']
df_train_extra.isna().sum().sum()
319676
df_train_extra = df_train_extra.fillna(0.0)
df_test_extra = df_test_extra.fillna(0.0)
df_train = pd.concat([df_train,df_train_extra],axis=1)
df_test = pd.concat([df_test, df_test_extra],axis=1)
df_train.head(2)
id | date | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | bathrooms / bedrooms | bathrooms / condition | bathrooms / floors | bathrooms / grade | bathrooms / id | bathrooms / sqft_above | bathrooms / sqft_basement | bathrooms / sqft_living | bathrooms / sqft_living15 | bathrooms / sqft_lot | bathrooms / sqft_lot15 | bathrooms / view | bathrooms / yr_built | bedrooms / bathrooms | bedrooms / condition | bedrooms / floors | bedrooms / grade | bedrooms / sqft_above | bedrooms / sqft_basement | bedrooms / sqft_living | bedrooms / sqft_living15 | bedrooms / sqft_lot | bedrooms / sqft_lot15 | bedrooms / view | bedrooms / yr_built | condition / bathrooms | condition / bedrooms | condition / floors | condition / grade | condition / id | condition / sqft_above | condition / sqft_basement | condition / sqft_living | condition / sqft_living15 | condition / sqft_lot | condition / sqft_lot15 | condition / view | condition / yr_built | floors / bathrooms | floors / bedrooms | floors / condition | floors / grade | floors / sqft_above | floors / sqft_basement | floors / sqft_living | floors / sqft_living15 | floors / sqft_lot | floors / sqft_lot15 | floors / view | floors / yr_built | grade / bathrooms | grade / bedrooms | grade / condition | grade / floors | grade / sqft_above | grade / sqft_living | grade / sqft_living15 | grade / view | grade / yr_built | id / bathrooms | id / bedrooms | id / condition | id / floors | id / grade | id / sqft_above | id / sqft_basement | id / sqft_living | id / sqft_living15 | id / sqft_lot | id / sqft_lot15 | id / view | sqft_above / bathrooms | sqft_above / bedrooms | sqft_above / condition | sqft_above / floors | sqft_above / grade | sqft_above / sqft_basement | sqft_above / sqft_living | sqft_above / sqft_living15 | sqft_above / sqft_lot | sqft_above / sqft_lot15 | sqft_above / view | sqft_above / yr_built | sqft_basement / bathrooms | sqft_basement / bedrooms | sqft_basement / condition | sqft_basement / floors | sqft_basement / id | sqft_basement / sqft_lot | sqft_basement / sqft_lot15 | sqft_basement / view | sqft_living / bathrooms | sqft_living / bedrooms | sqft_living / condition | sqft_living / floors | sqft_living / grade | sqft_living / sqft_living15 | sqft_living / view | sqft_living / yr_built | sqft_living15 / bathrooms | sqft_living15 / bedrooms | sqft_living15 / condition | sqft_living15 / floors | sqft_living15 / grade | sqft_living15 / sqft_above | sqft_living15 / sqft_basement | sqft_living15 / sqft_living | sqft_living15 / sqft_lot | sqft_living15 / sqft_lot15 | sqft_living15 / view | sqft_living15 / yr_built | sqft_lot / bathrooms | sqft_lot / bedrooms | sqft_lot / condition | sqft_lot / floors | sqft_lot / id | sqft_lot / sqft_above | sqft_lot / sqft_basement | sqft_lot / sqft_lot15 | sqft_lot15 / bathrooms | sqft_lot15 / bedrooms | sqft_lot15 / condition | sqft_lot15 / floors | sqft_lot15 / sqft_above | sqft_lot15 / sqft_basement | sqft_lot15 / sqft_living15 | sqft_lot15 / sqft_lot | sqft_lot15 / view | view / bathrooms | view / bedrooms | view / condition | view / floors | view / id | view / sqft_above | view / sqft_basement | view / sqft_lot | view / sqft_lot15 | yr_built / bathrooms | yr_built / bedrooms | yr_built / condition | yr_built / grade | yr_built / sqft_above | yr_built / sqft_living | yr_built / sqft_living15 | yr_built / view | yr_renovated / bathrooms | yr_renovated / bedrooms | yr_renovated / condition | yr_renovated / floors | yr_renovated / id | yr_renovated / sqft_above | yr_renovated / sqft_basement | yr_renovated / sqft_lot | yr_renovated / sqft_lot15 | yr_renovated / view | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2561340020 | 2014-08-04 | 3 | 1 | 1780 | 11096 | 1 | 0 | 0 | 3 | 7 | 1210 | 570 | 1979 | 0 | 98074 | 47.6170 | -122.051 | 1780 | 10640 | 0.333333 | 0.333333 | 1.0 | 0.142857 | 3.904206e-10 | 0.000826 | 0.001754 | 0.000562 | 0.000562 | 0.000090 | 0.000094 | 0.0 | 0.000505 | 3.0 | 1.000000 | 3.0 | 0.428571 | 0.002479 | 0.005263 | 0.001685 | 0.001685 | 0.000270 | 0.000282 | 0.0 | 0.001516 | 3.0 | 1.0 | 3.0 | 0.428571 | 1.171262e-09 | 0.002479 | 0.005263 | 0.001685 | 0.001685 | 0.000270 | 0.000282 | 0.0 | 0.001516 | 1.0 | 0.333333 | 0.333333 | 0.142857 | 0.000826 | 0.001754 | 0.000562 | 0.000562 | 0.000090 | 0.000094 | 0.0 | 0.000505 | 7.0 | 2.333333 | 2.333333 | 7.0 | 0.005785 | 0.003933 | 0.003933 | 0.0 | 0.003537 | 2.561340e+09 | 8.537800e+08 | 8.537800e+08 | 2.561340e+09 | 3.659057e+08 | 2.116810e+06 | 4.493579e+06 | 1.438955e+06 | 1.438955e+06 | 2.308345e+05 | 2.407274e+05 | 0.0 | 1210.0 | 403.333333 | 403.333333 | 1210.0 | 172.857143 | 2.122807 | 0.679775 | 0.679775 | 0.109048 | 0.113722 | 0.0 | 0.611420 | 570.0 | 190.0 | 190.0 | 570.0 | 2.225398e-07 | 0.05137 | 0.053571 | 0.0 | 1780.0 | 593.333333 | 593.333333 | 1780.0 | 254.285714 | 1.000000 | 0.0 | 0.899444 | 1780.0 | 593.333333 | 593.333333 | 1780.0 | 254.285714 | 1.471074 | 3.122807 | 1.000000 | 0.160418 | 0.167293 | 0.0 | 0.899444 | 11096.0 | 3698.666667 | 3698.666667 | 11096.0 | 4.332107e-06 | 9.170248 | 19.466667 | 1.042857 | 10640.0 | 3546.666667 | 3546.666667 | 10640.0 | 8.793388 | 18.666667 | 5.977528 | 0.958904 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1979.0 | 659.666667 | 659.666667 | 282.714286 | 1.635537 | 1.111798 | 1.111798 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 8598200070 | 2014-12-08 | 2 | 2 | 1420 | 2229 | 2 | 0 | 0 | 3 | 7 | 1420 | 0 | 2004 | 0 | 98059 | 47.4871 | -122.165 | 1500 | 2230 | 1.000000 | 0.666667 | 1.0 | 0.285714 | 2.326068e-10 | 0.001408 | 0.000000 | 0.001408 | 0.001333 | 0.000897 | 0.000897 | 0.0 | 0.000998 | 1.0 | 0.666667 | 1.0 | 0.285714 | 0.001408 | 0.000000 | 0.001408 | 0.001333 | 0.000897 | 0.000897 | 0.0 | 0.000998 | 1.5 | 1.5 | 1.5 | 0.428571 | 3.489102e-10 | 0.002113 | 0.000000 | 0.002113 | 0.002000 | 0.001346 | 0.001345 | 0.0 | 0.001497 | 1.0 | 1.000000 | 0.666667 | 0.285714 | 0.001408 | 0.000000 | 0.001408 | 0.001333 | 0.000897 | 0.000897 | 0.0 | 0.000998 | 3.5 | 3.500000 | 2.333333 | 3.5 | 0.004930 | 0.004930 | 0.004667 | 0.0 | 0.003493 | 4.299100e+09 | 4.299100e+09 | 2.866067e+09 | 4.299100e+09 | 1.228314e+09 | 6.055070e+06 | 0.000000e+00 | 6.055070e+06 | 5.732133e+06 | 3.857425e+06 | 3.855695e+06 | 0.0 | 710.0 | 710.000000 | 473.333333 | 710.0 | 202.857143 | 0.000000 | 1.000000 | 0.946667 | 0.637057 | 0.636771 | 0.0 | 0.708583 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000e+00 | 0.00000 | 0.000000 | 0.0 | 710.0 | 710.000000 | 473.333333 | 710.0 | 202.857143 | 0.946667 | 0.0 | 0.708583 | 750.0 | 750.000000 | 500.000000 | 750.0 | 214.285714 | 1.056338 | 0.000000 | 1.056338 | 0.672948 | 0.672646 | 0.0 | 0.748503 | 1114.5 | 1114.500000 | 743.000000 | 1114.5 | 2.592403e-07 | 1.569718 | 0.000000 | 0.999552 | 1115.0 | 1115.000000 | 743.333333 | 1115.0 | 1.570423 | 0.000000 | 1.486667 | 1.000449 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1002.0 | 1002.000000 | 668.000000 | 286.285714 | 1.411268 | 1.411268 | 1.336000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
df_test.head(2)
id | date | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | bathrooms / bedrooms | bathrooms / condition | bathrooms / floors | bathrooms / grade | bathrooms / id | bathrooms / sqft_above | bathrooms / sqft_basement | bathrooms / sqft_living | bathrooms / sqft_living15 | bathrooms / sqft_lot | bathrooms / sqft_lot15 | bathrooms / view | bathrooms / yr_built | bedrooms / bathrooms | bedrooms / condition | bedrooms / floors | bedrooms / grade | bedrooms / sqft_above | bedrooms / sqft_basement | bedrooms / sqft_living | bedrooms / sqft_living15 | bedrooms / sqft_lot | bedrooms / sqft_lot15 | bedrooms / view | bedrooms / yr_built | condition / bathrooms | condition / bedrooms | condition / floors | condition / grade | condition / sqft_above | condition / sqft_basement | condition / sqft_living | condition / sqft_living15 | condition / sqft_lot | condition / sqft_lot15 | condition / view | condition / yr_built | floors / bathrooms | floors / bedrooms | floors / condition | floors / grade | floors / sqft_above | floors / sqft_living | floors / sqft_living15 | floors / sqft_lot | floors / sqft_lot15 | floors / view | floors / yr_built | grade / bathrooms | grade / bedrooms | grade / condition | grade / sqft_above | grade / sqft_living | grade / sqft_living15 | grade / view | grade / yr_built | id / bathrooms | id / bedrooms | id / condition | id / floors | id / grade | id / sqft_above | id / sqft_basement | id / sqft_living | id / sqft_living15 | id / sqft_lot | id / sqft_lot15 | id / view | sqft_above / bathrooms | sqft_above / bedrooms | sqft_above / condition | sqft_above / floors | sqft_above / grade | sqft_above / sqft_basement | sqft_above / sqft_living | sqft_above / sqft_living15 | sqft_above / sqft_lot | sqft_above / sqft_lot15 | sqft_above / view | sqft_above / yr_built | sqft_basement / bathrooms | sqft_basement / bedrooms | sqft_basement / condition | sqft_basement / floors | sqft_basement / id | sqft_basement / sqft_lot | sqft_basement / sqft_lot15 | sqft_basement / view | sqft_living / bathrooms | sqft_living / bedrooms | sqft_living / condition | sqft_living / floors | sqft_living / grade | sqft_living / sqft_living15 | sqft_living / view | sqft_living / yr_built | sqft_living15 / bathrooms | sqft_living15 / bedrooms | sqft_living15 / condition | sqft_living15 / floors | sqft_living15 / grade | sqft_living15 / sqft_above | sqft_living15 / sqft_living | sqft_living15 / sqft_lot | sqft_living15 / sqft_lot15 | sqft_living15 / view | sqft_living15 / yr_built | sqft_lot / bathrooms | sqft_lot / bedrooms | sqft_lot / condition | sqft_lot / floors | sqft_lot / id | sqft_lot / sqft_above | sqft_lot / sqft_basement | sqft_lot / sqft_living15 | sqft_lot / sqft_lot15 | sqft_lot15 / bathrooms | sqft_lot15 / bedrooms | sqft_lot15 / condition | sqft_lot15 / floors | sqft_lot15 / sqft_above | sqft_lot15 / sqft_basement | sqft_lot15 / sqft_lot | sqft_lot15 / view | view / bathrooms | view / bedrooms | view / condition | view / floors | view / id | view / sqft_above | view / sqft_basement | view / sqft_lot | view / sqft_lot15 | yr_built / bathrooms | yr_built / bedrooms | yr_built / condition | yr_built / sqft_living15 | yr_built / view | yr_renovated / bathrooms | yr_renovated / bedrooms | yr_renovated / condition | yr_renovated / floors | yr_renovated / id | yr_renovated / sqft_above | yr_renovated / sqft_basement | yr_renovated / sqft_lot | yr_renovated / sqft_lot15 | yr_renovated / view | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 8669180390 | 2014-06-04 | 3 | 2 | 2437 | 5136 | 2 | 0 | 0 | 3 | 7 | 2437 | 0 | 2011 | 0 | 98002 | 47.3517 | -122.210 | 2437 | 4614 | 0.666667 | 0.666667 | 1.0 | 0.285714 | 2.307023e-10 | 0.000821 | 0.0 | 0.000821 | 0.000821 | 0.000389 | 0.000433 | 0.0 | 0.000995 | 1.5 | 1.00 | 1.5 | 0.428571 | 0.001231 | 0.0 | 0.001231 | 0.001231 | 0.000584 | 0.000650 | 0.0 | 0.001492 | 1.5 | 1.000000 | 1.5 | 0.428571 | 0.001231 | 0.0 | 0.001231 | 0.001231 | 0.000584 | 0.000650 | 0.0 | 0.001492 | 1.0 | 0.666667 | 0.666667 | 0.285714 | 0.000821 | 0.000821 | 0.000821 | 0.000389 | 0.000433 | 0.0 | 0.000995 | 3.5 | 2.333333 | 2.333333 | 0.002872 | 0.002872 | 0.002872 | 0.0 | 0.003481 | 4.334590e+09 | 2.889727e+09 | 2.889727e+09 | 4.334590e+09 | 1.238454e+09 | 3.557317e+06 | 0.0 | 3.557317e+06 | 3.557317e+06 | 1.687925e+06 | 1.878886e+06 | 0.0 | 1218.5 | 812.333333 | 812.333333 | 1218.5 | 348.142857 | 0.0 | 1.0 | 1.000000 | 0.474494 | 0.528175 | 0.0 | 1.211835 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1218.5 | 812.333333 | 812.333333 | 1218.5 | 348.142857 | 1.000000 | 0.0 | 1.211835 | 1218.5 | 812.333333 | 812.333333 | 1218.5 | 348.142857 | 1.000000 | 1.000000 | 0.474494 | 0.528175 | 0.0 | 1.211835 | 2568.0 | 1712.0 | 1712.0 | 2568.0 | 5.924435e-07 | 2.107509 | 0.0 | 2.107509 | 1.113134 | 2307.0 | 1538.0 | 1538.0 | 2307.0 | 1.893311 | 0.0 | 0.898364 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1005.5 | 670.333333 | 670.333333 | 0.825195 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 3750603471 | 2015-03-27 | 3 | 2 | 1560 | 4800 | 2 | 0 | 0 | 4 | 7 | 1560 | 0 | 1974 | 0 | 98001 | 47.2653 | -122.285 | 1510 | 12240 | 0.666667 | 0.500000 | 1.0 | 0.285714 | 5.332475e-10 | 0.001282 | 0.0 | 0.001282 | 0.001325 | 0.000417 | 0.000163 | 0.0 | 0.001013 | 1.5 | 0.75 | 1.5 | 0.428571 | 0.001923 | 0.0 | 0.001923 | 0.001987 | 0.000625 | 0.000245 | 0.0 | 0.001520 | 2.0 | 1.333333 | 2.0 | 0.571429 | 0.002564 | 0.0 | 0.002564 | 0.002649 | 0.000833 | 0.000327 | 0.0 | 0.002026 | 1.0 | 0.666667 | 0.500000 | 0.285714 | 0.001282 | 0.001282 | 0.001325 | 0.000417 | 0.000163 | 0.0 | 0.001013 | 3.5 | 2.333333 | 1.750000 | 0.004487 | 0.004487 | 0.004636 | 0.0 | 0.003546 | 1.875302e+09 | 1.250201e+09 | 9.376509e+08 | 1.875302e+09 | 5.358005e+08 | 2.404233e+06 | 0.0 | 2.404233e+06 | 2.483843e+06 | 7.813757e+05 | 3.064219e+05 | 0.0 | 780.0 | 520.000000 | 390.000000 | 780.0 | 222.857143 | 0.0 | 1.0 | 1.033113 | 0.325000 | 0.127451 | 0.0 | 0.790274 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 780.0 | 520.000000 | 390.000000 | 780.0 | 222.857143 | 1.033113 | 0.0 | 0.790274 | 755.0 | 503.333333 | 377.500000 | 755.0 | 215.714286 | 0.967949 | 0.967949 | 0.314583 | 0.123366 | 0.0 | 0.764944 | 2400.0 | 1600.0 | 1200.0 | 2400.0 | 1.279794e-06 | 3.076923 | 0.0 | 3.178808 | 0.392157 | 6120.0 | 4080.0 | 3060.0 | 6120.0 | 7.846154 | 0.0 | 2.550000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 987.0 | 658.000000 | 493.500000 | 1.307285 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
def clean_data(df,log=True,sq=True,logsq=True,dummy=True,dummy_cat=False):
# log sq
if logsq:
log = True
sq = True
df = df.copy()
# Date time features
df['date'] = pd.to_datetime(df['date'])
df['yr_sales'] = df['date'].dt.year
df['age'] = df['yr_sales'] - df['yr_built']
df['yr_renovated2'] = np.where(df['yr_renovated'].eq(0), df['yr_built'], df['yr_renovated'])
df['age_after_renovation'] = df['yr_sales'] - df['yr_renovated2']
# Boolean data types
f = lambda x: 1 if x>0 else 0
df['basement_bool'] = df['sqft_basement'].apply(f)
df['renovation_bool'] = df['yr_renovated'].apply(f)
# Numerical features binning
cols_bin = ['age','age_after_renovation']
df['age_cat'] = pd.cut(df['age'], 10, labels=range(10)).astype(str)
df['age_after_renovation_cat'] = pd.cut(df['age_after_renovation'],
10, labels=range(10))
# Log transformation of large numerical values
cols_log = ['sqft_living', 'sqft_lot', 'sqft_above',
'sqft_basement', 'sqft_living15', 'sqft_lot15']
if log:
for col in cols_log:
df['log1p_' + col] = np.log1p(df[col])
# squared columns
cols_sq = [
# cats
'bedrooms','bathrooms','floors','waterfront','view',
# created nums
'age','age_after_renovation']
if sq:
for col in cols_sq:
df[col + '_sq'] = df[col]**2
cols_log_sq = [
# log nums
'log1p_sqft_living','log1p_sqft_lot',
'log1p_sqft_above','log1p_sqft_basement',
'log1p_sqft_living15','log1p_sqft_lot15'
]
if logsq:
for col in cols_log_sq:
df[col + '_sq'] = df[col]**2
# Categorical Features
cols_dummy = ['waterfront', 'view', 'condition', 'grade']
cols_dummy_cat = ['age_cat', 'age_after_renovation_cat']
for c in cols_dummy:
df[c] = df[c].astype(str)
# Create dummy variables
if dummy:
df_dummy = pd.get_dummies(df[cols_dummy],drop_first=False)
df = pd.concat([df,df_dummy], axis=1)
# dummy variable for newly created cats from numerical feature
if dummy_cat:
df_dummy = pd.get_dummies(df[cols_dummy_cat],drop_first=False)
df = pd.concat([df,cols_dummy_cat], axis=1)
# after creating dummy, make the columns number
for c in cols_dummy + cols_dummy_cat:
df[c] = df[c].astype(np.int32)
# Drop unwanted columns
cols_drop = ['date']
df = df.drop(cols_drop,axis=1)
return df
params_data = dict(log=True,sq=True,logsq=True,
dummy=False,dummy_cat=False)
df_train = clean_data(df_train,**params_data)
df_test = clean_data(df_test,**params_data)
features = list(sorted(df_train.columns))
features = [i for i in features if i in df_test.columns]
# print(np.array(features))
df_Xtrain = df_train[features]
df_Xtest = df_test[features]
scaling = 'standard'
if scaling == 'standard':
scaler = preprocessing.StandardScaler()
scaler.fit(df_Xtrain)
df_Xtrain = pd.DataFrame(scaler.transform(df_Xtrain),columns=features)
df_Xtest = pd.DataFrame(scaler.transform(df_Xtest),columns=features)
elif scaling == 'minmax':
scaler = preprocessing.MinMaxScaler()
scaler.fit(df_Xtrain)
df_Xtrain = pd.DataFrame(scaler.transform(df_Xtrain),columns=features)
df_Xtest = pd.DataFrame(scaler.transform(df_Xtest),columns=features)
df_Xtrain.head(2)
age | age_after_renovation | age_after_renovation_cat | age_after_renovation_sq | age_cat | age_sq | basement_bool | bathrooms | bathrooms / bedrooms | bathrooms / condition | bathrooms / floors | bathrooms / grade | bathrooms / id | bathrooms / sqft_above | bathrooms / sqft_basement | bathrooms / sqft_living | bathrooms / sqft_living15 | bathrooms / sqft_lot | bathrooms / sqft_lot15 | bathrooms / view | bathrooms / yr_built | bathrooms_sq | bedrooms | bedrooms / bathrooms | bedrooms / condition | bedrooms / floors | bedrooms / grade | bedrooms / sqft_above | bedrooms / sqft_basement | bedrooms / sqft_living | bedrooms / sqft_living15 | bedrooms / sqft_lot | bedrooms / sqft_lot15 | bedrooms / view | bedrooms / yr_built | bedrooms_sq | condition | condition / bathrooms | condition / bedrooms | condition / floors | condition / grade | condition / sqft_above | condition / sqft_basement | condition / sqft_living | condition / sqft_living15 | condition / sqft_lot | condition / sqft_lot15 | condition / view | condition / yr_built | floors | floors / bathrooms | floors / bedrooms | floors / condition | floors / grade | floors / sqft_above | floors / sqft_living | floors / sqft_living15 | floors / sqft_lot | floors / sqft_lot15 | floors / view | floors / yr_built | floors_sq | grade | grade / bathrooms | grade / bedrooms | grade / condition | grade / sqft_above | grade / sqft_living | grade / sqft_living15 | grade / view | grade / yr_built | id | id / bathrooms | id / bedrooms | id / condition | id / floors | id / grade | id / sqft_above | id / sqft_basement | id / sqft_living | id / sqft_living15 | id / sqft_lot | id / sqft_lot15 | id / view | lat | log1p_sqft_above | log1p_sqft_above_sq | log1p_sqft_basement | log1p_sqft_basement_sq | log1p_sqft_living | log1p_sqft_living15 | log1p_sqft_living15_sq | log1p_sqft_living_sq | log1p_sqft_lot | log1p_sqft_lot15 | log1p_sqft_lot15_sq | log1p_sqft_lot_sq | long | renovation_bool | sqft_above | sqft_above / bathrooms | sqft_above / bedrooms | sqft_above / condition | sqft_above / floors | sqft_above / grade | sqft_above / sqft_basement | sqft_above / sqft_living | sqft_above / sqft_living15 | sqft_above / sqft_lot | sqft_above / sqft_lot15 | sqft_above / view | sqft_above / yr_built | sqft_basement | sqft_basement / bathrooms | sqft_basement / bedrooms | sqft_basement / condition | sqft_basement / floors | sqft_basement / id | sqft_basement / sqft_lot | sqft_basement / sqft_lot15 | sqft_basement / view | sqft_living | sqft_living / bathrooms | sqft_living / bedrooms | sqft_living / condition | sqft_living / floors | sqft_living / grade | sqft_living / sqft_living15 | sqft_living / view | sqft_living / yr_built | sqft_living15 | sqft_living15 / bathrooms | sqft_living15 / bedrooms | sqft_living15 / condition | sqft_living15 / floors | sqft_living15 / grade | sqft_living15 / sqft_above | sqft_living15 / sqft_living | sqft_living15 / sqft_lot | sqft_living15 / sqft_lot15 | sqft_living15 / view | sqft_living15 / yr_built | sqft_lot | sqft_lot / bathrooms | sqft_lot / bedrooms | sqft_lot / condition | sqft_lot / floors | sqft_lot / id | sqft_lot / sqft_above | sqft_lot / sqft_basement | sqft_lot / sqft_lot15 | sqft_lot15 | sqft_lot15 / bathrooms | sqft_lot15 / bedrooms | sqft_lot15 / condition | sqft_lot15 / floors | sqft_lot15 / sqft_above | sqft_lot15 / sqft_basement | sqft_lot15 / sqft_lot | sqft_lot15 / view | view | view / bathrooms | view / bedrooms | view / condition | view / floors | view / id | view / sqft_above | view / sqft_basement | view / sqft_lot | view / sqft_lot15 | view_sq | waterfront | waterfront_sq | yr_built | yr_built / bathrooms | yr_built / bedrooms | yr_built / condition | yr_built / sqft_living15 | yr_built / view | yr_renovated | yr_renovated / bathrooms | yr_renovated / bedrooms | yr_renovated / condition | yr_renovated / floors | yr_renovated / id | yr_renovated / sqft_above | yr_renovated / sqft_basement | yr_renovated / sqft_lot | yr_renovated / sqft_lot15 | yr_renovated / view | yr_renovated2 | yr_sales | zipcode | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -0.288109 | -0.212303 | -0.062185 | -0.438016 | -0.139825 | -0.494698 | 1.247166 | -1.016588 | -0.965489 | -0.795055 | -0.515282 | -1.036172 | -0.047142 | -0.549962 | 0.122535 | -1.052856 | -0.945733 | -0.596395 | -0.630031 | -0.275942 | -1.031661 | -0.80839 | -0.39033 | 0.956028 | -0.052238 | 0.376809 | -0.117223 | 0.442409 | 0.537185 | -0.188636 | -0.205849 | -0.580044 | -0.587172 | -0.285714 | -0.403133 | -0.302220 | -0.630613 | 0.628679 | -0.229485 | 0.233242 | -0.241038 | 0.146134 | 0.448307 | -0.294107 | -0.309360 | -0.575244 | -0.600751 | -0.288873 | -0.632018 | -0.808735 | 0.30806 | -0.577040 | -0.553597 | -0.710706 | -0.181107 | -0.586724 | -0.606877 | -0.452037 | -0.477107 | -0.28162 | -0.830371 | -0.758275 | -0.554878 | 1.186812 | -0.122193 | 0.013307 | 0.534029 | -0.194425 | -0.204672 | -0.295723 | -0.598321 | -0.703484 | -0.209438 | -0.570833 | -0.578746 | -0.393265 | -0.620426 | -0.395239 | 0.081008 | -0.562928 | -0.598333 | -0.529761 | -0.556437 | -0.250563 | 0.410048 | -0.688967 | -0.698830 | 1.208375 | 1.137983 | -0.149505 | -0.169074 | -0.189252 | -0.177052 | 0.361630 | 0.383984 | 0.328910 | 0.301512 | 1.151178 | -0.207998 | -0.698239 | 0.364219 | -0.644904 | -0.503214 | -0.156539 | -0.695779 | 0.401074 | -1.148752 | -0.776109 | -0.705422 | -0.731047 | -0.275857 | -0.711145 | 0.636923 | 1.470655 | 0.881187 | 0.821995 | 0.787345 | -0.006871 | 0.106317 | 0.137237 | -0.21882 | -0.322100 | 1.229815 | -0.110761 | -0.120360 | 0.358513 | -0.114995 | -0.156381 | -0.27966 | -0.330010 | -0.302502 | 1.148152 | -0.090805 | -0.046248 | 0.478096 | -0.034106 | 0.680979 | -0.104754 | -0.584182 | -0.621035 | -0.287892 | -0.314306 | -0.095727 | 0.058690 | -0.072737 | -0.068863 | -0.025666 | -0.054941 | 0.020711 | 0.278160 | -0.070267 | -0.078695 | 0.138218 | -0.050869 | -0.039686 | 0.021829 | 0.084487 | 0.370281 | -0.162056 | -0.128165 | -0.305512 | -0.278944 | -0.287562 | -0.299222 | -0.290962 | -0.096935 | -0.286254 | -0.179678 | -0.226533 | -0.245098 | -0.261712 | -0.089698 | -0.089698 | 0.277141 | 1.265699 | 0.121183 | 0.554447 | 0.027355 | -0.296008 | -0.207992 | -0.189127 | -0.193615 | -0.206256 | -0.197852 | -0.037477 | -0.191291 | -0.116972 | -0.167636 | -0.169396 | -0.087856 | 0.201159 | -0.693043 | -0.071763 |
1 | -1.135161 | -1.074946 | -1.265291 | -0.814627 | -1.320662 | -0.856409 | -0.801818 | 0.346378 | 2.314693 | 0.532571 | -0.515282 | 0.772336 | -0.053339 | 0.864671 | -0.443053 | 1.641342 | 1.117504 | 1.576367 | 1.801556 | -0.275942 | 0.311806 | 0.12929 | -1.46038 | -1.304126 | -1.067188 | -1.511103 | -1.299996 | -0.895852 | -0.524318 | -0.661349 | -0.753927 | 0.671055 | 0.754381 | -0.285714 | -1.498260 | -0.775165 | -0.630613 | -0.736564 | 0.955764 | -1.071623 | -0.241038 | -0.181044 | -0.484700 | 0.140838 | 0.106092 | 1.344084 | 1.579766 | -0.288873 | -0.687060 | 1.000897 | 0.30806 | 2.560298 | 1.082087 | 1.531381 | 1.350601 | 1.622643 | 1.506380 | 1.302110 | 1.567133 | -0.28162 | 0.975135 | 0.867877 | -0.554878 | -0.848913 | 1.523043 | 0.013307 | 0.006846 | 0.492647 | 0.478280 | -0.295723 | -0.675111 | 1.397196 | 0.520103 | 2.602291 | 1.582860 | 0.273961 | 1.559159 | 1.226992 | -0.396782 | 1.602825 | 1.675097 | 2.569011 | 2.895279 | -0.250563 | -0.527440 | -0.314663 | -0.338123 | -0.795545 | -0.779839 | -0.681826 | -0.692075 | -0.700087 | -0.697163 | -1.411647 | -1.527957 | -1.398248 | -1.291600 | 0.344386 | -0.207998 | -0.442941 | -0.938924 | 0.855453 | -0.257004 | -1.211439 | -0.309818 | -0.476151 | 0.726617 | 0.149507 | 1.458647 | 1.602488 | -0.275857 | -0.474104 | -0.658262 | -0.637064 | -0.670877 | -0.648914 | -0.633365 | -0.041318 | -0.582151 | -0.583993 | -0.21882 | -0.716449 | -1.218547 | 0.428944 | -0.511883 | -1.228662 | -0.714969 | -0.323710 | -0.27966 | -0.747292 | -0.712318 | -1.056693 | 0.635603 | -0.428294 | -1.294299 | -0.647989 | -0.407430 | 0.074037 | 1.325638 | 1.594532 | -0.287892 | -0.757198 | -0.302804 | -0.300517 | -0.250741 | -0.283355 | -0.315586 | -0.075109 | -0.279917 | -0.207854 | -0.103913 | -0.378759 | -0.388285 | -0.304125 | -0.353199 | -0.406129 | -0.387329 | -0.240127 | -0.071222 | -0.128165 | -0.305512 | -0.278944 | -0.287562 | -0.299222 | -0.290962 | -0.096935 | -0.286254 | -0.179678 | -0.226533 | -0.245098 | -0.261712 | -0.089698 | -0.089698 | 1.124268 | -0.603046 | 1.750243 | 0.628987 | 0.664598 | -0.296008 | -0.207992 | -0.189127 | -0.193615 | -0.206256 | -0.197852 | -0.037477 | -0.191291 | -0.116972 | -0.167636 | -0.169396 | -0.087856 | 1.064027 | -0.693043 | -0.353180 |
References:
lgb.LGBMRegressor
boosting_type='gbdt', num_leaves=31,
max_depth=-1, learning_rate=0.1,
n_estimators=100, subsample_for_bin=200000,
objective=None, class_weight=None,
min_split_gain=0.0, min_child_weight=0.001,
min_child_samples=20, subsample=1.0,
subsample_freq=0, colsample_bytree=1.0,
reg_alpha=0.0, reg_lambda=0.0,
random_state=None, n_jobs=-1, silent=True,
importance_type='split'
# help(lgb.LGBMRegressor)
%%time
# log transform target
ytrain_log1p = np.log1p(ytrain)
model = lgb.LGBMRegressor(random_state=SEED, n_estimators=10_000)
model.fit(df_Xtrain, ytrain_log1p)
ypreds_log1p = model.predict(df_Xtest)
ypreds = np.expm1(ypreds_log1p)
print('ytest:', ytest[:3])
print('ypreds: ', ypreds[:3])
print_regr_eval(ytest,ypreds,df_Xtest.shape[1])
ytest: [285000. 239950. 460000.] ypreds: [327161.39212613 220042.54307535 540233.58275963] RMSE : 116,354.63 Explained Variance: 0.900479 R-Squared: 0.899544 Adjusted R-squared: 0.894874 CPU times: user 10min, sys: 2.7 s, total: 10min 2s Wall time: 5min 5s
# help(model.fit)
def do_crossval(df_Xtrain, ytrain, features, params,
nfolds=5,SEED=SEED,verbose=0):
"""Modelling gbm using nfolds.
We should not look at test data during cross-validation.
Fit the cross-validation with different params and choose
the model with best params.
"""
model = lgb.LGBMRegressor(**params)
# make arrays
Xtrain = np.array(df_Xtrain)
Xtest = np.array(df_Xtest)
# kfold cross validation
kf = model_selection.KFold(nfolds, shuffle=True,random_state=SEED)
importances = np.zeros(len(features)) # must be numpy array
df_preds = pd.DataFrame()
lst_valid_rmse = []
# iteration
for i, (idx_tr, idx_vd) in enumerate(kf.split(Xtrain, ytrain)):
# Dataframe for fold
df_fold = pd.DataFrame()
# Training and validation data
Xtr = Xtrain[idx_tr]
Xvd = Xtrain[idx_vd]
ytr = ytrain[idx_tr]
yvd = ytrain[idx_vd]
# Train with early stopping
model.fit(Xtr, ytr, early_stopping_rounds = 100,
eval_metric = 'l2',
eval_set = [(Xtr, ytr), (Xvd, yvd)],
eval_names = ['train', 'valid'],
verbose = verbose)
# Record the validation fold score
# l1 is mae and l2 is mse
score = model.best_score_['valid']['l2']
rmse = np.sqrt(score)
lst_valid_rmse.append(rmse)
# if we have done log transform of target
# ypred_fold_log1p = model.predict(Xvd)
# ypred_fold = np.exp1m(ypreds_fold_log1p)
importances += model.feature_importances_ / nfolds
print(f'''
Fold : {i + 1}
Validation RMSE : {rmse:,.0f}
Estimators Trained: {model.best_iteration_}
''')
# outside for-loop
arr_valid_rmse = np.array(lst_valid_rmse)
m = arr_valid_rmse.mean()
s = arr_valid_rmse.std()
# feature importances
df_feat_imp = pd.DataFrame({'feature' : features,
'importance': importances})
df_feat_imp = df_feat_imp.sort_values('importance',ascending=False)
top10_feats = df_feat_imp['feature'].head(10).to_list()
print(f"""
{nfolds} fold cross validation score is {m:,.0f} +- {s:,.0f}
Top 10 features: {top10_feats}
""")
return
params_lgb = {'random_state': SEED,'n_jobs': -1,
'n_estimators': 10_000
}
do_crossval(df_Xtrain, ytrain, features, params_lgb)
Fold : 1 Validation RMSE : 135,442 Estimators Trained: 584 Fold : 2 Validation RMSE : 127,327 Estimators Trained: 124 Fold : 3 Validation RMSE : 134,282 Estimators Trained: 133 Fold : 4 Validation RMSE : 110,940 Estimators Trained: 254 Fold : 5 Validation RMSE : 112,953 Estimators Trained: 142 5 fold cross validation score is 124,189 +- 10,394 Top 10 features: ['lat', 'long', 'grade / yr_built', 'zipcode', 'bathrooms / yr_built', 'condition / yr_built', 'sqft_lot / sqft_lot15', 'floors / yr_built', 'grade / sqft_living15', 'age_after_renovation']
# feature importance
df_imp = pd.DataFrame({'Feature': features,
'Importance_gain': model.feature_importances_
})
df_imp.nlargest(10,'Importance_gain').style.background_gradient()
Feature | Importance_gain | |
---|---|---|
84 | lat | 11324 |
191 | yr_sales | 9151 |
97 | long | 9100 |
150 | sqft_lot / sqft_lot15 | 9041 |
0 | age | 8742 |
192 | zipcode | 5847 |
1 | age_after_renovation | 5822 |
70 | grade / yr_built | 5306 |
127 | sqft_living / sqft_living15 | 5128 |
68 | grade / sqft_living15 | 5051 |
(df_imp
.set_index('Feature')
.nlargest(10,'Importance_gain')
.plot
.barh(figsize=(12,8))
.invert_yaxis()
)
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 0 hr 8 min 33 secs