This dataset contains house sale prices for King County, which includes Seattle. It includes homes sold between May 2014 and May 2015.
Task: Try to estimate the price based on given features.
# sklearn is slower
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(model, param_grid,cv=5,n_jobs=-1
scoring='accuracy',verbose=2,random_state=random_state)
grid_search.fit(Xtrain, ytrain)
# using dask is faster
import dask
import joblib
with joblib.parallel_backend('dask'):
grid_search.fit(Xtrain, ytrain)
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
ENV_BHISHAN = None
try:
import bhishan
%load_ext autoreload
%autoreload 2
ENV_BHISHAN = True
print('Environment: Bhishan')
except:
pass
Environment: Bhishan
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import os
import time
import io
import json
# random state
SEED = 0
RNG = np.random.RandomState(SEED)
# Jupyter notebook settings for pandas
pd.set_option('display.max_columns', 200)
pd.set_option('display.float_format', '{:,.3f}'.format) # numbers sep by comma
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 200)
print([(x.__name__,x.__version__) for x in [np, pd,sns,matplotlib]])
[('numpy', '1.16.4'), ('pandas', '0.25.2'), ('seaborn', '0.9.0'), ('matplotlib', '3.1.1')]
# scale and split
from sklearn.model_selection import train_test_split
# regressors
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
# regressor preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
# metrics
from sklearn import metrics
from sklearn.metrics import mean_squared_error
# cross validation
from sklearn.model_selection import cross_val_score
def show_method_attributes(obj, ncols=7,start=None, inside=None):
""" Show all the attributes of a given method.
Example:
========
show_method_attributes(list)
"""
print(f'Object Type: {type(obj)}\n')
lst = [elem for elem in dir(obj) if elem[0]!='_' ]
lst = [elem for elem in lst
if elem not in 'os np pd sys time psycopg2'.split() ]
if isinstance(start,str):
lst = [elem for elem in lst if elem.startswith(start)]
if isinstance(start,tuple) or isinstance(start,list):
lst = [elem for elem in lst for start_elem in start
if elem.startswith(start_elem)]
if isinstance(inside,str):
lst = [elem for elem in lst if inside in elem]
if isinstance(inside,tuple) or isinstance(inside,list):
lst = [elem for elem in lst for inside_elem in inside
if inside_elem in elem]
return pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
df_eval = pd.DataFrame({'Model': [],
'Details':[],
'Root Mean Squared Error (RMSE)':[],
'R-squared (training)':[],
'Adjusted R-squared (training)':[],
'R-squared (test)':[],
'Adjusted R-squared (test)':[],
'5-Fold Cross Validation':[]})
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
!ls ../data/processed/
data_cleaned.csv data_cleaned_encoded.csv
ifile = 'https://github.com/bhishanpdl/Project_House_Price_Prediction/blob/master/data/processed/data_cleaned_encoded.csv?raw=true'
ifile = '../data/processed/data_cleaned_encoded.csv'
df_raw = pd.read_csv(ifile)
print(df_raw.shape)
df_raw.head()
(21613, 92)
id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | yr_sales | age | yr_renovated2 | age_after_renovation | zipcode_top10 | zipcode_houses | basement_bool | renovation_bool | age_cat | age_after_renovation_cat | waterfront_0 | waterfront_1 | view_0 | view_1 | view_2 | view_3 | view_4 | condition_1 | condition_2 | condition_3 | condition_4 | condition_5 | grade_1 | grade_10 | grade_11 | grade_12 | grade_13 | grade_3 | grade_4 | grade_5 | grade_6 | grade_7 | grade_8 | grade_9 | zipcode_top10_98004 | zipcode_top10_98006 | zipcode_top10_98033 | zipcode_top10_98039 | zipcode_top10_98040 | zipcode_top10_98102 | zipcode_top10_98105 | zipcode_top10_98155 | zipcode_top10_98177 | zipcode_top10_others | age_cat_0 | age_cat_1 | age_cat_2 | age_cat_3 | age_cat_4 | age_cat_5 | age_cat_6 | age_cat_7 | age_cat_8 | age_cat_9 | age_after_renovation_cat_0 | age_after_renovation_cat_1 | age_after_renovation_cat_2 | age_after_renovation_cat_3 | age_after_renovation_cat_4 | age_after_renovation_cat_5 | age_after_renovation_cat_6 | age_after_renovation_cat_7 | age_after_renovation_cat_8 | age_after_renovation_cat_9 | log1p_price | log1p_sqft_living | log1p_sqft_lot | log1p_sqft_above | log1p_sqft_basement | log1p_sqft_living15 | log1p_sqft_lot15 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7129300520 | 2014-10-13 | 221,900.000 | 3 | 1.000 | 1180 | 5650 | 1.000 | 0 | 0 | 3 | 7 | 1180 | 0 | 1955 | 0 | 98178 | 47.511 | -122.257 | 1340 | 5650 | 2014 | 59 | 1955 | 59 | others | 262 | 0 | 0 | 5 | 5 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 12.310 | 7.074 | 8.640 | 7.074 | 0.000 | 7.201 | 8.640 |
1 | 6414100192 | 2014-12-09 | 538,000.000 | 3 | 2.250 | 2570 | 7242 | 2.000 | 0 | 0 | 3 | 7 | 2170 | 400 | 1951 | 1991 | 98125 | 47.721 | -122.319 | 1690 | 7639 | 2014 | 63 | 1991 | 23 | others | 410 | 1 | 1 | 5 | 2 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13.196 | 7.852 | 8.888 | 7.683 | 5.994 | 7.433 | 8.941 |
2 | 5631500400 | 2015-02-25 | 180,000.000 | 2 | 1.000 | 770 | 10000 | 1.000 | 0 | 0 | 3 | 6 | 770 | 0 | 1933 | 0 | 98028 | 47.738 | -122.233 | 2720 | 8062 | 2015 | 82 | 1933 | 82 | others | 283 | 0 | 0 | 7 | 7 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 12.101 | 6.648 | 9.210 | 6.648 | 0.000 | 7.909 | 8.995 |
3 | 2487200875 | 2014-12-09 | 604,000.000 | 4 | 3.000 | 1960 | 5000 | 1.000 | 0 | 0 | 5 | 7 | 1050 | 910 | 1965 | 0 | 98136 | 47.521 | -122.393 | 1360 | 5000 | 2014 | 49 | 1965 | 49 | others | 263 | 1 | 0 | 4 | 4 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 13.311 | 7.581 | 8.517 | 6.957 | 6.815 | 7.216 | 8.517 |
4 | 1954400510 | 2015-02-18 | 510,000.000 | 3 | 2.000 | 1680 | 8080 | 1.000 | 0 | 0 | 3 | 8 | 1680 | 0 | 1987 | 0 | 98074 | 47.617 | -122.045 | 1800 | 7503 | 2015 | 28 | 1987 | 28 | others | 441 | 0 | 0 | 2 | 2 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13.142 | 7.427 | 8.997 | 7.427 | 0.000 | 7.496 | 8.923 |
$$ h_{\theta}(X)=\theta_{0}+\theta_{1} x $$Simple linear model has only one feature and one target. Here our target is price. From the correlation plot, I see that sqft_living is the most important feature. So, I will build a simple linear regression with sqft_living vs price.
theta_0 = lr.intercept_
theta_1 = lr.coef_
df_corr = df_raw.corr(method='pearson')
cols10 = df_corr.nlargest(10, 'price').index
df_corr = df_raw[cols10].corr()
df_corr.style.background_gradient(cmap='coolwarm', axis=None)
price | log1p_price | sqft_living | grade | log1p_sqft_living | sqft_above | sqft_living15 | log1p_sqft_living15 | log1p_sqft_above | bathrooms | |
---|---|---|---|---|---|---|---|---|---|---|
price | 1 | 0.891654 | 0.702035 | 0.667434 | 0.611757 | 0.605567 | 0.585379 | 0.544014 | 0.542774 | 0.525138 |
log1p_price | 0.891654 | 1 | 0.695341 | 0.703634 | 0.67494 | 0.601802 | 0.619312 | 0.607201 | 0.586322 | 0.550802 |
sqft_living | 0.702035 | 0.695341 | 1 | 0.762704 | 0.954368 | 0.876597 | 0.75642 | 0.732194 | 0.84324 | 0.754665 |
grade | 0.667434 | 0.703634 | 0.762704 | 1 | 0.743711 | 0.755923 | 0.713202 | 0.688419 | 0.743416 | 0.664983 |
log1p_sqft_living | 0.611757 | 0.67494 | 0.954368 | 0.743711 | 1 | 0.832336 | 0.736567 | 0.746137 | 0.865382 | 0.761316 |
sqft_above | 0.605567 | 0.601802 | 0.876597 | 0.755923 | 0.832336 | 1 | 0.73187 | 0.701817 | 0.962353 | 0.685342 |
sqft_living15 | 0.585379 | 0.619312 | 0.75642 | 0.713202 | 0.736567 | 0.73187 | 1 | 0.976821 | 0.714572 | 0.568634 |
log1p_sqft_living15 | 0.544014 | 0.607201 | 0.732194 | 0.688419 | 0.746137 | 0.701817 | 0.976821 | 1 | 0.712634 | 0.570834 |
log1p_sqft_above | 0.542774 | 0.586322 | 0.84324 | 0.743416 | 0.865382 | 0.962353 | 0.714572 | 0.712634 | 1 | 0.694954 |
bathrooms | 0.525138 | 0.550802 | 0.754665 | 0.664983 | 0.761316 | 0.685342 | 0.568634 | 0.570834 | 0.694954 | 1 |
train, test = train_test_split(df_raw,train_size = 0.8,random_state=SEED)
feature = 'sqft_living'
target = 'price'
df = df_raw
X = df[feature].values.reshape(-1,1)
y = df[target].values.reshape(-1,1)
Xtrain = train[feature].values.reshape(-1,1)
ytrain = train[target].values.reshape(-1,1)
Xtest = test[feature].values.reshape(-1,1)
ytest = test[target].values.reshape(-1,1)
lr = linear_model.LinearRegression(n_jobs=-1)
lr.fit(Xtrain,ytrain)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)
print('Intercept: {}'.format(lr.intercept_))
print('Coefficient: {}'.format(lr.coef_))
Intercept: [-42628.97651509] Coefficient: [[280.68541679]]
ypreds = lr.predict(Xtest)
rmse = np.sqrt(mean_squared_error(ytest,ypreds)).round(3)
r2_train = lr.score(Xtrain, ytrain).round(3)
r2_test = lr.score(Xtest, ytest).round(3)
cv = cross_val_score(lr, X, y, cv=5,n_jobs=-1).mean().round(3)
df_eval.columns
Index(['Model', 'Details', 'Root Mean Squared Error (RMSE)', 'R-squared (training)', 'Adjusted R-squared (training)', 'R-squared (test)', 'Adjusted R-squared (test)', '5-Fold Cross Validation'], dtype='object')
df_eval.loc[len(df_eval)] = ['Simple Linear Regression','-',
rmse,r2_train,r2_train,r2_test,r2_test,cv]
df_eval
Model | Details | Root Mean Squared Error (RMSE) | R-squared (training) | Adjusted R-squared (training) | R-squared (test) | Adjusted R-squared (test) | 5-Fold Cross Validation | |
---|---|---|---|---|---|---|---|---|
0 | Simple Linear Regression | - | 255,511.380 | 0.487 | 0.487 | 0.516 | 0.516 | 0.491 |
if ENV_BHISHAN:
from bhishan.util_model_plot import plot_simple_linear_regression
plot_simple_linear_regression(Xtest, ytest, lr,"Living Space",'Price')
df_raw.head(2)
id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | yr_sales | age | yr_renovated2 | age_after_renovation | zipcode_top10 | zipcode_houses | basement_bool | renovation_bool | age_cat | age_after_renovation_cat | waterfront_0 | waterfront_1 | view_0 | view_1 | view_2 | view_3 | view_4 | condition_1 | condition_2 | condition_3 | condition_4 | condition_5 | grade_1 | grade_10 | grade_11 | grade_12 | grade_13 | grade_3 | grade_4 | grade_5 | grade_6 | grade_7 | grade_8 | grade_9 | zipcode_top10_98004 | zipcode_top10_98006 | zipcode_top10_98033 | zipcode_top10_98039 | zipcode_top10_98040 | zipcode_top10_98102 | zipcode_top10_98105 | zipcode_top10_98155 | zipcode_top10_98177 | zipcode_top10_others | age_cat_0 | age_cat_1 | age_cat_2 | age_cat_3 | age_cat_4 | age_cat_5 | age_cat_6 | age_cat_7 | age_cat_8 | age_cat_9 | age_after_renovation_cat_0 | age_after_renovation_cat_1 | age_after_renovation_cat_2 | age_after_renovation_cat_3 | age_after_renovation_cat_4 | age_after_renovation_cat_5 | age_after_renovation_cat_6 | age_after_renovation_cat_7 | age_after_renovation_cat_8 | age_after_renovation_cat_9 | log1p_price | log1p_sqft_living | log1p_sqft_lot | log1p_sqft_above | log1p_sqft_basement | log1p_sqft_living15 | log1p_sqft_lot15 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7129300520 | 2014-10-13 | 221,900.000 | 3 | 1.000 | 1180 | 5650 | 1.000 | 0 | 0 | 3 | 7 | 1180 | 0 | 1955 | 0 | 98178 | 47.511 | -122.257 | 1340 | 5650 | 2014 | 59 | 1955 | 59 | others | 262 | 0 | 0 | 5 | 5 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 12.310 | 7.074 | 8.640 | 7.074 | 0.000 | 7.201 | 8.640 |
1 | 6414100192 | 2014-12-09 | 538,000.000 | 3 | 2.250 | 2570 | 7242 | 2.000 | 0 | 0 | 3 | 7 | 2170 | 400 | 1951 | 1991 | 98125 | 47.721 | -122.319 | 1690 | 7639 | 2014 | 63 | 1991 | 23 | others | 410 | 1 | 1 | 5 | 2 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13.196 | 7.852 | 8.888 | 7.683 | 5.994 | 7.433 | 8.941 |
df_raw.columns
Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'yr_sales', 'age', 'yr_renovated2', 'age_after_renovation', 'zipcode_top10', 'zipcode_houses', 'basement_bool', 'renovation_bool', 'age_cat', 'age_after_renovation_cat', 'waterfront_0', 'waterfront_1', 'view_0', 'view_1', 'view_2', 'view_3', 'view_4', 'condition_1', 'condition_2', 'condition_3', 'condition_4', 'condition_5', 'grade_1', 'grade_10', 'grade_11', 'grade_12', 'grade_13', 'grade_3', 'grade_4', 'grade_5', 'grade_6', 'grade_7', 'grade_8', 'grade_9', 'zipcode_top10_98004', 'zipcode_top10_98006', 'zipcode_top10_98033', 'zipcode_top10_98039', 'zipcode_top10_98040', 'zipcode_top10_98102', 'zipcode_top10_98105', 'zipcode_top10_98155', 'zipcode_top10_98177', 'zipcode_top10_others', 'age_cat_0', 'age_cat_1', 'age_cat_2', 'age_cat_3', 'age_cat_4', 'age_cat_5', 'age_cat_6', 'age_cat_7', 'age_cat_8', 'age_cat_9', 'age_after_renovation_cat_0', 'age_after_renovation_cat_1', 'age_after_renovation_cat_2', 'age_after_renovation_cat_3', 'age_after_renovation_cat_4', 'age_after_renovation_cat_5', 'age_after_renovation_cat_6', 'age_after_renovation_cat_7', 'age_after_renovation_cat_8', 'age_after_renovation_cat_9', 'log1p_price', 'log1p_sqft_living', 'log1p_sqft_lot', 'log1p_sqft_above', 'log1p_sqft_basement', 'log1p_sqft_living15', 'log1p_sqft_lot15'], dtype='object')
features_raw_few = ['bedrooms','bathrooms','sqft_living',
'sqft_lot','floors','zipcode']
features_raw_all = ['bedrooms','bathrooms','sqft_living','sqft_lot',
'floors','waterfront','view','condition','grade',
'sqft_above','yr_built','yr_renovated',
'zipcode','lat','long','sqft_living15','sqft_lot15']
df.columns
Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'yr_sales', 'age', 'yr_renovated2', 'age_after_renovation', 'zipcode_top10', 'zipcode_houses', 'basement_bool', 'renovation_bool', 'age_cat', 'age_after_renovation_cat', 'waterfront_0', 'waterfront_1', 'view_0', 'view_1', 'view_2', 'view_3', 'view_4', 'condition_1', 'condition_2', 'condition_3', 'condition_4', 'condition_5', 'grade_1', 'grade_10', 'grade_11', 'grade_12', 'grade_13', 'grade_3', 'grade_4', 'grade_5', 'grade_6', 'grade_7', 'grade_8', 'grade_9', 'zipcode_top10_98004', 'zipcode_top10_98006', 'zipcode_top10_98033', 'zipcode_top10_98039', 'zipcode_top10_98040', 'zipcode_top10_98102', 'zipcode_top10_98105', 'zipcode_top10_98155', 'zipcode_top10_98177', 'zipcode_top10_others', 'age_cat_0', 'age_cat_1', 'age_cat_2', 'age_cat_3', 'age_cat_4', 'age_cat_5', 'age_cat_6', 'age_cat_7', 'age_cat_8', 'age_cat_9', 'age_after_renovation_cat_0', 'age_after_renovation_cat_1', 'age_after_renovation_cat_2', 'age_after_renovation_cat_3', 'age_after_renovation_cat_4', 'age_after_renovation_cat_5', 'age_after_renovation_cat_6', 'age_after_renovation_cat_7', 'age_after_renovation_cat_8', 'age_after_renovation_cat_9', 'log1p_price', 'log1p_sqft_living', 'log1p_sqft_lot', 'log1p_sqft_above', 'log1p_sqft_basement', 'log1p_sqft_living15', 'log1p_sqft_lot15'], dtype='object')
df.filter(regex='age').columns
Index(['age', 'age_after_renovation', 'age_cat', 'age_after_renovation_cat', 'age_cat_0', 'age_cat_1', 'age_cat_2', 'age_cat_3', 'age_cat_4', 'age_cat_5', 'age_cat_6', 'age_cat_7', 'age_cat_8', 'age_cat_9', 'age_after_renovation_cat_0', 'age_after_renovation_cat_1', 'age_after_renovation_cat_2', 'age_after_renovation_cat_3', 'age_after_renovation_cat_4', 'age_after_renovation_cat_5', 'age_after_renovation_cat_6', 'age_after_renovation_cat_7', 'age_after_renovation_cat_8', 'age_after_renovation_cat_9'], dtype='object')
features_processed_cat_age = [ 'age_cat_0', 'age_cat_1', 'age_cat_2',
'age_cat_3', 'age_cat_4', 'age_cat_5',
'age_cat_6', 'age_cat_7', 'age_cat_8',
'age_cat_9']
features_processed_cat_agernv = [
'age_after_renovation_cat_0','age_after_renovation_cat_1',
'age_after_renovation_cat_2', 'age_after_renovation_cat_3',
'age_after_renovation_cat_4', 'age_after_renovation_cat_5',
'age_after_renovation_cat_6', 'age_after_renovation_cat_7',
'age_after_renovation_cat_8', 'age_after_renovation_cat_9']
features_processed_few = features_raw_all + features_processed_cat_age
features_processed_many = (features_raw_all + features_processed_cat_age
+ features_processed_cat_agernv)
print(features_processed_many)
['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'age_cat_0', 'age_cat_1', 'age_cat_2', 'age_cat_3', 'age_cat_4', 'age_cat_5', 'age_cat_6', 'age_cat_7', 'age_cat_8', 'age_cat_9', 'age_after_renovation_cat_0', 'age_after_renovation_cat_1', 'age_after_renovation_cat_2', 'age_after_renovation_cat_3', 'age_after_renovation_cat_4', 'age_after_renovation_cat_5', 'age_after_renovation_cat_6', 'age_after_renovation_cat_7', 'age_after_renovation_cat_8', 'age_after_renovation_cat_9']
import io
import json
myjson = {
'features_raw_few' : features_raw_few,
'features_raw_all': features_raw_all,
'features_processed_few': features_processed_few,
'features_processed_many': features_processed_many
}
with io.open('../models/features_names.json', 'w', encoding='utf8') as fo:
str_ = json.dumps(myjson,
indent=4,
sort_keys=False,
separators=(',', ': '),
ensure_ascii=False)
fo.write(str(str_))
When we have more than one features to estimate the target, it is called multiple linear regression. The equation of the model is given below:
$$ h_{\theta}(X)=\theta_{0}+\theta_{1} x_{1}+\theta_{2} x_{2}+\ldots+\theta_{n} x_{n} $$For multiple linear regression we also have Adjusted R-squared values to better account additional number of features used.
$$ \overline{R^{2}}=R^{2}-\frac{k-1}{n-k}\left(1-R^{2}\right) $$where n
is the number of observations and k
is the number of parameters.
def adjustedR2(rsquared,nrows,kcols):
return rsquared- (kcols-1)/(nrows-kcols) * (1-rsquared)
def multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest,cv=5):
""" Multiple Linear Regression Modelling using given model.
Returns:
rmse, r2_train, ar2_train, r2_test, ar2_test, cv
"""
def adjustedR2(rsquared,nrows,kcols):
return rsquared- (kcols-1)/(nrows-kcols) * (1-rsquared)
# fitting
model.fit(Xtrain,ytrain)
# prediction
ypreds = model.predict(Xtest)
# metrics
rmse = np.sqrt(mean_squared_error(ytest,ypreds)).round(3)
r2_train = model.score(Xtrain, ytrain).round(3)
r2_test = model.score(Xtest, ytest).round(3)
cv = cross_val_score(model, X, y, cv=5,n_jobs=-1).mean().round(3)
ar2_train = adjustedR2(model.score(Xtrain,ytrain),
Xtrain.shape[0],
len(features)).round(3)
ar2_test = adjustedR2(model.score(Xtest,ytest),
Xtest.shape[0] ,
len(features)).round(3)
return (rmse, r2_train, ar2_train, r2_test, ar2_test, cv)
features = features_raw_few
target = ['price']
df = df_raw[features_raw_few + target]
X = df[features].values
y = df[target].values.reshape(-1,1)
Xtrain = train[features].values
ytrain = train[target].values.reshape(-1,1)
Xtest = test[features].values
ytest = test[target].values.reshape(-1,1)
model = linear_model.LinearRegression()
rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest)
df_eval.loc[len(df_eval)] = ['Multiple Linear Regression','few features, unprocessed',
rmse,r2_train,ar2_train,r2_test,ar2_test,cv]
df_eval
Model | Details | Root Mean Squared Error (RMSE) | R-squared (training) | Adjusted R-squared (training) | R-squared (test) | Adjusted R-squared (test) | 5-Fold Cross Validation | |
---|---|---|---|---|---|---|---|---|
0 | Simple Linear Regression | - | 255,511.380 | 0.487 | 0.487 | 0.516 | 0.516 | 0.491 |
1 | Multiple Linear Regression | few features, unprocessed | 250,717.169 | 0.510 | 0.510 | 0.534 | 0.533 | 0.512 |
features = features_processed_few
target = ['price']
df = df_raw[features + target]
X = df[features].values
y = df[target].values.reshape(-1,1)
X_train = train[features].values
y_train = train[target].values.reshape(-1,1)
X_test = test[features].values
y_test = test[target].values.reshape(-1,1)
model = linear_model.LinearRegression()
rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest)
row = df_eval.shape[0]
df_eval.loc[row] = ['Multiple Linear Regression','few features, processed',
rmse,r2_train,ar2_train,r2_test,ar2_test,cv]
df_eval
Model | Details | Root Mean Squared Error (RMSE) | R-squared (training) | Adjusted R-squared (training) | R-squared (test) | Adjusted R-squared (test) | 5-Fold Cross Validation | |
---|---|---|---|---|---|---|---|---|
0 | Simple Linear Regression | - | 255,511.380 | 0.487 | 0.487 | 0.516 | 0.516 | 0.491 |
1 | Multiple Linear Regression | few features, unprocessed | 250,717.169 | 0.510 | 0.510 | 0.534 | 0.533 | 0.512 |
2 | Multiple Linear Regression | few features, processed | 250,717.169 | 0.510 | 0.510 | 0.534 | 0.531 | 0.702 |
All raw features + age_binned + age_renovated_binned
features = features_processed_many
target = ['price']
df = df_raw[features + target]
X = df[features].values
y = df[target].values.reshape(-1,1)
Xtrain = train[features].values
ytrain = train[target].values.reshape(-1,1)
Xtest = test[features].values
ytest = test[target].values.reshape(-1,1)
model = linear_model.LinearRegression()
rmse, r2_train, ar2_train, r2_test, ar2_test, cv =\
multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest)
row = df_eval.shape[0]
df_eval.loc[row] = ['Multiple Linear Regression','many features, processed',
rmse,r2_train,ar2_train,r2_test,ar2_test,cv]
df_eval
Model | Details | Root Mean Squared Error (RMSE) | R-squared (training) | Adjusted R-squared (training) | R-squared (test) | Adjusted R-squared (test) | 5-Fold Cross Validation | |
---|---|---|---|---|---|---|---|---|
0 | Simple Linear Regression | - | 255,511.380 | 0.487 | 0.487 | 0.516 | 0.516 | 0.491 |
1 | Multiple Linear Regression | few features, unprocessed | 250,717.169 | 0.510 | 0.510 | 0.534 | 0.533 | 0.512 |
2 | Multiple Linear Regression | few features, processed | 250,717.169 | 0.510 | 0.510 | 0.534 | 0.531 | 0.702 |
3 | Multiple Linear Regression | many features, processed | 197,259.488 | 0.706 | 0.706 | 0.711 | 0.709 | 0.704 |
Popular regularization methods:
Ridge regression is called L2 regularization and by adding a penalty, we obtain the below equation $$ R S S_{R I D G E}=\sum_{i=1}^{m}\left(h_{\theta}\left(x_{i}\right)-y_{i}\right)^{2}+\alpha \sum_{j=1}^{n} \theta_{j}^{2} $$
features = features_processed_many
target = ['price']
df = df_raw[features+target]
X = df[features].values
y = df[target].values.reshape(-1,1)
Xtrain = train[features].values
ytrain = train[target].values.reshape(-1,1)
Xtest = test[features].values
ytest = test[target].values.reshape(-1,1)
model = linear_model.Ridge(alpha=1)
rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest)
row = df_eval.shape[0]
df_eval.loc[row] = ['Multiple Linear Regression Ridge Regularization (L2)',
'alpha=1, many features, processed',
rmse,r2_train,ar2_train,r2_test,ar2_test,cv]
features = features_processed_many
target = ['price']
df = df_raw[features+target]
X = df[features].values
y = df[target].values.reshape(-1,1)
Xtrain = train[features].values
ytrain = train[target].values.reshape(-1,1)
Xtest = test[features].values
ytest = test[target].values.reshape(-1,1)
model = linear_model.Ridge(alpha=1000)
rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest)
row = df_eval.shape[0]
df_eval.loc[row] = ['Multiple Linear Regression Ridge Regularization (L2)',
'alpha=1000, many features, processed',
rmse,r2_train,ar2_train,r2_test,ar2_test,cv]
Technically the Lasso model is optimizing the same objective function as the Elastic Net with l1_ratio=1.0 (no L2 penalty).
The optimization objective for Lasso is:
(1 / (2 n_samples)) ||y - Xw||^2_2 + alpha * ||w||_1
features = features_processed_many
target = ['price']
df = df_raw[features+target]
X = df[features].values
y = df[target].values.reshape(-1,1)
Xtrain = train[features].values
ytrain = train[target].values.reshape(-1,1)
Xtest = test[features].values
ytest = test[target].values.reshape(-1,1)
model = linear_model.Lasso(alpha=1, random_state=SEED)
rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest)
row = df_eval.shape[0]
df_eval.loc[row] = ['Multiple Linear Regression Lasso Regularization (L1)',
'alpha=1, many features, processed',
rmse,r2_train,ar2_train,r2_test,ar2_test,cv]
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/linear_model/coordinate_descent.py:475: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 336521445098471.8, tolerance: 233028249172.05145 positive)
features = features_processed_many
target = ['price']
df = df_raw[features+target]
X = df[features].values
y = df[target].values.reshape(-1,1)
Xtrain = train[features].values
ytrain = train[target].values.reshape(-1,1)
Xtest = test[features].values
ytest = test[target].values.reshape(-1,1)
model = linear_model.Lasso(alpha=100,random_state=SEED)
rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest)
df_eval.loc[len(df_eval)] = ['Multiple Linear Regression Lasso Regularization',
'alpha=100, many features, processed',
rmse,r2_train,ar2_train,r2_test,ar2_test,cv]
df_eval
Model | Details | Root Mean Squared Error (RMSE) | R-squared (training) | Adjusted R-squared (training) | R-squared (test) | Adjusted R-squared (test) | 5-Fold Cross Validation | |
---|---|---|---|---|---|---|---|---|
0 | Simple Linear Regression | - | 255,511.380 | 0.487 | 0.487 | 0.516 | 0.516 | 0.491 |
1 | Multiple Linear Regression | few features, unprocessed | 250,717.169 | 0.510 | 0.510 | 0.534 | 0.533 | 0.512 |
2 | Multiple Linear Regression | few features, processed | 250,717.169 | 0.510 | 0.510 | 0.534 | 0.531 | 0.702 |
3 | Multiple Linear Regression | many features, processed | 197,259.488 | 0.706 | 0.706 | 0.711 | 0.709 | 0.704 |
4 | Multiple Linear Regression Ridge Regularization (L2) | alpha=1, many features, processed | 197,256.275 | 0.706 | 0.706 | 0.711 | 0.709 | 0.704 |
5 | Multiple Linear Regression Ridge Regularization (L2) | alpha=1000, many features, processed | 210,224.755 | 0.664 | 0.663 | 0.672 | 0.669 | 0.661 |
6 | Multiple Linear Regression Lasso Regularization (L1) | alpha=1, many features, processed | 197,259.114 | 0.706 | 0.706 | 0.711 | 0.709 | 0.704 |
7 | Multiple Linear Regression Lasso Regularization | alpha=100, many features, processed | 197,276.866 | 0.706 | 0.706 | 0.711 | 0.709 | 0.704 |
features = features_raw_few
target = ['price']
df = df_raw[features+target]
polyfeat = PolynomialFeatures(degree=2)
X = polyfeat.fit_transform(df[features])
Xtrain = polyfeat.fit_transform(train[features])
Xtest = polyfeat.fit_transform(test[features])
y = df[target].values.reshape(-1,1)
ytrain = train[target].values.reshape(-1,1)
ytest = test[target].values.reshape(-1,1)
model = linear_model.LinearRegression(n_jobs=-1)
rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest)
row = df_eval.shape[0]
df_eval.loc[row] = ['Polynomial Regression','deg=2, few features,\
unprocessed, no regularization',
rmse,r2_train,ar2_train,r2_test,ar2_test,cv]
features = features_raw_few
target = ['price']
df = df_raw[features+target]
polyfeat = PolynomialFeatures(degree=3)
X = polyfeat.fit_transform(df[features])
Xtrain = polyfeat.fit_transform(train[features])
Xtest = polyfeat.fit_transform(test[features])
y = df[target].values.reshape(-1,1)
ytrain = train[target].values.reshape(-1,1)
ytest = test[target].values.reshape(-1,1)
model = linear_model.LinearRegression(n_jobs=-1)
rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest)
row = df_eval.shape[0]
df_eval.loc[row] = ['Polynomial Regression','deg=3, \
few features, unprocessed, no regularization',
rmse,r2_train,ar2_train,r2_test,ar2_test,cv]
features = features_raw_all
target = ['price']
df = df_raw[features+target]
polyfeat = PolynomialFeatures(degree=2)
X = polyfeat.fit_transform(df[features])
Xtrain = polyfeat.fit_transform(train[features])
Xtest = polyfeat.fit_transform(test[features])
y = df[target].values.reshape(-1,1)
ytrain = train[target].values.reshape(-1,1)
ytest = test[target].values.reshape(-1,1)
model = linear_model.LinearRegression(n_jobs=-1)
rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest)
row = df_eval.shape[0]
df_eval.loc[row] = ['Polynomial Regression','deg=2, all features,\
unprocessed, no regularization',
rmse,r2_train,ar2_train,r2_test,ar2_test,cv]
features = features_raw_all
target = ['price']
df = df_raw[features+target]
polyfeat = PolynomialFeatures(degree=3)
X = polyfeat.fit_transform(df[features])
X_train = polyfeat.fit_transform(train[features])
X_test = polyfeat.fit_transform(test[features])
y = df[target].values.reshape(-1,1)
y_train = train[target].values.reshape(-1,1)
y_test = test[target].values.reshape(-1,1)
model = linear_model.LinearRegression(n_jobs=-1)
rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest)
row = df_eval.shape[0]
df_eval.loc[row] = ['Polynomial Regression','deg =3, all features,\
unprocessed, no regularization',
rmse,r2_train,ar2_train,r2_test,ar2_test,cv]
time_start = time.time()
features = features_processed_many
target = ['price']
df = df_raw[features+target]
polyfeat = PolynomialFeatures(degree=2)
X = polyfeat.fit_transform(df[features])
Xtrain = polyfeat.fit_transform(train[features])
Xtest = polyfeat.fit_transform(test[features])
y = df[target].values.reshape(-1,1)
ytrain = train[target].values.reshape(-1,1)
ytest = test[target].values.reshape(-1,1)
model = linear_model.LinearRegression(n_jobs=-1)
rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest)
row = df_eval.shape[0]
time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
df_eval.loc[row] = ['Polynomial Regression','deg =2, many features,\
processed, no regularization',
rmse,r2_train,ar2_train,r2_test,ar2_test,cv]
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py:706: UserWarning: A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak. "timeout or by a memory leak.", UserWarning
Time taken: 0 min 7 secs
time_start = time.time()
features = features_processed_many
target = ['price']
df = df_raw[features+target]
polyfeat = PolynomialFeatures(degree=2)
X = polyfeat.fit_transform(df[features])
Xtrain = polyfeat.fit_transform(train[features])
Xtest = polyfeat.fit_transform(test[features])
y = df[target].values.reshape(-1,1)
ytrain = train[target].values.reshape(-1,1)
ytest = test[target].values.reshape(-1,1)
model = linear_model.Ridge(alpha=1,random_state=SEED)
rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest)
row = df_eval.shape[0]
df_eval.loc[row] = ['Polynomial Regression','deg=2, many features,\
processed, Ridge alpha=1',
rmse,r2_train,ar2_train,r2_test,ar2_test,cv]
time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/linear_model/ridge.py:147: LinAlgWarning: Ill-conditioned matrix (rcond=1.89017e-26): result may not be accurate. overwrite_a=True).T
Time taken: 0 min 3 secs
features = features_processed_many
target = ['price']
df = df_raw[features+target]
polyfeat = PolynomialFeatures(degree=2)
X = polyfeat.fit_transform(df[features])
Xtrain = polyfeat.fit_transform(train[features])
Xtest = polyfeat.fit_transform(test[features])
y = df[target].values.reshape(-1,1)
ytrain = train[target].values.reshape(-1,1)
ytest = test[target].values.reshape(-1,1)
model = linear_model.Ridge(alpha=50000,random_state=SEED)
rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest)
row = df_eval.shape[0]
df_eval.loc[row] = ['Polynomial Regression','deg =2, many features, processed,\
Ridge alpha=50000',
rmse,r2_train,ar2_train,r2_test,ar2_test,cv]
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/linear_model/ridge.py:147: LinAlgWarning: Ill-conditioned matrix (rcond=8.59361e-22): result may not be accurate. overwrite_a=True).T
time_start = time.time()
features = features_processed_many
target = ['price']
df = df_raw[features+target]
polyfeat = PolynomialFeatures(degree=2)
X = polyfeat.fit_transform(df[features])
Xtrain = polyfeat.fit_transform(train[features])
Xtest = polyfeat.fit_transform(test[features])
y = df[target].values.reshape(-1,1)
ytrain = train[target].values.reshape(-1,1)
ytest = test[target].values.reshape(-1,1)
model = linear_model.Lasso(alpha=1,random_state=SEED)
rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest)
row = df_eval.shape[0]
df_eval.loc[row] = ['Polynomial Regression','deg=2, all features,\
processed, Lasso alpha=1',
rmse,r2_train,ar2_train,r2_test,ar2_test,cv]
time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/linear_model/coordinate_descent.py:475: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 219134165009892.25, tolerance: 233028249172.05145 positive)
Time taken: 0 min 45 secs
time_start = time.time()
features = features_processed_many
target = ['price']
df = df_raw[features+target]
polyfeat = PolynomialFeatures(degree=2)
X = polyfeat.fit_transform(df[features])
Xtrain = polyfeat.fit_transform(train[features])
Xtest = polyfeat.fit_transform(test[features])
y = df[target].values.reshape(-1,1)
ytrain = train[target].values.reshape(-1,1)
ytest = test[target].values.reshape(-1,1)
model = linear_model.Lasso(alpha=50000,random_state=SEED)
rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest)
df_eval.loc[len(df_eval)] = ['Polynomial Regression','deg =2, all features,\
processed, Lasso alpha=50000',
rmse,r2_train,ar2_train,r2_test,ar2_test,cv]
time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/linear_model/coordinate_descent.py:475: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 232071906362195.44, tolerance: 233028249172.05145 positive)
Time taken: 0 min 35 secs
Notes
The object solves the same problem as the LassoCV object. However, unlike the LassoCV, it find the relevant alphas values by itself. In general, because of this property, it will be more stable. However, it is more fragile to heavily multicollinear datasets.
It is more efficient than the LassoCV if only a small number of features are selected compared to the total number, for instance if there are very few samples compared to the number of features.
def adjustedR2(rsquared,nrows,kcols):
return rsquared- (kcols-1)/(nrows-kcols) * (1-rsquared)
from sklearn.linear_model import LassoLarsCV
time_start = time.time()
features = features_processed_many
target = ['price']
df = df_raw[features+target]
X = df[features].values
y = df[target].values.reshape(-1,1)
Xtrain = train[features].values
ytrain = train[target].values.reshape(-1,1)
Xtest = test[features].values
ytest = test[target].values.reshape(-1,1)
model = linear_model.LassoLarsCV(cv=5,n_jobs=-1,verbose=2,max_iter=1000)
# fitting
model.fit(Xtrain,ytrain)
# prediction
ypreds = model.predict(Xtest)
# metrics
rmse = np.sqrt(mean_squared_error(ytest,ypreds)).round(3)
r2_train = model.score(Xtrain, ytrain).round(3)
r2_test = model.score(Xtest, ytest).round(3)
# rows and cols
nrows = df.shape[0]
kcols = len(features)
# adjusted rsquared
ar2_train = adjustedR2(r2_train,nrows,kcols)
ar2_test = adjustedR2(r2_test,nrows,kcols)
df_eval.loc[len(df_eval)] = ['Linear Regression LassoLarsCV',
'many features processed',
rmse,r2_train,ar2_train,r2_test,ar2_test,cv]
time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/utils/validation.py:724: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) [Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
.Time taken: 0 min 0 secs
[Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 0.2s finished
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
# Jupyter notebook settings for pandas
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100) # None for all the rows
df_eval.sort_values('Adjusted R-squared (test)',ascending=False)
Model | Details | Root Mean Squared Error (RMSE) | R-squared (training) | Adjusted R-squared (training) | R-squared (test) | Adjusted R-squared (test) | 5-Fold Cross Validation | |
---|---|---|---|---|---|---|---|---|
10 | Polynomial Regression | deg=2, all features, unprocessed, no regularization | 158,822.055 | 0.831 | 0.831 | 0.813 | 0.812 | 0.813 |
11 | Polynomial Regression | deg =3, all features, unprocessed, no regularization | 158,822.055 | 0.831 | 0.831 | 0.813 | 0.812 | -2.454 |
13 | Polynomial Regression | deg=2, many features, processed, Ridge alpha=1 | 159,185.146 | 0.846 | 0.846 | 0.812 | 0.810 | 0.808 |
14 | Polynomial Regression | deg =2, many features, processed, Ridge alpha=50000 | 165,440.773 | 0.821 | 0.820 | 0.797 | 0.795 | 0.792 |
15 | Polynomial Regression | deg=2, all features, processed, Lasso alpha=1 | 174,534.762 | 0.812 | 0.812 | 0.774 | 0.772 | 0.778 |
16 | Polynomial Regression | deg =2, all features, processed, Lasso alpha=50000 | 176,297.103 | 0.803 | 0.802 | 0.769 | 0.767 | 0.781 |
17 | Linear Regression LassoLarsCV | many features processed | 197,309.546 | 0.706 | 0.706 | 0.711 | 0.711 | 0.781 |
3 | Multiple Linear Regression | many features, processed | 197,259.488 | 0.706 | 0.706 | 0.711 | 0.709 | 0.704 |
4 | Multiple Linear Regression Ridge Regularization (L2) | alpha=1, many features, processed | 197,256.275 | 0.706 | 0.706 | 0.711 | 0.709 | 0.704 |
6 | Multiple Linear Regression Lasso Regularization (L1) | alpha=1, many features, processed | 197,259.114 | 0.706 | 0.706 | 0.711 | 0.709 | 0.704 |
7 | Multiple Linear Regression Lasso Regularization | alpha=100, many features, processed | 197,276.866 | 0.706 | 0.706 | 0.711 | 0.709 | 0.704 |
5 | Multiple Linear Regression Ridge Regularization (L2) | alpha=1000, many features, processed | 210,224.755 | 0.664 | 0.663 | 0.672 | 0.669 | 0.661 |
12 | Polynomial Regression | deg =2, many features, processed, no regularization | 229,581.128 | 0.849 | 0.848 | 0.609 | 0.606 | -3.857 |
9 | Polynomial Regression | deg=3, few features, unprocessed, no regularization | 237,502.457 | 0.584 | 0.584 | 0.581 | 0.581 | 0.490 |
8 | Polynomial Regression | deg=2, few features, unprocessed, no regularization | 237,956.794 | 0.569 | 0.568 | 0.580 | 0.579 | 0.540 |
1 | Multiple Linear Regression | few features, unprocessed | 250,717.169 | 0.510 | 0.510 | 0.534 | 0.533 | 0.512 |
2 | Multiple Linear Regression | few features, processed | 250,717.169 | 0.510 | 0.510 | 0.534 | 0.531 | 0.702 |
0 | Simple Linear Regression | - | 255,511.380 | 0.487 | 0.487 | 0.516 | 0.516 | 0.491 |
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
target = ['price']
features = features_processed_many
df = df_raw[features+target]
X = df[features].values
y = df[target].values.reshape(-1,1)
Xtrain = train[features].values
ytrain = train[target].values.reshape(-1,1)
Xtest = test[features].values
ytest = test[target].values.reshape(-1,1)
# scaling
import joblib
from sklearn.preprocessing import StandardScaler
# for linear models scaling is useful
scaler = StandardScaler()
scaler.fit(Xtrain)
# persist the model for future use
joblib.dump(scaler, '../models/MinMaxScaler_features_processed_many.pkl')
# scale transform
Xtrain = scaler.transform(Xtrain)
Xtest = scaler.transform(Xtest)
model = linear_model.Lasso(alpha=0.05,random_state=SEED,
max_iter=10_000, tol=0.01)
model.fit(Xtrain, ytrain)
# persist the model for future use
joblib.dump(model, '../models/lasso_regression_alpha_05_features_processed_many.pkl')
# prediction
ypreds = model.predict(Xtest)
# metrics
rmse_test = np.sqrt(mean_squared_error(ytest,ypreds)).round(3)
r2_train = model.score(Xtrain, ytrain).round(3)
r2_test = model.score(Xtest, ytest).round(3)
# prints
print('model = Lasso with Standard Scaling and many processed features')
print('rmse test = ', rmse_test)
print('rsquared train = ', r2_train)
print('rsquared test = ', r2_test)
model = Lasso with MinMax Scaling and many processed features rmse test = 197259.614 rsquared train = 0.706 rsquared test = 0.711
# Feature importance
sel = SelectFromModel(Lasso(alpha=0.05, random_state=SEED,tol=0.01))
sel.fit(Xtrain, ytrain)
sel.get_support()
array([ True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True])
selected_feat = train[features].columns[sel.get_support()]
selected_feat
Index(['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'age_cat_0', 'age_cat_1', 'age_cat_2', 'age_cat_3', 'age_cat_4', 'age_cat_5', 'age_cat_6', 'age_cat_7', 'age_cat_8', 'age_cat_9', 'age_after_renovation_cat_0', 'age_after_renovation_cat_1', 'age_after_renovation_cat_2', 'age_after_renovation_cat_3', 'age_after_renovation_cat_4', 'age_after_renovation_cat_5', 'age_after_renovation_cat_6', 'age_after_renovation_cat_7', 'age_after_renovation_cat_8', 'age_after_renovation_cat_9'], dtype='object')
print('total features: {}'.format(len(features)))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
np.sum(sel.estimator_.coef_ == 0)))
total features: 37 selected features: 37 features with coefficients shrank to zero: 0
import io
import json
myjson = {'model': 'Lasso with alpha 0.05 and many processed features',
'selected_features': selected_feat.values.tolist()}
with io.open('../models/lasso_alpha_005_selected_features.json', 'w', encoding='utf8') as fo:
str_ = json.dumps(myjson,
indent=4,
sort_keys=True,
separators=(',', ': '),
ensure_ascii=False)
fo.write(str(str_))
# Laso evaluation
plt.scatter(ytest, ypreds)
plt.xlabel('True House Price')
plt.ylabel('Predicted House Price')
plt.title('Evaluation of Lasso Predictions')
plt.xticks(rotation=90)
(array([-1000000., 0., 1000000., 2000000., 3000000., 4000000., 5000000., 6000000., 7000000., 8000000.]), <a list of 10 Text xticklabel objects>)
errors = ytest.ravel() - ypreds.ravel()
errors
array([-131627.15561054, -9655.37749367, -131804.98611956, ..., 198212.68183312, -154168.31099857, 87765.5346244 ])
plt.hist(errors, bins=20);
sns.distplot(errors,norm_hist=True,) # errors should follow normal distribution
<matplotlib.axes._subplots.AxesSubplot at 0x11b689320>
ser_fimp = pd.Series(np.abs(model.coef_.ravel()))
ser_fimp.index = features
ser_fimp.sort_values(inplace=True)
ax = ser_fimp.plot.barh(figsize=(18,18))
ax.tick_params(axis='both', which='major', labelsize=14)
plt.xlabel('Lasso Coefficients',fontsize=24)
plt.ylabel('Features',fontsize=24)
plt.title('Feature Importance',fontsize=24)
plt.tight_layout()
plt.show()
Best model: Polynomial Regression deg=2, all features, unprocessed, no regularization
features = features_raw_all
target = ['price']
df = df_raw[features+target]
polyfeat = PolynomialFeatures(degree=2)
X = polyfeat.fit_transform(df[features])
Xtrain = polyfeat.fit_transform(train[features])
Xtest = polyfeat.fit_transform(test[features])
y = df[target].values.reshape(-1,1)
ytrain = train[target].values.reshape(-1,1)
ytest = test[target].values.reshape(-1,1)
model = linear_model.LinearRegression(n_jobs=-1)
rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest)
row = df_eval.shape[0]
print(ar2_test)
0.812
df.head(2)
bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | price | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3 | 1.000 | 1180 | 5650 | 1.000 | 0 | 0 | 3 | 7 | 1180 | 1955 | 0 | 98178 | 47.511 | -122.257 | 1340 | 5650 | 221,900.000 |
1 | 3 | 2.250 | 2570 | 7242 | 2.000 | 0 | 0 | 3 | 7 | 2170 | 1951 | 1991 | 98125 | 47.721 | -122.319 | 1690 | 7639 | 538,000.000 |
from sklearn.compose import TransformedTargetRegressor as TTR
from sklearn.preprocessing import StandardScaler
time_start = time.time()
X = polyfeat.fit_transform(df[features])
Xtrain = polyfeat.fit_transform(train[features])
Xtest = polyfeat.fit_transform(test[features])
y = df[target].values.reshape(-1,1)
ytrain = train[target].values.reshape(-1,1)
ytest = test[target].values.reshape(-1,1)
# for linear models scaling is useful
scaler = StandardScaler()
scaler.fit(Xtrain)
# scale transform
Xtrain = scaler.transform(Xtrain)
Xtest = scaler.transform(Xtest)
model = linear_model.LinearRegression(n_jobs=-1)
model = TTR(model, func=np.log1p, inverse_func=np.expm1)
model.fit(Xtrain,ytrain)
# prediction
ypreds = model.predict(Xtest)
# metrics
rmse_test = np.sqrt(mean_squared_error(ytest,ypreds)).round(3)
r2_train = model.score(Xtrain, ytrain).round(3)
r2_test = model.score(Xtest, ytest).round(3)
# prints
print('model = Best model')
print('rmse test = ', rmse_test)
print('rsquared train = ', r2_train)
print('rsquared test = ', r2_test)
df_eval.loc[len(df_eval)] = ['Linear Regression',
'all features, standard scaler, transform ytrain',
rmse,r2_train,ar2_train,r2_test,ar2_test,cv]
time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
df_eval.sort_values('Adjusted R-squared (test)',ascending=False)
model = Best model rmse test = 152548.214 rsquared train = 0.84 rsquared test = 0.827 Time taken: 0 min 0 secs
Model | Details | Root Mean Squared Error (RMSE) | R-squared (training) | Adjusted R-squared (training) | R-squared (test) | Adjusted R-squared (test) | 5-Fold Cross Validation | |
---|---|---|---|---|---|---|---|---|
18 | Linear Regression | all features, standard scaler, transform ytrain | 158,822.055 | 0.840 | 0.831 | 0.827 | 0.812 | 0.813 |
10 | Polynomial Regression | deg=2, all features, unprocessed, no regularization | 158,822.055 | 0.831 | 0.831 | 0.813 | 0.812 | 0.813 |
11 | Polynomial Regression | deg =3, all features, unprocessed, no regularization | 158,822.055 | 0.831 | 0.831 | 0.813 | 0.812 | -2.454 |
13 | Polynomial Regression | deg=2, many features, processed, Ridge alpha=1 | 159,185.146 | 0.846 | 0.846 | 0.812 | 0.810 | 0.808 |
14 | Polynomial Regression | deg =2, many features, processed, Ridge alpha=50000 | 165,440.773 | 0.821 | 0.820 | 0.797 | 0.795 | 0.792 |
15 | Polynomial Regression | deg=2, all features, processed, Lasso alpha=1 | 174,534.762 | 0.812 | 0.812 | 0.774 | 0.772 | 0.778 |
16 | Polynomial Regression | deg =2, all features, processed, Lasso alpha=50000 | 176,297.103 | 0.803 | 0.802 | 0.769 | 0.767 | 0.781 |
17 | Linear Regression LassoLarsCV | many features processed | 197,309.546 | 0.706 | 0.706 | 0.711 | 0.711 | 0.781 |
3 | Multiple Linear Regression | many features, processed | 197,259.488 | 0.706 | 0.706 | 0.711 | 0.709 | 0.704 |
4 | Multiple Linear Regression Ridge Regularization (L2) | alpha=1, many features, processed | 197,256.275 | 0.706 | 0.706 | 0.711 | 0.709 | 0.704 |
6 | Multiple Linear Regression Lasso Regularization (L1) | alpha=1, many features, processed | 197,259.114 | 0.706 | 0.706 | 0.711 | 0.709 | 0.704 |
7 | Multiple Linear Regression Lasso Regularization | alpha=100, many features, processed | 197,276.866 | 0.706 | 0.706 | 0.711 | 0.709 | 0.704 |
5 | Multiple Linear Regression Ridge Regularization (L2) | alpha=1000, many features, processed | 210,224.755 | 0.664 | 0.663 | 0.672 | 0.669 | 0.661 |
12 | Polynomial Regression | deg =2, many features, processed, no regularization | 229,581.128 | 0.849 | 0.848 | 0.609 | 0.606 | -3.857 |
9 | Polynomial Regression | deg=3, few features, unprocessed, no regularization | 237,502.457 | 0.584 | 0.584 | 0.581 | 0.581 | 0.490 |
8 | Polynomial Regression | deg=2, few features, unprocessed, no regularization | 237,956.794 | 0.569 | 0.568 | 0.580 | 0.579 | 0.540 |
1 | Multiple Linear Regression | few features, unprocessed | 250,717.169 | 0.510 | 0.510 | 0.534 | 0.533 | 0.512 |
2 | Multiple Linear Regression | few features, processed | 250,717.169 | 0.510 | 0.510 | 0.534 | 0.531 | 0.702 |
0 | Simple Linear Regression | - | 255,511.380 | 0.487 | 0.487 | 0.516 | 0.516 | 0.491 |