%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;


ENV_BHISHAN = None

try:
    import bhishan
    %load_ext autoreload
    %autoreload 2
    
    ENV_BHISHAN = True
    
    print('Environment: Bhishan')
except:
    pass

Environment: Bhishan


import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import os
import time

import io
import json

# random state
SEED = 0
RNG = np.random.RandomState(SEED)


# Jupyter notebook settings for pandas
pd.set_option('display.max_columns', 200)
pd.set_option('display.float_format', '{:,.3f}'.format) # numbers sep by comma
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 200)

print([(x.__name__,x.__version__) for x in [np, pd,sns,matplotlib]])

[('numpy', '1.16.4'), ('pandas', '0.25.2'), ('seaborn', '0.9.0'), ('matplotlib', '3.1.1')]


# scale and split
from sklearn.model_selection import train_test_split


# regressors
from sklearn import linear_model
from sklearn.linear_model import LinearRegression


# regressor preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures


# metrics
from sklearn import metrics
from sklearn.metrics import mean_squared_error


# cross validation
from sklearn.model_selection import cross_val_score


def show_method_attributes(obj, ncols=7,start=None, inside=None):
    """ Show all the attributes of a given method.
    Example:
    ========
    show_method_attributes(list)
     """

    print(f'Object Type: {type(obj)}\n')
    lst = [elem for elem in dir(obj) if elem[0]!='_' ]
    lst = [elem for elem in lst 
           if elem not in 'os np pd sys time psycopg2'.split() ]

    if isinstance(start,str):
        lst = [elem for elem in lst if elem.startswith(start)]
        
    if isinstance(start,tuple) or isinstance(start,list):
        lst = [elem for elem in lst for start_elem in start
               if elem.startswith(start_elem)]
        
    if isinstance(inside,str):
        lst = [elem for elem in lst if inside in elem]
        
    if isinstance(inside,tuple) or isinstance(inside,list):
        lst = [elem for elem in lst for inside_elem in inside
               if inside_elem in elem]

    return pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')


df_eval = pd.DataFrame({'Model': [],
                           'Details':[],
                           'Root Mean Squared Error (RMSE)':[],
                           'R-squared (training)':[],
                           'Adjusted R-squared (training)':[],
                           'R-squared (test)':[],
                           'Adjusted R-squared (test)':[],
                           '5-Fold Cross Validation':[]})


%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;


!ls ../data/processed/

data_cleaned.csv         data_cleaned_encoded.csv


ifile = 'https://github.com/bhishanpdl/Project_House_Price_Prediction/blob/master/data/processed/data_cleaned_encoded.csv?raw=true'
ifile = '../data/processed/data_cleaned_encoded.csv'
df_raw = pd.read_csv(ifile)
print(df_raw.shape)
df_raw.head()

(21613, 92)


df_corr = df_raw.corr(method='pearson')

cols10 = df_corr.nlargest(10, 'price').index

df_corr = df_raw[cols10].corr()
df_corr.style.background_gradient(cmap='coolwarm', axis=None)


train, test = train_test_split(df_raw,train_size = 0.8,random_state=SEED)

feature = 'sqft_living'
target = 'price'

df = df_raw

X = df[feature].values.reshape(-1,1)
y = df[target].values.reshape(-1,1)

Xtrain = train[feature].values.reshape(-1,1)
ytrain = train[target].values.reshape(-1,1)

Xtest = test[feature].values.reshape(-1,1)
ytest = test[target].values.reshape(-1,1)


lr = linear_model.LinearRegression(n_jobs=-1)

lr.fit(Xtrain,ytrain)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)


print('Intercept: {}'.format(lr.intercept_))
print('Coefficient: {}'.format(lr.coef_))

Intercept: [-42628.97651509]
Coefficient: [[280.68541679]]


ypreds = lr.predict(Xtest)


rmse = np.sqrt(mean_squared_error(ytest,ypreds)).round(3)
r2_train = lr.score(Xtrain, ytrain).round(3)
r2_test = lr.score(Xtest, ytest).round(3)

cv = cross_val_score(lr, X, y, cv=5,n_jobs=-1).mean().round(3)


df_eval.columns

Index(['Model', 'Details', 'Root Mean Squared Error (RMSE)',
       'R-squared (training)', 'Adjusted R-squared (training)',
       'R-squared (test)', 'Adjusted R-squared (test)',
       '5-Fold Cross Validation'],
      dtype='object')


df_eval.loc[len(df_eval)] = ['Simple Linear Regression','-',
                             rmse,r2_train,r2_train,r2_test,r2_test,cv]
df_eval


if ENV_BHISHAN:
    from bhishan.util_model_plot import plot_simple_linear_regression

    plot_simple_linear_regression(Xtest, ytest, lr,"Living Space",'Price')


df_raw.head(2)


df_raw.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15', 'yr_sales', 'age',
       'yr_renovated2', 'age_after_renovation', 'zipcode_top10',
       'zipcode_houses', 'basement_bool', 'renovation_bool', 'age_cat',
       'age_after_renovation_cat', 'waterfront_0', 'waterfront_1', 'view_0',
       'view_1', 'view_2', 'view_3', 'view_4', 'condition_1', 'condition_2',
       'condition_3', 'condition_4', 'condition_5', 'grade_1', 'grade_10',
       'grade_11', 'grade_12', 'grade_13', 'grade_3', 'grade_4', 'grade_5',
       'grade_6', 'grade_7', 'grade_8', 'grade_9', 'zipcode_top10_98004',
       'zipcode_top10_98006', 'zipcode_top10_98033', 'zipcode_top10_98039',
       'zipcode_top10_98040', 'zipcode_top10_98102', 'zipcode_top10_98105',
       'zipcode_top10_98155', 'zipcode_top10_98177', 'zipcode_top10_others',
       'age_cat_0', 'age_cat_1', 'age_cat_2', 'age_cat_3', 'age_cat_4',
       'age_cat_5', 'age_cat_6', 'age_cat_7', 'age_cat_8', 'age_cat_9',
       'age_after_renovation_cat_0', 'age_after_renovation_cat_1',
       'age_after_renovation_cat_2', 'age_after_renovation_cat_3',
       'age_after_renovation_cat_4', 'age_after_renovation_cat_5',
       'age_after_renovation_cat_6', 'age_after_renovation_cat_7',
       'age_after_renovation_cat_8', 'age_after_renovation_cat_9',
       'log1p_price', 'log1p_sqft_living', 'log1p_sqft_lot',
       'log1p_sqft_above', 'log1p_sqft_basement', 'log1p_sqft_living15',
       'log1p_sqft_lot15'],
      dtype='object')


features_raw_few = ['bedrooms','bathrooms','sqft_living',
                    'sqft_lot','floors','zipcode']


features_raw_all = ['bedrooms','bathrooms','sqft_living','sqft_lot',
                    'floors','waterfront','view','condition','grade',
                    'sqft_above','yr_built','yr_renovated',
                    'zipcode','lat','long','sqft_living15','sqft_lot15']


df.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15', 'yr_sales', 'age',
       'yr_renovated2', 'age_after_renovation', 'zipcode_top10',
       'zipcode_houses', 'basement_bool', 'renovation_bool', 'age_cat',
       'age_after_renovation_cat', 'waterfront_0', 'waterfront_1', 'view_0',
       'view_1', 'view_2', 'view_3', 'view_4', 'condition_1', 'condition_2',
       'condition_3', 'condition_4', 'condition_5', 'grade_1', 'grade_10',
       'grade_11', 'grade_12', 'grade_13', 'grade_3', 'grade_4', 'grade_5',
       'grade_6', 'grade_7', 'grade_8', 'grade_9', 'zipcode_top10_98004',
       'zipcode_top10_98006', 'zipcode_top10_98033', 'zipcode_top10_98039',
       'zipcode_top10_98040', 'zipcode_top10_98102', 'zipcode_top10_98105',
       'zipcode_top10_98155', 'zipcode_top10_98177', 'zipcode_top10_others',
       'age_cat_0', 'age_cat_1', 'age_cat_2', 'age_cat_3', 'age_cat_4',
       'age_cat_5', 'age_cat_6', 'age_cat_7', 'age_cat_8', 'age_cat_9',
       'age_after_renovation_cat_0', 'age_after_renovation_cat_1',
       'age_after_renovation_cat_2', 'age_after_renovation_cat_3',
       'age_after_renovation_cat_4', 'age_after_renovation_cat_5',
       'age_after_renovation_cat_6', 'age_after_renovation_cat_7',
       'age_after_renovation_cat_8', 'age_after_renovation_cat_9',
       'log1p_price', 'log1p_sqft_living', 'log1p_sqft_lot',
       'log1p_sqft_above', 'log1p_sqft_basement', 'log1p_sqft_living15',
       'log1p_sqft_lot15'],
      dtype='object')


df.filter(regex='age').columns

Index(['age', 'age_after_renovation', 'age_cat', 'age_after_renovation_cat',
       'age_cat_0', 'age_cat_1', 'age_cat_2', 'age_cat_3', 'age_cat_4',
       'age_cat_5', 'age_cat_6', 'age_cat_7', 'age_cat_8', 'age_cat_9',
       'age_after_renovation_cat_0', 'age_after_renovation_cat_1',
       'age_after_renovation_cat_2', 'age_after_renovation_cat_3',
       'age_after_renovation_cat_4', 'age_after_renovation_cat_5',
       'age_after_renovation_cat_6', 'age_after_renovation_cat_7',
       'age_after_renovation_cat_8', 'age_after_renovation_cat_9'],
      dtype='object')


features_processed_cat_age = [ 'age_cat_0', 'age_cat_1', 'age_cat_2',
                               'age_cat_3', 'age_cat_4', 'age_cat_5',
                               'age_cat_6', 'age_cat_7', 'age_cat_8',
                               'age_cat_9']

features_processed_cat_agernv = [
                'age_after_renovation_cat_0','age_after_renovation_cat_1',
                'age_after_renovation_cat_2', 'age_after_renovation_cat_3',
                'age_after_renovation_cat_4', 'age_after_renovation_cat_5',
                'age_after_renovation_cat_6', 'age_after_renovation_cat_7',
                'age_after_renovation_cat_8', 'age_after_renovation_cat_9']


features_processed_few = features_raw_all + features_processed_cat_age
features_processed_many = (features_raw_all + features_processed_cat_age
                          + features_processed_cat_agernv)

print(features_processed_many)

['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'age_cat_0', 'age_cat_1', 'age_cat_2', 'age_cat_3', 'age_cat_4', 'age_cat_5', 'age_cat_6', 'age_cat_7', 'age_cat_8', 'age_cat_9', 'age_after_renovation_cat_0', 'age_after_renovation_cat_1', 'age_after_renovation_cat_2', 'age_after_renovation_cat_3', 'age_after_renovation_cat_4', 'age_after_renovation_cat_5', 'age_after_renovation_cat_6', 'age_after_renovation_cat_7', 'age_after_renovation_cat_8', 'age_after_renovation_cat_9']


import io
import json

myjson = {
    'features_raw_few' : features_raw_few,
    'features_raw_all': features_raw_all,
    'features_processed_few': features_processed_few,
    'features_processed_many': features_processed_many
}

with io.open('../models/features_names.json', 'w', encoding='utf8') as fo:
    str_ = json.dumps(myjson,
                      indent=4,
                      sort_keys=False,
                      separators=(',', ': '),
                      ensure_ascii=False)
    fo.write(str(str_))


def adjustedR2(rsquared,nrows,kcols):
    return rsquared- (kcols-1)/(nrows-kcols) * (1-rsquared)


def multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest,cv=5):
    """ Multiple Linear Regression Modelling using given model.
    
    
    Returns:
    rmse, r2_train, ar2_train, r2_test, ar2_test, cv
    """
    def adjustedR2(rsquared,nrows,kcols):
        return rsquared- (kcols-1)/(nrows-kcols) * (1-rsquared)
    
    # fitting
    model.fit(Xtrain,ytrain)

    # prediction
    ypreds = model.predict(Xtest)

    # metrics
    rmse = np.sqrt(mean_squared_error(ytest,ypreds)).round(3)
    r2_train = model.score(Xtrain, ytrain).round(3)
    r2_test = model.score(Xtest, ytest).round(3)

    cv = cross_val_score(model, X, y, cv=5,n_jobs=-1).mean().round(3)

    ar2_train = adjustedR2(model.score(Xtrain,ytrain),
                           Xtrain.shape[0],
                           len(features)).round(3)
    ar2_test  = adjustedR2(model.score(Xtest,ytest),
                           Xtest.shape[0] ,
                           len(features)).round(3)
    
    return (rmse, r2_train, ar2_train, r2_test, ar2_test, cv)


features = features_raw_few
target = ['price']

df = df_raw[features_raw_few + target]

X = df[features].values
y = df[target].values.reshape(-1,1)

Xtrain = train[features].values
ytrain = train[target].values.reshape(-1,1)

Xtest = test[features].values
ytest = test[target].values.reshape(-1,1)

model = linear_model.LinearRegression()
rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
    multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest)


df_eval.loc[len(df_eval)] = ['Multiple Linear Regression','few features, unprocessed',
                    rmse,r2_train,ar2_train,r2_test,ar2_test,cv]

df_eval


features = features_processed_few
target = ['price']

df = df_raw[features + target]

X = df[features].values
y = df[target].values.reshape(-1,1)

X_train = train[features].values
y_train = train[target].values.reshape(-1,1)

X_test = test[features].values
y_test = test[target].values.reshape(-1,1)

model = linear_model.LinearRegression()
rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
    multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest)

row = df_eval.shape[0]
df_eval.loc[row] = ['Multiple Linear Regression','few features, processed',
                    rmse,r2_train,ar2_train,r2_test,ar2_test,cv]

df_eval


features = features_processed_many
target = ['price']

df = df_raw[features + target]

X = df[features].values
y = df[target].values.reshape(-1,1)

Xtrain = train[features].values
ytrain = train[target].values.reshape(-1,1)

Xtest = test[features].values
ytest = test[target].values.reshape(-1,1)

model = linear_model.LinearRegression()
rmse, r2_train, ar2_train, r2_test, ar2_test, cv =\
    multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest)

row = df_eval.shape[0]
df_eval.loc[row] = ['Multiple Linear Regression','many features, processed',
                    rmse,r2_train,ar2_train,r2_test,ar2_test,cv]

df_eval


features = features_processed_many
target = ['price']

df = df_raw[features+target]

X = df[features].values
y = df[target].values.reshape(-1,1)

Xtrain = train[features].values
ytrain = train[target].values.reshape(-1,1)

Xtest = test[features].values
ytest = test[target].values.reshape(-1,1)

model = linear_model.Ridge(alpha=1)

rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
    multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest)

row = df_eval.shape[0]
df_eval.loc[row] = ['Multiple Linear Regression Ridge Regularization (L2)',
                    'alpha=1, many features, processed',
                    rmse,r2_train,ar2_train,r2_test,ar2_test,cv]


features = features_processed_many
target = ['price']

df = df_raw[features+target]

X = df[features].values
y = df[target].values.reshape(-1,1)

Xtrain = train[features].values
ytrain = train[target].values.reshape(-1,1)

Xtest = test[features].values
ytest = test[target].values.reshape(-1,1)

model = linear_model.Ridge(alpha=1000)

rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
    multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest)

row = df_eval.shape[0]
df_eval.loc[row] = ['Multiple Linear Regression Ridge Regularization (L2)',
                    'alpha=1000, many features, processed',
                    rmse,r2_train,ar2_train,r2_test,ar2_test,cv]


features = features_processed_many
target = ['price']

df = df_raw[features+target]

X = df[features].values
y = df[target].values.reshape(-1,1)

Xtrain = train[features].values
ytrain = train[target].values.reshape(-1,1)

Xtest = test[features].values
ytest = test[target].values.reshape(-1,1)

model = linear_model.Lasso(alpha=1, random_state=SEED)

rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
    multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest)

row = df_eval.shape[0]
df_eval.loc[row] = ['Multiple Linear Regression Lasso Regularization (L1)',
                    'alpha=1, many features, processed',
                    rmse,r2_train,ar2_train,r2_test,ar2_test,cv]

/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/linear_model/coordinate_descent.py:475: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 336521445098471.8, tolerance: 233028249172.05145
  positive)


features = features_processed_many

target = ['price']

df = df_raw[features+target]

X = df[features].values
y = df[target].values.reshape(-1,1)

Xtrain = train[features].values
ytrain = train[target].values.reshape(-1,1)

Xtest = test[features].values
ytest = test[target].values.reshape(-1,1)

model = linear_model.Lasso(alpha=100,random_state=SEED)

rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
    multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest)


df_eval.loc[len(df_eval)] = ['Multiple Linear Regression Lasso Regularization',
                    'alpha=100, many features, processed',
                    rmse,r2_train,ar2_train,r2_test,ar2_test,cv]

df_eval


features = features_raw_few
target = ['price']

df = df_raw[features+target]

polyfeat = PolynomialFeatures(degree=2)

X = polyfeat.fit_transform(df[features])

Xtrain = polyfeat.fit_transform(train[features])
Xtest = polyfeat.fit_transform(test[features])

y = df[target].values.reshape(-1,1)
ytrain = train[target].values.reshape(-1,1)
ytest = test[target].values.reshape(-1,1)


model = linear_model.LinearRegression(n_jobs=-1)

rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
    multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest)

row = df_eval.shape[0]
df_eval.loc[row] = ['Polynomial Regression','deg=2, few features,\
                     unprocessed, no regularization',
                    rmse,r2_train,ar2_train,r2_test,ar2_test,cv]


features = features_raw_few
target = ['price']

df = df_raw[features+target]

polyfeat = PolynomialFeatures(degree=3)

X = polyfeat.fit_transform(df[features])

Xtrain = polyfeat.fit_transform(train[features])
Xtest = polyfeat.fit_transform(test[features])

y = df[target].values.reshape(-1,1)
ytrain = train[target].values.reshape(-1,1)
ytest = test[target].values.reshape(-1,1)


model = linear_model.LinearRegression(n_jobs=-1)

rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
    multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest)

row = df_eval.shape[0]
df_eval.loc[row] = ['Polynomial Regression','deg=3, \
                     few features, unprocessed, no regularization',
                    rmse,r2_train,ar2_train,r2_test,ar2_test,cv]


features = features_raw_all
target = ['price']

df = df_raw[features+target]


polyfeat = PolynomialFeatures(degree=2)

X = polyfeat.fit_transform(df[features])

Xtrain = polyfeat.fit_transform(train[features])
Xtest = polyfeat.fit_transform(test[features])

y = df[target].values.reshape(-1,1)
ytrain = train[target].values.reshape(-1,1)
ytest = test[target].values.reshape(-1,1)


model = linear_model.LinearRegression(n_jobs=-1)

rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
    multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest)

row = df_eval.shape[0]
df_eval.loc[row] = ['Polynomial Regression','deg=2, all features,\
                     unprocessed, no regularization',
                    rmse,r2_train,ar2_train,r2_test,ar2_test,cv]


features = features_raw_all
target = ['price']

df = df_raw[features+target]

polyfeat = PolynomialFeatures(degree=3)

X = polyfeat.fit_transform(df[features])

X_train = polyfeat.fit_transform(train[features])
X_test = polyfeat.fit_transform(test[features])

y = df[target].values.reshape(-1,1)
y_train = train[target].values.reshape(-1,1)
y_test = test[target].values.reshape(-1,1)


model = linear_model.LinearRegression(n_jobs=-1)

rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
    multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest)

row = df_eval.shape[0]
df_eval.loc[row] = ['Polynomial Regression','deg =3, all features,\
                     unprocessed, no regularization',
                    rmse,r2_train,ar2_train,r2_test,ar2_test,cv]


time_start = time.time()


features = features_processed_many
target = ['price']

df = df_raw[features+target]


polyfeat = PolynomialFeatures(degree=2)

X = polyfeat.fit_transform(df[features])

Xtrain = polyfeat.fit_transform(train[features])
Xtest = polyfeat.fit_transform(test[features])

y = df[target].values.reshape(-1,1)
ytrain = train[target].values.reshape(-1,1)
ytest = test[target].values.reshape(-1,1)


model = linear_model.LinearRegression(n_jobs=-1)

rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
    multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest)

row = df_eval.shape[0]

time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))

df_eval.loc[row] = ['Polynomial Regression','deg =2, many features,\
                     processed, no regularization',
                    rmse,r2_train,ar2_train,r2_test,ar2_test,cv]

/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py:706: UserWarning: A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.
  "timeout or by a memory leak.", UserWarning

Time taken: 0 min 7 secs


time_start = time.time()


features = features_processed_many
target = ['price']

df = df_raw[features+target]

polyfeat = PolynomialFeatures(degree=2)

X = polyfeat.fit_transform(df[features])

Xtrain = polyfeat.fit_transform(train[features])
Xtest = polyfeat.fit_transform(test[features])

y = df[target].values.reshape(-1,1)
ytrain = train[target].values.reshape(-1,1)
ytest = test[target].values.reshape(-1,1)


model = linear_model.Ridge(alpha=1,random_state=SEED)

rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
    multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest)

row = df_eval.shape[0]
df_eval.loc[row] = ['Polynomial Regression','deg=2, many features,\
                     processed, Ridge alpha=1',
                    rmse,r2_train,ar2_train,r2_test,ar2_test,cv]

time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))

/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/linear_model/ridge.py:147: LinAlgWarning: Ill-conditioned matrix (rcond=1.89017e-26): result may not be accurate.
  overwrite_a=True).T

Time taken: 0 min 3 secs


features = features_processed_many
target = ['price']

df = df_raw[features+target]

polyfeat = PolynomialFeatures(degree=2)

X = polyfeat.fit_transform(df[features])

Xtrain = polyfeat.fit_transform(train[features])
Xtest = polyfeat.fit_transform(test[features])

y = df[target].values.reshape(-1,1)
ytrain = train[target].values.reshape(-1,1)
ytest = test[target].values.reshape(-1,1)


model = linear_model.Ridge(alpha=50000,random_state=SEED)

rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
    multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest)

row = df_eval.shape[0]
df_eval.loc[row] = ['Polynomial Regression','deg =2, many features, processed,\
                     Ridge alpha=50000',
                    rmse,r2_train,ar2_train,r2_test,ar2_test,cv]

/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/linear_model/ridge.py:147: LinAlgWarning: Ill-conditioned matrix (rcond=8.59361e-22): result may not be accurate.
  overwrite_a=True).T


time_start = time.time()


features = features_processed_many
target = ['price']

df = df_raw[features+target]

polyfeat = PolynomialFeatures(degree=2)

X = polyfeat.fit_transform(df[features])

Xtrain = polyfeat.fit_transform(train[features])
Xtest = polyfeat.fit_transform(test[features])

y = df[target].values.reshape(-1,1)
ytrain = train[target].values.reshape(-1,1)
ytest = test[target].values.reshape(-1,1)


model = linear_model.Lasso(alpha=1,random_state=SEED)

rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
    multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest)

row = df_eval.shape[0]
df_eval.loc[row] = ['Polynomial Regression','deg=2, all features,\
                     processed, Lasso alpha=1',
                    rmse,r2_train,ar2_train,r2_test,ar2_test,cv]

time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))

/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/linear_model/coordinate_descent.py:475: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 219134165009892.25, tolerance: 233028249172.05145
  positive)

Time taken: 0 min 45 secs


time_start = time.time()


features = features_processed_many
target = ['price']

df = df_raw[features+target]

polyfeat = PolynomialFeatures(degree=2)

X = polyfeat.fit_transform(df[features])

Xtrain = polyfeat.fit_transform(train[features])
Xtest = polyfeat.fit_transform(test[features])

y = df[target].values.reshape(-1,1)
ytrain = train[target].values.reshape(-1,1)
ytest = test[target].values.reshape(-1,1)


model = linear_model.Lasso(alpha=50000,random_state=SEED)

rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
    multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest)


df_eval.loc[len(df_eval)] = ['Polynomial Regression','deg =2, all features,\
                    processed, Lasso alpha=50000',
                    rmse,r2_train,ar2_train,r2_test,ar2_test,cv]

time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))

/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/linear_model/coordinate_descent.py:475: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 232071906362195.44, tolerance: 233028249172.05145
  positive)

Time taken: 0 min 35 secs


def adjustedR2(rsquared,nrows,kcols):
    return rsquared- (kcols-1)/(nrows-kcols) * (1-rsquared)


from sklearn.linear_model import LassoLarsCV

time_start = time.time()


features = features_processed_many
target = ['price']

df = df_raw[features+target]


X = df[features].values
y = df[target].values.reshape(-1,1)

Xtrain = train[features].values
ytrain = train[target].values.reshape(-1,1)

Xtest = test[features].values
ytest = test[target].values.reshape(-1,1)

model = linear_model.LassoLarsCV(cv=5,n_jobs=-1,verbose=2,max_iter=1000)

# fitting
model.fit(Xtrain,ytrain)

# prediction
ypreds = model.predict(Xtest)

# metrics
rmse = np.sqrt(mean_squared_error(ytest,ypreds)).round(3)
r2_train = model.score(Xtrain, ytrain).round(3)
r2_test = model.score(Xtest, ytest).round(3)


# rows and cols
nrows = df.shape[0]
kcols = len(features)

# adjusted rsquared
ar2_train = adjustedR2(r2_train,nrows,kcols)
ar2_test = adjustedR2(r2_test,nrows,kcols)



df_eval.loc[len(df_eval)] = ['Linear Regression LassoLarsCV',
                    'many features processed',
                    rmse,r2_train,ar2_train,r2_test,ar2_test,cv]

time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))

/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/utils/validation.py:724: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.

.Time taken: 0 min 0 secs

[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.2s finished


%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;


# Jupyter notebook settings for pandas
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100) # None for all the rows


df_eval.sort_values('Adjusted R-squared (test)',ascending=False)


from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

target = ['price']
features = features_processed_many

df = df_raw[features+target]

X = df[features].values
y = df[target].values.reshape(-1,1)

Xtrain = train[features].values
ytrain = train[target].values.reshape(-1,1)

Xtest = test[features].values
ytest = test[target].values.reshape(-1,1)

# scaling
import joblib
from sklearn.preprocessing import StandardScaler

# for linear models scaling is useful
scaler = StandardScaler()
scaler.fit(Xtrain)

# persist the model for future use
joblib.dump(scaler, '../models/MinMaxScaler_features_processed_many.pkl')

# scale transform
Xtrain = scaler.transform(Xtrain)
Xtest = scaler.transform(Xtest)

model = linear_model.Lasso(alpha=0.05,random_state=SEED,
                           max_iter=10_000, tol=0.01)
model.fit(Xtrain, ytrain)

# persist the model for future use
joblib.dump(model, '../models/lasso_regression_alpha_05_features_processed_many.pkl')

# prediction
ypreds = model.predict(Xtest)

# metrics
rmse_test = np.sqrt(mean_squared_error(ytest,ypreds)).round(3)
r2_train = model.score(Xtrain, ytrain).round(3)
r2_test = model.score(Xtest, ytest).round(3)

# prints
print('model = Lasso with Standard Scaling and many processed features')
print('rmse test = ', rmse_test)
print('rsquared train = ', r2_train)
print('rsquared test = ', r2_test)

model = Lasso with MinMax Scaling and many processed features
rmse test =  197259.614
rsquared train =  0.706
rsquared test =  0.711


# Feature importance


sel = SelectFromModel(Lasso(alpha=0.05, random_state=SEED,tol=0.01)) 

sel.fit(Xtrain, ytrain)

sel.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])


selected_feat = train[features].columns[sel.get_support()]
selected_feat

Index(['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'yr_built',
       'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15',
       'age_cat_0', 'age_cat_1', 'age_cat_2', 'age_cat_3', 'age_cat_4',
       'age_cat_5', 'age_cat_6', 'age_cat_7', 'age_cat_8', 'age_cat_9',
       'age_after_renovation_cat_0', 'age_after_renovation_cat_1',
       'age_after_renovation_cat_2', 'age_after_renovation_cat_3',
       'age_after_renovation_cat_4', 'age_after_renovation_cat_5',
       'age_after_renovation_cat_6', 'age_after_renovation_cat_7',
       'age_after_renovation_cat_8', 'age_after_renovation_cat_9'],
      dtype='object')


print('total features: {}'.format(len(features)))
print('selected features: {}'.format(len(selected_feat)))
      
print('features with coefficients shrank to zero: {}'.format(
    np.sum(sel.estimator_.coef_ == 0)))

total features: 37
selected features: 37
features with coefficients shrank to zero: 0


import io
import json

myjson = {'model': 'Lasso with alpha 0.05 and many processed features',
         'selected_features': selected_feat.values.tolist()}

with io.open('../models/lasso_alpha_005_selected_features.json', 'w', encoding='utf8') as fo:
    str_ = json.dumps(myjson,
                      indent=4,
                      sort_keys=True,
                      separators=(',', ': '),
                      ensure_ascii=False)
    fo.write(str(str_))


# Laso evaluation


plt.scatter(ytest, ypreds)
plt.xlabel('True House Price')
plt.ylabel('Predicted House Price')
plt.title('Evaluation of Lasso Predictions')
plt.xticks(rotation=90)

(array([-1000000.,        0.,  1000000.,  2000000.,  3000000.,  4000000.,
         5000000.,  6000000.,  7000000.,  8000000.]),
 <a list of 10 Text xticklabel objects>)


errors = ytest.ravel() - ypreds.ravel()
errors

array([-131627.15561054,   -9655.37749367, -131804.98611956, ...,
        198212.68183312, -154168.31099857,   87765.5346244 ])


plt.hist(errors, bins=20);


sns.distplot(errors,norm_hist=True,) # errors should follow normal distribution

<matplotlib.axes._subplots.AxesSubplot at 0x11b689320>


ser_fimp = pd.Series(np.abs(model.coef_.ravel()))
ser_fimp.index = features
ser_fimp.sort_values(inplace=True)

ax = ser_fimp.plot.barh(figsize=(18,18))
ax.tick_params(axis='both', which='major', labelsize=14)

plt.xlabel('Lasso Coefficients',fontsize=24)
plt.ylabel('Features',fontsize=24)
plt.title('Feature Importance',fontsize=24)

plt.tight_layout()
plt.show()


features = features_raw_all
target = ['price']

df = df_raw[features+target]


polyfeat = PolynomialFeatures(degree=2)

X = polyfeat.fit_transform(df[features])

Xtrain = polyfeat.fit_transform(train[features])
Xtest = polyfeat.fit_transform(test[features])

y = df[target].values.reshape(-1,1)
ytrain = train[target].values.reshape(-1,1)
ytest = test[target].values.reshape(-1,1)


model = linear_model.LinearRegression(n_jobs=-1)

rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
    multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest)

row = df_eval.shape[0]

print(ar2_test)

0.812


df.head(2)


from sklearn.compose import TransformedTargetRegressor as TTR
from sklearn.preprocessing import StandardScaler

time_start = time.time()


X = polyfeat.fit_transform(df[features])

Xtrain = polyfeat.fit_transform(train[features])
Xtest = polyfeat.fit_transform(test[features])

y = df[target].values.reshape(-1,1)
ytrain = train[target].values.reshape(-1,1)
ytest = test[target].values.reshape(-1,1)

# for linear models scaling is useful
scaler = StandardScaler()
scaler.fit(Xtrain)

# scale transform
Xtrain = scaler.transform(Xtrain)
Xtest = scaler.transform(Xtest)


model = linear_model.LinearRegression(n_jobs=-1)

model = TTR(model, func=np.log1p, inverse_func=np.expm1)


model.fit(Xtrain,ytrain)

# prediction
ypreds = model.predict(Xtest)

# metrics
rmse_test = np.sqrt(mean_squared_error(ytest,ypreds)).round(3)
r2_train = model.score(Xtrain, ytrain).round(3)
r2_test = model.score(Xtest, ytest).round(3)

# prints
print('model = Best model')
print('rmse test = ', rmse_test)
print('rsquared train = ', r2_train)
print('rsquared test = ', r2_test)

df_eval.loc[len(df_eval)] = ['Linear Regression',
                    'all features, standard scaler, transform ytrain',
                    rmse,r2_train,ar2_train,r2_test,ar2_test,cv]

time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))

df_eval.sort_values('Adjusted R-squared (test)',ascending=False)

model = Best model
rmse test =  152548.214
rsquared train =  0.84
rsquared test =  0.827
Time taken: 0 min 0 secs

	id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	condition	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15	yr_sales	age	yr_renovated2	age_after_renovation	zipcode_top10	zipcode_houses	basement_bool	renovation_bool	age_cat	age_after_renovation_cat	waterfront_0	view_0	condition_3	condition_5	grade_6	grade_7	grade_8	zipcode_top10_others	age_cat_2	age_cat_4	age_cat_5	age_cat_7	age_after_renovation_cat_2	age_after_renovation_cat_4	age_after_renovation_cat_5	age_after_renovation_cat_7	log1p_price	log1p_sqft_living	log1p_sqft_lot	log1p_sqft_above	log1p_sqft_basement	log1p_sqft_living15	log1p_sqft_lot15
0	7129300520	2014-10-13	221,900.000	3	1.000	1180	5650	1.000	3	7	1180	0	1955	0	98178	47.511	-122.257	1340	5650	2014	59	1955	59	others	262	0	0	5	5	1	1	1	0	0	1	0	1	0	0	1	0	0	0	1	0	12.310	7.074	8.640	7.074	0.000	7.201	8.640
1	6414100192	2014-12-09	538,000.000	3	2.250	2570	7242	2.000	3	7	2170	400	1951	1991	98125	47.721	-122.319	1690	7639	2014	63	1991	23	others	410	1	1	5	2	1	1	1	0	0	1	0	1	0	0	1	0	1	0	0	0	13.196	7.852	8.888	7.683	5.994	7.433	8.941
2	5631500400	2015-02-25	180,000.000	2	1.000	770	10000	1.000	3	6	770	0	1933	0	98028	47.738	-122.233	2720	8062	2015	82	1933	82	others	283	0	0	7	7	1	1	1	0	1	0	0	1	0	0	0	1	0	0	0	1	12.101	6.648	9.210	6.648	0.000	7.909	8.995
3	2487200875	2014-12-09	604,000.000	4	3.000	1960	5000	1.000	5	7	1050	910	1965	0	98136	47.521	-122.393	1360	5000	2014	49	1965	49	others	263	1	0	4	4	1	1	0	1	0	1	0	1	0	1	0	0	0	1	0	0	13.311	7.581	8.517	6.957	6.815	7.216	8.517
4	1954400510	2015-02-18	510,000.000	3	2.000	1680	8080	1.000	3	8	1680	0	1987	0	98074	47.617	-122.045	1800	7503	2015	28	1987	28	others	441	0	0	2	2	1	1	1	0	0	0	1	1	1	0	0	0	1	0	0	0	13.142	7.427	8.997	7.427	0.000	7.496	8.923

	price	log1p_price	sqft_living	grade	log1p_sqft_living	sqft_above	sqft_living15	log1p_sqft_living15	log1p_sqft_above	bathrooms
price	1	0.891654	0.702035	0.667434	0.611757	0.605567	0.585379	0.544014	0.542774	0.525138
log1p_price	0.891654	1	0.695341	0.703634	0.67494	0.601802	0.619312	0.607201	0.586322	0.550802
sqft_living	0.702035	0.695341	1	0.762704	0.954368	0.876597	0.75642	0.732194	0.84324	0.754665
grade	0.667434	0.703634	0.762704	1	0.743711	0.755923	0.713202	0.688419	0.743416	0.664983
log1p_sqft_living	0.611757	0.67494	0.954368	0.743711	1	0.832336	0.736567	0.746137	0.865382	0.761316
sqft_above	0.605567	0.601802	0.876597	0.755923	0.832336	1	0.73187	0.701817	0.962353	0.685342
sqft_living15	0.585379	0.619312	0.75642	0.713202	0.736567	0.73187	1	0.976821	0.714572	0.568634
log1p_sqft_living15	0.544014	0.607201	0.732194	0.688419	0.746137	0.701817	0.976821	1	0.712634	0.570834
log1p_sqft_above	0.542774	0.586322	0.84324	0.743416	0.865382	0.962353	0.714572	0.712634	1	0.694954
bathrooms	0.525138	0.550802	0.754665	0.664983	0.761316	0.685342	0.568634	0.570834	0.694954	1

	Model	Details	Root Mean Squared Error (RMSE)	R-squared (training)	Adjusted R-squared (training)	R-squared (test)	Adjusted R-squared (test)	5-Fold Cross Validation
0	Simple Linear Regression	-	255,511.380	0.487	0.487	0.516	0.516	0.491
1	Multiple Linear Regression	few features, unprocessed	250,717.169	0.510	0.510	0.534	0.533	0.512

	Model	Details	Root Mean Squared Error (RMSE)	R-squared (training)	Adjusted R-squared (training)	R-squared (test)	Adjusted R-squared (test)	5-Fold Cross Validation
0	Simple Linear Regression	-	255,511.380	0.487	0.487	0.516	0.516	0.491
1	Multiple Linear Regression	few features, unprocessed	250,717.169	0.510	0.510	0.534	0.533	0.512
2	Multiple Linear Regression	few features, processed	250,717.169	0.510	0.510	0.534	0.531	0.702

	Model	Details	Root Mean Squared Error (RMSE)	R-squared (training)	Adjusted R-squared (training)	R-squared (test)	Adjusted R-squared (test)	5-Fold Cross Validation
0	Simple Linear Regression	-	255,511.380	0.487	0.487	0.516	0.516	0.491
1	Multiple Linear Regression	few features, unprocessed	250,717.169	0.510	0.510	0.534	0.533	0.512
2	Multiple Linear Regression	few features, processed	250,717.169	0.510	0.510	0.534	0.531	0.702
3	Multiple Linear Regression	many features, processed	197,259.488	0.706	0.706	0.711	0.709	0.704

Table of Contents

Data Description¶

Notes for Linear Regression¶

General Modelling Tips¶

Imports¶

Useful Scripts¶

Load the data¶

Simple Linear Regression¶

Train-test split¶

modelling¶

model weights¶

prediction¶

evaluation¶

prediction visualization¶

Feature Selection¶

Multiple Linear Regression¶

Multiple Linear Regression - Some processed features¶

Multiple Linear Regression - Many processed features¶

Multiple Linear Regrerssion - Ridge Regularization L2¶

Multiple Linear Regression - Lasso Regularization L1¶

Polynomial Regression¶

Polynomial Regression - deg = 2 few raw features¶

Polynomial Regression - deg = 3 few raw features¶

Polynomial Regression - deg = 2 all raw features¶

Polynomial Regression - deg = 3 all raw features¶

Polynomial Regression - deg = 2 many processed features¶

Polynomial Regression - deg = 2 many processed features Ridge alpha = 1¶

Polynomial Regression - deg = 2 many processed features Ridge alpha = 50_000¶

Polynomial Regression - deg = 2 many processed features Lasso alpha = 1¶

Polynomial Regression - deg = 2 many processed features Lasso alpha = 50_000¶

Linear Model LassoLarsCV¶

Summary¶

Feature Importance for Lasso Regression¶

Best Model So Far¶

Transform Target Regressor¶

	Model	Details	Root Mean Squared Error (RMSE)	R-squared (training)	Adjusted R-squared (training)	R-squared (test)	Adjusted R-squared (test)	5-Fold Cross Validation
10	Polynomial Regression	deg=2, all features, unprocessed, no regularization	158,822.055	0.831	0.831	0.813	0.812	0.813
11	Polynomial Regression	deg =3, all features, unprocessed, no regularization	158,822.055	0.831	0.831	0.813	0.812	-2.454
13	Polynomial Regression	deg=2, many features, processed, Ridge alpha=1	159,185.146	0.846	0.846	0.812	0.810	0.808
14	Polynomial Regression	deg =2, many features, processed, Ridge alpha=50000	165,440.773	0.821	0.820	0.797	0.795	0.792
15	Polynomial Regression	deg=2, all features, processed, Lasso alpha=1	174,534.762	0.812	0.812	0.774	0.772	0.778
16	Polynomial Regression	deg =2, all features, processed, Lasso alpha=50000	176,297.103	0.803	0.802	0.769	0.767	0.781
17	Linear Regression LassoLarsCV	many features processed	197,309.546	0.706	0.706	0.711	0.711	0.781
3	Multiple Linear Regression	many features, processed	197,259.488	0.706	0.706	0.711	0.709	0.704
4	Multiple Linear Regression Ridge Regularization (L2)	alpha=1, many features, processed	197,256.275	0.706	0.706	0.711	0.709	0.704
6	Multiple Linear Regression Lasso Regularization (L1)	alpha=1, many features, processed	197,259.114	0.706	0.706	0.711	0.709	0.704
7	Multiple Linear Regression Lasso Regularization	alpha=100, many features, processed	197,276.866	0.706	0.706	0.711	0.709	0.704
5	Multiple Linear Regression Ridge Regularization (L2)	alpha=1000, many features, processed	210,224.755	0.664	0.663	0.672	0.669	0.661
12	Polynomial Regression	deg =2, many features, processed, no regularization	229,581.128	0.849	0.848	0.609	0.606	-3.857
9	Polynomial Regression	deg=3, few features, unprocessed, no regularization	237,502.457	0.584	0.584	0.581	0.581	0.490
8	Polynomial Regression	deg=2, few features, unprocessed, no regularization	237,956.794	0.569	0.568	0.580	0.579	0.540
1	Multiple Linear Regression	few features, unprocessed	250,717.169	0.510	0.510	0.534	0.533	0.512
2	Multiple Linear Regression	few features, processed	250,717.169	0.510	0.510	0.534	0.531	0.702
0	Simple Linear Regression	-	255,511.380	0.487	0.487	0.516	0.516	0.491

	Model	Details	Root Mean Squared Error (RMSE)	R-squared (training)	Adjusted R-squared (training)	R-squared (test)	Adjusted R-squared (test)	5-Fold Cross Validation
18	Linear Regression	all features, standard scaler, transform ytrain	158,822.055	0.840	0.831	0.827	0.812	0.813
10	Polynomial Regression	deg=2, all features, unprocessed, no regularization	158,822.055	0.831	0.831	0.813	0.812	0.813
11	Polynomial Regression	deg =3, all features, unprocessed, no regularization	158,822.055	0.831	0.831	0.813	0.812	-2.454
13	Polynomial Regression	deg=2, many features, processed, Ridge alpha=1	159,185.146	0.846	0.846	0.812	0.810	0.808
14	Polynomial Regression	deg =2, many features, processed, Ridge alpha=50000	165,440.773	0.821	0.820	0.797	0.795	0.792
15	Polynomial Regression	deg=2, all features, processed, Lasso alpha=1	174,534.762	0.812	0.812	0.774	0.772	0.778
16	Polynomial Regression	deg =2, all features, processed, Lasso alpha=50000	176,297.103	0.803	0.802	0.769	0.767	0.781
17	Linear Regression LassoLarsCV	many features processed	197,309.546	0.706	0.706	0.711	0.711	0.781
3	Multiple Linear Regression	many features, processed	197,259.488	0.706	0.706	0.711	0.709	0.704
4	Multiple Linear Regression Ridge Regularization (L2)	alpha=1, many features, processed	197,256.275	0.706	0.706	0.711	0.709	0.704
6	Multiple Linear Regression Lasso Regularization (L1)	alpha=1, many features, processed	197,259.114	0.706	0.706	0.711	0.709	0.704
7	Multiple Linear Regression Lasso Regularization	alpha=100, many features, processed	197,276.866	0.706	0.706	0.711	0.709	0.704
5	Multiple Linear Regression Ridge Regularization (L2)	alpha=1000, many features, processed	210,224.755	0.664	0.663	0.672	0.669	0.661
12	Polynomial Regression	deg =2, many features, processed, no regularization	229,581.128	0.849	0.848	0.609	0.606	-3.857
9	Polynomial Regression	deg=3, few features, unprocessed, no regularization	237,502.457	0.584	0.584	0.581	0.581	0.490
8	Polynomial Regression	deg=2, few features, unprocessed, no regularization	237,956.794	0.569	0.568	0.580	0.579	0.540
1	Multiple Linear Regression	few features, unprocessed	250,717.169	0.510	0.510	0.534	0.533	0.512
2	Multiple Linear Regression	few features, processed	250,717.169	0.510	0.510	0.534	0.531	0.702
0	Simple Linear Regression	-	255,511.380	0.487	0.487	0.516	0.516	0.491