%%time
# imports
import time
import numpy as np
import pandas as pd
import os
# sklearn
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
def adjustedR2(rsquared,nrows,kcols):
return rsquared- (kcols-1)/(nrows-kcols) * (1-rsquared)
# random state
SEED = 0
RNG = np.random.RandomState(SEED)
target = 'price'
# load the data
ifile = '../data/processed/data_cleaned_encoded.csv'
df = pd.read_csv(ifile)
cols_sq = ['bedrooms','bathrooms','floors','waterfront','view',
'age','age_after_renovation','log1p_sqft_living','log1p_sqft_lot',
'log1p_sqft_above','log1p_sqft_basement','log1p_sqft_living15','log1p_sqft_lot15']
for col in cols_sq:
df[col + '_sq'] = df[col]**2
cols_drop = ['id', 'date', 'price','zipcode_top10']
features = [i for i in df.columns
if i not in cols_drop]
df_Xtrain, df_Xtest, ser_ytrain, ser_ytest = train_test_split(
df[features], df[target],
test_size=0.2, random_state=SEED,
)
ytrain = np.array(ser_ytrain).ravel()
ytest = np.array(ser_ytest).ravel()
rf = RandomForestRegressor(n_estimators=1200,
max_depth=15,
min_samples_split=5,
min_samples_leaf=5,
max_features=None,
oob_score=True,
n_jobs=-1,
random_state=SEED)
rf.fit(df_Xtrain,ser_ytrain)
ypreds = rf.predict(df_Xtest)
# rmse
rmse = np.sqrt(sklearn.metrics.mean_squared_error(ytest,ypreds))
# r-squared values
r2 = sklearn.metrics.r2_score(ytest, ypreds)
ar2 = adjustedR2(r2, df_Xtest.shape[0], df_Xtest.shape[1])
rmse, r2, ar2