import time
time_start_notebook = time.time()
import numpy as np
import pandas as pd
# random state
SEED = 0
RNG = np.random.RandomState(SEED)
# mixed
import os
import time
import scipy
import six
import pickle
import joblib
# sklearn
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import metrics
# special
import xgboost as xgb
target = 'price'
#============================== load the data
df = pd.read_csv('../data/raw/kc_house_data.csv')
#============================== data processing
df = df.drop(['id','date'],axis=1)
log_cols = ['price','sqft_living','sqft_living15',
'sqft_lot','sqft_lot15']
for col in log_cols:
df[col] = np.log1p(df[col].to_numpy())
#============================== train-test split
df_Xtrain, df_Xtest, ser_ytrain, ser_ytest = train_test_split (
df.drop(target,axis=1),df[target],
test_size=0.20,random_state=SEED)
ytest = np.array(ser_ytest).flatten()
#============================= scaling
scaler = StandardScaler()
scaler.fit(df_Xtrain)
Xtrain = scaler.transform(df_Xtrain)
Xtest = scaler.transform(df_Xtest)
# modelling
model = xgb.XGBRegressor(n_jobs=-1, random_state=SEED,
objective='reg:squarederror',
n_estimators=1200,
max_depth=3,
reg_alpha=1,
reg_lambda=5,
subsample=1,
gamma=0,
min_child_weight=1,
colsample_bytree=1,
learning_rate=0.1
)
model.fit(Xtrain, ser_ytrain)
ypreds = model_xgb.predict(Xtest)
# NOTE: we need to do inverse log transform of target
ytest = np.expm1(ytest)
ypreds = np.expm1(ypreds)
rmse = np.sqrt(metrics.mean_squared_error(ytest,ypreds))
r2 = sklearn.metrics.r2_score(ytest, ypreds)
print('rmse=',rmse)
print('r2=',r2)