This dataset contains house sale prices for King County, which includes Seattle. It includes homes sold between May 2014 and May 2015.
Task: Try to estimate the price based on given features.
%load_ext autoreload
%autoreload 2
# my custom module
import bhishan
import src
from bhishan.util_statsmodels import print_statsmodels_summary
from bhishan.util_statsmodels import regression_residual_plots
from bhishan.util_statsmodels import lm_plot
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import os
import time
# random state
SEED = 0
RNG = np.random.RandomState(SEED)
# Jupyter notebook settings for pandas
pd.set_option('display.max_columns', 100)
#pd.set_option('display.float_format', '{:,.2g}'.format) # numbers sep by comma
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 200)
import IPython
from IPython.display import display, HTML, Image
print([(x.__name__, x.__version__) for x in [pd,np,sns,matplotlib]])
[('pandas', '0.25.0'), ('numpy', '1.16.4'), ('seaborn', '0.9.0'), ('matplotlib', '3.1.1')]
import scipy
import statsmodels.api as sm
import statsmodels.formula.api as smf
# _, ci_low, ci_high = wls_prediction_std(results)
from statsmodels.sandbox.regression.predstd import wls_prediction_std
import sklearn
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
def show_method_attributes(method, ncols=2):
""" Show all the attributes of a given method.
Example:
========
show_method_attributes(list)
"""
x = [I for I in dir(method) if I[0].islower()]
x = [I for I in x if I not in 'os np pd sys time psycopg2'.split()]
return pd.DataFrame(np.array_split(x,ncols)).T.fillna('')
df = pd.read_csv('../data/raw/kc_house_data.csv')
print(df.shape)
df.head()
(21613, 21)
id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7129300520 | 20141013T000000 | 221900.0 | 3 | 1.00 | 1180 | 5650 | 1.0 | 0 | 0 | 3 | 7 | 1180 | 0 | 1955 | 0 | 98178 | 47.5112 | -122.257 | 1340 | 5650 |
1 | 6414100192 | 20141209T000000 | 538000.0 | 3 | 2.25 | 2570 | 7242 | 2.0 | 0 | 0 | 3 | 7 | 2170 | 400 | 1951 | 1991 | 98125 | 47.7210 | -122.319 | 1690 | 7639 |
2 | 5631500400 | 20150225T000000 | 180000.0 | 2 | 1.00 | 770 | 10000 | 1.0 | 0 | 0 | 3 | 6 | 770 | 0 | 1933 | 0 | 98028 | 47.7379 | -122.233 | 2720 | 8062 |
3 | 2487200875 | 20141209T000000 | 604000.0 | 4 | 3.00 | 1960 | 5000 | 1.0 | 0 | 0 | 5 | 7 | 1050 | 910 | 1965 | 0 | 98136 | 47.5208 | -122.393 | 1360 | 5000 |
4 | 1954400510 | 20150218T000000 | 510000.0 | 3 | 2.00 | 1680 | 8080 | 1.0 | 0 | 0 | 3 | 8 | 1680 | 0 | 1987 | 0 | 98074 | 47.6168 | -122.045 | 1800 | 7503 |
df_train, df_test = train_test_split(df, test_size=0.2,random_state=100)
print(df.shape, df_train.shape, df_test.shape)
df_train.head(2)
(21613, 21) (17290, 21) (4323, 21)
id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
16000 | 2561340020 | 20140804T000000 | 325000.0 | 3 | 1.75 | 1780 | 11096 | 1.0 | 0 | 0 | 3 | 7 | 1210 | 570 | 1979 | 0 | 98074 | 47.6170 | -122.051 | 1780 | 10640 |
11286 | 8598200070 | 20141208T000000 | 278000.0 | 2 | 2.50 | 1420 | 2229 | 2.0 | 0 | 0 | 3 | 7 | 1420 | 0 | 2004 | 0 | 98059 | 47.4871 | -122.165 | 1500 | 2230 |
# using sm
features = ['sqft_living']
target = ['price']
df_Xtrain = df_train[features]
df_ytrain = df_train[target]
df_Xtest = df_test[features]
df_ytest = df_test[target]
df_X1train = sm.add_constant(df_Xtrain)
model = sm.OLS(df_ytrain, df_X1train )
results = model.fit()
summary = results.summary()
print_statsmodels_summary(summary)
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/numpy/core/fromnumeric.py:2389: FutureWarning: Method .ptp is deprecated and will be removed in a future version. Use numpy.ptp instead. return ptp(axis=axis, out=out, **kwargs)
Dep. Variable: | price | R-squared: | 0.487 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.487 |
Method: | Least Squares | F-statistic: | 1.642e+04 |
Date: | Fri, 18 Oct 2019 | Prob (F-statistic): | 0.00 |
Time: | 21:22:01 | Log-Likelihood: | -2.4030e+05 |
No. Observations: | 17290 | AIC: | 4.806e+05 |
Df Residuals: | 17288 | BIC: | 4.806e+05 |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | -4.263e+04 | 4963.377 | -8.589 | 0.000 | -5.24e+04 | -3.29e+04 |
sqft_living | 280.6854 | 2.190 | 128.149 | 0.000 | 276.392 | 284.979 |
Omnibus: | 11909.901 | Durbin-Watson: | 2.018 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 419310.893 |
Skew: | 2.854 | Prob(JB): | 0.00 |
Kurtosis: | 26.441 | Cond. No. | 5.62e+03 |
# using smf
formula = 'price ~ sqft_living'
model = smf.ols(formula=formula, data= pd.concat([df_Xtrain,df_ytrain],axis=1))
results = model.fit()
summary = results.summary()
show_method_attributes(results,7)
0 | 1 | 2 | 3 | 4 | 5 | 6 | |
---|---|---|---|---|---|---|---|
0 | aic | conf_int_el | df_resid | get_influence | mse_resid | resid | t_test |
1 | bic | cov_HC0 | diagn | get_prediction | mse_total | resid_pearson | t_test_pairwise |
2 | bse | cov_HC1 | eigenvals | get_robustcov_results | nobs | rsquared | tvalues |
3 | centered_tss | cov_HC2 | el_test | initialize | normalized_cov_params | rsquared_adj | uncentered_tss |
4 | compare_f_test | cov_HC3 | ess | k_constant | outlier_test | save | use_t |
5 | compare_lm_test | cov_kwds | f_pvalue | llf | params | scale | wald_test |
6 | compare_lr_test | cov_params | f_test | load | predict | ssr | wald_test_terms |
7 | condition_number | cov_type | fittedvalues | model | pvalues | summary | wresid |
8 | conf_int | df_model | fvalue | mse_model | remove_data | summary2 |
results.summary()
Dep. Variable: | price | R-squared: | 0.487 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.487 |
Method: | Least Squares | F-statistic: | 1.642e+04 |
Date: | Fri, 18 Oct 2019 | Prob (F-statistic): | 0.00 |
Time: | 21:22:02 | Log-Likelihood: | -2.4030e+05 |
No. Observations: | 17290 | AIC: | 4.806e+05 |
Df Residuals: | 17288 | BIC: | 4.806e+05 |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | -4.263e+04 | 4963.377 | -8.589 | 0.000 | -5.24e+04 | -3.29e+04 |
sqft_living | 280.6854 | 2.190 | 128.149 | 0.000 | 276.392 | 284.979 |
Omnibus: | 11909.901 | Durbin-Watson: | 2.018 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 419310.893 |
Skew: | 2.854 | Prob(JB): | 0.00 |
Kurtosis: | 26.441 | Cond. No. | 5.62e+03 |
Yes, the low P-value associated with the t-statistic for feature suggests so.
For a unit increase in sqft_living, our model predicts price will increase by 280.6854
Positive
predicted price is
518741.86
and confidence interval is[500426.67775749206, 537057.0363632903]
results.params
Intercept -42628.976515 sqft_living 280.685417 dtype: float64
value = 2000
ans = -42628.976515 + 280.685417 * value
round(ans,2)
518741.86
df.loc[:2,'sqft_living']
0 1180 1 2570 2 770 Name: sqft_living, dtype: int64
text = " y = {:.2f} x + ({:.2f})".format(*results.params.values[::-1])
text
' y = 280.69 x + (-42628.98)'
value = 2000
X1_test = np.array([value,1])
X1_test.shape
(2,)
def predict(params,X1):
return params.T @ X1
def get_confidence_interval(X1,results):
ci_min = results.conf_int(alpha=0.05)[0]
ci_max = results.conf_int(alpha=0.05)[1]
confidence_interval = [predict(ci_min, X1), predict(ci_max, X1)]
print(confidence_interval)
formula = 'price ~ sqft_living'
model = smf.ols(formula=formula, data= pd.concat([df_Xtrain,df_ytrain],axis=1))
results = model.fit()
get_confidence_interval(X1_test,results)
[-104715118.95658985, -65800225.73295514]
results.rsquared, results.rsquared_adj
(0.4871565982411399, 0.48712693353719727)
df_Xtest.shape
(4323, 1)
df_X1test = df_Xtest.assign(const=1)
df_X1test.head(2)
sqft_living | const | |
---|---|---|
19836 | 2437 | 1 |
10442 | 1560 | 1 |
df_ypreds = results.predict(df_X1test)
df_ypreds.shape, df_ytest.shape
((4323,), (4323, 1))
df_ypreds.head(2)
19836 641401.384197 10442 395240.273674 dtype: float64
df_ytrain.shape, df_ytest.shape
((17290, 1), (4323, 1))
df_ytest.head()
price | |
---|---|
19836 | 285000.0 |
10442 | 239950.0 |
20548 | 460000.0 |
11014 | 397500.0 |
4138 | 545000.0 |
df_comparison = df_ytest.copy(deep=True)
df_comparison['ypred'] = df_ypreds
df_comparison['error'] = df_comparison['price'] - df_comparison['ypred']
df_comparison.head()
price | ypred | error | |
---|---|---|---|
19836 | 285000.0 | 641401.384197 | -356401.384197 |
10442 | 239950.0 | 395240.273674 | -155290.273674 |
20548 | 460000.0 | 628209.169608 | -168209.169608 |
11014 | 397500.0 | 372785.440331 | 24714.559669 |
4138 | 545000.0 | 485059.607046 | 59940.392954 |
mse = df_comparison['error'].pow(2).mean()
mse # this is too high.
65286065304.1562
ax = plt.scatter(x=df_comparison['price'],y=df_comparison['ypred'], color='b')
plt.xticks(rotation=90)
plt.tight_layout()
plt.plot([0,7_000_000], [0,7_000_000],c='r')
[<matplotlib.lines.Line2D at 0x1180a9630>]
from bhishan.util_statsmodels import lm_plot
df_ypreds_train = results.predict(df_X1train)
lm_plot(df_X1train, df_ytrain, df_ypreds_train)
1. Externally Studentised Residual Plot (Outliers Test): - The horizontal red dashed lines are studentised t values t = ± 3 - The points outside t = ± 3 may be considered outliers. - If we see U-shaped fitted solid blue line, our data is non-linear. We might need to transoform features or include polynomial features. 2. Normal Q-Q Plot (Test of Normality) - If fitted points align with 45 degree line, the assumption of normality is likey to hold true. 3. Scale-Location Plot (Test of Constant Variance, homoskedasticity) - Small residuals on y-axis is better. - If we see conical shape, data is heteroskedastic. - If data points are clustered at left or right, we observe heteroskedasticity. - If we see few outliers with high y-values, we have high variance residual outliers. 4. Residuals vs Leverage Plot (Outliers Test) - Studentised residual larger than 3 are potential outliers. - High leverage points are potential outliers.
from bhishan.util_statsmodels import regression_residual_plots
formula = 'price ~ sqft_living'
model = smf.ols(formula=formula, data=df_train)
results = model.fit()
regression_residual_plots(results, dependent_var="price",
data=df_train[['sqft_living','price']],
annotate_outliers=False,verbose=False)
feature = 'sqft_living'
fig = plt.figure(figsize=(15,8))
fig = sm.graphics.plot_regress_exog(results, feature, fig=fig)
from statsmodels.sandbox.regression.predstd import wls_prediction_std
# data
x = df_Xtrain[['sqft_living']].values.reshape(-1,1)
y = df_ytrain.values.reshape(-1,1)
best_fit = results.fittedvalues.values.reshape(-1,1)
# figure
fig, ax = plt.subplots(figsize=(10,7))
# text
text = " y = {:.2f} x + ({:.2f})".format(*results.params.values[::-1])
ax.text(0,7000_000, text)
ax.plot(x, y, 'o', label="data")
ax.plot(x, best_fit, 'g--.', label="OLS")
# confidence intervals
_, ci_low, ci_high = wls_prediction_std(results)
ax.plot(x, ci_high, 'r--')
ax.plot(x, ci_low, 'r--')
# plot legend
ax.legend(loc='best')
plt.tight_layout()
df_Xtrain.shape, df_ytrain.shape
((17290, 1), (17290, 1))
def sns_regression_plot():
plt.figure(figsize=(12,8))
p = sns.regplot(x='sqft_living', y='price',
data = pd.concat([Xtrain,ytrain],axis=1),
line_kws={'color': 'g'})
slope, intercept, r_value, p_value, std_err = \
scipy.stats.linregress(x=p.get_lines()[0].get_xdata(),
y=p.get_lines()[0].get_ydata())
text = 'y = {:.2f} x + ({:.2f})'.format(slope, intercept)
plt.text(0,7000_000, text,size=24,alpha=0.5)
plt.tight_layout()
plt.show()
from bhishan.util_statsmodels import regression_residual_plots
formula = 'price ~ sqft_living'
model = smf.ols(formula=formula, data=df)
results = model.fit()
regression_residual_plots(results, dependent_var="price",data=df[['sqft_living','price']])
1. Residuals vs Fitted Plot (Test of linearity): - If solid red line is along y=0, assumption of linearity may hold true. 2. Normal Q-Q plot (Test of Normality) - If fitted points align with 45 degree line, assumption of normality may hold true. 3. Scale-Location Plot (Test of Constant Variance, homoskedasticity) - Small residuals on y-axis is better. - If data points are clustered at left or right, we observe heteroskedasticity. - If we see few outliers with high y-values, we have high variance residual outliers. 4. Residual vs Leverage Plot (Cook's test of outliers) - Solid red line is leverage best fit line. - Dashed red line is Cook's distance for 0.5 * p * (1-x) /x - Dotted red line is Cook's distance for 1 * p * (1-x) /x - p is number of model parameters. x is plot values: np.linspace(0.001, 0.200, 50). - If the values exceeds the farthest dotted red line, it has high leverage on the dataset and may be considered outlier.
df_train.head()
id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
16000 | 2561340020 | 20140804T000000 | 325000.0 | 3 | 1.75 | 1780 | 11096 | 1.0 | 0 | 0 | 3 | 7 | 1210 | 570 | 1979 | 0 | 98074 | 47.6170 | -122.051 | 1780 | 10640 |
11286 | 8598200070 | 20141208T000000 | 278000.0 | 2 | 2.50 | 1420 | 2229 | 2.0 | 0 | 0 | 3 | 7 | 1420 | 0 | 2004 | 0 | 98059 | 47.4871 | -122.165 | 1500 | 2230 |
3201 | 6788200931 | 20140520T000000 | 710000.0 | 2 | 1.00 | 1790 | 4000 | 1.0 | 0 | 0 | 4 | 7 | 1040 | 750 | 1923 | 0 | 98112 | 47.6405 | -122.301 | 1310 | 4000 |
11049 | 3023059012 | 20140910T000000 | 389900.0 | 4 | 1.00 | 1710 | 117176 | 1.5 | 0 | 0 | 4 | 6 | 1710 | 0 | 1942 | 0 | 98055 | 47.4497 | -122.212 | 1940 | 12223 |
9716 | 5683500030 | 20150320T000000 | 489000.0 | 4 | 1.00 | 1150 | 5217 | 1.5 | 0 | 0 | 3 | 7 | 1150 | 0 | 1951 | 0 | 98115 | 47.6806 | -122.287 | 1220 | 5217 |
model_fit = results
model_norm_residuals = model_fit.get_influence().resid_studentized_internal
# Annotations of Outliers
abs_norm_resid = np.flip(np.argsort(np.abs(model_norm_residuals)), 0)
abs_norm_resid_top_3 = abs_norm_resid[:3]
for i in abs_norm_resid_top_3:
print(i)
7252 3914 9254
from bhishan.util_statsmodels import regression_residual_plots
formula = 'price ~ sqft_living'
model = smf.ols(formula=formula, data=df_train)
results = model.fit()
regression_residual_plots(results, dependent_var="price",data=df_train[['sqft_living','price']],annotate_outliers=True)
1. Residuals vs Fitted Plot (Test of linearity): - If solid red line is along y=0, assumption of linearity may hold true. 2. Normal Q-Q plot (Test of Normality) - If fitted points align with 45 degree line, assumption of normality may hold true. 3. Scale-Location Plot (Test of Constant Variance, homoskedasticity) - Small residuals on y-axis is better. - If data points are clustered at left or right, we observe heteroskedasticity. - If we see few outliers with high y-values, we have high variance residual outliers. 4. Residual vs Leverage Plot (Cook's test of outliers) - Solid red line is leverage best fit line. - Dashed red line is Cook's distance for 0.5 * p * (1-x) /x - Dotted red line is Cook's distance for 1 * p * (1-x) /x - p is number of model parameters. x is plot values: np.linspace(0.001, 0.200, 50). - If the values exceeds the farthest dotted red line, it has high leverage on the dataset and may be considered outlier.
# lets try log transformation and plot again
df_train['log_sqft_living'] = np.log(df_train['sqft_living'].values)
features = ['log_sqft_living']
target = ['price']
df_X = df_train[features]
df_y = df_train[target]
df_X1 = df_X.assign(const=1)
model = sm.OLS(df_y, df_X1 )
results = model.fit()
from bhishan.util_statsmodels import regression_residual_plots
regression_residual_plots(results, dependent_var="price",
data= df_train[['log_sqft_living','price']],
verbose=False,
annotate_outliers=False,
ofile='../reports/figures/regression_residual_plots.png')
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
# Here, we do not need to give intercept term.
formula = 'price ~ np.log(sqft_living)'
model = smf.ols(formula=formula, data=df_train)
results = model.fit()
print(results.summary())
OLS Regression Results ============================================================================== Dep. Variable: price R-squared: 0.369 Model: OLS Adj. R-squared: 0.369 Method: Least Squares F-statistic: 1.012e+04 Date: Fri, 18 Oct 2019 Prob (F-statistic): 0.00 Time: 22:27:32 Log-Likelihood: -2.4210e+05 No. Observations: 17290 AIC: 4.842e+05 Df Residuals: 17288 BIC: 4.842e+05 Df Model: 1 Covariance Type: nonrobust ======================================================================================= coef std err t P>|t| [0.025 0.975] --------------------------------------------------------------------------------------- Intercept -3.427e+06 3.95e+04 -86.762 0.000 -3.5e+06 -3.35e+06 np.log(sqft_living) 5.255e+05 5224.766 100.580 0.000 5.15e+05 5.36e+05 ============================================================================== Omnibus: 15529.189 Durbin-Watson: 2.014 Prob(Omnibus): 0.000 Jarque-Bera (JB): 1270840.426 Skew: 3.994 Prob(JB): 0.00 Kurtosis: 44.234 Cond. No. 137. ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
df_train['log_sqft_living'] = np.log(df_train['sqft_living'].values)
features = ['log_sqft_living']
target = ['price']
df_X = df_train[features]
df_y = df_train[target]
df_X1 = df_X.assign(const=1)
model = sm.OLS(df_y, df_X1 )
results = model.fit()
print(results.summary())
OLS Regression Results ============================================================================== Dep. Variable: price R-squared: 0.369 Model: OLS Adj. R-squared: 0.369 Method: Least Squares F-statistic: 1.012e+04 Date: Fri, 18 Oct 2019 Prob (F-statistic): 0.00 Time: 22:25:20 Log-Likelihood: -2.4210e+05 No. Observations: 17290 AIC: 4.842e+05 Df Residuals: 17288 BIC: 4.842e+05 Df Model: 1 Covariance Type: nonrobust =================================================================================== coef std err t P>|t| [0.025 0.975] ----------------------------------------------------------------------------------- log_sqft_living 5.255e+05 5224.766 100.580 0.000 5.15e+05 5.36e+05 const -3.427e+06 3.95e+04 -86.762 0.000 -3.5e+06 -3.35e+06 ============================================================================== Omnibus: 15529.189 Durbin-Watson: 2.014 Prob(Omnibus): 0.000 Jarque-Bera (JB): 1270840.426 Skew: 3.994 Prob(JB): 0.00 Kurtosis: 44.234 Cond. No. 137. ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy """Entry point for launching an IPython kernel.
# linearity does not hold much
# normality does not hold much
# heteroskedasticity is little bit improved
# there are no outlier leverage points
# we may need to include more than one features to estimate price
# rather than only log of living area.
from bhishan.util_statsmodels import lm_residual_corr_plot
lm_residual_corr_plot(df_X1train, df_ytrain, df_ypreds_train)
1. Correlation of Error Terms (Collinearity Test): - If the magnitude of errors increase/decrease as we go along x-axis, the error terms may be correlated. - This could mean that our estimated standard errors underestimate the true standard errors. - Our confidence and prediction intervals may be narrower than they should be.
df.head(2)
id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7129300520 | 20141013T000000 | 221900.0 | 3 | 1.00 | 1180 | 5650 | 1.0 | 0 | 0 | 3 | 7 | 1180 | 0 | 1955 | 0 | 98178 | 47.5112 | -122.257 | 1340 | 5650 |
1 | 6414100192 | 20141209T000000 | 538000.0 | 3 | 2.25 | 2570 | 7242 | 2.0 | 0 | 0 | 3 | 7 | 2170 | 400 | 1951 | 1991 | 98125 | 47.7210 | -122.319 | 1690 | 7639 |
df.columns
Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15'], dtype='object')
features = ['bedrooms', 'bathrooms',
'sqft_living','sqft_lot', 'sqft_above',
'sqft_living15', 'sqft_lot15',
'floors', 'waterfront', 'view', 'condition', 'grade',
'yr_built', 'yr_renovated', 'zipcode',
'lat', 'long' ]
formula = 'price ~ ' + ' + '.join(features)
formula
'price ~ bedrooms + bathrooms + sqft_living + sqft_lot + sqft_above + sqft_living15 + sqft_lot15 + floors + waterfront + view + condition + grade + yr_built + yr_renovated + zipcode + lat + long'
formula = 'price ~ ' + ' + '.join(features)
model = smf.ols(formula=formula, data=df_train)
results = model.fit()
print_statsmodels_summary(results.summary())
1. R-squared (Coefficient of Determination): https://www.wikiwand.com/en/Coefficient_of_determination - R-squared = 1 - (SS_res / SS_tot) SS_res = sum (y_i - f_i)**2 = sum(e_i **2) SS_tot = sum (y_i - y_bar)**2 - If R2 = 0.49, then, 49% of the variability of the dependent variable has been accounted for, and the remaining 51% of the variability is still unaccounted for. 2. Adjusted R2 (R bar squared): - R_bar_squared = 1 - (1-R2) * (n-1) / (n-p-1) - The adjusted R2 can be negative. - It will always be less than or equal to that of R2. - Adjusted R2 can be interpreted as an unbiased (or less biased) estimator of the population R2. - R2 is a positively biased estimate of the population value. When p increases, R2 increases, but R_bar_squared may not increase. 3. F-statistic https://www.wikiwand.com/en/F-test - the one-way ANOVA F-test statistic is F = explained variance -------------------- unexplained variance - When there are only two groups for the one-way ANOVA F-test, F = t**2, where t is the Student's t statistic. - For two models 1 and 2, F = (RSS1 - RSS2) / (p2-p1) --------------------------- (RSS2) / (n-p2) 4. AIC https://www.wikiwand.com/en/Akaike_information_criterion - Akaike Information Criterion. - AIC = 2k - 2 ln L where k is the number of model parameters, L is log likelihood. - Adjusts the log-likelihood based on the number of observations and the complexity of the model. - Penalizes the model selection metrics when more independent variables are added. 5. BIC https://www.wikiwand.com/en/Bayesian_information_criterion - Bayesian Information Criterion. - BIC = ln(n) * k - 2 ln L where k is the number of model parameters, L is log likelihood. - Similar to the AIC, but has a higher penalty for models with more parameters. - Penalizes the model selection metrics when more independent variables are added. - BIC is only valid for sample size n much larger than the number k of parameters in the model. 6. P > |t|: https://www.wikiwand.com/en/P-value - p-value means probability value. - p-value means that the null-hypothesis model parameter = 0 is true. - If it is less than the confidence level, often 0.05, it indicates that there is a statistically significant relationship between the predictor and the response. 7. Skewness: https://www.wikiwand.com/en/Skewness - A measure of the symmetry of the data about the mean. - Normally-distributed errors should be symmetrically distributed about the mean. - The normal distribution has 0 skew. 8. Kurtosis: https://www.wikiwand.com/en/Kurtosis - A measure of the shape of the distribution. - The normal distribution has a Kurtosis of 3. - If kurtosis is greater than 3, curve is tall and peaked. 9. Omnibus D’Angostino’s test: https://www.wikiwand.com/en/D%27Agostino%27s_K-squared_test - It provides a combined statistical test for the presence of skewness and kurtosis. - This is a goodness-of-fit measure of departure from normality, that is the test aims to establish whether or not the given sample comes from a normally distributed population. - The test is based on transformations of the sample kurtosis and skewness, and has power only against the alternatives that the distribution is skewed and/or kurtic. 10. Jarque-Bera: https://www.wikiwand.com/en/Jarque%E2%80%93Bera_test - A different test of the skewness and kurtosis. - In statistics, the Jarque–Bera test is a goodness-of-fit test of whether sample data have the skewness and kurtosis matching a normal distribution. - The test is named after Carlos Jarque and Anil K. Bera. - The test statistic is always nonnegative. - If it is far from zero, it signals the data do not have a normal distribution. 11. Durbin-Watson: https://www.wikiwand.com/en/Durbin%E2%80%93Watson_statistic - In statistics, the Durbin–Watson statistic is a test statistic used to detect the presence of autocorrelation at lag 1 in the residuals - Often important in time-series analysis. - A similar assessment can be also carried out with the Breusch–Godfrey test and the Ljung–Box test. 12. Cond. No: https://www.wikiwand.com/en/Condition_number - The condition number of a function measures how much the output value of the function can change for a small change in the input argument. - This is used to measure how sensitive a function is to changes or errors in the input - In linear regression the condition number of the moment matrix can be used as a diagnostic for multicollinearity. - A problem with a low condition number is said to be well-conditioned. - A problem with a high condition number is said to be ill-conditioned.
Dep. Variable: | price | R-squared: | 0.699 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.698 |
Method: | Least Squares | F-statistic: | 2356. |
Date: | Fri, 18 Oct 2019 | Prob (F-statistic): | 0.00 |
Time: | 23:37:09 | Log-Likelihood: | -2.3571e+05 |
No. Observations: | 17290 | AIC: | 4.714e+05 |
Df Residuals: | 17272 | BIC: | 4.716e+05 |
Df Model: | 17 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 5.497e+06 | 3.3e+06 | 1.663 | 0.096 | -9.81e+05 | 1.2e+07 |
bedrooms | -3.505e+04 | 2100.847 | -16.681 | 0.000 | -3.92e+04 | -3.09e+04 |
bathrooms | 4.746e+04 | 3658.519 | 12.973 | 0.000 | 4.03e+04 | 5.46e+04 |
sqft_living | 142.2869 | 4.947 | 28.763 | 0.000 | 132.591 | 151.983 |
sqft_lot | 0.1045 | 0.051 | 2.062 | 0.039 | 0.005 | 0.204 |
sqft_above | 35.0195 | 4.916 | 7.123 | 0.000 | 25.383 | 44.656 |
sqft_living15 | 21.1896 | 3.882 | 5.458 | 0.000 | 13.580 | 28.799 |
sqft_lot15 | -0.3449 | 0.078 | -4.419 | 0.000 | -0.498 | -0.192 |
floors | 6793.8805 | 4035.403 | 1.684 | 0.092 | -1115.918 | 1.47e+04 |
waterfront | 5.901e+05 | 1.9e+04 | 31.006 | 0.000 | 5.53e+05 | 6.27e+05 |
view | 5.197e+04 | 2410.166 | 21.563 | 0.000 | 4.72e+04 | 5.67e+04 |
condition | 2.755e+04 | 2635.735 | 10.451 | 0.000 | 2.24e+04 | 3.27e+04 |
grade | 9.735e+04 | 2397.894 | 40.597 | 0.000 | 9.26e+04 | 1.02e+05 |
yr_built | -2747.9351 | 80.962 | -33.941 | 0.000 | -2906.628 | -2589.242 |
yr_renovated | 10.0597 | 4.130 | 2.436 | 0.015 | 1.964 | 18.156 |
zipcode | -555.7652 | 37.192 | -14.943 | 0.000 | -628.664 | -482.866 |
lat | 6.003e+05 | 1.2e+04 | 49.907 | 0.000 | 5.77e+05 | 6.24e+05 |
long | -2.059e+05 | 1.46e+04 | -14.065 | 0.000 | -2.35e+05 | -1.77e+05 |
Omnibus: | 14886.668 | Durbin-Watson: | 2.036 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 1534176.331 |
Skew: | 3.632 | Prob(JB): | 0.00 |
Kurtosis: | 48.572 | Cond. No. | 2.17e+08 |
# using sm.OLS
target = ['price']
df_Xtrain = df_train[features]
df_ytrain = df_train[target]
df_Xtest = df_test[features]
df_ytest = df_test[target]
df_X1train = sm.add_constant(df_Xtrain)
model = sm.OLS(df_ytrain, df_X1train )
results = model.fit()
print_statsmodels_summary(results.summary(),verbose=False)
/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/numpy/core/fromnumeric.py:2389: FutureWarning: Method .ptp is deprecated and will be removed in a future version. Use numpy.ptp instead. return ptp(axis=axis, out=out, **kwargs)
Dep. Variable: | price | R-squared: | 0.699 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.698 |
Method: | Least Squares | F-statistic: | 2356. |
Date: | Fri, 18 Oct 2019 | Prob (F-statistic): | 0.00 |
Time: | 23:39:08 | Log-Likelihood: | -2.3571e+05 |
No. Observations: | 17290 | AIC: | 4.714e+05 |
Df Residuals: | 17272 | BIC: | 4.716e+05 |
Df Model: | 17 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 5.497e+06 | 3.3e+06 | 1.663 | 0.096 | -9.81e+05 | 1.2e+07 |
bedrooms | -3.505e+04 | 2100.847 | -16.681 | 0.000 | -3.92e+04 | -3.09e+04 |
bathrooms | 4.746e+04 | 3658.519 | 12.973 | 0.000 | 4.03e+04 | 5.46e+04 |
sqft_living | 142.2869 | 4.947 | 28.763 | 0.000 | 132.591 | 151.983 |
sqft_lot | 0.1045 | 0.051 | 2.062 | 0.039 | 0.005 | 0.204 |
sqft_above | 35.0195 | 4.916 | 7.123 | 0.000 | 25.383 | 44.656 |
sqft_living15 | 21.1896 | 3.882 | 5.458 | 0.000 | 13.580 | 28.799 |
sqft_lot15 | -0.3449 | 0.078 | -4.419 | 0.000 | -0.498 | -0.192 |
floors | 6793.8805 | 4035.403 | 1.684 | 0.092 | -1115.918 | 1.47e+04 |
waterfront | 5.901e+05 | 1.9e+04 | 31.006 | 0.000 | 5.53e+05 | 6.27e+05 |
view | 5.197e+04 | 2410.166 | 21.563 | 0.000 | 4.72e+04 | 5.67e+04 |
condition | 2.755e+04 | 2635.735 | 10.451 | 0.000 | 2.24e+04 | 3.27e+04 |
grade | 9.735e+04 | 2397.894 | 40.597 | 0.000 | 9.26e+04 | 1.02e+05 |
yr_built | -2747.9351 | 80.962 | -33.941 | 0.000 | -2906.628 | -2589.242 |
yr_renovated | 10.0597 | 4.130 | 2.436 | 0.015 | 1.964 | 18.156 |
zipcode | -555.7652 | 37.192 | -14.943 | 0.000 | -628.664 | -482.866 |
lat | 6.003e+05 | 1.2e+04 | 49.907 | 0.000 | 5.77e+05 | 6.24e+05 |
long | -2.059e+05 | 1.46e+04 | -14.065 | 0.000 | -2.35e+05 | -1.77e+05 |
Omnibus: | 14886.668 | Durbin-Watson: | 2.036 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 1534176.331 |
Skew: | 3.632 | Prob(JB): | 0.00 |
Kurtosis: | 48.572 | Cond. No. | 2.17e+08 |
results.rsquared, results.rsquared_adj
(0.6987007086088071, 0.6984041541881465)
formula = 'price ~ np.log(sqft_living)'
model = smf.ols(formula=formula, data=df_train)
results = model.fit()
print(results.summary())
OLS Regression Results ============================================================================== Dep. Variable: price R-squared: 0.369 Model: OLS Adj. R-squared: 0.369 Method: Least Squares F-statistic: 1.012e+04 Date: Fri, 18 Oct 2019 Prob (F-statistic): 0.00 Time: 22:30:14 Log-Likelihood: -2.4210e+05 No. Observations: 17290 AIC: 4.842e+05 Df Residuals: 17288 BIC: 4.842e+05 Df Model: 1 Covariance Type: nonrobust ======================================================================================= coef std err t P>|t| [0.025 0.975] --------------------------------------------------------------------------------------- Intercept -3.427e+06 3.95e+04 -86.762 0.000 -3.5e+06 -3.35e+06 np.log(sqft_living) 5.255e+05 5224.766 100.580 0.000 5.15e+05 5.36e+05 ============================================================================== Omnibus: 15529.189 Durbin-Watson: 2.014 Prob(Omnibus): 0.000 Jarque-Bera (JB): 1270840.426 Skew: 3.994 Prob(JB): 0.00 Kurtosis: 44.234 Cond. No. 137. ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
formula = 'price ~ np.log(sqft_living) + np.power(np.log(sqft_living),2)'
model = smf.ols(formula=formula, data=df_train)
results = model.fit()
print(results.summary())
OLS Regression Results ============================================================================== Dep. Variable: price R-squared: 0.499 Model: OLS Adj. R-squared: 0.499 Method: Least Squares F-statistic: 8604. Date: Fri, 18 Oct 2019 Prob (F-statistic): 0.00 Time: 22:30:58 Log-Likelihood: -2.4011e+05 No. Observations: 17290 AIC: 4.802e+05 Df Residuals: 17287 BIC: 4.802e+05 Df Model: 2 Covariance Type: nonrobust ==================================================================================================== coef std err t P>|t| [0.025 0.975] ---------------------------------------------------------------------------------------------------- Intercept 2.648e+07 4.49e+05 59.040 0.000 2.56e+07 2.74e+07 np.log(sqft_living) -7.435e+06 1.19e+05 -62.422 0.000 -7.67e+06 -7.2e+06 np.power(np.log(sqft_living), 2) 5.28e+05 7893.686 66.886 0.000 5.13e+05 5.43e+05 ============================================================================== Omnibus: 11316.265 Durbin-Watson: 2.017 Prob(Omnibus): 0.000 Jarque-Bera (JB): 387529.438 Skew: 2.649 Prob(JB): 0.00 Kurtosis: 25.580 Cond. No. 1.36e+04 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 1.36e+04. This might indicate that there are strong multicollinearity or other numerical problems.
df_train.columns
Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'log_sqft_living'], dtype='object')
features = ['bedrooms', 'bathrooms', 'sqft_living',
'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
'lat', 'long', 'sqft_living15', 'sqft_lot15', 'log_sqft_living']
['log_sqft_living']