%load_ext autoreload
%autoreload 2


# my custom module
import bhishan
import src

from bhishan.util_statsmodels import print_statsmodels_summary
from bhishan.util_statsmodels import regression_residual_plots
from bhishan.util_statsmodels import lm_plot


import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import os
import time

# random state
SEED = 0
RNG = np.random.RandomState(SEED)

# Jupyter notebook settings for pandas
pd.set_option('display.max_columns', 100)
#pd.set_option('display.float_format', '{:,.2g}'.format) # numbers sep by comma
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 200)

import IPython
from IPython.display import display, HTML, Image

print([(x.__name__, x.__version__) for x in [pd,np,sns,matplotlib]])

[('pandas', '0.25.0'), ('numpy', '1.16.4'), ('seaborn', '0.9.0'), ('matplotlib', '3.1.1')]


import scipy


import statsmodels.api as sm
import statsmodels.formula.api as smf


# _, ci_low, ci_high = wls_prediction_std(results)
from statsmodels.sandbox.regression.predstd import wls_prediction_std


import sklearn
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split


%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;


def show_method_attributes(method, ncols=2):
    """ Show all the attributes of a given method.
    Example:
    ========
    show_method_attributes(list)
     """
    x = [I for I in dir(method) if I[0].islower()]
    x = [I for I in x if I not in 'os np pd sys time psycopg2'.split()]

    return pd.DataFrame(np.array_split(x,ncols)).T.fillna('')


df = pd.read_csv('../data/raw/kc_house_data.csv')
print(df.shape)
df.head()

(21613, 21)


df_train, df_test = train_test_split(df, test_size=0.2,random_state=100)

print(df.shape, df_train.shape, df_test.shape)
df_train.head(2)

(21613, 21) (17290, 21) (4323, 21)


# using sm
features = ['sqft_living']
target = ['price']

df_Xtrain = df_train[features]
df_ytrain = df_train[target]

df_Xtest = df_test[features]
df_ytest = df_test[target]

df_X1train = sm.add_constant(df_Xtrain)
model = sm.OLS(df_ytrain, df_X1train )
results = model.fit()

summary = results.summary()
print_statsmodels_summary(summary)

/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/numpy/core/fromnumeric.py:2389: FutureWarning: Method .ptp is deprecated and will be removed in a future version. Use numpy.ptp instead.
  return ptp(axis=axis, out=out, **kwargs)


# using smf
formula = 'price ~ sqft_living'
model = smf.ols(formula=formula, data= pd.concat([df_Xtrain,df_ytrain],axis=1))
results = model.fit()

summary = results.summary()


show_method_attributes(results,7)


results.summary()


results.params

Intercept     -42628.976515
sqft_living      280.685417
dtype: float64


value = 2000
ans = -42628.976515 + 280.685417 * value
round(ans,2)

518741.86


df.loc[:2,'sqft_living']

0    1180
1    2570
2     770
Name: sqft_living, dtype: int64


text = " y = {:.2f} x + ({:.2f})".format(*results.params.values[::-1])
text

' y = 280.69 x + (-42628.98)'


value = 2000
X1_test = np.array([value,1])
X1_test.shape

(2,)


def predict(params,X1):
    return params.T @ X1

def get_confidence_interval(X1,results):
    ci_min = results.conf_int(alpha=0.05)[0]
    ci_max = results.conf_int(alpha=0.05)[1]

    confidence_interval = [predict(ci_min, X1), predict(ci_max, X1)]
    print(confidence_interval)


formula = 'price ~ sqft_living'
model = smf.ols(formula=formula, data= pd.concat([df_Xtrain,df_ytrain],axis=1))
results = model.fit()

get_confidence_interval(X1_test,results)

[-104715118.95658985, -65800225.73295514]


results.rsquared, results.rsquared_adj

(0.4871565982411399, 0.48712693353719727)


df_Xtest.shape

(4323, 1)


df_X1test = df_Xtest.assign(const=1)
df_X1test.head(2)


df_ypreds = results.predict(df_X1test)
df_ypreds.shape, df_ytest.shape

((4323,), (4323, 1))


df_ypreds.head(2)

19836    641401.384197
10442    395240.273674
dtype: float64


df_ytrain.shape, df_ytest.shape

((17290, 1), (4323, 1))


df_ytest.head()


df_comparison = df_ytest.copy(deep=True)
df_comparison['ypred'] = df_ypreds
df_comparison['error'] = df_comparison['price'] - df_comparison['ypred']
df_comparison.head()


mse = df_comparison['error'].pow(2).mean()
mse # this is too high.

65286065304.1562


ax = plt.scatter(x=df_comparison['price'],y=df_comparison['ypred'], color='b')
plt.xticks(rotation=90)
plt.tight_layout()
plt.plot([0,7_000_000], [0,7_000_000],c='r')

[<matplotlib.lines.Line2D at 0x1180a9630>]


from bhishan.util_statsmodels import lm_plot


df_ypreds_train = results.predict(df_X1train)


lm_plot(df_X1train, df_ytrain, df_ypreds_train)

    1. Externally Studentised Residual Plot (Outliers Test):
    - The horizontal red dashed lines are studentised t values t = ± 3
    - The points outside t = ± 3 may be considered outliers.
    - If we see U-shaped fitted solid blue line, our data is non-linear.
      We might need to transoform features or include polynomial features.

    2. Normal Q-Q Plot (Test of Normality)
    - If fitted points align with 45 degree line,
      the assumption of normality is likey to hold true.

    3. Scale-Location Plot (Test of Constant Variance, homoskedasticity)
    - Small residuals on y-axis is better.
    - If we see conical shape, data is heteroskedastic.
    - If data points are clustered at left or right, we observe heteroskedasticity.
    - If we see few outliers with high y-values,
       we have high variance residual outliers.

    4. Residuals vs Leverage Plot (Outliers Test)
    - Studentised residual larger than 3 are potential outliers.
    - High leverage points are potential outliers.


from bhishan.util_statsmodels import regression_residual_plots

formula = 'price ~ sqft_living'
model = smf.ols(formula=formula, data=df_train)
results = model.fit()

regression_residual_plots(results, dependent_var="price",
                          data=df_train[['sqft_living','price']],
                          annotate_outliers=False,verbose=False)


feature = 'sqft_living'
fig = plt.figure(figsize=(15,8))
fig = sm.graphics.plot_regress_exog(results, feature, fig=fig)


from statsmodels.sandbox.regression.predstd import wls_prediction_std

# data
x = df_Xtrain[['sqft_living']].values.reshape(-1,1)
y = df_ytrain.values.reshape(-1,1)
best_fit = results.fittedvalues.values.reshape(-1,1)


# figure
fig, ax = plt.subplots(figsize=(10,7))

# text
text = " y = {:.2f} x + ({:.2f})".format(*results.params.values[::-1])
ax.text(0,7000_000, text)


ax.plot(x, y, 'o', label="data")
ax.plot(x, best_fit, 'g--.', label="OLS")

# confidence intervals
_, ci_low, ci_high = wls_prediction_std(results)
ax.plot(x, ci_high, 'r--')
ax.plot(x, ci_low, 'r--')

# plot legend
ax.legend(loc='best')
plt.tight_layout()


df_Xtrain.shape, df_ytrain.shape

((17290, 1), (17290, 1))


def sns_regression_plot():
    plt.figure(figsize=(12,8))
    p = sns.regplot(x='sqft_living', y='price',
                    data = pd.concat([Xtrain,ytrain],axis=1),
                    line_kws={'color': 'g'})
    slope, intercept, r_value, p_value, std_err = \
    scipy.stats.linregress(x=p.get_lines()[0].get_xdata(),
                           y=p.get_lines()[0].get_ydata())

    text = 'y = {:.2f} x + ({:.2f})'.format(slope, intercept)

    plt.text(0,7000_000, text,size=24,alpha=0.5)
    plt.tight_layout()
    plt.show()


from bhishan.util_statsmodels import regression_residual_plots

formula = 'price ~ sqft_living'
model = smf.ols(formula=formula, data=df)
results = model.fit()

regression_residual_plots(results, dependent_var="price",data=df[['sqft_living','price']])

    1. Residuals vs Fitted Plot (Test of linearity):
    - If solid red line is along y=0, assumption of linearity may hold true.

    2. Normal Q-Q plot (Test of Normality)
    - If fitted points align with 45 degree line, assumption of normality may hold true.

    3. Scale-Location Plot (Test of Constant Variance, homoskedasticity)
    - Small residuals on y-axis is better.
    - If data points are clustered at left or right, we observe heteroskedasticity.
    - If we see few outliers with high y-values, we have high variance residual outliers.

    4. Residual vs Leverage Plot (Cook's test of outliers)
    - Solid red line is leverage best fit line.
    - Dashed red line is Cook's distance for 0.5 * p * (1-x) /x
    - Dotted red line is Cook's distance for 1 * p * (1-x) /x
    - p is number of model parameters. x is plot values: np.linspace(0.001, 0.200, 50).
    - If the values exceeds the farthest dotted red line,
      it has high leverage on the dataset and may be considered outlier.


df_train.head()


model_fit = results

model_norm_residuals = model_fit.get_influence().resid_studentized_internal

# Annotations of Outliers
abs_norm_resid = np.flip(np.argsort(np.abs(model_norm_residuals)), 0)
abs_norm_resid_top_3 = abs_norm_resid[:3]

for i in abs_norm_resid_top_3:
    print(i)

7252
3914
9254


from bhishan.util_statsmodels import regression_residual_plots

formula = 'price ~ sqft_living'
model = smf.ols(formula=formula, data=df_train)
results = model.fit()

regression_residual_plots(results, dependent_var="price",data=df_train[['sqft_living','price']],annotate_outliers=True)

    1. Residuals vs Fitted Plot (Test of linearity):
    - If solid red line is along y=0, assumption of linearity may hold true.

    2. Normal Q-Q plot (Test of Normality)
    - If fitted points align with 45 degree line, assumption of normality may hold true.

    3. Scale-Location Plot (Test of Constant Variance, homoskedasticity)
    - Small residuals on y-axis is better.
    - If data points are clustered at left or right, we observe heteroskedasticity.
    - If we see few outliers with high y-values, we have high variance residual outliers.

    4. Residual vs Leverage Plot (Cook's test of outliers)
    - Solid red line is leverage best fit line.
    - Dashed red line is Cook's distance for 0.5 * p * (1-x) /x
    - Dotted red line is Cook's distance for 1 * p * (1-x) /x
    - p is number of model parameters. x is plot values: np.linspace(0.001, 0.200, 50).
    - If the values exceeds the farthest dotted red line,
      it has high leverage on the dataset and may be considered outlier.


# lets try log transformation and plot again
df_train['log_sqft_living'] = np.log(df_train['sqft_living'].values)

features = ['log_sqft_living']
target = ['price']
df_X = df_train[features]
df_y = df_train[target]

df_X1 = df_X.assign(const=1)
model = sm.OLS(df_y, df_X1 )
results = model.fit()

from bhishan.util_statsmodels import regression_residual_plots
regression_residual_plots(results, dependent_var="price",
                          data= df_train[['log_sqft_living','price']],
                          verbose=False,
                         annotate_outliers=False,
                         ofile='../reports/figures/regression_residual_plots.png')

/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# Here, we do not need to give intercept term.
formula = 'price ~ np.log(sqft_living)'

model = smf.ols(formula=formula, data=df_train)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  price   R-squared:                       0.369
Model:                            OLS   Adj. R-squared:                  0.369
Method:                 Least Squares   F-statistic:                 1.012e+04
Date:                Fri, 18 Oct 2019   Prob (F-statistic):               0.00
Time:                        22:27:32   Log-Likelihood:            -2.4210e+05
No. Observations:               17290   AIC:                         4.842e+05
Df Residuals:                   17288   BIC:                         4.842e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
=======================================================================================
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept           -3.427e+06   3.95e+04    -86.762      0.000    -3.5e+06   -3.35e+06
np.log(sqft_living)  5.255e+05   5224.766    100.580      0.000    5.15e+05    5.36e+05
==============================================================================
Omnibus:                    15529.189   Durbin-Watson:                   2.014
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          1270840.426
Skew:                           3.994   Prob(JB):                         0.00
Kurtosis:                      44.234   Cond. No.                         137.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


df_train['log_sqft_living'] = np.log(df_train['sqft_living'].values)

features = ['log_sqft_living']
target = ['price']

df_X = df_train[features]
df_y = df_train[target]

df_X1 = df_X.assign(const=1)

model = sm.OLS(df_y, df_X1 )
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  price   R-squared:                       0.369
Model:                            OLS   Adj. R-squared:                  0.369
Method:                 Least Squares   F-statistic:                 1.012e+04
Date:                Fri, 18 Oct 2019   Prob (F-statistic):               0.00
Time:                        22:25:20   Log-Likelihood:            -2.4210e+05
No. Observations:               17290   AIC:                         4.842e+05
Df Residuals:                   17288   BIC:                         4.842e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
===================================================================================
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
log_sqft_living  5.255e+05   5224.766    100.580      0.000    5.15e+05    5.36e+05
const           -3.427e+06   3.95e+04    -86.762      0.000    -3.5e+06   -3.35e+06
==============================================================================
Omnibus:                    15529.189   Durbin-Watson:                   2.014
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          1270840.426
Skew:                           3.994   Prob(JB):                         0.00
Kurtosis:                      44.234   Cond. No.                         137.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


# linearity does not hold much
# normality does not hold much
# heteroskedasticity is little bit improved
# there are no outlier leverage points

# we may need to include more than one features to estimate price
# rather than only log of living area.


from bhishan.util_statsmodels import lm_residual_corr_plot


lm_residual_corr_plot(df_X1train, df_ytrain, df_ypreds_train)

    1. Correlation of Error Terms (Collinearity Test):
    - If the magnitude of errors increase/decrease as we go along x-axis,
      the error terms may be correlated.
    - This could mean that our estimated standard errors underestimate
      the true standard errors.
    - Our confidence and prediction intervals may be
      narrower than they should be.


df.head(2)


df.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')


features = ['bedrooms', 'bathrooms', 
            'sqft_living','sqft_lot', 'sqft_above',
            'sqft_living15', 'sqft_lot15',
            'floors', 'waterfront', 'view', 'condition', 'grade',
           'yr_built', 'yr_renovated', 'zipcode',
           'lat', 'long' ]

formula = 'price ~ ' + ' + '.join(features)
formula

'price ~ bedrooms + bathrooms + sqft_living + sqft_lot + sqft_above + sqft_living15 + sqft_lot15 + floors + waterfront + view + condition + grade + yr_built + yr_renovated + zipcode + lat + long'


formula = 'price ~ ' + ' + '.join(features)
model = smf.ols(formula=formula, data=df_train)
results = model.fit()
print_statsmodels_summary(results.summary())

    1. R-squared (Coefficient of Determination):
    https://www.wikiwand.com/en/Coefficient_of_determination
    - R-squared = 1 - (SS_res / SS_tot)
      SS_res = sum (y_i - f_i)**2  = sum(e_i **2)
      SS_tot = sum (y_i - y_bar)**2
    - If R2 = 0.49, then, 49% of the variability of the dependent
      variable has been accounted for, and the remaining 51% of the
      variability is still unaccounted for.

    2. Adjusted R2 (R bar squared):
    - R_bar_squared = 1 - (1-R2) * (n-1) / (n-p-1)
    - The adjusted R2 can be negative.
    - It will always be less than or equal to that of R2.
    - Adjusted R2 can be interpreted as an unbiased
     (or less biased) estimator of the population R2.
    - R2 is a positively biased estimate of the population value.
      When p increases, R2 increases, but R_bar_squared may not increase.

    3. F-statistic
    https://www.wikiwand.com/en/F-test
    -  the one-way ANOVA F-test statistic is
         F = explained variance
             --------------------
             unexplained variance
    - When there are only two groups for the one-way ANOVA F-test,
      F = t**2, where t is the Student's t statistic.
    - For two models 1 and 2,
      F = (RSS1 - RSS2) / (p2-p1)
          ---------------------------
          (RSS2) / (n-p2)

    4. AIC
    https://www.wikiwand.com/en/Akaike_information_criterion
    - Akaike Information Criterion.
    - AIC = 2k - 2 ln L  where k is the number of model parameters, 
      L is log likelihood.
    - Adjusts the log-likelihood based on the number of observations
      and the complexity of the model.
    - Penalizes the model selection metrics when more independent
      variables are added.

    5. BIC
    https://www.wikiwand.com/en/Bayesian_information_criterion
    - Bayesian Information Criterion.
    - BIC = ln(n) * k - 2 ln L  where k is the number of model parameters, 
      L is log likelihood.
    - Similar to the AIC, but has a higher penalty for models with more parameters.
    - Penalizes the model selection metrics when more independent variables are added.
    -  BIC is only valid for sample size n much larger than
       the number k of parameters in the model.

    6. P > |t|:
    https://www.wikiwand.com/en/P-value
    - p-value means probability value.
    - p-value means that the null-hypothesis model parameter = 0 is true.
    - If it is less than the confidence level, often 0.05,
      it indicates that there is a statistically significant
      relationship between the predictor and the response.

    7. Skewness:
    https://www.wikiwand.com/en/Skewness
    - A measure of the symmetry of the data about the mean.
    - Normally-distributed errors should be symmetrically distributed about the mean.
    - The normal distribution has 0 skew.

    8. Kurtosis:
    https://www.wikiwand.com/en/Kurtosis
    - A measure of the shape of the distribution.
    - The normal distribution has a Kurtosis of 3.
    - If kurtosis is greater than 3, curve is tall and peaked.

    9. Omnibus D’Angostino’s test:
    https://www.wikiwand.com/en/D%27Agostino%27s_K-squared_test
    - It provides a combined statistical test for the presence of skewness and kurtosis.
    - This is a goodness-of-fit measure of departure from normality,
      that is the test aims to establish whether or not the given sample
      comes from a normally distributed population.
    - The test is based on transformations of the sample kurtosis and skewness,
      and has power only against the alternatives that the
      distribution is skewed and/or kurtic.

    10. Jarque-Bera:
    https://www.wikiwand.com/en/Jarque%E2%80%93Bera_test
    - A different test of the skewness and kurtosis.
    - In statistics, the Jarque–Bera test is a goodness-of-fit test of
      whether sample data have the skewness and kurtosis
      matching a normal distribution.
    - The test is named after Carlos Jarque and Anil K. Bera.
    - The test statistic is always nonnegative.
    - If it is far from zero, it signals the data do not have a normal distribution.

    11. Durbin-Watson:
    https://www.wikiwand.com/en/Durbin%E2%80%93Watson_statistic
    - In statistics, the Durbin–Watson statistic is a test statistic
      used to detect the presence of autocorrelation at lag 1 in the residuals
    - Often important in time-series analysis.
    - A similar assessment can be also carried out with the
      Breusch–Godfrey test and the Ljung–Box test.

    12. Cond. No:
    https://www.wikiwand.com/en/Condition_number
    - The condition number of a function measures how much the output value
     of the function can change for a small change in the input argument.
    - This is used to measure how sensitive a function is to changes
      or errors in the input
    - In linear regression the condition number of the moment matrix
       can be used as a diagnostic for multicollinearity.
    - A problem with a low condition number is said to be well-conditioned.
    - A problem with a high condition number is said to be ill-conditioned.


# using sm.OLS
target = ['price']
df_Xtrain = df_train[features]
df_ytrain = df_train[target]

df_Xtest = df_test[features]
df_ytest = df_test[target]

df_X1train = sm.add_constant(df_Xtrain)
model = sm.OLS(df_ytrain, df_X1train )
results = model.fit()

print_statsmodels_summary(results.summary(),verbose=False)

/Users/poudel/miniconda3/envs/dataSc/lib/python3.7/site-packages/numpy/core/fromnumeric.py:2389: FutureWarning: Method .ptp is deprecated and will be removed in a future version. Use numpy.ptp instead.
  return ptp(axis=axis, out=out, **kwargs)


results.rsquared, results.rsquared_adj

(0.6987007086088071, 0.6984041541881465)


formula = 'price ~ np.log(sqft_living)'

model = smf.ols(formula=formula, data=df_train)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  price   R-squared:                       0.369
Model:                            OLS   Adj. R-squared:                  0.369
Method:                 Least Squares   F-statistic:                 1.012e+04
Date:                Fri, 18 Oct 2019   Prob (F-statistic):               0.00
Time:                        22:30:14   Log-Likelihood:            -2.4210e+05
No. Observations:               17290   AIC:                         4.842e+05
Df Residuals:                   17288   BIC:                         4.842e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
=======================================================================================
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept           -3.427e+06   3.95e+04    -86.762      0.000    -3.5e+06   -3.35e+06
np.log(sqft_living)  5.255e+05   5224.766    100.580      0.000    5.15e+05    5.36e+05
==============================================================================
Omnibus:                    15529.189   Durbin-Watson:                   2.014
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          1270840.426
Skew:                           3.994   Prob(JB):                         0.00
Kurtosis:                      44.234   Cond. No.                         137.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


formula = 'price ~ np.log(sqft_living) + np.power(np.log(sqft_living),2)'

model = smf.ols(formula=formula, data=df_train)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  price   R-squared:                       0.499
Model:                            OLS   Adj. R-squared:                  0.499
Method:                 Least Squares   F-statistic:                     8604.
Date:                Fri, 18 Oct 2019   Prob (F-statistic):               0.00
Time:                        22:30:58   Log-Likelihood:            -2.4011e+05
No. Observations:               17290   AIC:                         4.802e+05
Df Residuals:                   17287   BIC:                         4.802e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
====================================================================================================
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
Intercept                         2.648e+07   4.49e+05     59.040      0.000    2.56e+07    2.74e+07
np.log(sqft_living)              -7.435e+06   1.19e+05    -62.422      0.000   -7.67e+06    -7.2e+06
np.power(np.log(sqft_living), 2)   5.28e+05   7893.686     66.886      0.000    5.13e+05    5.43e+05
==============================================================================
Omnibus:                    11316.265   Durbin-Watson:                   2.017
Prob(Omnibus):                  0.000   Jarque-Bera (JB):           387529.438
Skew:                           2.649   Prob(JB):                         0.00
Kurtosis:                      25.580   Cond. No.                     1.36e+04
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.36e+04. This might indicate that there are
strong multicollinearity or other numerical problems.


df_train.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15', 'log_sqft_living'],
      dtype='object')


features = ['bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15', 'log_sqft_living']

['log_sqft_living']

	id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	condition	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15
0	7129300520	20141013T000000	221900.0	3	1.00	1180	5650	1.0	3	7	1180	0	1955	0	98178	47.5112	-122.257	1340	5650
1	6414100192	20141209T000000	538000.0	3	2.25	2570	7242	2.0	3	7	2170	400	1951	1991	98125	47.7210	-122.319	1690	7639
2	5631500400	20150225T000000	180000.0	2	1.00	770	10000	1.0	3	6	770	0	1933	0	98028	47.7379	-122.233	2720	8062
3	2487200875	20141209T000000	604000.0	4	3.00	1960	5000	1.0	5	7	1050	910	1965	0	98136	47.5208	-122.393	1360	5000
4	1954400510	20150218T000000	510000.0	3	2.00	1680	8080	1.0	3	8	1680	0	1987	0	98074	47.6168	-122.045	1800	7503

	id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	waterfront	view	condition	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15
16000	2561340020	20140804T000000	325000.0	3	1.75	1780	11096	1.0	0	0	3	7	1210	570	1979	0	98074	47.6170	-122.051	1780	10640
11286	8598200070	20141208T000000	278000.0	2	2.50	1420	2229	2.0	0	0	3	7	1420	0	2004	0	98059	47.4871	-122.165	1500	2230

	0	1	2	3	4	5	6
0	aic	conf_int_el	df_resid	get_influence	mse_resid	resid	t_test
1	bic	cov_HC0	diagn	get_prediction	mse_total	resid_pearson	t_test_pairwise
2	bse	cov_HC1	eigenvals	get_robustcov_results	nobs	rsquared	tvalues
3	centered_tss	cov_HC2	el_test	initialize	normalized_cov_params	rsquared_adj	uncentered_tss
4	compare_f_test	cov_HC3	ess	k_constant	outlier_test	save	use_t
5	compare_lm_test	cov_kwds	f_pvalue	llf	params	scale	wald_test
6	compare_lr_test	cov_params	f_test	load	predict	ssr	wald_test_terms
7	condition_number	cov_type	fittedvalues	model	pvalues	summary	wresid
8	conf_int	df_model	fvalue	mse_model	remove_data	summary2

	price
19836	285000.0
10442	239950.0
20548	460000.0
11014	397500.0
4138	545000.0

	price	ypred	error
19836	285000.0	641401.384197	-356401.384197
10442	239950.0	395240.273674	-155290.273674
20548	460000.0	628209.169608	-168209.169608
11014	397500.0	372785.440331	24714.559669
4138	545000.0	485059.607046	59940.392954

Table of Contents

Data Description¶

Imports¶

Load the data¶

Train test split¶

Simple Linear Regression¶

Modelling Simple Linear Regression Using statsmodels¶

Statistics Questions¶

Model score (coefficient of determination R^2) for training¶

Model Predictions after adding bias term¶

Residual Plots¶

Best fit line with confidence interval¶

Seaborn regplot¶

Assumptions of Linear Regression¶

log transformation using ols formula¶

Multiple Linear Regression¶

Polynomial Regression¶

Dep. Variable:	price	R-squared:	0.487
Model:	OLS	Adj. R-squared:	0.487
Method:	Least Squares	F-statistic:	1.642e+04
Date:	Fri, 18 Oct 2019	Prob (F-statistic):	0.00
Time:	21:22:01	Log-Likelihood:	-2.4030e+05
No. Observations:	17290	AIC:	4.806e+05
Df Residuals:	17288	BIC:	4.806e+05
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	-4.263e+04	4963.377	-8.589	0.000	-5.24e+04	-3.29e+04
sqft_living	280.6854	2.190	128.149	0.000	276.392	284.979

Omnibus:	11909.901	Durbin-Watson:	2.018
Prob(Omnibus):	0.000	Jarque-Bera (JB):	419310.893
Skew:	2.854	Prob(JB):	0.00
Kurtosis:	26.441	Cond. No.	5.62e+03

Omnibus:	14886.668	Durbin-Watson:	2.036
Prob(Omnibus):	0.000	Jarque-Bera (JB):	1534176.331
Skew:	3.632	Prob(JB):	0.00
Kurtosis:	48.572	Cond. No.	2.17e+08

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	5.497e+06	3.3e+06	1.663	0.096	-9.81e+05	1.2e+07
bedrooms	-3.505e+04	2100.847	-16.681	0.000	-3.92e+04	-3.09e+04
bathrooms	4.746e+04	3658.519	12.973	0.000	4.03e+04	5.46e+04
sqft_living	142.2869	4.947	28.763	0.000	132.591	151.983
sqft_lot	0.1045	0.051	2.062	0.039	0.005	0.204
sqft_above	35.0195	4.916	7.123	0.000	25.383	44.656
sqft_living15	21.1896	3.882	5.458	0.000	13.580	28.799
sqft_lot15	-0.3449	0.078	-4.419	0.000	-0.498	-0.192
floors	6793.8805	4035.403	1.684	0.092	-1115.918	1.47e+04
waterfront	5.901e+05	1.9e+04	31.006	0.000	5.53e+05	6.27e+05
view	5.197e+04	2410.166	21.563	0.000	4.72e+04	5.67e+04
condition	2.755e+04	2635.735	10.451	0.000	2.24e+04	3.27e+04
grade	9.735e+04	2397.894	40.597	0.000	9.26e+04	1.02e+05
yr_built	-2747.9351	80.962	-33.941	0.000	-2906.628	-2589.242
yr_renovated	10.0597	4.130	2.436	0.015	1.964	18.156
zipcode	-555.7652	37.192	-14.943	0.000	-628.664	-482.866
lat	6.003e+05	1.2e+04	49.907	0.000	5.77e+05	6.24e+05
long	-2.059e+05	1.46e+04	-14.065	0.000	-2.35e+05	-1.77e+05

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	5.497e+06	3.3e+06	1.663	0.096	-9.81e+05	1.2e+07
bedrooms	-3.505e+04	2100.847	-16.681	0.000	-3.92e+04	-3.09e+04
bathrooms	4.746e+04	3658.519	12.973	0.000	4.03e+04	5.46e+04
sqft_living	142.2869	4.947	28.763	0.000	132.591	151.983
sqft_lot	0.1045	0.051	2.062	0.039	0.005	0.204
sqft_above	35.0195	4.916	7.123	0.000	25.383	44.656
sqft_living15	21.1896	3.882	5.458	0.000	13.580	28.799
sqft_lot15	-0.3449	0.078	-4.419	0.000	-0.498	-0.192
floors	6793.8805	4035.403	1.684	0.092	-1115.918	1.47e+04
waterfront	5.901e+05	1.9e+04	31.006	0.000	5.53e+05	6.27e+05
view	5.197e+04	2410.166	21.563	0.000	4.72e+04	5.67e+04
condition	2.755e+04	2635.735	10.451	0.000	2.24e+04	3.27e+04
grade	9.735e+04	2397.894	40.597	0.000	9.26e+04	1.02e+05
yr_built	-2747.9351	80.962	-33.941	0.000	-2906.628	-2589.242
yr_renovated	10.0597	4.130	2.436	0.015	1.964	18.156
zipcode	-555.7652	37.192	-14.943	0.000	-628.664	-482.866
lat	6.003e+05	1.2e+04	49.907	0.000	5.77e+05	6.24e+05
long	-2.059e+05	1.46e+04	-14.065	0.000	-2.35e+05	-1.77e+05